From a68fed6dd56cfe60715a3645ce93e99691b6c98b Mon Sep 17 00:00:00 2001
From: Scare  Crowe <84860158+CWDSYSTEMS@users.noreply.github.com>
Date: Tue, 26 Oct 2021 22:35:21 +0500
Subject: [PATCH] T3Q gets dumped cuz he wanks the clowns he cries his lies to.

---
 mesa 3D driver/.gitlab-ci.yml                 |   38 +-
 mesa 3D driver/CODEOWNERS                     |  138 +
 mesa 3D driver/VERSION                        |    2 +-
 mesa 3D driver/bin/khronos-update.py          |   82 +-
 mesa 3D driver/bin/symbols-check.py           |    2 +-
 mesa 3D driver/bin/update-android-headers.sh  |   25 +-
 .../docs/drivers/freedreno/isaspec.rst        |    5 +-
 mesa 3D driver/docs/drivers/llvmpipe.rst      |    9 +-
 mesa 3D driver/docs/drivers/panfrost.rst      |   16 +-
 mesa 3D driver/docs/drivers/zink.rst          |   17 +
 mesa 3D driver/docs/envvars.rst               |  102 +-
 mesa 3D driver/docs/features.txt              |   93 +-
 mesa 3D driver/docs/gallium/context.rst       |    2 +
 .../docs/gallium/cso/rasterizer.rst           |   13 +-
 mesa 3D driver/docs/gallium/screen.rst        |    5 +
 mesa 3D driver/docs/gallium/tgsi.rst          |    5 -
 mesa 3D driver/docs/isl/tiling.rst            |   31 +
 mesa 3D driver/docs/release-calendar.csv      |   16 +-
 mesa 3D driver/docs/relnotes.rst              |    8 +
 mesa 3D driver/docs/relnotes/21.1.8.rst       |  105 +
 mesa 3D driver/docs/relnotes/21.2.2.rst       |  293 ++
 mesa 3D driver/docs/relnotes/21.2.3.rst       |  139 +
 mesa 3D driver/docs/relnotes/21.2.4.rst       |  147 +
 mesa 3D driver/docs/relnotes/new_features.txt |    8 -
 mesa 3D driver/include/EGL/eglext.h           |   18 +-
 mesa 3D driver/include/GL/gl.h                |    4 +-
 .../include/GL/internal/dri_interface.h       |  160 +-
 .../android_stub/android/hardware_buffer.h    |    7 -
 .../include/android_stub/android/log.h        |    2 -
 .../android_stub/android/native_window.h      |   12 -
 .../include/android_stub/cutils/compiler.h    |   44 +
 .../include/android_stub/cutils/trace.h       |  238 +
 .../include/android_stub/log/log_main.h       |   10 +-
 .../include/android_stub/ndk/sync.h           |    4 -
 .../android_stub/vndk/hardware_buffer.h       |   14 +
 mesa 3D driver/include/drm-uapi/virtgpu_drm.h |   27 +
 .../include/vulkan/vulkan_android.h           |   15 +-
 mesa 3D driver/include/vulkan/vulkan_beta.h   |   99 +-
 mesa 3D driver/include/vulkan/vulkan_core.h   |  298 +-
 .../include/vulkan/vulkan_fuchsia.h           |  141 +
 mesa 3D driver/meson.build                    |  142 +-
 mesa 3D driver/meson_options.txt              |    8 +-
 .../src/amd/addrlib/inc/addrinterface.h       |   51 +-
 .../src/amd/addrlib/inc/addrtypes.h           |   51 +-
 .../src/amd/addrlib/src/addrinterface.cpp     |   51 +-
 .../src/amd/addrlib/src/amdgpu_asic_addr.h    |   51 +-
 .../src/amd/addrlib/src/chip/r800/si_gb_reg.h |   51 +-
 .../src/amd/addrlib/src/core/addrcommon.h     |   51 +-
 .../src/amd/addrlib/src/core/addrelemlib.cpp  |   51 +-
 .../src/amd/addrlib/src/core/addrelemlib.h    |   52 +-
 .../src/amd/addrlib/src/core/addrlib.cpp      |   51 +-
 .../src/amd/addrlib/src/core/addrlib.h        |   51 +-
 .../src/amd/addrlib/src/core/addrlib1.cpp     |   51 +-
 .../src/amd/addrlib/src/core/addrlib1.h       |   52 +-
 .../src/amd/addrlib/src/core/addrlib2.cpp     |   52 +-
 .../src/amd/addrlib/src/core/addrlib2.h       |   52 +-
 .../src/amd/addrlib/src/core/addrobject.cpp   |   52 +-
 .../src/amd/addrlib/src/core/addrobject.h     |   52 +-
 .../src/amd/addrlib/src/core/coord.cpp        |   51 +-
 .../src/amd/addrlib/src/core/coord.h          |   51 +-
 .../addrlib/src/gfx10/gfx10SwizzlePattern.h   |   53 +-
 .../amd/addrlib/src/gfx10/gfx10addrlib.cpp    |  125 +-
 .../src/amd/addrlib/src/gfx10/gfx10addrlib.h  |   52 +-
 .../src/amd/addrlib/src/gfx9/gfx9addrlib.cpp  |  117 +-
 .../src/amd/addrlib/src/gfx9/gfx9addrlib.h    |   51 +-
 .../src/amd/addrlib/src/r800/ciaddrlib.cpp    |   51 +-
 .../src/amd/addrlib/src/r800/ciaddrlib.h      |   51 +-
 .../src/amd/addrlib/src/r800/egbaddrlib.cpp   |   51 +-
 .../src/amd/addrlib/src/r800/egbaddrlib.h     |   51 +-
 .../src/amd/addrlib/src/r800/siaddrlib.cpp    |   51 +-
 .../src/amd/addrlib/src/r800/siaddrlib.h      |   51 +-
 .../amd/ci/deqp-radv-bonaire-aco-skips.txt    |   10 -
 .../src/amd/ci/deqp-radv-hawaii-aco-skips.txt |   10 -
 .../src/amd/ci/deqp-radv-raven-aco-flakes.txt |    2 +
 .../src/amd/ci/deqp-radv-raven-aco-skips.txt  |   12 +-
 .../amd/ci/deqp-radv-renoir-aco-flakes.txt    |    2 +
 mesa 3D driver/src/amd/ci/deqp-radv-skips.txt |    9 +
 .../src/amd/ci/deqp-radv-stoney-aco-skips.txt |   10 -
 .../amd/ci/deqp-radv-vangogh-aco-fails.txt    |    0
 .../amd/ci/deqp-radv-vega10-aco-flakes.txt    |    2 +
 mesa 3D driver/src/amd/ci/gitlab-ci.yml       |    6 +-
 mesa 3D driver/src/amd/common/ac_debug.c      |   64 +-
 mesa 3D driver/src/amd/common/ac_exp_param.h  |    2 +-
 mesa 3D driver/src/amd/common/ac_gpu_info.c   |   26 +-
 mesa 3D driver/src/amd/common/ac_gpu_info.h   |    2 +
 mesa 3D driver/src/amd/common/ac_nir.h        |   21 +-
 mesa 3D driver/src/amd/common/ac_nir_cull.c   |    9 +-
 .../amd/common/ac_nir_lower_esgs_io_to_mem.c  |    4 +-
 .../src/amd/common/ac_nir_lower_ngg.c         |  422 +-
 .../amd/common/ac_nir_lower_tess_io_to_mem.c  |   12 +-
 mesa 3D driver/src/amd/common/ac_rgp.c        |   29 +-
 mesa 3D driver/src/amd/common/ac_rgp.h        |    2 +
 .../src/amd/common/ac_rgp_elf_object_pack.c   |   10 +-
 .../src/amd/common/ac_shader_args.h           |    4 +-
 .../src/amd/common/ac_shader_util.c           |   70 +
 .../src/amd/common/ac_shader_util.h           |   14 +
 mesa 3D driver/src/amd/common/ac_surface.c    |  149 +-
 mesa 3D driver/src/amd/common/ac_surface.h    |    6 +
 .../src/amd/common/ac_surface_modifier_test.c |    2 +-
 mesa 3D driver/src/amd/compiler/README-ISA.md |   10 +
 mesa 3D driver/src/amd/compiler/README.md     |   20 +
 .../src/amd/compiler/aco_assembler.cpp        |   27 +-
 .../src/amd/compiler/aco_builder_h.py         |    2 +-
 .../src/amd/compiler/aco_insert_NOPs.cpp      |  134 +-
 .../src/amd/compiler/aco_insert_waitcnt.cpp   |    9 +-
 .../compiler/aco_instruction_selection.cpp    | 1048 +++--
 .../amd/compiler/aco_instruction_selection.h  |    6 +-
 .../aco_instruction_selection_setup.cpp       |  277 +-
 .../src/amd/compiler/aco_interface.cpp        |  117 +-
 .../src/amd/compiler/aco_interface.h          |   13 +-
 mesa 3D driver/src/amd/compiler/aco_ir.cpp    |  325 +-
 mesa 3D driver/src/amd/compiler/aco_ir.h      |  142 +-
 .../amd/compiler/aco_live_var_analysis.cpp    |   27 +-
 .../src/amd/compiler/aco_lower_phis.cpp       |  287 +-
 .../src/amd/compiler/aco_lower_to_cssa.cpp    |   38 +-
 .../amd/compiler/aco_lower_to_hw_instr.cpp    |  146 +-
 .../src/amd/compiler/aco_opcodes.py           |   32 +-
 .../src/amd/compiler/aco_opcodes_cpp.py       |    5 -
 .../amd/compiler/aco_opt_value_numbering.cpp  |    4 +-
 .../src/amd/compiler/aco_optimizer.cpp        |  594 ++-
 .../src/amd/compiler/aco_optimizer_postRA.cpp |  168 +-
 .../src/amd/compiler/aco_print_asm.cpp        |  127 +-
 .../src/amd/compiler/aco_print_ir.cpp         |   93 +-
 .../amd/compiler/aco_register_allocation.cpp  |  906 ++--
 .../src/amd/compiler/aco_scheduler.cpp        |   37 +-
 mesa 3D driver/src/amd/compiler/aco_spill.cpp |  345 +-
 .../src/amd/compiler/aco_statistics.cpp       |    9 +
 .../src/amd/compiler/aco_validate.cpp         |  105 +-
 .../src/amd/compiler/tests/helpers.cpp        |   47 +-
 .../src/amd/compiler/tests/helpers.h          |    2 +-
 .../src/amd/compiler/tests/meson.build        |    2 +-
 .../src/amd/compiler/tests/test_optimizer.cpp |  192 +-
 .../compiler/tests/test_optimizer_postRA.cpp  |  118 +
 .../src/amd/compiler/tests/test_regalloc.cpp  |  143 +-
 .../src/amd/compiler/tests/test_sdwa.cpp      |  124 +-
 .../amd/compiler/tests/test_to_hw_instr.cpp   |  143 +-
 mesa 3D driver/src/amd/llvm/ac_llvm_build.c   |   59 +-
 mesa 3D driver/src/amd/llvm/ac_llvm_build.h   |    7 +-
 mesa 3D driver/src/amd/llvm/ac_llvm_cull.c    |   67 +-
 mesa 3D driver/src/amd/llvm/ac_llvm_cull.h    |   12 +-
 .../src/amd/llvm/ac_llvm_helper.cpp           |    2 +-
 mesa 3D driver/src/amd/llvm/ac_nir_to_llvm.c  |  361 +-
 mesa 3D driver/src/amd/llvm/ac_shader_abi.h   |   13 +-
 mesa 3D driver/src/amd/meson.build            |    6 +-
 .../src/amd/vulkan/layers/radv_sqtt_layer.c   |   22 +-
 mesa 3D driver/src/amd/vulkan/meson.build     |   38 +-
 .../amd/vulkan/radv_acceleration_structure.c  |  972 +++-
 .../amd/vulkan/radv_acceleration_structure.h  |  102 +
 mesa 3D driver/src/amd/vulkan/radv_android.c  |  120 +-
 .../src/amd/vulkan/radv_cmd_buffer.c          |  904 +++-
 .../src/amd/vulkan/radv_constants.h           |   14 +-
 mesa 3D driver/src/amd/vulkan/radv_debug.c    |   52 +-
 mesa 3D driver/src/amd/vulkan/radv_debug.h    |   37 +-
 .../src/amd/vulkan/radv_descriptor_set.c      |   59 +-
 mesa 3D driver/src/amd/vulkan/radv_device.c   | 1102 +++--
 mesa 3D driver/src/amd/vulkan/radv_formats.c  |  404 +-
 mesa 3D driver/src/amd/vulkan/radv_image.c    |  124 +-
 .../src/amd/vulkan/radv_llvm_helper.cpp       |    9 +-
 .../src/amd/vulkan/radv_llvm_helper.h         |   41 +
 mesa 3D driver/src/amd/vulkan/radv_meta.c     |   16 +
 mesa 3D driver/src/amd/vulkan/radv_meta.h     |   14 +-
 .../src/amd/vulkan/radv_meta_blit.c           |    3 +
 .../src/amd/vulkan/radv_meta_blit2d.c         |   11 +
 .../src/amd/vulkan/radv_meta_buffer.c         |   70 +-
 .../src/amd/vulkan/radv_meta_bufimage.c       |  194 +-
 .../src/amd/vulkan/radv_meta_clear.c          |  128 +-
 .../src/amd/vulkan/radv_meta_copy.c           |   51 +
 .../src/amd/vulkan/radv_meta_copy_vrs_htile.c |   62 +-
 .../src/amd/vulkan/radv_meta_dcc_retile.c     |   69 +-
 .../src/amd/vulkan/radv_meta_decompress.c     |  251 +-
 .../src/amd/vulkan/radv_meta_fast_clear.c     |   24 +-
 .../src/amd/vulkan/radv_meta_fmask_expand.c   |   18 +-
 .../src/amd/vulkan/radv_meta_resolve.c        |   12 +-
 .../src/amd/vulkan/radv_meta_resolve_cs.c     |   46 +-
 .../src/amd/vulkan/radv_meta_resolve_fs.c     |   12 +-
 .../vulkan/radv_nir_lower_ycbcr_textures.c    |    2 +-
 .../src/amd/vulkan/radv_nir_to_llvm.c         | 1169 ++---
 mesa 3D driver/src/amd/vulkan/radv_pass.c     |    4 +-
 mesa 3D driver/src/amd/vulkan/radv_pipeline.c | 1143 ++---
 .../src/amd/vulkan/radv_pipeline_cache.c      |   82 +-
 .../src/amd/vulkan/radv_pipeline_rt.c         | 2354 ++++++++++
 mesa 3D driver/src/amd/vulkan/radv_private.h  |  405 +-
 mesa 3D driver/src/amd/vulkan/radv_query.c    |  128 +-
 mesa 3D driver/src/amd/vulkan/radv_shader.c   |  878 ++--
 mesa 3D driver/src/amd/vulkan/radv_shader.h   |  272 +-
 .../src/amd/vulkan/radv_shader_args.c         |  501 +-
 .../src/amd/vulkan/radv_shader_args.h         |   14 +-
 .../src/amd/vulkan/radv_shader_info.c         |  247 +-
 mesa 3D driver/src/amd/vulkan/radv_sqtt.c     |   85 +-
 mesa 3D driver/src/amd/vulkan/radv_util.c     |   57 -
 mesa 3D driver/src/amd/vulkan/radv_wsi.c      |  173 +-
 .../src/amd/vulkan/radv_wsi_display.c         |  187 -
 mesa 3D driver/src/amd/vulkan/si_cmd_buffer.c |   91 +-
 .../src/android_stub/cutils_stub.cpp          |   24 +-
 .../src/broadcom/ci/deqp-v3d-rpi4-gles.toml   |    2 +
 .../src/broadcom/ci/deqp-v3dv-rpi4-flakes.txt |   25 +-
 .../src/broadcom/ci/deqp-vc4-rpi3-fails.txt   |   15 +-
 .../src/broadcom/ci/deqp-vc4-rpi3-gles.toml   |    2 +
 mesa 3D driver/src/broadcom/ci/gitlab-ci.yml  |    6 +-
 .../src/broadcom/ci/piglit-v3d-rpi4-fails.txt |    2 -
 .../src/broadcom/ci/piglit-vc4-rpi3-fails.txt |   18 +-
 .../src/broadcom/ci/piglit-vc4-rpi3-skips.txt |    1 +
 mesa 3D driver/src/broadcom/clif/clif_dump.c  |   24 +-
 mesa 3D driver/src/broadcom/clif/clif_dump.h  |    2 +-
 .../src/broadcom/clif/clif_private.h          |    6 +
 mesa 3D driver/src/broadcom/clif/v3dx_dump.c  |   19 +
 .../src/broadcom/common/v3d_debug.c           |    4 +
 .../src/broadcom/common/v3d_debug.h           |    7 +-
 .../src/broadcom/compiler/nir_to_vir.c        |   74 +-
 .../src/broadcom/compiler/qpu_schedule.c      |    8 +
 .../src/broadcom/compiler/v3d_compiler.h      |   26 -
 .../src/broadcom/compiler/v3d_nir_lower_io.c  |   57 +
 mesa 3D driver/src/broadcom/compiler/vir.c    |   29 +-
 .../broadcom/compiler/vir_live_variables.c    |    2 +-
 .../broadcom/compiler/vir_register_allocate.c |   22 +-
 .../src/broadcom/compiler/vir_to_qpu.c        |   15 +-
 mesa 3D driver/src/broadcom/qpu/qpu_pack.c    |    2 +-
 .../src/broadcom/vulkan/meson.build           |   21 +-
 .../src/broadcom/vulkan/v3dv_cmd_buffer.c     |   49 +-
 .../src/broadcom/vulkan/v3dv_descriptor_set.c |   27 +-
 .../src/broadcom/vulkan/v3dv_device.c         |  143 +-
 .../src/broadcom/vulkan/v3dv_image.c          |  251 +-
 .../src/broadcom/vulkan/v3dv_meta_clear.c     |   26 +-
 .../src/broadcom/vulkan/v3dv_meta_copy.c      |  367 +-
 .../src/broadcom/vulkan/v3dv_pass.c           |    4 +-
 .../src/broadcom/vulkan/v3dv_pipeline.c       |  178 +-
 .../src/broadcom/vulkan/v3dv_pipeline_cache.c |    6 +-
 .../src/broadcom/vulkan/v3dv_private.h        |  174 +-
 .../src/broadcom/vulkan/v3dv_query.c          |   52 +-
 .../src/broadcom/vulkan/v3dv_queue.c          |   95 +-
 .../src/broadcom/vulkan/v3dv_uniforms.c       |   20 +-
 mesa 3D driver/src/broadcom/vulkan/v3dv_wsi.c |  203 +-
 .../src/broadcom/vulkan/v3dvx_cmd_buffer.c    |   41 +-
 .../src/broadcom/vulkan/v3dvx_device.c        |   10 +-
 .../src/broadcom/vulkan/v3dvx_image.c         |   37 +-
 .../src/broadcom/vulkan/v3dvx_meta_common.c   |  132 +-
 .../src/broadcom/vulkan/v3dvx_pipeline.c      |   12 +-
 .../src/broadcom/vulkan/v3dvx_private.h       |   20 +-
 .../src/compiler/builtin_type_macros.h        |   44 +
 mesa 3D driver/src/compiler/clc/clc.c         |  314 ++
 mesa 3D driver/src/compiler/clc/clc.h         |  247 +
 .../src/compiler/clc/clc_helpers.cpp          | 1135 +++++
 mesa 3D driver/src/compiler/clc/clc_helpers.h |  104 +
 mesa 3D driver/src/compiler/clc/meson.build   |   66 +
 mesa 3D driver/src/compiler/glsl/ast.h        |    2 +
 .../src/compiler/glsl/ast_to_hir.cpp          |   61 +-
 .../src/compiler/glsl/builtin_variables.cpp   |    3 +
 .../glsl/gl_nir_link_uniform_initializers.c   |    1 +
 .../src/compiler/glsl/gl_nir_link_uniforms.c  |    3 +-
 .../src/compiler/glsl/gl_nir_linker.h         |    3 +-
 .../src/compiler/glsl/gl_nir_lower_buffers.c  |   15 +-
 .../src/compiler/glsl/gl_nir_lower_images.c   |    2 +-
 .../glsl/gl_nir_lower_samplers_as_deref.c     |    9 +-
 .../src/compiler/glsl/glsl_parser.yy          |    2 +-
 .../src/compiler/glsl/glsl_to_nir.cpp         |   18 +-
 mesa 3D driver/src/compiler/glsl/ir_clone.cpp |    1 +
 .../glsl/link_uniform_initializers.cpp        |    1 +
 .../src/compiler/glsl/link_varyings.cpp       |    6 +-
 mesa 3D driver/src/compiler/glsl/linker.cpp   |    2 +-
 .../glsl/lower_blend_equation_advanced.cpp    |    6 +-
 .../glsl/opt_dead_builtin_varyings.cpp        |    2 +-
 .../src/compiler/glsl/serialize.cpp           |    6 -
 .../src/compiler/glsl/tests/meson.build       |    4 +
 .../glsl/tests/uniform_initializer_utils.cpp  |    3 +
 mesa 3D driver/src/compiler/glsl_types.cpp    |  134 +-
 mesa 3D driver/src/compiler/glsl_types.h      |   16 +
 .../src/compiler/isaspec/README.rst           |    1 +
 mesa 3D driver/src/compiler/isaspec/decode.c  |  793 ++++
 mesa 3D driver/src/compiler/isaspec/decode.h  |  149 +
 mesa 3D driver/src/compiler/isaspec/decode.py |  310 ++
 mesa 3D driver/src/compiler/isaspec/encode.py |  652 +++
 mesa 3D driver/src/compiler/isaspec/isa.py    |  553 +++
 .../src/compiler/isaspec/meson.build          |   24 +
 mesa 3D driver/src/compiler/meson.build       |   28 +-
 mesa 3D driver/src/compiler/nir/meson.build   |  152 +-
 mesa 3D driver/src/compiler/nir/nir.c         |  298 +-
 mesa 3D driver/src/compiler/nir/nir.h         |  159 +-
 mesa 3D driver/src/compiler/nir/nir_builder.h |    2 +
 .../src/compiler/nir/nir_builtin_builder.c    |    4 +-
 mesa 3D driver/src/compiler/nir/nir_clone.c   |   10 +-
 .../src/compiler/nir/nir_control_flow.c       |    7 +-
 mesa 3D driver/src/compiler/nir/nir_deref.c   |   15 +-
 .../compiler/nir/nir_divergence_analysis.c    |   36 +-
 .../src/compiler/nir/nir_from_ssa.c           |   26 +-
 .../src/compiler/nir/nir_gather_info.c        |   27 +-
 .../src/compiler/nir/nir_intrinsics.py        |   20 +-
 .../src/compiler/nir/nir_linking_helpers.c    |  281 +-
 .../src/compiler/nir/nir_loop_analyze.c       |   17 +-
 .../compiler/nir/nir_lower_alu_to_scalar.c    |    8 +-
 .../src/compiler/nir/nir_lower_amul.c         |   62 +-
 .../compiler/nir/nir_lower_atomics_to_ssbo.c  |   14 +-
 .../src/compiler/nir/nir_lower_bit_size.c     |   21 +-
 .../src/compiler/nir/nir_lower_blend.c        |   17 +-
 .../nir/nir_lower_discard_or_demote.c         |   11 +-
 .../src/compiler/nir/nir_lower_flrp.c         |    2 +-
 .../compiler/nir/nir_lower_gs_intrinsics.c    |   48 +
 .../src/compiler/nir/nir_lower_image.c        |    2 +-
 .../compiler/nir/nir_lower_indirect_derefs.c  |    2 +-
 .../nir/nir_lower_input_attachments.c         |    4 +-
 .../src/compiler/nir/nir_lower_int_to_float.c |    2 +
 .../src/compiler/nir/nir_lower_io.c           |   39 +-
 .../nir/nir_lower_io_arrays_to_elements.c     |    6 +-
 .../src/compiler/nir/nir_lower_io_to_scalar.c |    6 +-
 .../nir/nir_lower_io_to_temporaries.c         |    4 +-
 .../src/compiler/nir/nir_lower_io_to_vector.c |   25 +-
 .../compiler/nir/nir_lower_locals_to_regs.c   |    6 +-
 .../nir/nir_lower_passthrough_edgeflags.c     |    4 +
 .../compiler/nir/nir_lower_phis_to_scalar.c   |   23 +-
 .../nir/nir_lower_readonly_images_to_tex.c    |   16 +-
 .../src/compiler/nir/nir_lower_ssbo.c         |    8 +-
 .../src/compiler/nir/nir_lower_subgroups.c    |   10 +-
 .../compiler/nir/nir_lower_system_values.c    |    3 +-
 .../nir/nir_lower_sysvals_to_varyings.c       |   72 +
 .../src/compiler/nir/nir_lower_tex.c          |   91 +-
 .../src/compiler/nir/nir_lower_ubo_vec4.c     |    2 +-
 .../compiler/nir/nir_lower_uniforms_to_ubo.c  |    4 +-
 .../src/compiler/nir/nir_lower_var_copies.c   |    2 +-
 .../src/compiler/nir/nir_lower_vec_to_movs.c  |    8 +-
 .../src/compiler/nir/nir_opcodes.py           |  168 +-
 .../src/compiler/nir/nir_opt_access.c         |   12 +-
 .../src/compiler/nir/nir_opt_algebraic.py     |  161 +-
 .../src/compiler/nir/nir_opt_comparison_pre.c |   10 +-
 .../compiler/nir/nir_opt_constant_folding.c   |    3 +-
 mesa 3D driver/src/compiler/nir/nir_opt_gcm.c |  109 +-
 mesa 3D driver/src/compiler/nir/nir_opt_if.c  |  111 +-
 .../src/compiler/nir/nir_opt_loop_unroll.c    |   22 +-
 .../src/compiler/nir/nir_opt_memcpy.c         |    2 +-
 .../compiler/nir/nir_opt_peephole_select.c    |   18 +-
 .../src/compiler/nir/nir_opt_undef.c          |    4 +-
 .../src/compiler/nir/nir_opt_vectorize.c      |    5 +-
 mesa 3D driver/src/compiler/nir/nir_print.c   |   29 +-
 .../compiler/nir/nir_propagate_invariant.c    |    2 +
 .../src/compiler/nir/nir_range_analysis.c     |   26 +-
 .../src/compiler/nir/nir_schedule.c           |   12 +
 mesa 3D driver/src/compiler/nir/nir_search.c  |    3 +-
 .../src/compiler/nir/nir_search_helpers.h     |   21 +
 .../src/compiler/nir/nir_serialize.c          |   86 +-
 .../src/compiler/nir/nir_split_vars.c         |   15 +-
 mesa 3D driver/src/compiler/nir/nir_sweep.c   |   36 +-
 .../src/compiler/nir/nir_validate.c           |   19 +-
 .../src/compiler/nir/nir_worklist.h           |    3 +-
 .../src/compiler/nir/tests/core_tests.cpp     |   19 +
 .../nir/tests/negative_equal_tests.cpp        |    3 +-
 mesa 3D driver/src/compiler/nir_types.cpp     |  118 +-
 mesa 3D driver/src/compiler/nir_types.h       |   15 +
 mesa 3D driver/src/compiler/shader_enums.c    |   24 +
 mesa 3D driver/src/compiler/shader_enums.h    |    5 +
 mesa 3D driver/src/compiler/shader_info.h     |   29 +
 mesa 3D driver/src/compiler/spirv/nir_spirv.h |    5 -
 .../compiler/spirv/spirv.core.grammar.json    |   84 +-
 mesa 3D driver/src/compiler/spirv/spirv.h     |    5 +
 mesa 3D driver/src/compiler/spirv/spirv2nir.c |    6 +-
 .../src/compiler/spirv/spirv_to_nir.c         |  344 +-
 mesa 3D driver/src/compiler/spirv/vtn_alu.c   |  361 +-
 .../src/compiler/spirv/vtn_private.h          |   42 +
 .../src/compiler/spirv/vtn_variables.c        |  183 +-
 .../src/egl/drivers/dri2/egl_dri2.c           |   40 +-
 .../src/egl/drivers/dri2/egl_dri2.h           |    9 +-
 .../src/egl/drivers/dri2/platform_device.c    |   15 +-
 .../src/egl/drivers/dri2/platform_drm.c       |    2 +
 .../src/egl/drivers/dri2/platform_wayland.c   |   91 +-
 .../src/egl/drivers/dri2/platform_x11_dri3.c  |    1 +
 mesa 3D driver/src/egl/drivers/wgl/egl_wgl.c  |  712 +++
 mesa 3D driver/src/egl/drivers/wgl/egl_wgl.h  |   57 +
 mesa 3D driver/src/egl/generate/egl.xml       |   24 +-
 .../src/egl/generate/eglFunctionList.py       |   25 +
 mesa 3D driver/src/egl/generate/egl_other.xml |   27 +
 mesa 3D driver/src/egl/main/egl.def           |   12 +
 mesa 3D driver/src/egl/main/eglapi.c          |    1 +
 mesa 3D driver/src/egl/main/eglcurrent.c      |    9 +-
 .../src/egl/main/egldispatchstubs.c           |   27 +
 .../src/egl/main/egldispatchstubs.h           |   27 +
 mesa 3D driver/src/egl/main/egldisplay.c      |   17 +-
 mesa 3D driver/src/egl/main/egldisplay.h      |    2 +
 mesa 3D driver/src/egl/main/eglglobals.c      |    4 +-
 mesa 3D driver/src/egl/main/eglglvnd.c        |   27 +
 mesa 3D driver/src/egl/main/egllog.c          |    1 -
 mesa 3D driver/src/egl/main/eglsurface.c      |   21 +
 mesa 3D driver/src/egl/main/eglsurface.h      |    2 +
 mesa 3D driver/src/egl/meson.build            |   12 +
 .../etnaviv/ci/deqp-etnaviv-gc2000-fails.txt  |  151 +
 .../etnaviv/ci/deqp-etnaviv-gc2000-flakes.txt |   39 +
 mesa 3D driver/src/etnaviv/ci/gitlab-ci.yml   |   36 +
 .../src/etnaviv/drm/etnaviv_device.c          |   25 +-
 .../src/etnaviv/drm/etnaviv_drmif.h           |    3 +
 mesa 3D driver/src/etnaviv/drm/etnaviv_priv.h |    1 +
 .../.gitlab-ci/reference/afuc_test.asm        |    1 -
 .../freedreno/.gitlab-ci/reference/crash.log  |   28 +-
 ...cification.basic_teximage2d.rgba16f_2d.log |    1 -
 ...exed.indirect_draw_count.triangle_list.log |   11 +-
 .../.gitlab-ci/reference/es2gears-a320.log    |    1 -
 .../.gitlab-ci/reference/fd-clouds.log        |   23 +-
 .../.gitlab-ci/reference/glxgears-a420.log    |    1 -
 .../freedreno/.gitlab-ci/reference/shadow.log |    2 -
 mesa 3D driver/src/freedreno/afuc/disasm.c    |   13 +-
 mesa 3D driver/src/freedreno/afuc/meson.build |   87 +-
 .../src/freedreno/ci/deqp-freedreno-a307.toml |    2 +
 .../ci/deqp-freedreno-a530-fails.txt          |   20 +-
 .../ci/deqp-freedreno-a530-flakes.txt         |   20 +
 .../ci/deqp-freedreno-a530-skips.txt          |    4 +
 .../src/freedreno/ci/deqp-freedreno-a530.toml |    2 +
 .../ci/deqp-freedreno-a630-fails.txt          |  293 +-
 .../ci/deqp-freedreno-a630-flakes.txt         |   13 +-
 .../ci/deqp-freedreno-a630-skips.txt          |    8 +-
 .../ci/deqp-freedreno-a630-vk-full.toml       |    1 +
 .../freedreno/ci/deqp-freedreno-a630-vk.toml  |   10 +
 .../src/freedreno/ci/deqp-freedreno-a630.toml |   17 +
 mesa 3D driver/src/freedreno/ci/gitlab-ci.yml |   20 +-
 .../ci/restricted-traces-freedreno.yml        |    4 +-
 .../src/freedreno/ci/traces-freedreno.yml     |   48 +-
 .../src/freedreno/common/freedreno_dev_info.h |   21 +
 .../src/freedreno/common/freedreno_devices.py |   27 +-
 .../src/freedreno/computerator/a4xx.c         |  348 ++
 .../src/freedreno/computerator/a6xx.c         |   25 +
 .../computerator/examples/pvtmem.asm          |   14 +
 .../src/freedreno/computerator/ir3_asm.c      |    2 -
 .../src/freedreno/computerator/main.c         |    3 +
 .../src/freedreno/computerator/main.h         |    1 +
 .../src/freedreno/computerator/meson.build    |    2 +
 mesa 3D driver/src/freedreno/decode/buffers.c |   13 +-
 mesa 3D driver/src/freedreno/decode/cffdec.c  |    5 +-
 mesa 3D driver/src/freedreno/decode/cffdec.h  |    5 +
 mesa 3D driver/src/freedreno/decode/cffdump.c |    5 +-
 .../src/freedreno/decode/crashdec.c           |    5 +
 .../src/freedreno/decode/meson.build          |   49 +
 .../src/freedreno/decode/pgmdump2.c           |    4 +-
 mesa 3D driver/src/freedreno/decode/redump.h  |    8 +-
 .../decode/scripts/parse-submits.lua          |    4 -
 mesa 3D driver/src/freedreno/decode/util.h    |   23 +-
 .../src/freedreno/drm/freedreno_bo.c          |   24 +-
 .../src/freedreno/drm/freedreno_bo_cache.c    |   21 +-
 .../src/freedreno/drm/freedreno_device.c      |    4 +
 .../src/freedreno/drm/freedreno_drmif.h       |   10 +-
 .../src/freedreno/drm/freedreno_pipe.c        |    3 +-
 .../src/freedreno/drm/freedreno_priv.h        |   24 +-
 .../src/freedreno/drm/freedreno_ringbuffer.h  |    7 +
 mesa 3D driver/src/freedreno/drm/msm_bo.c     |    6 +-
 mesa 3D driver/src/freedreno/drm/msm_device.c |   21 +
 mesa 3D driver/src/freedreno/drm/msm_pipe.c   |    3 -
 mesa 3D driver/src/freedreno/drm/msm_priv.h   |   20 +-
 .../src/freedreno/drm/msm_ringbuffer.c        |    2 +-
 .../src/freedreno/drm/msm_ringbuffer_sp.c     |   27 +-
 mesa 3D driver/src/freedreno/ds/meson.build   |    1 +
 .../src/freedreno/fdl/fd6_format_table.c      |  467 ++
 .../src/freedreno/fdl/fd6_format_table.h      |   51 +
 mesa 3D driver/src/freedreno/fdl/fd6_layout.c |   11 +-
 mesa 3D driver/src/freedreno/fdl/fd6_view.c   |  346 ++
 .../src/freedreno/fdl/freedreno_layout.h      |   66 +-
 mesa 3D driver/src/freedreno/fdl/meson.build  |    3 +
 .../src/freedreno/ir3/disasm-a3xx.c           |   19 +-
 mesa 3D driver/src/freedreno/ir3/instr-a3xx.h |   29 +
 mesa 3D driver/src/freedreno/ir3/ir3.c        |   76 +-
 mesa 3D driver/src/freedreno/ir3/ir3.h        |  167 +-
 mesa 3D driver/src/freedreno/ir3/ir3_a4xx.c   |  103 +-
 mesa 3D driver/src/freedreno/ir3/ir3_a6xx.c   |   69 +-
 .../src/freedreno/ir3/ir3_assembler.c         |    3 +
 .../src/freedreno/ir3/ir3_assembler.h         |    3 +-
 mesa 3D driver/src/freedreno/ir3/ir3_cf.c     |    7 +
 .../src/freedreno/ir3/ir3_compiler.c          |    6 +
 .../src/freedreno/ir3/ir3_compiler.h          |    8 +
 .../src/freedreno/ir3/ir3_compiler_nir.c      |  248 +-
 .../src/freedreno/ir3/ir3_context.c           |   19 +-
 .../src/freedreno/ir3/ir3_context.h           |   19 +-
 mesa 3D driver/src/freedreno/ir3/ir3_cp.c     |   16 +-
 mesa 3D driver/src/freedreno/ir3/ir3_cse.c    |   45 +-
 .../src/freedreno/ir3/ir3_legalize.c          |   18 +-
 mesa 3D driver/src/freedreno/ir3/ir3_lexer.l  |    1 +
 .../src/freedreno/ir3/ir3_liveness.c          |   10 +-
 .../freedreno/ir3/ir3_lower_parallelcopy.c    |    6 +-
 .../src/freedreno/ir3/ir3_lower_spill.c       |  163 +
 .../src/freedreno/ir3/ir3_lower_subgroups.c   |    4 +-
 .../src/freedreno/ir3/ir3_merge_regs.c        |   33 +-
 mesa 3D driver/src/freedreno/ir3/ir3_nir.c    |   84 +-
 mesa 3D driver/src/freedreno/ir3/ir3_nir.h    |   42 +
 .../src/freedreno/ir3/ir3_nir_lower_64b.c     |  284 ++
 .../src/freedreno/ir3/ir3_nir_lower_tess.c    |   34 +-
 .../freedreno/ir3/ir3_nir_lower_tg4_to_tex.c  |    2 +-
 .../ir3/ir3_nir_lower_wide_load_store.c       |  118 +
 .../src/freedreno/ir3/ir3_nir_trig.py         |    4 +-
 mesa 3D driver/src/freedreno/ir3/ir3_parser.y |   21 +-
 .../src/freedreno/ir3/ir3_postsched.c         |    8 +-
 mesa 3D driver/src/freedreno/ir3/ir3_print.c  |   64 +-
 mesa 3D driver/src/freedreno/ir3/ir3_ra.c     |  495 +-
 mesa 3D driver/src/freedreno/ir3/ir3_ra.h     |   42 +-
 .../freedreno/ir3/ir3_remove_unreachable.c    |  122 +
 mesa 3D driver/src/freedreno/ir3/ir3_sched.c  |    8 +-
 mesa 3D driver/src/freedreno/ir3/ir3_shader.c |    3 +-
 mesa 3D driver/src/freedreno/ir3/ir3_shader.h |   21 +-
 mesa 3D driver/src/freedreno/ir3/ir3_spill.c  | 1793 ++++++-
 .../src/freedreno/ir3/ir3_validate.c          |   57 +-
 mesa 3D driver/src/freedreno/ir3/meson.build  |    5 +-
 .../src/freedreno/ir3/tests/disasm.c          |  278 +-
 mesa 3D driver/src/freedreno/isa/encode.c     |   26 +-
 mesa 3D driver/src/freedreno/isa/ir3-cat1.xml |    2 +-
 mesa 3D driver/src/freedreno/isa/ir3-cat3.xml |    2 +-
 mesa 3D driver/src/freedreno/isa/ir3-cat5.xml |   17 +-
 mesa 3D driver/src/freedreno/isa/ir3-cat6.xml |  138 +-
 .../src/freedreno/isa/ir3-common.xml          |   12 +-
 mesa 3D driver/src/freedreno/isa/ir3-disasm.c |    4 +-
 mesa 3D driver/src/freedreno/isa/isa.h        |    2 +-
 mesa 3D driver/src/freedreno/isa/meson.build  |   21 +-
 mesa 3D driver/src/freedreno/meson.build      |   14 +-
 .../src/freedreno/registers/adreno/a4xx.xml   |   19 +-
 .../src/freedreno/registers/adreno/a5xx.xml   |   24 +-
 .../src/freedreno/registers/adreno/a6xx.xml   |  103 +-
 .../registers/adreno/adreno_common.xml        |    9 +
 mesa 3D driver/src/freedreno/rnn/meson.build  |   10 +
 mesa 3D driver/src/freedreno/rnn/rnndec.c     |    7 +-
 .../src/freedreno/vulkan/meson.build          |   37 +-
 .../src/freedreno/vulkan/tu_android.c         |   10 +-
 .../src/freedreno/vulkan/tu_clear_blit.c      |  625 +--
 .../src/freedreno/vulkan/tu_cmd_buffer.c      |  384 +-
 mesa 3D driver/src/freedreno/vulkan/tu_cs.c   |    6 +-
 mesa 3D driver/src/freedreno/vulkan/tu_cs.h   |    5 +-
 .../src/freedreno/vulkan/tu_descriptor_set.c  |   29 +-
 .../src/freedreno/vulkan/tu_device.c          |  909 ++--
 mesa 3D driver/src/freedreno/vulkan/tu_drm.c  |  197 +-
 .../src/freedreno/vulkan/tu_formats.c         |  480 +-
 .../src/freedreno/vulkan/tu_image.c           |  366 +-
 mesa 3D driver/src/freedreno/vulkan/tu_kgsl.c |   26 +-
 mesa 3D driver/src/freedreno/vulkan/tu_pass.c |   35 +-
 .../src/freedreno/vulkan/tu_perfetto.cc       |  296 ++
 .../src/freedreno/vulkan/tu_perfetto.h        |  116 +
 .../src/freedreno/vulkan/tu_perfetto_util.c   |   48 +
 .../src/freedreno/vulkan/tu_pipeline.c        |  166 +-
 .../src/freedreno/vulkan/tu_pipeline_cache.c  |   12 +-
 .../src/freedreno/vulkan/tu_private.h         |  368 +-
 .../src/freedreno/vulkan/tu_query.c           |    4 +-
 .../src/freedreno/vulkan/tu_shader.c          |    9 +-
 .../src/freedreno/vulkan/tu_tracepoints.py    |  158 +
 mesa 3D driver/src/freedreno/vulkan/tu_util.c |   14 +-
 mesa 3D driver/src/freedreno/vulkan/tu_util.h |   14 +
 mesa 3D driver/src/freedreno/vulkan/tu_wsi.c  |  205 +-
 .../src/freedreno/vulkan/tu_wsi_display.c     |  239 -
 .../gallium/auxiliary/cso_cache/cso_context.c |   63 +-
 .../gallium/auxiliary/cso_cache/cso_context.h |   11 +-
 .../src/gallium/auxiliary/draw/draw_context.h |    3 +-
 .../gallium/auxiliary/draw/draw_pipe_clip.c   |   19 +-
 .../gallium/auxiliary/draw/draw_pipe_offset.c |    2 +-
 .../auxiliary/draw/draw_pipe_pstipple.c       |    9 +-
 .../src/gallium/auxiliary/draw/draw_pt.c      |    5 +-
 .../src/gallium/auxiliary/draw/draw_tess.c    |    2 +-
 .../auxiliary/driver_ddebug/dd_context.c      |   13 +-
 .../auxiliary/driver_ddebug/dd_screen.c       |    4 +-
 .../gallium/auxiliary/driver_noop/noop_pipe.c |   43 +-
 .../auxiliary/driver_noop/noop_state.c        |   17 +
 .../auxiliary/driver_rbug/rbug_context.c      |    3 +-
 .../auxiliary/driver_rbug/rbug_screen.c       |    4 +-
 .../auxiliary/driver_trace/tr_context.c       |   63 +-
 .../auxiliary/driver_trace/tr_context.h       |    3 +-
 .../auxiliary/driver_trace/tr_dump_state.c    |   17 +-
 .../auxiliary/driver_trace/tr_dump_state.h    |    2 +
 .../auxiliary/driver_trace/tr_screen.c        |   64 +-
 .../auxiliary/driver_trace/tr_texture.h       |    1 +
 .../gallium/auxiliary/gallivm/lp_bld_arit.c   |   67 +-
 .../gallium/auxiliary/gallivm/lp_bld_const.c  |    6 +-
 .../gallium/auxiliary/gallivm/lp_bld_conv.c   |    3 +
 .../gallium/auxiliary/gallivm/lp_bld_coro.c   |   37 +-
 .../gallium/auxiliary/gallivm/lp_bld_coro.h   |    4 +
 .../auxiliary/gallivm/lp_bld_format_soa.c     |   18 +-
 .../gallium/auxiliary/gallivm/lp_bld_intr.c   |    4 +
 .../gallium/auxiliary/gallivm/lp_bld_limits.h |   13 +-
 .../gallium/auxiliary/gallivm/lp_bld_nir.c    |   71 +-
 .../gallium/auxiliary/gallivm/lp_bld_nir.h    |    3 +
 .../auxiliary/gallivm/lp_bld_nir_soa.c        |  334 +-
 .../gallium/auxiliary/gallivm/lp_bld_tgsi.h   |    2 +-
 .../gallium/auxiliary/gallivm/lp_bld_type.c   |   10 +-
 .../src/gallium/auxiliary/hud/hud_context.c   |    9 +-
 .../src/gallium/auxiliary/indices/u_indices.c |    3 +
 .../gallium/auxiliary/indices/u_primconvert.c |    4 +-
 .../src/gallium/auxiliary/meson.build         |   20 +-
 .../src/gallium/auxiliary/nir/nir_to_tgsi.c   |  466 +-
 .../gallium/auxiliary/nir/nir_to_tgsi_info.c  |   10 +-
 .../src/gallium/auxiliary/nir/tgsi_to_nir.c   |   12 +-
 .../auxiliary/pipe-loader/driinfo_gallium.h   |    3 +-
 .../gallium/auxiliary/pipebuffer/pb_cache.c   |   20 +-
 .../gallium/auxiliary/pipebuffer/pb_cache.h   |    3 +-
 .../gallium/auxiliary/pipebuffer/pb_slab.c    |   44 +-
 .../gallium/auxiliary/pipebuffer/pb_slab.h    |    3 +-
 .../gallium/auxiliary/postprocess/pp_colors.c |    2 +-
 .../gallium/auxiliary/postprocess/pp_mlaa.c   |    6 +-
 .../gallium/auxiliary/postprocess/pp_run.c    |   13 +-
 .../src/gallium/auxiliary/tgsi/tgsi_build.c   |    4 -
 .../src/gallium/auxiliary/tgsi/tgsi_dump.c    |   16 -
 .../gallium/auxiliary/tgsi/tgsi_lowering.c    |    1 -
 .../src/gallium/auxiliary/tgsi/tgsi_scan.c    |    1 -
 .../src/gallium/auxiliary/tgsi/tgsi_scan.h    |    1 -
 .../src/gallium/auxiliary/tgsi/tgsi_ureg.c    |   45 +-
 .../src/gallium/auxiliary/tgsi/tgsi_ureg.h    |   25 +-
 .../gallium/auxiliary/util/u_async_debug.h    |    8 +
 .../src/gallium/auxiliary/util/u_blitter.c    |   31 +-
 .../src/gallium/auxiliary/util/u_blitter.h    |    2 +-
 .../src/gallium/auxiliary/util/u_compute.c    |    4 +-
 .../src/gallium/auxiliary/util/u_driconf.c    |    2 +-
 .../src/gallium/auxiliary/util/u_dump_state.c |    2 -
 .../src/gallium/auxiliary/util/u_helpers.c    |   22 +
 .../src/gallium/auxiliary/util/u_helpers.h    |    9 +
 .../src/gallium/auxiliary/util/u_inlines.h    |   24 +-
 .../auxiliary/util/u_live_shader_cache.h      |    8 +
 .../src/gallium/auxiliary/util/u_prim.c       |   13 +
 .../src/gallium/auxiliary/util/u_screen.c     |    3 +
 .../gallium/auxiliary/util/u_simple_shaders.c |    6 +-
 .../src/gallium/auxiliary/util/u_tests.c      |    8 +-
 .../auxiliary/util/u_threaded_context.c       |  400 +-
 .../auxiliary/util/u_threaded_context.h       |   62 +-
 .../auxiliary/util/u_threaded_context_calls.h |    3 +
 .../gallium/auxiliary/util/u_trace_gallium.c  |   96 +
 .../gallium/auxiliary/util/u_trace_gallium.h  |   57 +
 .../gallium/auxiliary/util/u_tracepoints.py   |   42 +-
 .../auxiliary/util/u_vertex_state_cache.c     |  134 +
 .../auxiliary/util/u_vertex_state_cache.h     |   67 +
 .../gallium/auxiliary/vl/vl_bicubic_filter.c  |    2 +-
 .../gallium/auxiliary/vl/vl_compositor_cs.c   |    4 +-
 .../gallium/auxiliary/vl/vl_compositor_gfx.c  |    2 +-
 .../gallium/auxiliary/vl/vl_deint_filter.c    |    3 +-
 .../src/gallium/auxiliary/vl/vl_idct.c        |    4 +-
 .../gallium/auxiliary/vl/vl_matrix_filter.c   |    2 +-
 .../src/gallium/auxiliary/vl/vl_mc.c          |    2 +-
 .../gallium/auxiliary/vl/vl_median_filter.c   |    2 +-
 .../gallium/auxiliary/vl/vl_mpeg12_decoder.c  |    2 +-
 .../src/gallium/auxiliary/vl/vl_zscan.c       |    2 +-
 .../src/gallium/drivers/asahi/agx_state.c     |   11 +-
 .../src/gallium/drivers/crocus/crocus_batch.c |   19 +-
 .../src/gallium/drivers/crocus/crocus_blit.c  |   29 -
 .../src/gallium/drivers/crocus/crocus_blorp.c |   10 +
 .../src/gallium/drivers/crocus/crocus_blt.c   |   18 +-
 .../gallium/drivers/crocus/crocus_bufmgr.h    |    1 -
 .../src/gallium/drivers/crocus/crocus_clear.c |    7 +-
 .../gallium/drivers/crocus/crocus_context.c   |    2 -
 .../gallium/drivers/crocus/crocus_context.h   |    9 +-
 .../drivers/crocus/crocus_disk_cache.c        |    2 +-
 .../src/gallium/drivers/crocus/crocus_draw.c  |   22 +-
 .../gallium/drivers/crocus/crocus_program.c   |   35 +-
 .../drivers/crocus/crocus_program_cache.c     |    3 +-
 .../src/gallium/drivers/crocus/crocus_query.c |    8 +-
 .../gallium/drivers/crocus/crocus_resource.c  |    8 +-
 .../gallium/drivers/crocus/crocus_screen.c    |    9 +-
 .../gallium/drivers/crocus/crocus_screen.h    |    4 -
 .../src/gallium/drivers/crocus/crocus_state.c |  128 +-
 .../src/gallium/drivers/d3d12/d3d12_blit.cpp  |    4 +-
 .../gallium/drivers/d3d12/d3d12_compiler.cpp  |    4 +-
 .../gallium/drivers/d3d12/d3d12_context.cpp   |    8 +-
 .../src/gallium/drivers/d3d12/d3d12_draw.cpp  |   12 +-
 .../d3d12/d3d12_lower_int_cubemap_to_array.c  |    2 +-
 .../gallium/drivers/d3d12/d3d12_nir_passes.c  |   48 -
 .../gallium/drivers/d3d12/d3d12_nir_passes.h  |    3 -
 .../gallium/drivers/d3d12/d3d12_surface.cpp   |    2 +-
 .../drivers/etnaviv/etnaviv_compiler_nir.c    |   11 +-
 .../gallium/drivers/etnaviv/etnaviv_context.c |   31 -
 .../gallium/drivers/etnaviv/etnaviv_context.h |    4 -
 .../gallium/drivers/etnaviv/etnaviv_screen.c  |   35 +-
 .../gallium/drivers/etnaviv/etnaviv_texture.c |   20 +-
 .../drivers/etnaviv/etnaviv_texture_state.c   |    2 +-
 .../src/gallium/drivers/etnaviv/meson.build   |    1 +
 .../drivers/freedreno/a2xx/fd2_context.c      |   26 +-
 .../gallium/drivers/freedreno/a2xx/fd2_draw.c |    4 +-
 .../drivers/freedreno/a2xx/fd2_screen.c       |   27 +
 .../drivers/freedreno/a2xx/fd2_texture.c      |    3 +-
 .../gallium/drivers/freedreno/a2xx/ir2_nir.c  |    1 +
 .../drivers/freedreno/a3xx/fd3_context.c      |   16 +-
 .../gallium/drivers/freedreno/a3xx/fd3_draw.c |    4 +-
 .../drivers/freedreno/a3xx/fd3_screen.c       |   15 +
 .../drivers/freedreno/a4xx/fd4_context.c      |   16 +-
 .../gallium/drivers/freedreno/a4xx/fd4_draw.c |    4 +-
 .../drivers/freedreno/a4xx/fd4_screen.c       |   15 +
 .../drivers/freedreno/a4xx/fd4_texture.c      |    3 +-
 .../drivers/freedreno/a5xx/fd5_context.c      |   16 +-
 .../gallium/drivers/freedreno/a5xx/fd5_draw.c |    7 +-
 .../gallium/drivers/freedreno/a5xx/fd5_emit.c |    3 +-
 .../drivers/freedreno/a5xx/fd5_program.c      |   31 +-
 .../drivers/freedreno/a5xx/fd5_screen.c       |   15 +
 .../drivers/freedreno/a5xx/fd5_texture.c      |    3 +-
 .../drivers/freedreno/a6xx/fd6_blitter.c      |   58 +-
 .../drivers/freedreno/a6xx/fd6_compute.c      |   24 +-
 .../drivers/freedreno/a6xx/fd6_const.c        |    4 +-
 .../drivers/freedreno/a6xx/fd6_context.c      |   28 +-
 .../drivers/freedreno/a6xx/fd6_context.h      |    3 +
 .../gallium/drivers/freedreno/a6xx/fd6_draw.c |   15 +-
 .../gallium/drivers/freedreno/a6xx/fd6_emit.c |   45 +-
 .../gallium/drivers/freedreno/a6xx/fd6_emit.h |    1 +
 .../drivers/freedreno/a6xx/fd6_format.c       |  371 +-
 .../drivers/freedreno/a6xx/fd6_format.h       |   14 +-
 .../gallium/drivers/freedreno/a6xx/fd6_gmem.c |   52 +-
 .../drivers/freedreno/a6xx/fd6_image.c        |   10 +-
 .../drivers/freedreno/a6xx/fd6_program.c      |  131 +-
 .../drivers/freedreno/a6xx/fd6_rasterizer.c   |   11 +-
 .../drivers/freedreno/a6xx/fd6_resource.c     |    3 +-
 .../drivers/freedreno/a6xx/fd6_screen.c       |   37 +-
 .../drivers/freedreno/a6xx/fd6_texture.c      |    3 +-
 .../ci/piglit-freedreno-a530-fails.txt        |    6 +-
 .../ci/piglit-freedreno-a630-fails.txt        |   53 +-
 .../ci/piglit-freedreno-a630-flakes.txt       |   18 +-
 .../drivers/freedreno/freedreno_autotune.c    |    2 +
 .../drivers/freedreno/freedreno_batch.c       |    4 +-
 .../drivers/freedreno/freedreno_batch.h       |    4 +-
 .../drivers/freedreno/freedreno_batch_cache.c |    2 +-
 .../drivers/freedreno/freedreno_blitter.c     |    2 +-
 .../drivers/freedreno/freedreno_context.c     |   61 +-
 .../drivers/freedreno/freedreno_context.h     |   31 +-
 .../drivers/freedreno/freedreno_draw.c        |   14 +-
 .../drivers/freedreno/freedreno_gmem.c        |   36 +-
 .../drivers/freedreno/freedreno_perfetto.cc   |   14 +
 .../drivers/freedreno/freedreno_program.c     |   15 +
 .../drivers/freedreno/freedreno_query.h       |    7 +
 .../drivers/freedreno/freedreno_query_acc.c   |    4 +-
 .../drivers/freedreno/freedreno_query_hw.c    |    4 +-
 .../drivers/freedreno/freedreno_resource.c    |  153 +-
 .../drivers/freedreno/freedreno_resource.h    |    1 +
 .../drivers/freedreno/freedreno_screen.c      |   19 +-
 .../drivers/freedreno/freedreno_screen.h      |   11 +
 .../drivers/freedreno/freedreno_state.c       |   35 +-
 .../drivers/freedreno/freedreno_texture.c     |   16 +-
 .../drivers/freedreno/freedreno_texture.h     |    1 +
 .../freedreno/freedreno_tracepoints.py        |   53 +-
 .../drivers/freedreno/freedreno_util.h        |   14 +-
 .../drivers/freedreno/ir3/ir3_cmdline.c       |    6 +
 .../gallium/drivers/freedreno/ir3/ir3_const.h |   23 +-
 .../drivers/freedreno/ir3/ir3_gallium.c       |   15 +-
 .../src/gallium/drivers/freedreno/meson.build |    3 +-
 .../drivers/i915/ci/deqp-i915-g33-fails.txt   |  550 +--
 .../gallium/drivers/i915/ci/deqp-i915g.toml   |    2 +
 .../src/gallium/drivers/i915/ci/gitlab-ci.yml |    2 -
 .../drivers/i915/ci/piglit-i915-g33-fails.txt |  255 +-
 .../gallium/drivers/i915/ci/traces-i915.yml   |   12 +-
 .../src/gallium/drivers/i915/i915_context.c   |    7 +-
 .../src/gallium/drivers/i915/i915_context.h   |   10 +-
 .../src/gallium/drivers/i915/i915_debug.c     |    1 +
 .../src/gallium/drivers/i915/i915_debug.h     |    1 +
 .../src/gallium/drivers/i915/i915_fpc.h       |    4 -
 .../gallium/drivers/i915/i915_fpc_translate.c |   30 +-
 .../src/gallium/drivers/i915/i915_prim_vbuf.c |   61 +-
 .../drivers/i915/i915_resource_texture.c      |    9 +-
 .../src/gallium/drivers/i915/i915_screen.c    |  102 +-
 .../src/gallium/drivers/i915/i915_state.c     |   23 +-
 .../gallium/drivers/i915/i915_state_derived.c |   71 +-
 .../src/gallium/drivers/i915/i915_surface.c   |    2 +-
 .../drivers/iris/ci/deqp-iris-amly-fails.txt  |    8 +-
 .../drivers/iris/ci/deqp-iris-amly.toml       |   96 +
 .../drivers/iris/ci/deqp-iris-apl-fails.txt   |    8 +-
 .../drivers/iris/ci/deqp-iris-apl.toml        |   98 +
 .../drivers/iris/ci/deqp-iris-cml-fails.txt   |   46 +
 .../drivers/iris/ci/deqp-iris-cml.toml        |   99 +
 .../drivers/iris/ci/deqp-iris-glk-fails.txt   |   21 +
 .../drivers/iris/ci/deqp-iris-glk-flakes.txt  |    2 +
 .../drivers/iris/ci/deqp-iris-glk.toml        |   99 +
 .../drivers/iris/ci/deqp-iris-kbl-fails.txt   |   46 +
 .../drivers/iris/ci/deqp-iris-kbl.toml        |   96 +
 .../drivers/iris/ci/deqp-iris-skips.txt       |   14 +
 .../drivers/iris/ci/deqp-iris-whl-fails.txt   |   57 +
 .../drivers/iris/ci/deqp-iris-whl.toml        |   99 +
 .../src/gallium/drivers/iris/ci/gitlab-ci.yml |  207 +-
 .../iris/ci/piglit-iris-amly-fails.txt        |   40 +-
 .../gallium/drivers/iris/ci/traces-iris.yml   |  360 +-
 .../src/gallium/drivers/iris/iris_batch.c     |  318 +-
 .../src/gallium/drivers/iris/iris_batch.h     |   10 +-
 .../src/gallium/drivers/iris/iris_blit.c      |   51 +-
 .../src/gallium/drivers/iris/iris_blorp.c     |   28 +-
 .../src/gallium/drivers/iris/iris_bufmgr.c    |  709 ++-
 .../src/gallium/drivers/iris/iris_bufmgr.h    |  177 +-
 .../src/gallium/drivers/iris/iris_clear.c     |   33 +-
 .../src/gallium/drivers/iris/iris_context.c   |    6 +-
 .../src/gallium/drivers/iris/iris_context.h   |   22 +-
 .../gallium/drivers/iris/iris_disk_cache.c    |    2 +-
 .../src/gallium/drivers/iris/iris_draw.c      |   39 +-
 .../src/gallium/drivers/iris/iris_fence.c     |   45 +-
 .../src/gallium/drivers/iris/iris_fence.h     |   13 +-
 .../gallium/drivers/iris/iris_fine_fence.c    |    4 +-
 .../gallium/drivers/iris/iris_genx_protos.h   |    3 +
 .../gallium/drivers/iris/iris_pipe_control.c  |    9 +-
 .../src/gallium/drivers/iris/iris_program.c   |  201 +-
 .../src/gallium/drivers/iris/iris_query.c     |    6 +-
 .../src/gallium/drivers/iris/iris_resolve.c   |   61 +-
 .../src/gallium/drivers/iris/iris_resource.c  |  363 +-
 .../src/gallium/drivers/iris/iris_resource.h  |   14 -
 .../src/gallium/drivers/iris/iris_screen.c    |   53 +-
 .../src/gallium/drivers/iris/iris_screen.h    |    7 +-
 .../src/gallium/drivers/iris/iris_state.c     |  409 +-
 .../drivers/lima/ci/deqp-lima-fails.txt       |   11 -
 .../src/gallium/drivers/lima/ci/gitlab-ci.yml |    2 +-
 .../gallium/drivers/lima/drm-shim/lima_noop.c |  120 +
 .../gallium/drivers/lima/drm-shim/meson.build |   29 +
 .../src/gallium/drivers/lima/ir/gp/codegen.c  |    2 +-
 .../src/gallium/drivers/lima/ir/gp/codegen.h  |    2 +-
 .../src/gallium/drivers/lima/ir/gp/disasm.c   |  332 +-
 .../src/gallium/drivers/lima/ir/gp/nir.c      |    4 +-
 .../src/gallium/drivers/lima/ir/gp/regalloc.c |  337 +-
 .../gallium/drivers/lima/ir/gp/scheduler.c    |   49 -
 .../src/gallium/drivers/lima/ir/lima_ir.h     |    1 +
 .../lima/ir/lima_nir_split_load_input.c       |  130 +-
 .../drivers/lima/ir/lima_nir_split_loads.c    |  149 +
 .../src/gallium/drivers/lima/ir/pp/codegen.c  |    2 +-
 .../src/gallium/drivers/lima/ir/pp/codegen.h  |    2 +-
 .../src/gallium/drivers/lima/ir/pp/disasm.c   |  359 +-
 .../src/gallium/drivers/lima/lima_draw.c      |   24 +-
 .../src/gallium/drivers/lima/lima_job.c       |    4 +-
 .../src/gallium/drivers/lima/lima_parser.c    |   44 +-
 .../src/gallium/drivers/lima/lima_parser.h    |    1 +
 .../src/gallium/drivers/lima/lima_program.c   |    1 +
 .../src/gallium/drivers/lima/lima_screen.c    |    2 +
 .../src/gallium/drivers/lima/lima_state.c     |   18 +-
 .../src/gallium/drivers/lima/lima_util.c      |    7 +
 .../src/gallium/drivers/lima/lima_util.h      |    1 +
 .../src/gallium/drivers/lima/meson.build      |    5 +
 .../drivers/lima/standalone/lima_disasm.c     |    4 +-
 .../llvmpipe/ci/deqp-llvmpipe-asan.toml       |   14 +
 .../llvmpipe/ci/deqp-llvmpipe-fails.txt       |   20 -
 .../drivers/llvmpipe/ci/deqp-llvmpipe.toml    |    2 +
 .../gallium/drivers/llvmpipe/ci/gitlab-ci.yml |   12 +-
 .../drivers/llvmpipe/ci/llvmpipe-cl.txt       |   21 +-
 .../drivers/llvmpipe/ci/llvmpipe-quick_gl.txt |    3 -
 .../llvmpipe/ci/llvmpipe-quick_shader.txt     |    1 +
 .../drivers/llvmpipe/ci/traces-llvmpipe.yml   |   84 +-
 .../gallium/drivers/llvmpipe/lp_bld_interp.c  |   32 +-
 .../gallium/drivers/llvmpipe/lp_bld_interp.h  |    3 +-
 .../src/gallium/drivers/llvmpipe/lp_context.h |    2 +
 .../gallium/drivers/llvmpipe/lp_cs_tpool.c    |   25 +-
 .../gallium/drivers/llvmpipe/lp_cs_tpool.h    |    2 +
 .../gallium/drivers/llvmpipe/lp_draw_arrays.c |    3 +-
 .../src/gallium/drivers/llvmpipe/lp_linear.c  |    5 +-
 .../drivers/llvmpipe/lp_linear_fastpath.c     |   10 +-
 .../src/gallium/drivers/llvmpipe/lp_rast.c    |    7 +-
 .../src/gallium/drivers/llvmpipe/lp_screen.c  |   14 +-
 .../src/gallium/drivers/llvmpipe/lp_setup.c   |   67 +-
 .../drivers/llvmpipe/lp_setup_context.h       |    4 +
 .../gallium/drivers/llvmpipe/lp_setup_line.c  |   62 +-
 .../gallium/drivers/llvmpipe/lp_setup_rect.c  |    6 +-
 .../gallium/drivers/llvmpipe/lp_setup_tri.c   |   54 +-
 .../gallium/drivers/llvmpipe/lp_state_cs.c    |   52 +-
 .../gallium/drivers/llvmpipe/lp_state_cs.h    |   15 +-
 .../gallium/drivers/llvmpipe/lp_state_fs.c    |  157 +-
 .../gallium/drivers/llvmpipe/lp_state_fs.h    |   23 +-
 .../drivers/llvmpipe/lp_state_fs_linear.c     |   10 +-
 .../drivers/llvmpipe/lp_state_sampler.c       |   12 +-
 .../gallium/drivers/llvmpipe/lp_state_setup.c |  165 +-
 .../gallium/drivers/llvmpipe/lp_state_tess.c  |    9 +
 .../src/gallium/drivers/llvmpipe/lp_texture.c |  156 +-
 .../src/gallium/drivers/llvmpipe/lp_texture.h |   16 +-
 .../nouveau/codegen/nv50_ir_build_util.cpp    |    2 +-
 .../drivers/nouveau/codegen/nv50_ir_driver.h  |    4 +-
 .../nouveau/codegen/nv50_ir_emit_gm107.cpp    |   24 +-
 .../nouveau/codegen/nv50_ir_emit_nv50.cpp     |    2 +-
 .../nouveau/codegen/nv50_ir_emit_nvc0.cpp     |    4 +-
 .../nouveau/codegen/nv50_ir_from_nir.cpp      |    5 +-
 .../nouveau/codegen/nv50_ir_from_tgsi.cpp     |    2 +-
 .../nouveau/codegen/nv50_ir_peephole.cpp      |    3 +-
 .../drivers/nouveau/codegen/nv50_ir_ra.cpp    |    2 +-
 .../drivers/nouveau/nv30/nv30_context.h       |    4 +-
 .../gallium/drivers/nouveau/nv30/nv30_draw.c  |    2 +-
 .../drivers/nouveau/nv30/nv30_fragtex.c       |   13 +-
 .../drivers/nouveau/nv30/nv40_verttex.c       |    8 +-
 .../gallium/drivers/nouveau/nv50/nv50_state.c |   12 +-
 .../drivers/nouveau/nvc0/nvc0_context.h       |    1 +
 .../gallium/drivers/nouveau/nvc0/nvc0_state.c |   26 +-
 .../gallium/drivers/nouveau/nvc0/nvc0_vbo.c   |    4 +-
 .../src/gallium/drivers/panfrost/meson.build  |    6 +-
 .../gallium/drivers/panfrost/pan_assemble.c   |    4 +-
 .../gallium/drivers/panfrost/pan_cmdstream.c  |  431 +-
 .../gallium/drivers/panfrost/pan_compute.c    |    4 +-
 .../gallium/drivers/panfrost/pan_context.c    |   80 +-
 .../gallium/drivers/panfrost/pan_context.h    |   62 +-
 .../src/gallium/drivers/panfrost/pan_job.c    |  258 +-
 .../src/gallium/drivers/panfrost/pan_job.h    |   17 +-
 .../gallium/drivers/panfrost/pan_resource.c   |  120 +-
 .../gallium/drivers/panfrost/pan_resource.h   |   25 +-
 .../src/gallium/drivers/panfrost/pan_screen.c |   59 +-
 .../src/gallium/drivers/panfrost/pan_screen.h |   21 +-
 .../r300/compiler/r300_fragprog_swizzle.c     |    2 +-
 .../r300/compiler/r300_fragprog_swizzle.h     |    2 +-
 .../drivers/r300/compiler/r3xx_vertprog.c     |   12 +-
 .../r300/compiler/r3xx_vertprog_dump.c        |   10 +-
 .../drivers/r300/compiler/r500_fragprog.c     |    2 +-
 .../drivers/r300/compiler/r500_fragprog.h     |    2 +-
 .../drivers/r300/compiler/radeon_compiler.c   |   18 +-
 .../drivers/r300/compiler/radeon_compiler.h   |    2 +-
 .../drivers/r300/compiler/radeon_opcodes.c    |    2 +-
 .../drivers/r300/compiler/radeon_opcodes.h    |    2 +-
 .../r300/compiler/radeon_program_alu.c        |    2 +-
 .../r300/compiler/radeon_program_tex.c        |    2 +-
 .../drivers/r300/compiler/radeon_swizzle.h    |    2 +-
 .../src/gallium/drivers/r300/r300_blit.c      |    4 +-
 .../src/gallium/drivers/r300/r300_context.c   |    4 +-
 .../src/gallium/drivers/r300/r300_emit.c      |    2 +-
 .../src/gallium/drivers/r300/r300_reg.h       |   18 +-
 .../src/gallium/drivers/r300/r300_render.c    |    4 +-
 .../src/gallium/drivers/r300/r300_state.c     |   28 +-
 .../gallium/drivers/r300/r300_state_derived.c |   10 +-
 .../src/gallium/drivers/r300/r300_texture.c   |    2 +-
 .../gallium/drivers/r300/r300_tgsi_to_rc.c    |   27 +-
 .../src/gallium/drivers/r300/r300_vs.c        |    2 +-
 .../gallium/drivers/r600/evergreen_state.c    |   14 +-
 .../src/gallium/drivers/r600/r600_blit.c      |    2 +-
 .../gallium/drivers/r600/r600_buffer_common.c |    2 +-
 .../src/gallium/drivers/r600/r600_dump.c      |    1 -
 .../gallium/drivers/r600/r600_hw_context.c    |    2 +-
 .../src/gallium/drivers/r600/r600_pipe.h      |    1 +
 .../gallium/drivers/r600/r600_pipe_common.c   |    2 +-
 .../gallium/drivers/r600/r600_state_common.c  |   12 +-
 .../src/gallium/drivers/r600/r600_texture.c   |    4 +-
 .../r600/sfn/sfn_nir_vectorize_vs_inputs.c    |    2 +-
 .../gallium/drivers/radeon/radeon_vcn_dec.c   |   26 +-
 .../gallium/drivers/radeon/radeon_vcn_dec.h   |    3 +
 .../gallium/drivers/radeon/radeon_vcn_enc.c   |    2 +-
 .../gallium/drivers/radeon/radeon_winsys.h    |   29 +-
 .../gallium/drivers/radeonsi/ci/gitlab-ci.yml |    6 +-
 .../radeonsi/ci/navi10-piglit-quick-fail.csv  |    4 -
 .../ci/piglit-radeonsi-stoney-fails.txt       |    3 +-
 .../drivers/radeonsi/ci/radeonsi-run-tests.py |   98 +-
 .../drivers/radeonsi/ci/raven-deqp-fail.csv   |    0
 .../drivers/radeonsi/ci/raven-glcts-fail.csv  |   28 +
 .../radeonsi/ci/raven-piglit-quick-fail.csv   |  271 ++
 .../ci/sienna_cichlid-piglit-quick-fail.csv   |    4 -
 .../drivers/radeonsi/ci/traces-radeonsi.yml   |   16 +-
 .../drivers/radeonsi/driinfo_radeonsi.h       |    1 +
 .../drivers/radeonsi/gfx10_shader_ngg.c       |  230 +-
 .../src/gallium/drivers/radeonsi/meson.build  |    4 +-
 .../src/gallium/drivers/radeonsi/si_blit.c    |   44 +-
 .../src/gallium/drivers/radeonsi/si_buffer.c  |   61 +-
 .../gallium/drivers/radeonsi/si_build_pm4.h   |  186 +-
 .../src/gallium/drivers/radeonsi/si_clear.c   |    3 +-
 .../src/gallium/drivers/radeonsi/si_compute.c |  147 +-
 .../drivers/radeonsi/si_compute_blit.c        |  105 +-
 .../src/gallium/drivers/radeonsi/si_cp_dma.c  |   80 +-
 .../drivers/radeonsi/si_cp_reg_shadowing.c    |   10 +-
 .../src/gallium/drivers/radeonsi/si_debug.c   |   37 +-
 .../drivers/radeonsi/si_debug_options.h       |    3 +-
 .../gallium/drivers/radeonsi/si_descriptors.c |  225 +-
 .../src/gallium/drivers/radeonsi/si_fence.c   |   64 +-
 .../src/gallium/drivers/radeonsi/si_get.c     |   21 +-
 .../src/gallium/drivers/radeonsi/si_gfx_cs.c  |  289 +-
 .../gallium/drivers/radeonsi/si_perfcounter.c |   65 +-
 .../src/gallium/drivers/radeonsi/si_pipe.c    |  111 +-
 .../src/gallium/drivers/radeonsi/si_pipe.h    |  321 +-
 .../src/gallium/drivers/radeonsi/si_pm4.c     |    8 +-
 .../src/gallium/drivers/radeonsi/si_pm4.h     |    2 +-
 .../src/gallium/drivers/radeonsi/si_query.c   |   81 +-
 .../src/gallium/drivers/radeonsi/si_query.h   |    3 -
 .../drivers/radeonsi/si_sdma_copy_image.c     |  481 ++
 .../src/gallium/drivers/radeonsi/si_shader.c  |  428 +-
 .../src/gallium/drivers/radeonsi/si_shader.h  |  209 +-
 .../drivers/radeonsi/si_shader_internal.h     |   28 +-
 .../gallium/drivers/radeonsi/si_shader_llvm.c |  164 +-
 .../drivers/radeonsi/si_shader_llvm_gs.c      |  160 +-
 .../drivers/radeonsi/si_shader_llvm_ps.c      |   52 +-
 .../radeonsi/si_shader_llvm_resources.c       |   38 +-
 .../drivers/radeonsi/si_shader_llvm_tess.c    |   56 +-
 .../drivers/radeonsi/si_shader_llvm_vs.c      |  196 +-
 .../gallium/drivers/radeonsi/si_shader_nir.c  |   62 +-
 .../src/gallium/drivers/radeonsi/si_sqtt.c    |  172 +-
 .../src/gallium/drivers/radeonsi/si_state.c   |  558 ++-
 .../src/gallium/drivers/radeonsi/si_state.h   |   72 +-
 .../drivers/radeonsi/si_state_draw.cpp        | 1428 +++---
 .../gallium/drivers/radeonsi/si_state_msaa.c  |   34 +-
 .../drivers/radeonsi/si_state_shaders.cpp     | 4109 +++++++++++++++++
 .../drivers/radeonsi/si_state_streamout.c     |  106 +-
 .../drivers/radeonsi/si_state_viewport.c      |   73 +-
 .../src/gallium/drivers/radeonsi/si_texture.c |   50 +-
 .../softpipe/ci/deqp-softpipe-fails.txt       |    4 -
 .../drivers/softpipe/ci/deqp-softpipe.toml    |    2 +
 .../gallium/drivers/softpipe/ci/gitlab-ci.yml |    3 +-
 .../drivers/softpipe/ci/softpipe-quick.txt    |    2 +
 .../gallium/drivers/softpipe/sp_draw_arrays.c |    2 +-
 .../src/gallium/drivers/softpipe/sp_setup.c   |  100 +-
 .../src/gallium/drivers/softpipe/sp_state.h   |    1 +
 .../drivers/softpipe/sp_state_derived.c       |    2 +-
 .../drivers/softpipe/sp_state_sampler.c       |    9 +-
 .../src/gallium/drivers/svga/svga_context.h   |    1 +
 .../gallium/drivers/svga/svga_draw_elements.c |    4 +-
 .../src/gallium/drivers/svga/svga_mksstats.h  |    6 +-
 .../src/gallium/drivers/svga/svga_pipe_draw.c |   10 +-
 .../gallium/drivers/svga/svga_pipe_sampler.c  |   17 +-
 .../src/gallium/drivers/svga/svga_pipe_ts.c   |   10 +
 .../src/gallium/drivers/svga/svga_surface.c   |   29 +-
 .../gallium/drivers/svga/svga_swtnl_draw.c    |    3 +-
 .../src/gallium/drivers/svga/svga_winsys.h    |    9 +-
 .../src/gallium/drivers/swr/swr_context.h     |    1 +
 .../src/gallium/drivers/swr/swr_draw.cpp      |    6 +-
 .../src/gallium/drivers/swr/swr_state.cpp     |   23 +-
 .../src/gallium/drivers/tegra/tegra_context.c |    3 +-
 .../src/gallium/drivers/v3d/v3d_blit.c        |    4 +-
 .../src/gallium/drivers/v3d/v3d_context.c     |    9 -
 .../src/gallium/drivers/v3d/v3d_context.h     |    2 -
 .../src/gallium/drivers/v3d/v3d_job.c         |   10 +-
 .../src/gallium/drivers/v3d/v3d_program.c     |   13 +-
 .../src/gallium/drivers/v3d/v3d_resource.c    |    2 +-
 .../src/gallium/drivers/v3d/v3d_screen.c      |   23 +-
 .../src/gallium/drivers/v3d/v3d_screen.h      |    1 +
 .../src/gallium/drivers/v3d/v3dx_draw.c       |   13 +-
 .../gallium/drivers/v3d/v3dx_format_table.c   |    1 +
 .../src/gallium/drivers/v3d/v3dx_state.c      |   10 +-
 .../src/gallium/drivers/vc4/vc4_blit.c        |    2 +-
 .../src/gallium/drivers/vc4/vc4_cl_dump.c     |    2 +-
 .../src/gallium/drivers/vc4/vc4_context.c     |    9 -
 .../src/gallium/drivers/vc4/vc4_context.h     |    2 -
 .../src/gallium/drivers/vc4/vc4_draw.c        |   18 -
 .../src/gallium/drivers/vc4/vc4_screen.c      |   13 +
 .../src/gallium/drivers/vc4/vc4_screen.h      |    1 +
 .../src/gallium/drivers/vc4/vc4_state.c       |    8 +-
 .../drivers/virgl/ci/deqp-virgl-gl-fails.txt  |   22 -
 .../drivers/virgl/ci/deqp-virgl-gl.toml       |    2 +
 .../virgl/ci/deqp-virgl-gles-fails.txt        |   18 -
 .../drivers/virgl/ci/deqp-virgl-gles.toml     |    2 +
 .../gallium/drivers/virgl/ci/gitlab-ci.yml    |    6 +-
 .../gallium/drivers/virgl/ci/traces-virgl.yml |  113 +-
 .../src/gallium/drivers/virgl/meson.build     |    2 +-
 .../gallium/drivers/virgl/tests/meson.build   |    1 +
 .../src/gallium/drivers/virgl/virgl_context.c |   54 +-
 .../src/gallium/drivers/virgl/virgl_context.h |    3 +
 .../src/gallium/drivers/virgl/virgl_encode.c  |    2 +-
 .../src/gallium/drivers/virgl/virgl_screen.c  |   31 +-
 .../src/gallium/drivers/virgl/virgl_screen.h  |    3 +
 .../drivers/zink/ci/deqp-zink-lvp-fails.txt   |   38 -
 .../drivers/zink/ci/deqp-zink-lvp.toml        |    2 +
 .../src/gallium/drivers/zink/ci/gitlab-ci.yml |    8 +-
 .../drivers/zink/ci/piglit-zink-lvp-fails.txt |   12 +-
 .../zink/ci/piglit-zink-lvp-flakes.txt        |    4 +-
 .../drivers/zink/ci/piglit-zink-lvp-skips.txt |    4 +
 .../src/gallium/drivers/zink/meson.build      |    7 +-
 .../zink/nir_lower_dynamic_bo_access.c        |    2 +-
 .../drivers/zink/nir_to_spirv/nir_to_spirv.c  |  650 ++-
 .../drivers/zink/nir_to_spirv/spirv_builder.c |    9 +-
 .../drivers/zink/nir_to_spirv/spirv_builder.h |    3 +
 .../src/gallium/drivers/zink/zink_batch.c     |  236 +-
 .../src/gallium/drivers/zink/zink_batch.h     |   25 +-
 .../src/gallium/drivers/zink/zink_blit.c      |   16 +-
 .../src/gallium/drivers/zink/zink_bo.c        |   10 +-
 .../src/gallium/drivers/zink/zink_bo.h        |   17 +-
 .../src/gallium/drivers/zink/zink_clear.c     |   38 +-
 .../src/gallium/drivers/zink/zink_compiler.c  |  537 ++-
 .../src/gallium/drivers/zink/zink_compiler.h  |   10 +-
 .../src/gallium/drivers/zink/zink_context.c   | 1873 +++++---
 .../src/gallium/drivers/zink/zink_context.h   |  141 +-
 .../gallium/drivers/zink/zink_descriptors.c   |  838 ++--
 .../gallium/drivers/zink/zink_descriptors.h   |   46 +-
 .../drivers/zink/zink_descriptors_lazy.c      |  333 +-
 .../gallium/drivers/zink/zink_device_info.py  |   64 +-
 .../src/gallium/drivers/zink/zink_draw.cpp    |  555 ++-
 .../src/gallium/drivers/zink/zink_fence.c     |   11 +-
 .../src/gallium/drivers/zink/zink_format.c    |   72 +
 .../src/gallium/drivers/zink/zink_format.h    |    6 +
 .../gallium/drivers/zink/zink_framebuffer.c   |  265 +-
 .../gallium/drivers/zink/zink_framebuffer.h   |   30 +-
 .../src/gallium/drivers/zink/zink_pipeline.c  |  131 +-
 .../src/gallium/drivers/zink/zink_pipeline.h  |   49 +-
 .../src/gallium/drivers/zink/zink_program.c   |  612 ++-
 .../src/gallium/drivers/zink/zink_program.h   |  134 +-
 .../src/gallium/drivers/zink/zink_query.c     |  248 +-
 .../src/gallium/drivers/zink/zink_query.h     |    8 +-
 .../gallium/drivers/zink/zink_render_pass.c   |  191 +-
 .../gallium/drivers/zink/zink_render_pass.h   |   32 +-
 .../src/gallium/drivers/zink/zink_resource.c  |  296 +-
 .../src/gallium/drivers/zink/zink_resource.h  |   51 +-
 .../src/gallium/drivers/zink/zink_screen.c    |  287 +-
 .../src/gallium/drivers/zink/zink_screen.h    |   26 +-
 .../gallium/drivers/zink/zink_shader_keys.h   |   41 +-
 .../src/gallium/drivers/zink/zink_state.c     |  193 +-
 .../src/gallium/drivers/zink/zink_state.h     |   25 +-
 .../src/gallium/drivers/zink/zink_surface.c   |  145 +-
 .../src/gallium/drivers/zink/zink_surface.h   |   48 +-
 .../gallium/frontends/clover/api/event.cpp    |    4 +-
 .../gallium/frontends/clover/api/kernel.cpp   |   18 +-
 .../gallium/frontends/clover/api/memory.cpp   |   96 +-
 .../gallium/frontends/clover/api/platform.cpp |    2 +-
 .../gallium/frontends/clover/api/program.cpp  |   22 +-
 .../gallium/frontends/clover/api/transfer.cpp |   20 +-
 .../gallium/frontends/clover/core/binary.cpp  |  243 +
 .../gallium/frontends/clover/core/binary.hpp  |  169 +
 .../frontends/clover/core/compiler.hpp        |   12 +-
 .../gallium/frontends/clover/core/device.cpp  |    6 +-
 .../gallium/frontends/clover/core/device.hpp  |    4 +-
 .../gallium/frontends/clover/core/format.cpp  |  162 +-
 .../gallium/frontends/clover/core/format.hpp  |    3 +-
 .../gallium/frontends/clover/core/kernel.cpp  |  166 +-
 .../gallium/frontends/clover/core/kernel.hpp  |   24 +-
 .../gallium/frontends/clover/core/memory.cpp  |   70 +-
 .../gallium/frontends/clover/core/memory.hpp  |   65 +-
 .../gallium/frontends/clover/core/printf.cpp  |    8 +-
 .../gallium/frontends/clover/core/printf.hpp  |    6 +-
 .../gallium/frontends/clover/core/program.cpp |   32 +-
 .../gallium/frontends/clover/core/program.hpp |   12 +-
 .../frontends/clover/core/resource.cpp        |    7 +-
 .../gallium/frontends/clover/llvm/codegen.hpp |   14 +-
 .../frontends/clover/llvm/codegen/bitcode.cpp |   16 +-
 .../frontends/clover/llvm/codegen/common.cpp  |   80 +-
 .../frontends/clover/llvm/codegen/native.cpp  |    7 +-
 .../gallium/frontends/clover/llvm/compat.hpp  |    6 +
 .../frontends/clover/llvm/invocation.cpp      |   28 +-
 .../frontends/clover/llvm/invocation.hpp      |    8 +-
 .../src/gallium/frontends/clover/meson.build  |    5 +-
 .../frontends/clover/nir/invocation.cpp       |  126 +-
 .../frontends/clover/nir/invocation.hpp       |    6 +-
 .../frontends/clover/spirv/invocation.cpp     |  124 +-
 .../frontends/clover/spirv/invocation.hpp     |   10 +-
 .../gallium/frontends/clover/util/adaptor.hpp |    5 +-
 .../gallium/frontends/clover/util/compat.hpp  |   43 +
 .../src/gallium/frontends/d3d10umd/D3DKMT.cpp |    4 +-
 .../src/gallium/frontends/d3d10umd/Device.cpp |    8 +-
 .../frontends/d3d10umd/DriverIncludes.h       |   13 +-
 .../gallium/frontends/d3d10umd/DxgiFns.cpp    |  374 ++
 .../src/gallium/frontends/d3d10umd/DxgiFns.h  |   46 +
 .../gallium/frontends/d3d10umd/Rasterizer.cpp |    1 +
 .../gallium/frontends/d3d10umd/Resource.cpp   |  131 +-
 .../src/gallium/frontends/d3d10umd/Shader.cpp |    2 +-
 .../gallium/frontends/d3d10umd/ShaderParse.c  |    2 +-
 .../gallium/frontends/d3d10umd/ShaderParse.h  |    3 -
 .../src/gallium/frontends/d3d10umd/State.h    |    1 +
 .../gallium/frontends/d3d10umd/meson.build    |    2 +-
 .../src/gallium/frontends/dri/dri2.c          |   86 +-
 .../src/gallium/frontends/dri/dri_helpers.c   |   16 +
 .../src/gallium/frontends/dri/dri_helpers.h   |    6 +
 .../frontends/dri/dri_query_renderer.c        |    5 +
 .../src/gallium/frontends/dri/dri_screen.c    |   24 +-
 .../src/gallium/frontends/dri/dri_screen.h    |    2 +
 .../src/gallium/frontends/dri/drisw.c         |    9 +
 .../lavapipe/ci/deqp-lvp-asan-fails.txt       |    2 +
 .../lavapipe/ci/deqp-lvp-asan-skips.txt       |    9 +
 .../frontends/lavapipe/ci/deqp-lvp-fails.txt  |   43 +-
 .../frontends/lavapipe/ci/deqp-lvp-skips.txt  |   16 +-
 .../frontends/lavapipe/ci/gitlab-ci.yml       |   40 +-
 .../frontends/lavapipe/lvp_cmd_buffer.c       | 2256 ++-------
 .../frontends/lavapipe/lvp_descriptor_set.c   |  125 +-
 1120 files changed, 60293 insertions(+), 27052 deletions(-)
 create mode 100644 mesa 3D driver/CODEOWNERS
 create mode 100644 mesa 3D driver/docs/relnotes/21.1.8.rst
 create mode 100644 mesa 3D driver/docs/relnotes/21.2.2.rst
 create mode 100644 mesa 3D driver/docs/relnotes/21.2.3.rst
 create mode 100644 mesa 3D driver/docs/relnotes/21.2.4.rst
 create mode 100644 mesa 3D driver/include/android_stub/cutils/compiler.h
 create mode 100644 mesa 3D driver/include/android_stub/cutils/trace.h
 create mode 100644 mesa 3D driver/src/amd/ci/deqp-radv-raven-aco-flakes.txt
 create mode 100644 mesa 3D driver/src/amd/ci/deqp-radv-renoir-aco-flakes.txt
 create mode 100644 mesa 3D driver/src/amd/ci/deqp-radv-skips.txt
 create mode 100644 mesa 3D driver/src/amd/ci/deqp-radv-vangogh-aco-fails.txt
 create mode 100644 mesa 3D driver/src/amd/ci/deqp-radv-vega10-aco-flakes.txt
 create mode 100644 mesa 3D driver/src/amd/vulkan/radv_acceleration_structure.h
 create mode 100644 mesa 3D driver/src/amd/vulkan/radv_llvm_helper.h
 create mode 100644 mesa 3D driver/src/amd/vulkan/radv_pipeline_rt.c
 create mode 100644 mesa 3D driver/src/compiler/clc/clc.c
 create mode 100644 mesa 3D driver/src/compiler/clc/clc.h
 create mode 100644 mesa 3D driver/src/compiler/clc/clc_helpers.cpp
 create mode 100644 mesa 3D driver/src/compiler/clc/clc_helpers.h
 create mode 100644 mesa 3D driver/src/compiler/clc/meson.build
 create mode 100644 mesa 3D driver/src/compiler/isaspec/README.rst
 create mode 100644 mesa 3D driver/src/compiler/isaspec/decode.c
 create mode 100644 mesa 3D driver/src/compiler/isaspec/decode.h
 create mode 100644 mesa 3D driver/src/compiler/isaspec/decode.py
 create mode 100644 mesa 3D driver/src/compiler/isaspec/encode.py
 create mode 100644 mesa 3D driver/src/compiler/isaspec/isa.py
 create mode 100644 mesa 3D driver/src/compiler/isaspec/meson.build
 create mode 100644 mesa 3D driver/src/compiler/nir/nir_lower_sysvals_to_varyings.c
 create mode 100644 mesa 3D driver/src/egl/drivers/wgl/egl_wgl.c
 create mode 100644 mesa 3D driver/src/egl/drivers/wgl/egl_wgl.h
 create mode 100644 mesa 3D driver/src/etnaviv/ci/deqp-etnaviv-gc2000-fails.txt
 create mode 100644 mesa 3D driver/src/etnaviv/ci/deqp-etnaviv-gc2000-flakes.txt
 create mode 100644 mesa 3D driver/src/etnaviv/ci/gitlab-ci.yml
 create mode 100644 mesa 3D driver/src/freedreno/computerator/a4xx.c
 create mode 100644 mesa 3D driver/src/freedreno/computerator/examples/pvtmem.asm
 create mode 100644 mesa 3D driver/src/freedreno/fdl/fd6_format_table.c
 create mode 100644 mesa 3D driver/src/freedreno/fdl/fd6_format_table.h
 create mode 100644 mesa 3D driver/src/freedreno/fdl/fd6_view.c
 create mode 100644 mesa 3D driver/src/freedreno/ir3/ir3_lower_spill.c
 create mode 100644 mesa 3D driver/src/freedreno/ir3/ir3_nir_lower_64b.c
 create mode 100644 mesa 3D driver/src/freedreno/ir3/ir3_nir_lower_wide_load_store.c
 create mode 100644 mesa 3D driver/src/freedreno/ir3/ir3_remove_unreachable.c
 create mode 100644 mesa 3D driver/src/freedreno/vulkan/tu_perfetto.cc
 create mode 100644 mesa 3D driver/src/freedreno/vulkan/tu_perfetto.h
 create mode 100644 mesa 3D driver/src/freedreno/vulkan/tu_perfetto_util.c
 create mode 100644 mesa 3D driver/src/freedreno/vulkan/tu_tracepoints.py
 create mode 100644 mesa 3D driver/src/gallium/auxiliary/util/u_trace_gallium.c
 create mode 100644 mesa 3D driver/src/gallium/auxiliary/util/u_trace_gallium.h
 create mode 100644 mesa 3D driver/src/gallium/auxiliary/util/u_vertex_state_cache.c
 create mode 100644 mesa 3D driver/src/gallium/auxiliary/util/u_vertex_state_cache.h
 create mode 100644 mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-amly.toml
 create mode 100644 mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-apl.toml
 create mode 100644 mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-cml-fails.txt
 create mode 100644 mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-cml.toml
 create mode 100644 mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-glk-flakes.txt
 create mode 100644 mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-glk.toml
 create mode 100644 mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-kbl-fails.txt
 create mode 100644 mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-kbl.toml
 create mode 100644 mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-skips.txt
 create mode 100644 mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-whl-fails.txt
 create mode 100644 mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-whl.toml
 create mode 100644 mesa 3D driver/src/gallium/drivers/lima/drm-shim/lima_noop.c
 create mode 100644 mesa 3D driver/src/gallium/drivers/lima/drm-shim/meson.build
 create mode 100644 mesa 3D driver/src/gallium/drivers/lima/ir/lima_nir_split_loads.c
 create mode 100644 mesa 3D driver/src/gallium/drivers/llvmpipe/ci/deqp-llvmpipe-asan.toml
 create mode 100644 mesa 3D driver/src/gallium/drivers/radeonsi/ci/raven-deqp-fail.csv
 create mode 100644 mesa 3D driver/src/gallium/drivers/radeonsi/ci/raven-glcts-fail.csv
 create mode 100644 mesa 3D driver/src/gallium/drivers/radeonsi/ci/raven-piglit-quick-fail.csv
 create mode 100644 mesa 3D driver/src/gallium/drivers/radeonsi/si_sdma_copy_image.c
 create mode 100644 mesa 3D driver/src/gallium/drivers/radeonsi/si_state_shaders.cpp
 create mode 100644 mesa 3D driver/src/gallium/frontends/clover/core/binary.cpp
 create mode 100644 mesa 3D driver/src/gallium/frontends/clover/core/binary.hpp
 create mode 100644 mesa 3D driver/src/gallium/frontends/clover/util/compat.hpp
 create mode 100644 mesa 3D driver/src/gallium/frontends/d3d10umd/DxgiFns.cpp
 create mode 100644 mesa 3D driver/src/gallium/frontends/d3d10umd/DxgiFns.h
 create mode 100644 mesa 3D driver/src/gallium/frontends/lavapipe/ci/deqp-lvp-asan-fails.txt
 create mode 100644 mesa 3D driver/src/gallium/frontends/lavapipe/ci/deqp-lvp-asan-skips.txt

diff --git a/mesa 3D driver/.gitlab-ci.yml b/mesa 3D driver/.gitlab-ci.yml
index 413f75adda..528fa21127 100644
--- a/mesa 3D driver/.gitlab-ci.yml	
+++ b/mesa 3D driver/.gitlab-ci.yml	
@@ -12,6 +12,9 @@ variables:
   JOB_ARTIFACTS_BASE: ${PIPELINE_ARTIFACTS_BASE}/${CI_JOB_ID}
   # reference images stored for traces
   PIGLIT_REPLAY_REFERENCE_IMAGES_BASE: "${MINIO_HOST}/mesa-tracie-results/$FDO_UPSTREAM_REPO"
+  # Individual CI farm status, set to "offline" to disable jobs
+  # running on a particular CI farm (ie. for outages, etc):
+  FD_FARM: "online"
 
 include:
   - project: 'freedesktop/ci-templates'
@@ -27,22 +30,22 @@ include:
   - local: '.gitlab-ci/test-source-dep.yml'
   - local: 'src/amd/ci/gitlab-ci.yml'
   - local: 'src/broadcom/ci/gitlab-ci.yml'
+  - local: 'src/etnaviv/ci/gitlab-ci.yml'
   - local: 'src/freedreno/ci/gitlab-ci.yml'
   - local: 'src/gallium/drivers/i915/ci/gitlab-ci.yml'
   - local: 'src/gallium/drivers/iris/ci/gitlab-ci.yml'
   - local: 'src/gallium/drivers/lima/ci/gitlab-ci.yml'
   - local: 'src/gallium/drivers/llvmpipe/ci/gitlab-ci.yml'
-  - local: 'src/gallium/drivers/panfrost/ci/gitlab-ci.yml'
   - local: 'src/gallium/drivers/radeonsi/ci/gitlab-ci.yml'
   - local: 'src/gallium/drivers/softpipe/ci/gitlab-ci.yml'
   - local: 'src/gallium/drivers/virgl/ci/gitlab-ci.yml'
   - local: 'src/gallium/drivers/zink/ci/gitlab-ci.yml'
   - local: 'src/gallium/frontends/lavapipe/ci/gitlab-ci.yml'
+  - local: 'src/panfrost/ci/gitlab-ci.yml'
 
 stages:
   - sanity
   - container
-  - container-2
   - git-archive
   - build-x86_64
   - build-misc
@@ -51,6 +54,7 @@ stages:
   - arm
   - broadcom
   - freedreno
+  - etnaviv
   - software-renderer
   - layered-backends
   - deploy
@@ -202,8 +206,10 @@ test-docs-mr:
     FDO_DISTRIBUTION_TAG: "${MESA_IMAGE_TAG}--${MESA_BASE_TAG}--${MESA_TEMPLATES_COMMIT}"
 
 .set-image:
+  extends:
+    - .incorporate-templates-commit
   variables:
-    MESA_IMAGE: "$CI_REGISTRY_IMAGE/${MESA_IMAGE_PATH}:${MESA_IMAGE_TAG}--${MESA_TEMPLATES_COMMIT}"
+    MESA_IMAGE: "$CI_REGISTRY_IMAGE/${MESA_IMAGE_PATH}:${FDO_DISTRIBUTION_TAG}"
   image: "$MESA_IMAGE"
 
 .set-image-base-tag:
@@ -211,7 +217,7 @@ test-docs-mr:
     - .set-image
     - .incorporate-base-tag+templates-commit
   variables:
-    MESA_IMAGE: "$CI_REGISTRY_IMAGE/${MESA_IMAGE_PATH}:${MESA_IMAGE_TAG}--${MESA_BASE_TAG}--${MESA_TEMPLATES_COMMIT}"
+    MESA_IMAGE: "$CI_REGISTRY_IMAGE/${MESA_IMAGE_PATH}:${FDO_DISTRIBUTION_TAG}"
 
 
 # Build the CI docker images.
@@ -277,7 +283,6 @@ test-docs-mr:
     - .incorporate-base-tag+templates-commit
     # Don't want the .container rules
     - .ci-run-policy
-  stage: container-2
 
 # Debian 11 based x86 build image base
 debian/x86_build-base:
@@ -387,7 +392,7 @@ debian/android_build:
 debian/x86_test-base:
   extends: debian/x86_build-base
   variables:
-    MESA_IMAGE_TAG: &debian-x86_test-base "2021-07-26-python"
+    MESA_IMAGE_TAG: &debian-x86_test-base "2021-09-28-deqp-runner"
 
 .use-debian/x86_test-base:
   extends:
@@ -404,14 +409,14 @@ debian/x86_test-gl:
   extends: .use-debian/x86_test-base
   variables:
     FDO_DISTRIBUTION_EXEC: 'env KERNEL_URL=${KERNEL_URL} FDO_CI_CONCURRENT=${FDO_CI_CONCURRENT} bash .gitlab-ci/container/${CI_JOB_NAME}.sh'
-    KERNEL_URL: &kernel-rootfs-url "https://gitlab.freedesktop.org/gfx-ci/linux/-/archive/v5.13-rc5-for-mesa-ci-27df41f1e0cf/linux-v5.13-rc5-for-mesa-ci-27df41f1e0cf.tar.bz2"
-    MESA_IMAGE_TAG: &debian-x86_test-gl "2021-08-04-latest-virglrenderer"
+    KERNEL_URL: &kernel-rootfs-url "https://gitlab.freedesktop.org/gfx-ci/linux/-/archive/v5.13-rc5-for-mesa-ci-2bb5d9ffd79c/linux-v5.13-rc5-for-mesa-ci-2bb5d9ffd79c.tar.bz2"
+    MESA_IMAGE_TAG: &debian-x86_test-gl "2021-2021-10-07-piglit"
 
 # Debian 11 based x86 test image for VK
 debian/x86_test-vk:
   extends: .use-debian/x86_test-base
   variables:
-    MESA_IMAGE_TAG: &debian-x86_test-vk "2021-07-30-piglit-2"
+    MESA_IMAGE_TAG: &debian-x86_test-vk "2021-2021-10-07-piglit"
 
 # Debian 11 based ARM build image
 debian/arm_build:
@@ -456,11 +461,11 @@ fedora/x86_build:
 .kernel+rootfs:
   extends:
     - .ci-run-policy
-  stage: container-2
+  stage: container
   variables:
     GIT_STRATEGY: fetch
     KERNEL_URL: *kernel-rootfs-url
-    MESA_ROOTFS_TAG: &kernel-rootfs "2021-08-07-enable-lima"
+    MESA_ROOTFS_TAG: &kernel-rootfs "2021-10-07-piglit"
     DISTRIBUTION_TAG: &distribution-tag-arm "${MESA_ROOTFS_TAG}--${MESA_ARTIFACTS_TAG}--${MESA_TEMPLATES_COMMIT}"
   script:
     - .gitlab-ci/container/lava_build.sh
@@ -507,7 +512,6 @@ debian/arm_test:
     - .container
     # Don't want the .container rules
     - .ci-run-policy
-  stage: build-misc
   needs:
     - kernel+rootfs_arm64
     - kernel+rootfs_armhf
@@ -515,7 +519,7 @@ debian/arm_test:
     FDO_DISTRIBUTION_EXEC: 'env ARTIFACTS_PREFIX=https://${MINIO_HOST}/mesa-lava ARTIFACTS_SUFFIX=${MESA_ROOTFS_TAG}--${MESA_ARM_BUILD_TAG}--${MESA_TEMPLATES_COMMIT} CI_PROJECT_PATH=${CI_PROJECT_PATH} FDO_CI_CONCURRENT=${FDO_CI_CONCURRENT} FDO_UPSTREAM_REPO=${FDO_UPSTREAM_REPO} bash .gitlab-ci/container/${CI_JOB_NAME}.sh'
     FDO_DISTRIBUTION_TAG: "${MESA_IMAGE_TAG}--${MESA_ROOTFS_TAG}--${MESA_ARM_BUILD_TAG}--${MESA_TEMPLATES_COMMIT}"
     MESA_ARM_BUILD_TAG: *debian-arm_build
-    MESA_IMAGE_TAG: &debian-arm_test "2021-07-26-python"
+    MESA_IMAGE_TAG: &debian-arm_test "2021-09-17-deqp"
     MESA_ROOTFS_TAG: *kernel-rootfs
 
 .use-debian/arm_test:
@@ -750,7 +754,6 @@ debian-gallium:
   script:
     - .gitlab-ci/meson/build.sh
     - .gitlab-ci/run-shader-db.sh
-    - src/freedreno/.gitlab-ci/run-fdtools.sh
 
 # Test a release build with -Werror so new warnings don't sneak in.
 debian-release:
@@ -774,6 +777,7 @@ debian-release:
       -D gallium-opencl=disabled
       -D llvm=false
     GALLIUM_DRIVERS: "i915,iris,nouveau,kmsro,freedreno,r300,svga,swrast,v3d,vc4,virgl,etnaviv,panfrost,lima,zink,d3d12,crocus"
+    VULKAN_DRIVERS: "amd"
     BUILDTYPE: "release"
     EXTRA_OPTION: >
       -D osmesa=true
@@ -931,7 +935,7 @@ debian-arm64:
     - .meson-arm
     - .ci-deqp-artifacts
   variables:
-    VULKAN_DRIVERS: "freedreno,broadcom"
+    VULKAN_DRIVERS: "freedreno,broadcom,panfrost"
     EXTRA_OPTION: >
       -D llvm=disabled
       -D valgrind=false
@@ -1333,7 +1337,7 @@ debian-mingw32-x86_64:
 
 .baremetal-arm64-asan-test:
   variables:
-    TEST_LD_PRELOAD: libasan.so.6
+    DEQP_RUNNER_OPTIONS: "--env LD_PRELOAD=libasan.so.6"
     MINIO_ARTIFACT_NAME: mesa-arm64-asan
   needs:
     - debian/arm_test
@@ -1343,4 +1347,4 @@ debian-mingw32-x86_64:
 .baremetal-deqp-test:
   variables:
     HWCI_TEST_SCRIPT: "/install/deqp-runner.sh"
-    DEQP_PARALLEL: 0 # Default to number of CPUs
+    FDO_CI_CONCURRENT: 0 # Default to number of CPUs
diff --git a/mesa 3D driver/CODEOWNERS b/mesa 3D driver/CODEOWNERS
new file mode 100644
index 0000000000..2f7b067589
--- /dev/null
+++ b/mesa 3D driver/CODEOWNERS	
@@ -0,0 +1,138 @@
+# This file contains the GitLab handle of the maintainers/reviewers for
+# a given file:
+# https://docs.gitlab.com/ce/user/project/code_owners.html
+#
+# Consider these as the list of people who want to be involved in MRs
+# touching these files/folders, and whom you can ask your questions and
+# tag in issues.
+#
+# As of GitLab 14.3, all features surrounding this file are premium-only,
+# which means this file is only read by humans for now.
+#
+# Paths starting with a `/` are relative to the git root, otherwise they
+# can match any substring of the file's path.
+# If multiple lines match, only the last one applies; there is no
+# accumulation.
+
+
+##################
+# INFRASTRUCTURE #
+##################
+
+# Build system - Meson
+meson.build @dbaker @eric
+/meson_options.txt @dbaker @eric
+/docs/meson.rst @dbaker @eric
+
+# Build system - Android
+/android/ @roman.stratiienko
+
+# Compatibility headers
+/include/c99* @evelikov
+/include/c11* @eric
+
+# Documentation
+/docs/ @eric @evelikov
+
+
+##########
+# COMMON #
+##########
+
+# NIR
+/src/compiler/nir/ @jekstrand
+
+# Vulkan
+/src/vulkan/ @eric @jekstrand
+/include/vulkan/ @eric @jekstrand
+
+
+#############
+# PLATFORMS #
+#############
+
+# EGL
+/src/egl/ @eric @evelikov
+/include/EGL/ @eric @evelikov
+
+# EGL - Android support
+/src/egl/drivers/dri2/platform_android.c @robh @tfiga
+
+# EGL - Device support
+/src/egl/drivers/dri2/platform_device.c @evelikov
+
+# EGL - Wayland support
+/src/egl/wayland/ @daniels @eric
+/src/egl/drivers/dri2/platform_wayland.c @daniels @eric
+
+# Gallium targets
+/src/gallium/targets/ @evelikov
+
+# GLX
+/src/glx/ @ajax
+/include/GL/glx* @ajax
+
+# GLVND
+/src/egl/main/eglglvnd.c @kbrenneman
+/src/egl/main/egldispatchstubs.* @kbrenneman
+/src/egl/generate/ @kbrenneman
+/src/glx/*glvnd* @kbrenneman
+
+# Haiku
+/include/HaikuGL/ @kallisti5
+/src/egl/drivers/haiku/ @kallisti5
+/src/gallium/frontends/hgl/ @kallisti5
+/src/gallium/targets/haiku-softpipe/ @kallisti5
+/src/gallium/winsys/sw/hgl/ @kallisti5
+/src/hgl/ @kallisti5
+
+# Loader - DRI/classic
+/src/loader/ @evelikov
+
+# Loader - Gallium
+/src/gallium/auxiliary/pipe-loader/ @evelikov
+/src/gallium/auxiliary/target-helpers/ @evelikov
+
+# Vulkan WSI - Display
+/src/vulkan/wsi/wsi_common_display.* @keithp
+/src/*/vulkan/*_wsi_display.c @keithp
+
+
+###########
+# Drivers #
+###########
+
+# Asahi
+/src/asahi/ @alyssa
+/src/gallium/drivers/asahi/ @alyssa
+
+# Freedreno
+/src/gallium/drivers/freedreno/ @robclark
+
+# Intel
+/include/drm-uapi/i915_drm.h @kwg @llandwerlin @jekstrand @idr
+/include/pci_ids/i*_pci_ids.h @kwg @llandwerlin @jekstrand @idr
+/src/intel/ @kwg @llandwerlin @jekstrand @idr
+/src/gallium/winsys/iris/ @kwg @llandwerlin @jekstrand @idr
+/src/gallium/drivers/iris/ @kwg @llandwerlin @jekstrand @idr
+/src/gallium/drivers/i915/ @anholt
+/src/mesa/drivers/dri/i965/ @kwg @llandwerlin @jekstrand @idr
+/doxygen/i965.doxy @kwg @llandwerlin @jekstrand @idr
+
+# Microsoft
+/src/microsoft/ @jenatali
+/src/gallium/drivers/d3d12/ @jenatali
+
+# Panfrost
+/src/panfrost/ @alyssa
+/src/panfrost/vulkan/ @bbrezillon
+/src/gallium/drivers/panfrost/ @alyssa
+
+# SWR
+/src/gallium/drivers/swr/ @jzielins @krzysztof.raszkowski
+/docs/gallium/drivers/openswr.rst @jzielins @krzysztof.raszkowski
+/docs/gallium/drivers/openswr/ @jzielins @krzysztof.raszkowski
+
+# VMware
+/src/gallium/drivers/svga/ @brianp @charmainel
+/src/gallium/winsys/svga/ @thomash @drawat
diff --git a/mesa 3D driver/VERSION b/mesa 3D driver/VERSION
index 31660c53cb..0336c7e3dd 100644
--- a/mesa 3D driver/VERSION	
+++ b/mesa 3D driver/VERSION	
@@ -1 +1 @@
-21.3.0-devel
+22.0.0-devel
diff --git a/mesa 3D driver/bin/khronos-update.py b/mesa 3D driver/bin/khronos-update.py
index d7713c526c..70ec54b383 100644
--- a/mesa 3D driver/bin/khronos-update.py	
+++ b/mesa 3D driver/bin/khronos-update.py	
@@ -47,7 +47,7 @@ SOURCES = [
         'api': 'khr',
         'inc_folder': 'KHR',
         'sources': [
-            Source('include/KHR/khrplatform.h',    'https://github.com/KhronosGroup/EGL-Registry/raw/master/api/KHR/khrplatform.h'),
+            Source('include/KHR/khrplatform.h',    'https://github.com/KhronosGroup/EGL-Registry/raw/main/api/KHR/khrplatform.h'),
         ],
     },
 
@@ -55,10 +55,10 @@ SOURCES = [
         'api': 'egl',
         'inc_folder': 'EGL',
         'sources': [
-            Source('src/egl/generate/egl.xml',     'https://github.com/KhronosGroup/EGL-Registry/raw/master/api/egl.xml'),
-            Source('include/EGL/egl.h',            'https://github.com/KhronosGroup/EGL-Registry/raw/master/api/EGL/egl.h'),
-            Source('include/EGL/eglplatform.h',    'https://github.com/KhronosGroup/EGL-Registry/raw/master/api/EGL/eglplatform.h'),
-            Source('include/EGL/eglext.h',         'https://github.com/KhronosGroup/EGL-Registry/raw/master/api/EGL/eglext.h'),
+            Source('src/egl/generate/egl.xml',     'https://github.com/KhronosGroup/EGL-Registry/raw/main/api/egl.xml'),
+            Source('include/EGL/egl.h',            'https://github.com/KhronosGroup/EGL-Registry/raw/main/api/EGL/egl.h'),
+            Source('include/EGL/eglplatform.h',    'https://github.com/KhronosGroup/EGL-Registry/raw/main/api/EGL/eglplatform.h'),
+            Source('include/EGL/eglext.h',         'https://github.com/KhronosGroup/EGL-Registry/raw/main/api/EGL/eglext.h'),
             Source('include/EGL/eglextchromium.h', 'https://chromium.googlesource.com/chromium/src/+/refs/heads/master/ui/gl/EGL/eglextchromium.h?format=TEXT'),
             Source('include/EGL/eglext_angle.h',   'https://chromium.googlesource.com/angle/angle/+/refs/heads/master/include/EGL/eglext_angle.h?format=TEXT'),
             Source('include/EGL/eglmesaext.h',     None),
@@ -69,11 +69,11 @@ SOURCES = [
         'api': 'gl',
         'inc_folder': 'GL',
         'sources': [
-            Source('src/mapi/glapi/registry/gl.xml', 'https://github.com/KhronosGroup/OpenGL-Registry/raw/master/xml/gl.xml'),
-            Source('include/GL/glcorearb.h',         'https://github.com/KhronosGroup/OpenGL-Registry/raw/master/api/GL/glcorearb.h'),
-            Source('include/GL/glext.h',             'https://github.com/KhronosGroup/OpenGL-Registry/raw/master/api/GL/glext.h'),
-            Source('include/GL/glxext.h',            'https://github.com/KhronosGroup/OpenGL-Registry/raw/master/api/GL/glxext.h'),
-            Source('include/GL/wglext.h',            'https://github.com/KhronosGroup/OpenGL-Registry/raw/master/api/GL/wglext.h'),
+            Source('src/mapi/glapi/registry/gl.xml', 'https://github.com/KhronosGroup/OpenGL-Registry/raw/main/xml/gl.xml'),
+            Source('include/GL/glcorearb.h',         'https://github.com/KhronosGroup/OpenGL-Registry/raw/main/api/GL/glcorearb.h'),
+            Source('include/GL/glext.h',             'https://github.com/KhronosGroup/OpenGL-Registry/raw/main/api/GL/glext.h'),
+            Source('include/GL/glxext.h',            'https://github.com/KhronosGroup/OpenGL-Registry/raw/main/api/GL/glxext.h'),
+            Source('include/GL/wglext.h',            'https://github.com/KhronosGroup/OpenGL-Registry/raw/main/api/GL/wglext.h'),
             Source('include/GL/gl.h',                None),  # FIXME: I don't know what the canonical source is
             Source('include/GL/glx.h',               None),  # FIXME: I don't know what the canonical source is
             Source('include/GL/internal/',           None),
@@ -86,10 +86,10 @@ SOURCES = [
         'api': 'gles1',
         'inc_folder': 'GLES',
         'sources': [
-            Source('include/GLES/gl.h',         'https://github.com/KhronosGroup/OpenGL-Registry/raw/master/api/GLES/gl.h'),
-            Source('include/GLES/glplatform.h', 'https://github.com/KhronosGroup/OpenGL-Registry/raw/master/api/GLES/glplatform.h'),
-            Source('include/GLES/glext.h',      'https://github.com/KhronosGroup/OpenGL-Registry/raw/master/api/GLES/glext.h'),
-            Source('include/GLES/egl.h',        'https://github.com/KhronosGroup/OpenGL-Registry/raw/master/api/GLES/egl.h'),
+            Source('include/GLES/gl.h',         'https://github.com/KhronosGroup/OpenGL-Registry/raw/main/api/GLES/gl.h'),
+            Source('include/GLES/glplatform.h', 'https://github.com/KhronosGroup/OpenGL-Registry/raw/main/api/GLES/glplatform.h'),
+            Source('include/GLES/glext.h',      'https://github.com/KhronosGroup/OpenGL-Registry/raw/main/api/GLES/glext.h'),
+            Source('include/GLES/egl.h',        'https://github.com/KhronosGroup/OpenGL-Registry/raw/main/api/GLES/egl.h'),
         ],
     },
 
@@ -97,9 +97,9 @@ SOURCES = [
         'api': 'gles2',
         'inc_folder': 'GLES2',
         'sources': [
-            Source('include/GLES2/gl2.h',         'https://github.com/KhronosGroup/OpenGL-Registry/raw/master/api/GLES2/gl2.h'),
-            Source('include/GLES2/gl2platform.h', 'https://github.com/KhronosGroup/OpenGL-Registry/raw/master/api/GLES2/gl2platform.h'),
-            Source('include/GLES2/gl2ext.h',      'https://github.com/KhronosGroup/OpenGL-Registry/raw/master/api/GLES2/gl2ext.h'),
+            Source('include/GLES2/gl2.h',         'https://github.com/KhronosGroup/OpenGL-Registry/raw/main/api/GLES2/gl2.h'),
+            Source('include/GLES2/gl2platform.h', 'https://github.com/KhronosGroup/OpenGL-Registry/raw/main/api/GLES2/gl2platform.h'),
+            Source('include/GLES2/gl2ext.h',      'https://github.com/KhronosGroup/OpenGL-Registry/raw/main/api/GLES2/gl2ext.h'),
         ],
     },
 
@@ -107,10 +107,10 @@ SOURCES = [
         'api': 'gles3',
         'inc_folder': 'GLES3',
         'sources': [
-            Source('include/GLES3/gl3.h',         'https://github.com/KhronosGroup/OpenGL-Registry/raw/master/api/GLES3/gl3.h'),
-            Source('include/GLES3/gl31.h',        'https://github.com/KhronosGroup/OpenGL-Registry/raw/master/api/GLES3/gl31.h'),
-            Source('include/GLES3/gl32.h',        'https://github.com/KhronosGroup/OpenGL-Registry/raw/master/api/GLES3/gl32.h'),
-            Source('include/GLES3/gl3platform.h', 'https://github.com/KhronosGroup/OpenGL-Registry/raw/master/api/GLES3/gl3platform.h'),
+            Source('include/GLES3/gl3.h',         'https://github.com/KhronosGroup/OpenGL-Registry/raw/main/api/GLES3/gl3.h'),
+            Source('include/GLES3/gl31.h',        'https://github.com/KhronosGroup/OpenGL-Registry/raw/main/api/GLES3/gl31.h'),
+            Source('include/GLES3/gl32.h',        'https://github.com/KhronosGroup/OpenGL-Registry/raw/main/api/GLES3/gl32.h'),
+            Source('include/GLES3/gl3platform.h', 'https://github.com/KhronosGroup/OpenGL-Registry/raw/main/api/GLES3/gl3platform.h'),
             Source('include/GLES3/gl3ext.h',      None),  # FIXME: I don't know what the canonical source is
         ],
     },
@@ -155,25 +155,27 @@ SOURCES = [
         'api': 'vulkan',
         'inc_folder': 'vulkan',
         'sources': [
-            Source('src/vulkan/registry/vk.xml',                'https://github.com/KhronosGroup/Vulkan-Headers/raw/master/registry/vk.xml'),
-            Source('include/vulkan/vulkan.h',                   'https://github.com/KhronosGroup/Vulkan-Headers/raw/master/include/vulkan/vulkan.h'),
-            Source('include/vulkan/vulkan_core.h',              'https://github.com/KhronosGroup/Vulkan-Headers/raw/master/include/vulkan/vulkan_core.h'),
-            Source('include/vulkan/vulkan_beta.h',              'https://github.com/KhronosGroup/Vulkan-Headers/raw/master/include/vulkan/vulkan_beta.h'),
-            Source('include/vulkan/vk_icd.h',                   'https://github.com/KhronosGroup/Vulkan-Headers/raw/master/include/vulkan/vk_icd.h'),
-            Source('include/vulkan/vk_layer.h',                 'https://github.com/KhronosGroup/Vulkan-Headers/raw/master/include/vulkan/vk_layer.h'),
-            Source('include/vulkan/vk_platform.h',              'https://github.com/KhronosGroup/Vulkan-Headers/raw/master/include/vulkan/vk_platform.h'),
-            Source('include/vulkan/vulkan_android.h',           'https://github.com/KhronosGroup/Vulkan-Headers/raw/master/include/vulkan/vulkan_android.h'),
-            Source('include/vulkan/vulkan_fuchsia.h',           'https://github.com/KhronosGroup/Vulkan-Headers/raw/master/include/vulkan/vulkan_fuchsia.h'),
-            Source('include/vulkan/vulkan_ggp.h',               'https://github.com/KhronosGroup/Vulkan-Headers/raw/master/include/vulkan/vulkan_ggp.h'),
-            Source('include/vulkan/vulkan_ios.h',               'https://github.com/KhronosGroup/Vulkan-Headers/raw/master/include/vulkan/vulkan_ios.h'),
-            Source('include/vulkan/vulkan_macos.h',             'https://github.com/KhronosGroup/Vulkan-Headers/raw/master/include/vulkan/vulkan_macos.h'),
-            Source('include/vulkan/vulkan_metal.h',             'https://github.com/KhronosGroup/Vulkan-Headers/raw/master/include/vulkan/vulkan_metal.h'),
-            Source('include/vulkan/vulkan_vi.h',                'https://github.com/KhronosGroup/Vulkan-Headers/raw/master/include/vulkan/vulkan_vi.h'),
-            Source('include/vulkan/vulkan_wayland.h',           'https://github.com/KhronosGroup/Vulkan-Headers/raw/master/include/vulkan/vulkan_wayland.h'),
-            Source('include/vulkan/vulkan_win32.h',             'https://github.com/KhronosGroup/Vulkan-Headers/raw/master/include/vulkan/vulkan_win32.h'),
-            Source('include/vulkan/vulkan_xcb.h',               'https://github.com/KhronosGroup/Vulkan-Headers/raw/master/include/vulkan/vulkan_xcb.h'),
-            Source('include/vulkan/vulkan_xlib.h',              'https://github.com/KhronosGroup/Vulkan-Headers/raw/master/include/vulkan/vulkan_xlib.h'),
-            Source('include/vulkan/vulkan_xlib_xrandr.h',       'https://github.com/KhronosGroup/Vulkan-Headers/raw/master/include/vulkan/vulkan_xlib_xrandr.h'),
+            Source('src/vulkan/registry/vk.xml',                'https://github.com/KhronosGroup/Vulkan-Headers/raw/main/registry/vk.xml'),
+            Source('include/vulkan/vulkan.h',                   'https://github.com/KhronosGroup/Vulkan-Headers/raw/main/include/vulkan/vulkan.h'),
+            Source('include/vulkan/vulkan_core.h',              'https://github.com/KhronosGroup/Vulkan-Headers/raw/main/include/vulkan/vulkan_core.h'),
+            Source('include/vulkan/vulkan_beta.h',              'https://github.com/KhronosGroup/Vulkan-Headers/raw/main/include/vulkan/vulkan_beta.h'),
+            Source('include/vulkan/vk_icd.h',                   'https://github.com/KhronosGroup/Vulkan-Headers/raw/main/include/vulkan/vk_icd.h'),
+            Source('include/vulkan/vk_layer.h',                 'https://github.com/KhronosGroup/Vulkan-Headers/raw/main/include/vulkan/vk_layer.h'),
+            Source('include/vulkan/vk_platform.h',              'https://github.com/KhronosGroup/Vulkan-Headers/raw/main/include/vulkan/vk_platform.h'),
+            Source('include/vulkan/vulkan_android.h',           'https://github.com/KhronosGroup/Vulkan-Headers/raw/main/include/vulkan/vulkan_android.h'),
+            Source('include/vulkan/vulkan_directfb.h',          'https://github.com/KhronosGroup/Vulkan-Headers/raw/main/include/vulkan/vulkan_directfb.h'),
+            Source('include/vulkan/vulkan_fuchsia.h',           'https://github.com/KhronosGroup/Vulkan-Headers/raw/main/include/vulkan/vulkan_fuchsia.h'),
+            Source('include/vulkan/vulkan_ggp.h',               'https://github.com/KhronosGroup/Vulkan-Headers/raw/main/include/vulkan/vulkan_ggp.h'),
+            Source('include/vulkan/vulkan_ios.h',               'https://github.com/KhronosGroup/Vulkan-Headers/raw/main/include/vulkan/vulkan_ios.h'),
+            Source('include/vulkan/vulkan_macos.h',             'https://github.com/KhronosGroup/Vulkan-Headers/raw/main/include/vulkan/vulkan_macos.h'),
+            Source('include/vulkan/vulkan_metal.h',             'https://github.com/KhronosGroup/Vulkan-Headers/raw/main/include/vulkan/vulkan_metal.h'),
+            Source('include/vulkan/vulkan_screen.h',            'https://github.com/KhronosGroup/Vulkan-Headers/raw/main/include/vulkan/vulkan_screen.h'),
+            Source('include/vulkan/vulkan_vi.h',                'https://github.com/KhronosGroup/Vulkan-Headers/raw/main/include/vulkan/vulkan_vi.h'),
+            Source('include/vulkan/vulkan_wayland.h',           'https://github.com/KhronosGroup/Vulkan-Headers/raw/main/include/vulkan/vulkan_wayland.h'),
+            Source('include/vulkan/vulkan_win32.h',             'https://github.com/KhronosGroup/Vulkan-Headers/raw/main/include/vulkan/vulkan_win32.h'),
+            Source('include/vulkan/vulkan_xcb.h',               'https://github.com/KhronosGroup/Vulkan-Headers/raw/main/include/vulkan/vulkan_xcb.h'),
+            Source('include/vulkan/vulkan_xlib.h',              'https://github.com/KhronosGroup/Vulkan-Headers/raw/main/include/vulkan/vulkan_xlib.h'),
+            Source('include/vulkan/vulkan_xlib_xrandr.h',       'https://github.com/KhronosGroup/Vulkan-Headers/raw/main/include/vulkan/vulkan_xlib_xrandr.h'),
             Source('include/vulkan/vk_android_native_buffer.h', 'https://android.googlesource.com/platform/frameworks/native/+/master/vulkan/include/vulkan/vk_android_native_buffer.h?format=TEXT'),
             Source('include/vulkan/.editorconfig',              None),
         ],
diff --git a/mesa 3D driver/bin/symbols-check.py b/mesa 3D driver/bin/symbols-check.py
index 5ebeb861d1..dd36470d76 100644
--- a/mesa 3D driver/bin/symbols-check.py	
+++ b/mesa 3D driver/bin/symbols-check.py	
@@ -73,7 +73,7 @@ def get_symbols_dumpbin(dumpbin, lib):
             continue
         symbol_name = fields[3]
         # De-mangle symbols
-        if symbol_name[0] == '_':
+        if symbol_name[0] == '_' and '@' in symbol_name:
             symbol_name = symbol_name[1:].split('@')[0]
         symbols.append(symbol_name)
     return symbols
diff --git a/mesa 3D driver/bin/update-android-headers.sh b/mesa 3D driver/bin/update-android-headers.sh
index eddb1cf9c9..565f32e1a3 100644
--- a/mesa 3D driver/bin/update-android-headers.sh	
+++ b/mesa 3D driver/bin/update-android-headers.sh	
@@ -8,9 +8,11 @@ if [ ! -e .git ]; then
 fi
 
 if [ ! -d platform-hardware-libhardware ]; then
+    git clone --depth 1 https://android.googlesource.com/platform/frameworks/native platform-frameworks-native
     git clone --depth 1 https://android.googlesource.com/platform/hardware/libhardware platform-hardware-libhardware
     git clone --depth 1 https://android.googlesource.com/platform/system/core platform-system-core
-    git clone --depth 1 https://android.googlesource.com/platform/frameworks/native platform-frameworks-native
+    git clone --depth 1 https://android.googlesource.com/platform/system/logging platform-system-logging
+    git clone --depth 1 https://android.googlesource.com/platform/system/unwinding platform-system-unwinding
 fi
 
 dest=include/android_stub
@@ -25,14 +27,14 @@ mkdir ${dest}
 
 # These directories contains mostly only the files we need, so copy wholesale
 
-cp -av platform-frameworks-native/libs/nativewindow/include/vndk        \
-    platform-system-core/libsync/include/sync                           \
-    platform-system-core/libsync/include/ndk                            \
-    platform-system-core/libbacktrace/include/backtrace                 \
-    platform-system-core/libsystem/include/system                       \
-    platform-system-core/liblog/include/log                             \
-    platform-frameworks-native/libs/nativewindow/include/apex           \
+cp -av                                                                  \
+    platform-frameworks-native/libs/nativewindow/include/vndk           \
     platform-frameworks-native/libs/nativebase/include/nativebase       \
+    platform-system-core/libsync/include/ndk                            \
+    platform-system-core/libsync/include/sync                           \
+    platform-system-core/libsystem/include/system                       \
+    platform-system-logging/liblog/include/log                          \
+    platform-system-unwinding/libbacktrace/include/backtrace            \
     ${dest}
 
 
@@ -43,15 +45,16 @@ cp -av platform-hardware-libhardware/include/hardware/{hardware,gralloc,gralloc1
 cp -av platform-frameworks-native/vulkan/include/hardware/hwvulkan.h ${dest}/hardware
 
 mkdir ${dest}/cutils
-cp -av platform-system-core/libcutils/include/cutils/{log,native_handle,properties}.h ${dest}/cutils
+cp -av platform-system-core/libcutils/include/cutils/{compiler,log,native_handle,properties,trace}.h ${dest}/cutils
 
 
 # include/android has files from a few different projects
 
 mkdir ${dest}/android
-cp -av platform-frameworks-native/libs/nativewindow/include/android/*   \
+cp -av                                                                  \
+    platform-frameworks-native/libs/nativewindow/include/android/*      \
     platform-frameworks-native/libs/arect/include/android/*             \
-    platform-system-core/liblog/include/android/*                       \
     platform-system-core/libsync/include/android/*                      \
+    platform-system-logging/liblog/include/android/*                    \
     ${dest}/android
 
diff --git a/mesa 3D driver/docs/drivers/freedreno/isaspec.rst b/mesa 3D driver/docs/drivers/freedreno/isaspec.rst
index 5ecd7d9be2..07f679e73e 100644
--- a/mesa 3D driver/docs/drivers/freedreno/isaspec.rst	
+++ b/mesa 3D driver/docs/drivers/freedreno/isaspec.rst	
@@ -22,7 +22,7 @@ cuts taken to get things up and running (which are mostly not inherent to
 the xml schema, and should not be too difficult to remove from the py and
 decode/disasm utility):
 
-* Maximum "bitset" size is 64b
+* Maximum "field" size is 64b
 * Fixed instruction size
 
 Often times, especially when new functionality is added in later gens
@@ -184,6 +184,9 @@ decoding.  The display template consists of references to fields (which may
 be derived fields) specified as ``{FIELDNAME}`` and other characters
 which are just echoed through to the resulting decoded bitset.
 
+It is possible to define a line column alignment value per field to influence
+the visual output. It needs to be pecified as ``{FIELDNAME:align=xx}``.
+
 The ``<override>`` element will be described in the next section, but it
 provides for both different decoded instruction syntax/mnemonics (when
 simply providing a different display template string) as well as instruction
diff --git a/mesa 3D driver/docs/drivers/llvmpipe.rst b/mesa 3D driver/docs/drivers/llvmpipe.rst
index 91d780ada5..4af2194a5a 100644
--- a/mesa 3D driver/docs/drivers/llvmpipe.rst	
+++ b/mesa 3D driver/docs/drivers/llvmpipe.rst	
@@ -106,9 +106,10 @@ Windows
 
 On Windows, building will create
 ``build/windows-x86-debug/gallium/targets/libgl-gdi/opengl32.dll`` which
-is a drop-in alternative for system's ``opengl32.dll``. To use it put it
-in the same directory as your application. It can also be used by
-replacing the native ICD driver, but it's quite an advanced usage, so if
+is a drop-in alternative for system's ``opengl32.dll``, which will use
+the Mesa ICD, ``build/windows-x86-debug/gallium/targets/wgl/libgallium_wgl.dll``.
+To use it put both dlls in the same directory as your application. It can also
+be used by replacing the native ICD driver, but it's quite an advanced usage, so if
 you need to ask, don't even try it.
 
 There is however an easy way to replace the OpenGL software renderer
@@ -116,7 +117,7 @@ that comes with Microsoft Windows 7 (or later) with llvmpipe (that is,
 on systems without any OpenGL drivers):
 
 -  copy
-   ``build/windows-x86-debug/gallium/targets/libgl-gdi/opengl32.dll`` to
+   ``build/windows-x86-debug/gallium/targets/wgl/libgallium_wgl.dll`` to
    ``C:\Windows\SysWOW64\mesadrv.dll``
 
 -  load this registry settings:
diff --git a/mesa 3D driver/docs/drivers/panfrost.rst b/mesa 3D driver/docs/drivers/panfrost.rst
index 994dae5d88..eda2dd27ec 100644
--- a/mesa 3D driver/docs/drivers/panfrost.rst	
+++ b/mesa 3D driver/docs/drivers/panfrost.rst	
@@ -1,9 +1,10 @@
 Panfrost
 ========
 
-The Panfrost driver stack includes a **non-conformant** OpenGL ES
-implementation for Arm Mali GPUs based on the Midgard and Bifrost
-microarchitectures. The following GPUs are currently supported:
+The Panfrost driver stack includes an OpenGL ES implementation for Arm Mali
+GPUs based on the Midgard and Bifrost microarchitectures. It is **conformant**
+on Mali G52 but **non-conformant** on other GPUs. The following hardware is
+currently supported:
 
 =========  ============ ============ =======
 Product    Architecture OpenGL ES    OpenGL
@@ -39,17 +40,16 @@ it's easy to add support, see the commit ``cff7de4bb597e9`` as an example.
 LLVM is *not* required by Panfrost's compilers. LLVM support in Mesa can
 safely be disabled for most OpenGL ES users with Panfrost.
 
-Build with meson like ``meson . build/ -Ddri-drivers= -Dvulkan-drivers=
+Build like ``meson . build/ -Ddri-drivers= -Dvulkan-drivers=
 -Dgallium-drivers=panfrost -Dllvm=disabled`` for a build directory
 ``build``.
 
-Building for Android via the legacy ``Android.mk`` system is not officially
-supported but reportedly works. Your mileage may vary.
-
 For general information on building Mesa, read :doc:`the install documentation
 <../install>`.
 
 Chat
 ----
 
-Panfrost developers and users hang out on IRC at ``#panfrost`` on OFTC.
+Panfrost developers and users hang out on IRC at ``#panfrost`` on OFTC. Note
+that registering and authenticating with `NickServ` is required to prevent
+spam. `Join the chat. <https://webchat.oftc.net/?channels=#panfrost>`_
diff --git a/mesa 3D driver/docs/drivers/zink.rst b/mesa 3D driver/docs/drivers/zink.rst
index 70aa750956..3942d7c164 100644
--- a/mesa 3D driver/docs/drivers/zink.rst	
+++ b/mesa 3D driver/docs/drivers/zink.rst	
@@ -227,6 +227,23 @@ are required to be supported
 
   * `VK_KHR_draw_indirect_count`_
 
+Performance
+-----------
+
+If you notice poor performance and high CPU usage while running an application,
+changing the descriptor manager may improve performance:
+
+.. envvar:: ZINK_DESCRIPTORS <mode> ("auto")
+
+``auto``
+   Automatically detect best mode. This is the default.
+``lazy``
+   Disable caching and attempt to use the least amount of CPU.
+``nofallback``
+   Always use caching to try reducing GPU churn.
+``notemplates``
+   The same as `auto`, but disables the use of `VK_KHR_descriptor_templates`.
+
 Debugging
 ---------
 
diff --git a/mesa 3D driver/docs/envvars.rst b/mesa 3D driver/docs/envvars.rst
index df9c0434ab..92e568715e 100644
--- a/mesa 3D driver/docs/envvars.rst	
+++ b/mesa 3D driver/docs/envvars.rst	
@@ -232,12 +232,13 @@ the :doc:`Xlib software driver page <xlibdriver>` for details.
 :envvar:`MESA_GLX_ALPHA_BITS`
    specifies default number of bits for alpha channel.
 
-i945/i965 driver environment variables (non-Gallium)
+Intel driver environment variables
 ----------------------------------------------------
 
-:envvar:`INTEL_NO_HW`
-   if set to 1, prevents batches from being submitted to the hardware.
-   This is useful for debugging hangs, etc.
+:envvar:`INTEL_BLACKHOLE_DEFAULT`
+   if set to 1, true or yes, then the OpenGL implementation will
+   default ``GL_BLACKHOLE_RENDER_INTEL`` to true, thus disabling any
+   rendering.
 :envvar:`INTEL_DEBUG`
    a comma-separated list of named flags, which do various things:
 
@@ -334,8 +335,65 @@ i945/i965 driver environment variables (non-Gallium)
    ``vs``
       dump shader assembly for vertex shaders
 
-:envvar:`INTEL_SCALAR_VS` (or ``TCS``, ``TES``, ``GS``)
-   force scalar/vec4 mode for a shader stage (Gen8-9 only)
+:envvar:`INTEL_MEASURE`
+   Collects GPU timestamps over common intervals, and generates a CSV report
+   to show how long rendering took.  The overhead of collection is limited to
+   the flushing that is required at the interval boundaries for accurate
+   timestamps. By default, timing data is sent to ``stderr``.  To direct output
+   to a file:
+
+   ``INTEL_MEASURE=file=/tmp/measure.csv {workload}``
+
+   To begin capturing timestamps at a particular frame:
+
+   ``INTEL_MEASURE=file=/tmp/measure.csv,start=15 {workload}``
+
+   To capture only 23 frames:
+
+   ``INTEL_MEASURE=count=23 {workload}``
+
+   To capture frames 15-37, stopping before frame 38:
+
+   ``INTEL_MEASURE=start=15,count=23 {workload}``
+
+   Designate an asynchronous control file with:
+
+   ``INTEL_MEASURE=control=path/to/control.fifo {workload}``
+
+   As the workload runs, enable capture for 5 frames with:
+
+   ``$ echo 5 > path/to/control.fifo``
+
+   Enable unbounded capture:
+
+   ``$ echo -1 > path/to/control.fifo``
+
+   and disable with:
+
+   ``$ echo 0 > path/to/control.fifo``
+
+   Select the boundaries of each snapshot with:
+
+   ``INTEL_MEASURE=draw``
+      Collects timings for every render (DEFAULT)
+
+   ``INTEL_MEASURE=rt``
+      Collects timings when the render target changes
+
+   ``INTEL_MEASURE=batch``
+      Collects timings when batches are submitted
+
+   ``INTEL_MEASURE=frame``
+      Collects timings at frame boundaries
+
+   With ``INTEL_MEASURE=interval=5``, the duration of 5 events will be
+   combined into a single record in the output.  When possible, a single
+   start and end event will be submitted to the GPU to minimize
+   stalling.  Combined events will not span batches, except in
+   the case of ``INTEL_MEASURE=frame``.
+:envvar:`INTEL_NO_HW`
+   if set to 1, true or yes, prevents batches from being submitted to the
+   hardware. This is useful for debugging hangs, etc.
 :envvar:`INTEL_PRECISE_TRIG`
    if set to 1, true or yes, then the driver prefers accuracy over
    performance in trig functions.
@@ -353,10 +411,6 @@ i945/i965 driver environment variables (non-Gallium)
    The success of assembly override would be signified by "Successfully
    overrode shader with sha1 <sha1>" in stderr replacing the original
    assembly.
-:envvar:`INTEL_BLACKHOLE_DEFAULT`
-   if set to 1, true or yes, then the OpenGL implementation will
-   default ``GL_BLACKHOLE_RENDER_INTEL`` to true, thus disabling any
-   rendering.
 
 
 Radeon driver environment variables (radeon, r200, and r300g)
@@ -365,6 +419,13 @@ Radeon driver environment variables (radeon, r200, and r300g)
 :envvar:`RADEON_NO_TCL`
    if set, disable hardware-accelerated Transform/Clip/Lighting.
 
+DRI environment variables
+-------------------------
+
+:envvar:`DRI_NO_MSAA`
+   disable MSAA for GLX/EGL MSAA visuals
+
+
 EGL environment variables
 -------------------------
 
@@ -557,8 +618,6 @@ RADV driver environment variables
       force all allocated buffers to be referenced in submissions
    ``checkir``
       validate the LLVM IR before LLVM compiles the shader
-   ``errors``
-      display more info about errors
    ``forcecompress``
       Enables DCC,FMASK,CMASK,HTILE in situations where the driver supports it
       but normally does not deem it beneficial.
@@ -574,6 +633,8 @@ RADV driver environment variables
       class of application bugs appearing as flickering.
    ``metashaders``
       dump internal meta shaders
+   ``noatocdithering``
+      disable dithering for alpha to coverage
    ``nobinning``
       disable primitive binning
    ``nocache``
@@ -596,6 +657,8 @@ RADV driver environment variables
       disable memory shaders cache
    ``nongg``
       disable NGG for GFX10+
+   ``nonggc``
+      disable NGG culling on GPUs where it's enabled by default (GFX10.3+ only).
    ``nooutoforder``
       disable out-of-order rasterization
    ``notccompatcmask``
@@ -607,6 +670,8 @@ RADV driver environment variables
       disable VRS for flat shading (only on GFX10.3+)
    ``preoptir``
       dump LLVM IR before any optimizations
+   ``prologs``
+      dump vertex shader prologs
    ``shaders``
       dump shaders
    ``shaderstats``
@@ -639,6 +704,9 @@ RADV driver environment variables
       enable wave32 for compute shaders (GFX10+)
    ``dccmsaa``
       enable DCC for MSAA images
+   ``force_emulate_rt``
+      forces ray-tracing to be emulated in software,
+      even if there is hardware support.
    ``gewave32``
       enable wave32 for vertex/tess/geometry shaders (GFX10+)
    ``localbos``
@@ -648,7 +716,7 @@ RADV driver environment variables
    ``pswave32``
       enable wave32 for pixel shaders (GFX10+)
    ``nggc``
-      enable NGG culling on GFX10+ GPUs.
+      enable NGG culling on GPUs where it's not enabled by default (GFX10.1 only).
    ``rt``
       enable rt extensions whose implementation is still experimental.
    ``sam``
@@ -690,8 +758,6 @@ radeonsi driver environment variables
       Disable DCC.
    ``nodccclear``
       Disable DCC fast clear.
-   ``nodccfb``
-      Disable separate DCC on the main framebuffer
    ``nodccmsaa``
       Disable DCC for MSAA
    ``nodpbb``
@@ -768,12 +834,6 @@ radeonsi driver environment variables
       Always use NGG culling even when it can hurt.
    ``nonggc``
       Disable NGG culling.
-   ``alwayspd``
-      Always enable the primitive discard compute shader.
-   ``pd``
-      Enable the primitive discard compute shader for large draw calls.
-   ``nopd``
-      Disable the primitive discard compute shader.
    ``switch_on_eop``
       Program WD/IA to switch on end-of-packet.
    ``nooutoforder``
diff --git a/mesa 3D driver/docs/features.txt b/mesa 3D driver/docs/features.txt
index dc4c330c6b..be0e946e1b 100644
--- a/mesa 3D driver/docs/features.txt	
+++ b/mesa 3D driver/docs/features.txt	
@@ -41,7 +41,7 @@ GL 3.0, GLSL 1.30 --- all DONE: freedreno, i965, nv50, nvc0, r600, radeonsi, llv
   glBindFragDataLocation, glGetFragDataLocation         DONE
   GL_NV_conditional_render (Conditional rendering)      DONE ()
   GL_ARB_map_buffer_range (Map buffer subranges)        DONE (v3d, vc4, lima)
-  GL_ARB_color_buffer_float (Clamping controls)         DONE (v3d)
+  GL_ARB_color_buffer_float (Clamping controls)         DONE (v3d, lima)
   GL_ARB_texture_float (Float textures, renderbuffers)  DONE (v3d)
   GL_EXT_packed_float                                   DONE (v3d)
   GL_EXT_texture_shared_exponent                        DONE (v3d)
@@ -74,7 +74,7 @@ GL 3.1, GLSL 1.40 --- all DONE: freedreno, i965, nv50, nvc0, r600, radeonsi, llv
   Forward compatible context support/deprecations       DONE
   GL_ARB_draw_instanced (Instanced drawing)             DONE (v3d)
   GL_ARB_copy_buffer (Buffer copying)                   DONE (v3d, vc4, lima)
-  GL_NV_primitive_restart (Primitive restart)           DONE ()
+  GL_NV_primitive_restart (Primitive restart)           DONE (v3d)
   16 vertex texture image units                         DONE ()
   GL_ARB_texture_buffer_object (Texture buffer objs)    DONE ()
   GL_ARB_texture_rectangle (Rectangular textures)       DONE (v3d, vc4, lima)
@@ -206,14 +206,14 @@ GL 4.4, GLSL 4.40 -- all DONE: i965/gen8+, nvc0, r600, radeonsi, llvmpipe, zink
   - input/output block locations                        DONE
   GL_ARB_multi_bind                                     DONE (all drivers)
   GL_ARB_query_buffer_object                            DONE (i965/hsw+, virgl)
-  GL_ARB_texture_mirror_clamp_to_edge                   DONE (i965, nv50, softpipe, swr, virgl, panfrost)
+  GL_ARB_texture_mirror_clamp_to_edge                   DONE (i965, nv50, softpipe, swr, virgl, v3d, panfrost)
   GL_ARB_texture_stencil8                               DONE (freedreno, i965/hsw+, nv50, softpipe, swr, virgl, v3d, panfrost, d3d12)
   GL_ARB_vertex_type_10f_11f_11f_rev                    DONE (freedreno, i965, nv50, softpipe, swr, virgl, panfrost, d3d12)
 
 GL 4.5, GLSL 4.50 -- all DONE: nvc0, r600, radeonsi, llvmpipe, zink
 
   GL_ARB_ES3_1_compatibility                            DONE (i965/hsw+, softpipe, virgl)
-  GL_ARB_clip_control                                   DONE (freedreno, i965, nv50, softpipe, swr, virgl)
+  GL_ARB_clip_control                                   DONE (freedreno, i965, nv50, softpipe, swr, virgl, lima)
   GL_ARB_conditional_render_inverted                    DONE (freedreno, i965, nv50, softpipe, swr, virgl, panfrost)
   GL_ARB_cull_distance                                  DONE (freedreno/a6xx, i965, nv50, softpipe, swr, virgl)
   GL_ARB_derivative_control                             DONE (i965, nv50, softpipe, virgl)
@@ -268,40 +268,40 @@ GLES3.1, GLSL ES 3.1 -- all DONE: i965/hsw+, nvc0, r600, radeonsi, virgl, v3d, s
       glGetBooleani_v - restrict to GLES enums
       gl_HelperInvocation support                       DONE (i965, r600, panfrost)
 
-GLES3.2, GLSL ES 3.2 -- all DONE: i965/gen9+, radeonsi, virgl, llvmpipe
+GLES3.2, GLSL ES 3.2 -- all DONE: i965/gen9+, radeonsi, virgl, llvmpipe, zink
 
   GL_EXT_color_buffer_float                             DONE (all drivers)
   GL_KHR_blend_equation_advanced                        DONE (freedreno/a6xx, i965, nvc0, panfrost)
   GL_KHR_debug                                          DONE (all drivers)
-  GL_KHR_robustness                                     DONE (freedreno, i965, nvc0, r600, zink)
-  GL_KHR_texture_compression_astc_ldr                   DONE (freedreno, i965/gen9+, r600, v3d, vc4, panfrost, softpipe, swr, zink, lima)
+  GL_KHR_robustness                                     DONE (freedreno, i965, nvc0, r600)
+  GL_KHR_texture_compression_astc_ldr                   DONE (freedreno, i965/gen9+, r600, v3d, vc4, panfrost, softpipe, swr, lima)
   GL_OES_copy_image                                     DONE (all drivers)
   GL_OES_draw_buffers_indexed                           DONE (all drivers that support GL_ARB_draw_buffers_blend)
   GL_OES_draw_elements_base_vertex                      DONE (all drivers)
-  GL_OES_geometry_shader                                DONE (freedreno/a6xx, i965/hsw+, nvc0, r600, softpipe, v3d, zink)
+  GL_OES_geometry_shader                                DONE (freedreno/a6xx, i965/hsw+, nvc0, r600, softpipe, v3d)
   GL_OES_gpu_shader5                                    DONE (freedreno/a6xx, all drivers that support GL_ARB_gpu_shader5)
-  GL_OES_primitive_bounding_box                         DONE (freedreno/a5xx+, i965/gen7+, nvc0, r600, softpipe, v3d, zink)
-  GL_OES_sample_shading                                 DONE (freedreno/a6xx, i965, nvc0, r600, zink, panfrost, zink)
-  GL_OES_sample_variables                               DONE (freedreno/a6xx, i965, nvc0, r600, zink, panfrost/bifrost, zink)
+  GL_OES_primitive_bounding_box                         DONE (freedreno/a5xx+, i965/gen7+, nvc0, r600, softpipe, v3d)
+  GL_OES_sample_shading                                 DONE (freedreno/a6xx, i965, nvc0, r600, panfrost)
+  GL_OES_sample_variables                               DONE (freedreno/a6xx, i965, nvc0, r600, panfrost/bifrost)
   GL_OES_shader_image_atomic                            DONE (all drivers that support GL_ARB_shader_image_load_store)
   GL_OES_shader_io_blocks                               DONE (All drivers that support GLES 3.1)
-  GL_OES_shader_multisample_interpolation               DONE (freedreno/a6xx, i965, nvc0, r600, zink)
+  GL_OES_shader_multisample_interpolation               DONE (freedreno/a6xx, i965, nvc0, r600)
   GL_OES_tessellation_shader                            DONE (freedreno/a6xx, all drivers that support GL_ARB_tessellation_shader)
   GL_OES_texture_border_clamp                           DONE (all drivers)
-  GL_OES_texture_buffer                                 DONE (freedreno, i965, nvc0, r600, softpipe, panfrost, zink)
-  GL_OES_texture_cube_map_array                         DONE (freedreno/a4xx+, i965/hsw+, nvc0, r600, softpipe, zink)
+  GL_OES_texture_buffer                                 DONE (freedreno, i965, nvc0, r600, softpipe, panfrost)
+  GL_OES_texture_cube_map_array                         DONE (freedreno/a4xx+, i965/hsw+, nvc0, r600, softpipe)
   GL_OES_texture_stencil8                               DONE (all drivers that support GL_ARB_texture_stencil8)
   GL_OES_texture_storage_multisample_2d_array           DONE (all drivers that support GL_ARB_texture_multisample)
 
 Khronos, ARB, and OES extensions that are not part of any OpenGL or OpenGL ES version:
 
-  GL_ARB_bindless_texture                               DONE (nvc0, radeonsi)
+  GL_ARB_bindless_texture                               DONE (nvc0, radeonsi, zink)
   GL_ARB_cl_event                                       not started
   GL_ARB_compute_variable_group_size                    DONE (i965/gen7+, nvc0, radeonsi, zink)
   GL_ARB_ES3_2_compatibility                            DONE (i965/gen8+, radeonsi, virgl, zink)
   GL_ARB_fragment_shader_interlock                      DONE (i965, zink)
   GL_ARB_gpu_shader_int64                               DONE (i965/gen8+, nvc0, radeonsi, softpipe, llvmpipe, zink)
-  GL_ARB_parallel_shader_compile                        DONE (all drivers)
+  GL_ARB_parallel_shader_compile                        DONE (freedreno, iris, radeonsi)
   GL_ARB_post_depth_coverage                            DONE (i965, nvc0, radeonsi, llvmpipe, zink)
   GL_ARB_robustness_isolation                           not started
   GL_ARB_sample_locations                               DONE (nvc0, zink)
@@ -316,12 +316,12 @@ Khronos, ARB, and OES extensions that are not part of any OpenGL or OpenGL ES ve
   GL_ARB_sparse_texture2                                not started
   GL_ARB_sparse_texture_clamp                           not started
   GL_ARB_texture_filter_minmax                          DONE (nvc0/gm200+, zink)
-  GL_EXT_color_buffer_half_float                        DONE (gallium drivers supporting required formats)
+  GL_EXT_color_buffer_half_float                        DONE (freedreno, i965, iris, llvmpipe, nv50, nvc0, radeonsi, zink)
   GL_EXT_depth_bounds_test                              DONE (i965/gen12+, nv50, nvc0, radeonsi, softpipe, swr, zink)
-  GL_EXT_memory_object                                  DONE (radeonsi, i965/gen7+)
-  GL_EXT_memory_object_fd                               DONE (radeonsi, i965/gen7+)
+  GL_EXT_memory_object                                  DONE (radeonsi, i965/gen7+, llvmpipe)
+  GL_EXT_memory_object_fd                               DONE (radeonsi, i965/gen7+, llvmpipe)
   GL_EXT_memory_object_win32                            not started
-  GL_EXT_multisampled_render_to_texture                 DONE (freedreno/a6xx, panfrost)
+  GL_EXT_multisampled_render_to_texture                 DONE (freedreno/a6xx, panfrost, zink)
   GL_EXT_render_snorm                                   DONE (i965, r600, radeonsi, softpipe, zink)
   GL_EXT_semaphore                                      DONE (radeonsi, i965/gen7+)
   GL_EXT_semaphore_fd                                   DONE (radeonsi, i965/gen7+)
@@ -448,23 +448,23 @@ Vulkan 1.2 -- all DONE: anv, vn
   VK_KHR_8bit_storage                                   DONE (anv/gen8+, lvp, radv, vn)
   VK_KHR_buffer_device_address                          DONE (anv/gen8+, lvp, radv, vn)
   VK_KHR_create_renderpass2                             DONE (anv, lvp, radv, tu, vn)
-  VK_KHR_depth_stencil_resolve                          DONE (anv, radv, tu, vn)
+  VK_KHR_depth_stencil_resolve                          DONE (anv, lvp, radv, tu, vn)
   VK_KHR_draw_indirect_count                            DONE (anv, lvp, radv, tu, vn)
   VK_KHR_driver_properties                              DONE (anv, lvp, radv, vn)
   VK_KHR_image_format_list                              DONE (anv, lvp, radv, tu, v3dv, vn)
-  VK_KHR_imageless_framebuffer                          DONE (anv, lvp, radv, vn)
+  VK_KHR_imageless_framebuffer                          DONE (anv, lvp, radv, tu, vn)
   VK_KHR_sampler_mirror_clamp_to_edge                   DONE (anv, lvp, radv, tu, v3dv, vn)
   VK_KHR_separate_depth_stencil_layouts                 DONE (anv, lvp, radv, vn)
   VK_KHR_shader_atomic_int64                            DONE (anv/gen9+, lvp, radv, vn)
-  VK_KHR_shader_float16_int8                            DONE (anv/gen8+, radv, tu, vn)
-  VK_KHR_shader_float_controls                          DONE (anv/gen8+, radv, tu, vn)
-  VK_KHR_shader_subgroup_extended_types                 DONE (anv/gen8+, radv, vn)
-  VK_KHR_spirv_1_4                                      DONE (anv, radv, tu, vn)
+  VK_KHR_shader_float16_int8                            DONE (anv/gen8+, lvp, radv, tu, vn)
+  VK_KHR_shader_float_controls                          DONE (anv/gen8+, lvp, radv, tu, vn)
+  VK_KHR_shader_subgroup_extended_types                 DONE (anv/gen8+, lvp, radv, tu, vn)
+  VK_KHR_spirv_1_4                                      DONE (anv, lvp, radv, tu, vn)
   VK_KHR_timeline_semaphore                             DONE (anv, lvp, radv, tu, vn)
-  VK_KHR_uniform_buffer_standard_layout                 DONE (anv, lvp, radv, v3dv, vn)
+  VK_KHR_uniform_buffer_standard_layout                 DONE (anv, lvp, radv, tu, v3dv, vn)
   VK_KHR_vulkan_memory_model                            DONE (anv, radv, tu, vn)
   VK_EXT_descriptor_indexing                            DONE (anv/gen9+, radv, tu, vn)
-  VK_EXT_host_query_reset                               DONE (anv, lvp, radv, tu, vn)
+  VK_EXT_host_query_reset                               DONE (anv, lvp, radv, tu, v3dv, vn)
   VK_EXT_sampler_filter_minmax                          DONE (anv/gen9+, lvp, radv, tu, vn)
   VK_EXT_scalar_block_layout                            DONE (anv, lvp, radv/gfx7+, tu, vn)
   VK_EXT_separate_stencil_usage                         DONE (anv, lvp, tu, vn)
@@ -477,28 +477,29 @@ Khronos extensions that are not part of any Vulkan version:
   VK_KHR_deferred_host_operations                       DONE (anv, radv)
   VK_KHR_display                                        DONE (anv, lvp, radv, tu, v3dv)
   VK_KHR_display_swapchain                              not started
-  VK_KHR_external_fence_fd                              DONE (anv, radv, tu, v3dv)
+  VK_KHR_external_fence_fd                              DONE (anv, radv, tu, v3dv, vn)
   VK_KHR_external_fence_win32                           not started
-  VK_KHR_external_memory_fd                             DONE (anv, radv, tu, v3dv)
+  VK_KHR_external_memory_fd                             DONE (anv, lvp, radv, tu, v3dv, vn)
   VK_KHR_external_memory_win32                          not started
-  VK_KHR_external_semaphore_fd                          DONE (anv, radv, tu, v3dv)
+  VK_KHR_external_semaphore_fd                          DONE (anv, radv, tu, v3dv, vn)
   VK_KHR_external_semaphore_win32                       not started
   VK_KHR_fragment_shading_rate                          not started
   VK_KHR_get_display_properties2                        DONE (anv, lvp, radv, tu, v3dv)
   VK_KHR_get_surface_capabilities2                      DONE (anv, lvp, radv, tu, v3dv, vn)
-  VK_KHR_incremental_present                            DONE (anv, lvp, radv, tu, v3dv)
+  VK_KHR_incremental_present                            DONE (anv, lvp, radv, tu, v3dv, vn)
   VK_KHR_performance_query                              DONE (anv/gen8+, tu)
   VK_KHR_pipeline_executable_properties                 DONE (anv, radv, tu)
   VK_KHR_push_descriptor                                DONE (anv, lvp, radv, tu)
   VK_KHR_shader_clock                                   DONE (anv, radv)
+  VK_KHR_shader_integer_dot_product                     DONE (radv)
   VK_KHR_shader_non_semantic_info                       DONE (anv, radv)
   VK_KHR_shader_subgroup_uniform_control_flow           DONE (anv, radv)
   VK_KHR_shader_terminate_invocation                    DONE (anv, radv, tu)
   VK_KHR_shared_presentable_image                       not started
   VK_KHR_surface                                        DONE (anv, lvp, radv, tu, v3dv, vn)
-  VK_KHR_surface_protected_capabilities                 DONE (anv, lvp, radv, vn)
-  VK_KHR_swapchain                                      DONE (anv, lvp, radv, tu, v3dv)
-  VK_KHR_swapchain_mutable_format                       DONE (anv, radv)
+  VK_KHR_surface_protected_capabilities                 DONE (anv, lvp, radv, v3dv, vn)
+  VK_KHR_swapchain                                      DONE (anv, lvp, radv, tu, v3dv, vn)
+  VK_KHR_swapchain_mutable_format                       DONE (anv, radv, v3dv, vn)
   VK_KHR_wayland_surface                                DONE (anv, lvp, radv, tu, v3dv, vn)
   VK_KHR_workgroup_memory_explicit_layout               DONE (anv, radv)
   VK_KHR_win32_keyed_mutex                              not started
@@ -506,40 +507,40 @@ Khronos extensions that are not part of any Vulkan version:
   VK_KHR_xcb_surface                                    DONE (anv, lvp, radv, tu, v3dv, vn)
   VK_KHR_xlib_surface                                   DONE (anv, lvp, radv, tu, v3dv, vn)
   VK_KHR_zero_initialize_workgroup_memory               DONE (anv, radv)
-  VK_EXT_4444_formats                                   DONE (anv, radv, tu)
+  VK_EXT_4444_formats                                   DONE (anv, lvp, radv, tu)
   VK_EXT_calibrated_timestamps                          DONE (anv, lvp, radv)
   VK_EXT_color_write_enable                             DONE (anv, lvp, v3dv)
   VK_EXT_conditional_rendering                          DONE (anv, lvp, radv, tu)
   VK_EXT_conservative_rasterization                     DONE (anv/gen9+, radv)
   VK_EXT_custom_border_color                            DONE (anv, lvp, radv, tu, v3dv)
   VK_EXT_debug_marker                                   DONE (radv)
-  VK_EXT_depth_clip_enable                              DONE (anv, radv, tu)
+  VK_EXT_depth_clip_enable                              DONE (anv, lvp, radv, tu)
   VK_EXT_depth_range_unrestricted                       DONE (radv)
   VK_EXT_discard_rectangles                             DONE (radv)
   VK_EXT_display_control                                DONE (anv, radv, tu)
   VK_EXT_extended_dynamic_state                         DONE (anv, lvp, radv, tu)
   VK_EXT_extended_dynamic_state2                        DONE (anv, lvp, radv)
-  VK_EXT_external_memory_dma_buf                        DONE (anv, radv, tu, v3dv)
+  VK_EXT_external_memory_dma_buf                        DONE (anv, radv, tu, v3dv, vn)
   VK_EXT_external_memory_host                           DONE (anv, lvp, radv)
   VK_EXT_filter_cubic                                   DONE (tu/a650)
   VK_EXT_fragment_shader_interlock                      DONE (anv/gen9+)
   VK_EXT_global_priority                                DONE (anv, radv)
-  VK_EXT_image_drm_format_modifier                      DONE (anv, radv/gfx9+, tu)
+  VK_EXT_image_drm_format_modifier                      DONE (anv, radv/gfx9+, tu, vn)
   VK_EXT_image_robustness                               DONE (anv, radv)
   VK_EXT_index_type_uint8                               DONE (anv, lvp, radv/gfx8+, v3dv, tu)
   VK_EXT_inline_uniform_block                           DONE (anv, radv)
-  VK_EXT_line_rasterization                             DONE (anv, lvp, radv)
+  VK_EXT_line_rasterization                             DONE (anv, lvp, radv, tu)
   VK_EXT_memory_budget                                  DONE (anv, radv, tu)
   VK_EXT_memory_priority                                DONE (radv)
   VK_EXT_multi_draw                                     DONE (anv, lvp, radv)
   VK_EXT_pci_bus_info                                   DONE (anv, radv)
   VK_EXT_physical_device_drm                            DONE (anv, radv, v3dv)
   VK_EXT_pipeline_creation_cache_control                DONE (anv, radv, v3dv)
-  VK_EXT_pipeline_creation_feedback                     DONE (anv, radv)
+  VK_EXT_pipeline_creation_feedback                     DONE (anv, radv, v3dv)
   VK_EXT_post_depth_coverage                            DONE (anv/gfx10+, lvp, radv)
   VK_EXT_private_data                                   DONE (anv, lvp, radv, tu, v3dv)
   VK_EXT_provoking_vertex                               DONE (anv, lvp, radv, tu, v3dv)
-  VK_EXT_queue_family_foreign                           DONE (anv, radv)
+  VK_EXT_queue_family_foreign                           DONE (anv, radv, vn)
   VK_EXT_robustness2                                    DONE (anv, radv, tu)
   VK_EXT_sample_locations                               DONE (anv, radv/gfx9-, tu/a650)
   VK_EXT_shader_atomic_float                            DONE (anv, radv)
@@ -552,11 +553,11 @@ Khronos extensions that are not part of any Vulkan version:
   VK_EXT_subgroup_size_control                          DONE (anv, radv)
   VK_EXT_texel_buffer_alignment                         DONE (anv, radv)
   VK_EXT_transform_feedback                             DONE (anv, lvp, radv, tu, vn)
-  VK_EXT_vertex_attribute_divisor                       DONE (anv, radv, lvp, tu)
-  VK_EXT_vertex_input_dynamic_state                     DONE (lvp)
+  VK_EXT_vertex_attribute_divisor                       DONE (anv, radv, lvp, tu, v3dv)
+  VK_EXT_vertex_input_dynamic_state                     DONE (lvp, radv)
   VK_EXT_ycbcr_image_arrays                             DONE (anv, radv)
-  VK_ANDROID_external_memory_android_hardware_buffer    DONE (anv, radv)
-  VK_ANDROID_native_buffer                              DONE (anv, radv)
+  VK_ANDROID_external_memory_android_hardware_buffer    DONE (anv, radv, vn)
+  VK_ANDROID_native_buffer                              DONE (anv, radv, vn)
   VK_GOOGLE_decorate_string                             DONE (anv, lvp, radv)
   VK_GOOGLE_hlsl_functionality1                         DONE (anv, lvp, radv)
   VK_GOOGLE_user_type                                   DONE (anv, radv)
diff --git a/mesa 3D driver/docs/gallium/context.rst b/mesa 3D driver/docs/gallium/context.rst
index e9332edfd7..6b35a981d9 100644
--- a/mesa 3D driver/docs/gallium/context.rst	
+++ b/mesa 3D driver/docs/gallium/context.rst	
@@ -118,6 +118,8 @@ objects. They all follow simple, one-method binding calls, e.g.
     levels. This corresponds to GL's ``PATCH_DEFAULT_OUTER_LEVEL``.
   * ``default_inner_level`` is the default value for the inner tessellation
     levels. This corresponds to GL's ``PATCH_DEFAULT_INNER_LEVEL``.
+* ``set_patch_vertices`` sets the number of vertices per input patch
+  for tessellation.
 
 * ``set_debug_callback`` sets the callback to be used for reporting
   various debug messages, eventually reported via KHR_debug and
diff --git a/mesa 3D driver/docs/gallium/cso/rasterizer.rst b/mesa 3D driver/docs/gallium/cso/rasterizer.rst
index e75f5886d8..48986c07f4 100644
--- a/mesa 3D driver/docs/gallium/cso/rasterizer.rst	
+++ b/mesa 3D driver/docs/gallium/cso/rasterizer.rst	
@@ -326,10 +326,15 @@ clip_halfz
     When true clip space in the z axis goes from [0..1] (D3D).  When false
     [-1, 1] (GL)
 
-depth_clip
-    When false, the near and far depth clipping planes of the view volume are
-    disabled and the depth value will be clamped at the per-pixel level, after
-    polygon offset has been applied and before depth testing.
+depth_clip_near
+    When false, the near depth clipping plane of the view volume is disabled.
+depth_clip_far
+    When false, the far depth clipping plane of the view volume is disabled.
+depth_clamp
+    Whether the depth value will be clamped to the interval defined by the
+    near and far depth range at the per-pixel level, after polygon offset has
+    been applied and before depth testing. Note that a clamp to [0,1] according
+    to GL rules should always happen even if this is disabled.
 
 clip_plane_enable
     For each k in [0, PIPE_MAX_CLIP_PLANES), if bit k of this field is set,
diff --git a/mesa 3D driver/docs/gallium/screen.rst b/mesa 3D driver/docs/gallium/screen.rst
index 879714fd40..b8f3c2fcf5 100644
--- a/mesa 3D driver/docs/gallium/screen.rst	
+++ b/mesa 3D driver/docs/gallium/screen.rst	
@@ -80,6 +80,9 @@ The integer capabilities:
   disabling depth clipping (through pipe_rasterizer_state) separately for
   the near and far plane. If not, depth_clip_near and depth_clip_far will be
   equal.
+  ``PIPE_CAP_DEPTH_CLAMP_ENABLE``: Whether the driver is capable of
+  enabling depth clamping (through pipe_rasterizer_state) separately from depth
+  clipping. If not, depth_clamp will be the inverse of depth_clip_far.
 * ``PIPE_CAP_SHADER_STENCIL_EXPORT``: Whether a stencil reference value can be
   written from a fragment shader.
 * ``PIPE_CAP_TGSI_INSTANCEID``: Whether TGSI_SEMANTIC_INSTANCEID is supported
@@ -618,6 +621,8 @@ The integer capabilities:
 * ``PIPE_CAP_EMULATE_NONFIXED_PRIMITIVE_RESTART``: Driver requests all draws using a non-fixed restart index to be rewritten to use a fixed restart index.
 * ``PIPE_CAP_SUPPORTED_PRIM_MODES``: A bitmask of the ``pipe_prim_type`` enum values that the driver can natively support.
 * ``PIPE_CAP_SUPPORTED_PRIM_MODES_WITH_RESTART``: A bitmask of the ``pipe_prim_type`` enum values that the driver can natively support for primitive restart. Only useful if ``PIPE_CAP_PRIMITIVE_RESTART`` is also exported.
+* ``PIPE_CAP_PREFER_BACK_BUFFER_REUSE``: Only applies to DRI_PRIME. If 1, the driver prefers that DRI3 tries to use the same back buffer each frame. If 0, this means DRI3 will at least use 2 back buffers and ping-pong between them to allow the tiled->linear copy to run in parallel.
+* ``PIPE_CAP_DRAW_VERTEX_STATE``: Driver supports `pipe_screen::create_vertex_state/vertex_state_destroy` and `pipe_context::draw_vertex_state`. Only used by display lists and designed to serve vbo_save.
 
 .. _pipe_capf:
 
diff --git a/mesa 3D driver/docs/gallium/tgsi.rst b/mesa 3D driver/docs/gallium/tgsi.rst
index 404067862c..197ef6f1ad 100644
--- a/mesa 3D driver/docs/gallium/tgsi.rst	
+++ b/mesa 3D driver/docs/gallium/tgsi.rst	
@@ -3561,11 +3561,6 @@ interpolation should be done at, one of ``TGSI_INTERPOLATE_LOC_*``. Note that
 when per-sample shading is enabled, the implementation may choose to
 interpolate at the sample irrespective of the Location field.
 
-The CylindricalWrap bitfield specifies which register components
-should be subject to cylindrical wrapping when interpolating by the
-rasteriser. If TGSI_CYLINDRICAL_WRAP_X is set to 1, the X component
-should be interpolated according to cylindrical wrapping rules.
-
 
 Declaration Sampler View
 ^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/mesa 3D driver/docs/isl/tiling.rst b/mesa 3D driver/docs/isl/tiling.rst
index ded1fad171..44a44b461f 100644
--- a/mesa 3D driver/docs/isl/tiling.rst	
+++ b/mesa 3D driver/docs/isl/tiling.rst	
@@ -246,6 +246,36 @@ ISL, we represent a W-tile as a tiling with a logical dimension of 64el x 64el
 but a physical size of 128B x 32rows.  This cleanly takes care of the pitch
 issue above and seems to nicely model the hardware.
 
+Tile4
+-----
+
+The tile4 format, introduced on Xe-HP, is somewhat similar to Y but with more
+internal shuffling.  Each tile4 tile is an 8x8 grid of cache lines arranged
+as follows:
+
+===== ===== ===== ===== ===== ===== ===== =====
+===== ===== ===== ===== ===== ===== ===== =====
+0x000 0x040 0x080 0x0a0 0x200 0x240 0x280 0x2a0
+0x100 0x140 0x180 0x1a0 0x300 0x340 0x380 0x3a0
+0x400 0x440 0x480 0x4a0 0x600 0x640 0x680 0x6a0
+0x500 0x540 0x580 0x5a0 0x700 0x740 0x780 0x7a0
+0x800 0x840 0x880 0x8a0 0xa00 0xa40 0xa80 0xaa0
+0x900 0x940 0x980 0x9a0 0xb00 0xb40 0xb80 0xba0
+0xc00 0xc40 0xc80 0xca0 0xe00 0xe40 0xe80 0xea0
+0xd00 0xd40 0xd80 0xda0 0xf00 0xf40 0xf80 0xfa0
+===== ===== ===== ===== ===== ===== ===== =====
+
+Each 64B cache line within the tile is laid out the same way as for a Y-tile,
+as 4 rows of 16B each:
+
+==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
+==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
+0x00 0x01 0x02 0x03 0x04 0x05 0x06 0x07 0x08 0x09 0x0a 0x0b 0x0c 0x0d 0x0e 0x0f
+0x10 0x11 0x12 0x13 0x14 0x15 0x16 0x17 0x18 0x19 0x1a 0x1b 0x1c 0x1d 0x1e 0x1f
+0x20 0x21 0x22 0x23 0x24 0x25 0x26 0x27 0x28 0x29 0x2a 0x2b 0x2c 0x2d 0x2e 0x2f
+0x30 0x31 0x32 0x33 0x34 0x35 0x36 0x37 0x38 0x39 0x3a 0x3b 0x3c 0x3d 0x3e 0x3f
+==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
+
 Tiling as a bit pattern
 -----------------------
 
@@ -281,6 +311,7 @@ the tile are given by the table below:
 :cpp:enumerator:`isl_tiling::ISL_TILING_X`  :math:`v_2` :math:`v_1` :math:`v_0` :math:`u_8` :math:`u_7` :math:`u_6` :math:`u_5` :math:`u_4` :math:`u_3` :math:`u_2` :math:`u_1` :math:`u_0`
 :cpp:enumerator:`isl_tiling::ISL_TILING_Y0` :math:`u_6` :math:`u_5` :math:`u_4` :math:`v_4` :math:`v_3` :math:`v_2` :math:`v_1` :math:`v_0` :math:`u_3` :math:`u_2` :math:`u_1` :math:`u_0`
 :cpp:enumerator:`isl_tiling::ISL_TILING_W`  :math:`u_5` :math:`u_4` :math:`u_3` :math:`v_5` :math:`v_4` :math:`v_3` :math:`v_2` :math:`u_2` :math:`v_1` :math:`u_1` :math:`v_0` :math:`u_0`
+:cpp:enumerator:`isl_tiling::ISL_TILING_4`  :math:`v_4` :math:`v_3` :math:`u_6` :math:`v_2` :math:`u_5` :math:`u_4` :math:`v_1` :math:`v_0` :math:`u_3` :math:`u_2` :math:`u_1` :math:`u_0`
 =========================================== =========== =========== =========== =========== =========== =========== =========== =========== =========== =========== =========== ===========
 
 Constructing the mapping this way makes a lot of sense when you think about
diff --git a/mesa 3D driver/docs/release-calendar.csv b/mesa 3D driver/docs/release-calendar.csv
index d9bfd245bc..018c3e5e74 100644
--- a/mesa 3D driver/docs/release-calendar.csv	
+++ b/mesa 3D driver/docs/release-calendar.csv	
@@ -1,10 +1,6 @@
-21.1,2021-08-25,21.1.8,Eric Engestrom,
-21.2,2021-08-18,21.2.1,Dylan Baker
-,2021-09-01,21.2.2,Dylan Baker
-,2021-09-15,21.2.3,Dylan Baker,
-,2021-09-29,21.2.4,Dylan Baker,
-,2021-10-13,21.2.5,Dylan Baker,
-,2021-10-27,21.2.6,Dylan Baker,
-,2021-11-10,21.2.7,Dylan Baker,
-,2021-11-24,21.2.8,Dylan Baker,
-,2021-12-08,21.2.9,Dylan Baker,This is the last planned release of the 21.2.x series.
+21.2,2021-10-27,21.2.5,Dylan Baker,
+,2021-11-10,21.2.6,Dylan Baker,
+,2021-11-24,21.2.7,Dylan Baker,
+,2021-12-08,21.2.8,Dylan Baker,Last planned 21.2.x release.
+21.3,2021-10-27,21.3.0-rc3,Eric Engestrom,
+,2021-11-03,21.3.0-rc4,Eric Engestrom,
diff --git a/mesa 3D driver/docs/relnotes.rst b/mesa 3D driver/docs/relnotes.rst
index 3bbb338953..22163bfc13 100644
--- a/mesa 3D driver/docs/relnotes.rst	
+++ b/mesa 3D driver/docs/relnotes.rst	
@@ -3,6 +3,10 @@ Release Notes
 
 The release notes summarize what's new or changed in each Mesa release.
 
+-  :doc:`21.2.4 release notes <relnotes/21.2.4>`
+-  :doc:`21.2.3 release notes <relnotes/21.2.3>`
+-  :doc:`21.2.2 release notes <relnotes/21.2.2>`
+-  :doc:`21.1.8 release notes <relnotes/21.1.8>`
 -  :doc:`21.1.7 release notes <relnotes/21.1.7>`
 -  :doc:`21.2.0 release notes <relnotes/21.2.0>`
 -  :doc:`21.1.6 release notes <relnotes/21.1.6>`
@@ -342,6 +346,10 @@ release notes, or in the `old docs`_.
    :maxdepth: 1
    :hidden:
 
+   relnotes/21.2.4
+   relnotes/21.2.3
+   relnotes/21.2.2
+   relnotes/21.1.8
    relnotes/21.1.7
    relnotes/21.2.0
    relnotes/21.1.6
diff --git a/mesa 3D driver/docs/relnotes/21.1.8.rst b/mesa 3D driver/docs/relnotes/21.1.8.rst
new file mode 100644
index 0000000000..5b9322e9ac
--- /dev/null
+++ b/mesa 3D driver/docs/relnotes/21.1.8.rst	
@@ -0,0 +1,105 @@
+Mesa 21.1.8 Release Notes / 2021-09-08
+======================================
+
+Mesa 21.1.8 is a bug fix release which fixes bugs found since the 21.1.7 release.
+
+Mesa 21.1.8 implements the OpenGL 4.6 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.6. OpenGL
+4.6 is **only** available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+
+Mesa 21.1.8 implements the Vulkan 1.2 API, but the version reported by
+the apiVersion property of the VkPhysicalDeviceProperties struct
+depends on the particular driver being used.
+
+SHA256 checksum
+---------------
+
+::
+
+    5cd32f5d089dca75300578a3d771a656eaed652090573a2655fe4e7022d56bfc  mesa-21.1.8.tar.xz
+
+
+New features
+------------
+
+- None
+
+
+Bug fixes
+---------
+
+- llvmpipe doesn't compile a valid shader with an useless switch
+- GetFragDataLocation(prog, "gl_FragColor") generates INVALID_OPERATION, but specs don't say it should
+- Possible miscompilation of a comparison with unsigned zero
+- dEQP-VK.wsi.android.swapchain.create#image_swapchain_create_info crash on Android R
+
+
+Changes
+-------
+
+Alyssa Rosenzweig (1):
+
+- drm-shim: Support kernels with >4k pages
+
+Boris Brezillon (1):
+
+- panfrost: Fix pan_blitter_emit_bifrost_blend()
+
+Eric Engestrom (3):
+
+- .pick_status.json: Update to 8bb9e9e76fa1f062c8da9536e9ee209b2dc268f7
+- Revert "python: Explicitly add the 'L' suffix on Python 3"
+- isl: drop left-over comment
+
+Erik Faye-Lund (2):
+
+- gallium/nir/tgsi: fixup indentation
+- gallium/nir/tgsi: initialize file_max for inputs
+
+Ilia Mirkin (1):
+
+- mesa: don't return errors for gl_* GetFragData* queries
+
+Jason Ekstrand (1):
+
+- anv: Set CONTEXT_PARAM_RECOVERABLE to false
+
+Lionel Landwerlin (1):
+
+- anv/android: handle image bindings from gralloc buffers
+
+Mao, Marc (1):
+
+- iris: declare padding for iris_vue_prog_key
+
+Marcin Ślusarz (2):
+
+- nir/builder: invalidate metadata per function
+- glsl/opt_algebraic: disable invalid optimization
+
+Mike Blumenkrantz (1):
+
+- nir/lower_vectorize_tess_levels: set num_components for vectorized loads
+
+Roman Stratiienko (1):
+
+- lima: Implement lima_resource_get_param() callback
+
+Simon Ser (4):
+
+- etnaviv: add stride, offset and modifier to resource_get_param
+- panfrost: implement resource_get_param
+- vc4: implement resource_get_param
+- v3d: implement resource_get_param
+
+Timothy Arceri (1):
+
+- glsl: fix variable scope for instructions inside case statements
+
+Vinson Lee (2):
+
+- meson: Remove duplicate xvmc in build summary.
+- nir: Initialize evaluate_cube_face_index_amd dst.x.
diff --git a/mesa 3D driver/docs/relnotes/21.2.2.rst b/mesa 3D driver/docs/relnotes/21.2.2.rst
new file mode 100644
index 0000000000..f3f539cbe1
--- /dev/null
+++ b/mesa 3D driver/docs/relnotes/21.2.2.rst	
@@ -0,0 +1,293 @@
+Mesa 21.2.2 Release Notes / 2021-09-21
+======================================
+
+Mesa 21.2.2 is a bug fix release which fixes bugs found since the 21.2.1 release.
+
+Mesa 21.2.2 implements the OpenGL 4.6 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.6. OpenGL
+4.6 is **only** available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+
+Mesa 21.2.2 implements the Vulkan 1.2 API, but the version reported by
+the apiVersion property of the VkPhysicalDeviceProperties struct
+depends on the particular driver being used.
+
+SHA256 checksum
+---------------
+
+::
+
+   c4aaf1bf974217ed825e1c536de6ab72a4e266d44bcf69fc4ec499039f99e5c4  mesa-21.2.2.tar.xz
+
+
+New features
+------------
+
+- None
+
+
+Bug fixes
+---------
+
+- RADV: consistent crash in Splitgate
+- [RADV] The game "Aliens: Fireteam Elite" start crashing after commit 2e56e2342094e8ec90afa5265b1c43503f662939
+- llvmpipe doesn't compile a shader with an inner scope in a for loop
+- llvmpipe doesn't compile the increment of a for a loop
+- Mesa 21.2.1 implementation error: unexpected state[0] in make_state_flags()
+- freedreno: regression in org.skia.skqp.SkQPRunner#gles_localmatriximagefilter
+- [Radeonsi] VA-API Encoding no longer works on AMD PITCAIRN
+- turnip: Geometry flickering in Genshin Impact after 83e9a7fbcf53b90d0de66985dbbf91986fc7b05d
+- OSMesa problem resizing
+- Memory leak: si_get_shader_binary_size is missing a call to ac_rtld_close
+- dEQP-GLES3.stress.draw.unaligned_data.random.4 segfault
+- gl_DrawID is incorrect for glMultiDrawElementsBaseVertex/glMultiDrawElementsIndirect
+- i915: GPU hang when doing FB fetch and gl_FragDepth write in one shader
+- ../mesa-9999/src/amd/compiler/aco_instruction_selection.cpp:10009:30: error: 'exchange' is not a member of 'std'
+- radv: disable DCC for displayable images with storage on navi12/14
+- RADV: Menu static/artifacts in Doom Eternal
+- Crash happens when testing GL_PIXEL_PACK_BUFFER
+- panfrost G31 - Cathedral crash- opengl 2.1 game (I guess)
+- panfrost / armv7 - crash with mesa newer than 21.0.3
+- freedreno C++14 build error
+- llvmpipe doesn't compile a valid shader with an useless switch
+- GetFragDataLocation(prog, "gl_FragColor") generates INVALID_OPERATION, but specs don't say it should
+
+
+Changes
+-------
+
+Adrian Bunk (1):
+
+- util/format: NEON is not available with the soft-float ABI
+
+Alyssa Rosenzweig (24):
+
+- panfrost: Handle non-dithered clear colours
+- panfrost: Disable shader-assisted indirect draws
+- pan/bi: Don't set td in blend shaders
+- pan/bi: Correct the sr_count on +ST_TILE
+- pan/bi: Extract load_sample_id to a helper
+- pan/bi: Set the sample ID for blend shader LD_TILE
+- pan/bi: Use CLPER_V6 on Mali G31
+- panfrost: Remove unneeded quirks from T760
+- panfrost: Use blendable check for tib read check
+- pan/mdg: Insert moves before writeout when needed
+- panfrost: Zero initialize blend_shaders
+- panfrost: Fix NULL dereference in allowlist code
+- panfrost: Protect the variants array with a lock
+- panfrost: Don't use ralloc for resources
+- panfrost: Move bo->label assignment into the lock
+- panfrost: Switch resources from an array to a set
+- panfrost: Cache number of users of a resource
+- panfrost: Maintain a bitmap of active batches
+- panfrost: Add foreach_batch iterator
+- panfrost: Prefer batch->resources to rsrc->users
+- panfrost: Remove rsrc->track.users
+- panfrost: Remove writer = NULL assignments
+- panfrost: Replace writers pointer with hash table
+- panfrost: Raise maximum texture size
+
+Bas Nieuwenhuizen (2):
+
+- util/fossilize_db: Don't corrupt keys during entry read.
+- nir: Avoid visiting instructions multiple times in nir_instr_free_and_dce.
+
+Boris Brezillon (2):
+
+- panfrost: Add explicit padding to pan_blend_shader_key
+- panfrost: v7 does not support RGB32_UNORM textures
+
+Connor Abbott (4):
+
+- ir3/ra: Fix available bitset for live-through collect srcs
+- ir3/ra: Handle huge merge sets
+- ir3/lower_pcopy: Use right flags for src const/immed
+- ir3/lower_pcopy: Set entry->done in the swap loop
+
+Corentin Noël (1):
+
+- glx: Prevent crashes when an extension isn't found
+
+Daniel Schürmann (1):
+
+- aco: fix p_insert lowering with 16bit sources
+
+Danylo Piliaiev (1):
+
+- turnip: re-emit vertex params after they are invalidated
+
+Dave Airlie (5):
+
+- vulkan/wsi/sw: wait for image fence before submitting to queue
+- crocus: copy views before adjusting
+- crocus: add missing line smooth bits.
+- crocus: add missing fs dirty on reduced prim change.
+- crocus/gen7: add missing IVB/GT2 geom shader workaround.
+
+Dylan Baker (11):
+
+- docs: add SHA256 sum for mesa 21.2.1
+- .pick_status.json: Update to 35c3f5f08b7b11f3896412fb5778f127be329615
+- .pick_status.json: Update to 8e5e70bb3de7f75ab1b039e2cec2975ba59e4af7
+- .pick_status.json: Update to 572ed2249465acd4c5f8a229d504a48cbddf95a5
+- .pick_status.json: Update to 71e748ad2443c373bb090fa1da2626da367b1d20
+- .pick_status.json: Update to 9bc61108d73db4e614dda2a27750ff80165eedbb
+- .pick_status.json: Update to a6a89aaa2f2943532d99d9bc7b80106a1740f237
+- .pick_status.json: Update to f4b61e90617f19ca1b8a3cfe046bac5801081057
+- .pick_status.json: Update to 076c8f041a63c74c31d9f541684860628a8b9979
+- .pick_status.json: Update to b58d6eaf1174aab296c4230e3895c65cba4bd9e3
+- .pick_status.json: Update to 7244aa19806cec5265e1e219cac1a99b0d3c62c6
+
+Ed Martin (1):
+
+- winsys/radeonsi: Set vce_encode = true when VCE found
+
+Emma Anholt (2):
+
+- llvmpipe: Free CS shader images on context destroy.
+- llvmpipe: Fix leak of CS local memory with 0 threads.
+
+Erik Faye-Lund (4):
+
+- gallivm: fix texture-mapping with 16-bit result
+- gallium/nir/tgsi: fixup indentation
+- gallium/nir/tgsi: initialize file_max for inputs
+- lavapipe: fix reported subpixel precision for lines
+
+Filip Gawin (2):
+
+- nir: fix shadowed variable in nir_lower_bit_size.c
+- nir: fix ifind_msb_rev by using appropriate type
+
+Ian Romanick (3):
+
+- util: Add and use functions to calculate min and max int for a size
+- nir/lower_bit_size: Support add_sat and sub_sat
+- nir/lower_gs_intrinsics: Return progress if append_set_vertex_and_primitive_count makes progress
+
+Icecream95 (1):
+
+- pan/bi: Extend bi_add_nop_for_atest for tilebuffer loads
+
+Ilia Mirkin (3):
+
+- mesa: don't return errors for gl_* GetFragData* queries
+- glsl: fix explicit-location ifc matching in presence of array types
+- freedreno: use OUT_WFI for emit_marker
+
+Jason Ekstrand (1):
+
+- anv: Set CONTEXT_PARAM_RECOVERABLE to false
+
+Jordan Justen (1):
+
+- intel/isl: Enable MOCS 61 for external surfaces on TGL
+
+Juan A. Suarez Romero (1):
+
+- broadcom/compiler: force a last thrsw for spilling
+
+Lionel Landwerlin (2):
+
+- nir: prevent peephole from generating invalid NIR
+- intel/fs: fix framebuffer reads
+
+Mao, Marc (1):
+
+- iris: declare padding for iris_vue_prog_key
+
+Marcin Ślusarz (2):
+
+- glsl: propagate errors from \*=, /=, +=, -= operators
+- glsl: break out early if compound assignment's operand errored out
+
+Marek Olšák (6):
+
+- mesa: remove unused indices parameter from validate functions
+- mesa: fix gl_DrawID with indirect multi draws using user indirect buffer
+- mesa: skip draw calls with unaligned indices
+- radeonsi: fix a memory leak in si_get_shader_binary_size
+- radeonsi: disable DCC stores on Navi12-14 for displayable DCC to fix corruption
+- radeonsi: strengthen the VGT_FLUSH condition in begin_new_gfx_cs
+
+Mike Blumenkrantz (8):
+
+- nir/lower_vectorize_tess_levels: set num_components for vectorized loads
+- zink: fix pipeline caching
+- radv: use pool stride when copying single query results
+- zink: free local shader nirs on program free
+- zink: destroy shader modules on program free to avoid leaking
+- tgsi_to_nir: force int type for LAYER output
+- util/primconvert: force restart rewrites if original primtype wasn't supported
+- zink: fix ZINK_MAX_DESCRIPTORS_PER_TYPE to stop exploding the stack
+
+Nanley Chery (1):
+
+- intel/blorp: Fix Gfx7 stencil surface state valign
+
+Neha Bhende (1):
+
+- svga/drm: use pb_usage_flags instead of pipe_map_flags in vmw_svga_winsys_buffer_map
+
+Quantum (1):
+
+- main: allow all external textures for BindImageTexture
+
+Rhys Perry (4):
+
+- aco: include utility in isel
+- aco: don't constant propagate to DPP instructions
+- aco/spill: add temporary operands of exec phis to next_use_distances_end
+- nir: fix serialization of loop/if control
+
+Samuel Pitoiset (5):
+
+- radv: fix fast clearing depth images with mips on GFX10+
+- radv: fix copying depth+stencil images on compute
+- radv: disable DCC image stores on Navi12-14 for displayable DCC corruption
+- radv: fix determining the maximum number of waves that can use scratch
+- radv/llvm: fix using Wave32
+
+Simon Ser (4):
+
+- etnaviv: add stride, offset and modifier to resource_get_param
+- panfrost: implement resource_get_param
+- vc4: implement resource_get_param
+- v3d: implement resource_get_param
+
+Timothy Arceri (6):
+
+- glsl: fix variable scope for instructions inside case statements
+- nir: move nir_block_ends_in_break() to nir.h
+- mesa: fix mesa_problem() call in _mesa_program_state_flags()
+- glsl: fix variable scope for loop-expression
+- glsl: handle scope correctly when inlining loop expression
+- glsl: fix variable scope for do-while loops
+
+Timur Kristóf (7):
+
+- aco: Fix to_uniform_bool_instr when operands are not suitable.
+- aco: Emit zero for the derivatives of uniforms.
+- aco: Unset 16 and 24-bit flags from operands in apply_extract.
+- nir: Fix local_invocation_index upper bound for non-compute-like stages.
+- aco: Fix invalid usage of std::fill with std::array.
+- aco: Use Builder reference in emit_copies_block.
+- aco: Skip code paths to emit copies when there are no copies.
+
+Vinson Lee (1):
+
+- freedreno: Require C++17.
+
+Yevhenii Kharchenko (1):
+
+- iris: fix layer calculation for TEXTURE_3D ReadPixels() on mip-level>0
+
+liuyujun (1):
+
+- gallium: fix surface->destroy use-after-free
+
+mattvchandler (1):
+
+- gallium/osmesa: fix buffer resizing
diff --git a/mesa 3D driver/docs/relnotes/21.2.3.rst b/mesa 3D driver/docs/relnotes/21.2.3.rst
new file mode 100644
index 0000000000..8e5e25782b
--- /dev/null
+++ b/mesa 3D driver/docs/relnotes/21.2.3.rst	
@@ -0,0 +1,139 @@
+Mesa 21.2.3 Release Notes / 2021-09-29
+======================================
+
+Mesa 21.2.3 is a bug fix release which fixes bugs found since the 21.2.2 release.
+
+Mesa 21.2.3 implements the OpenGL 4.6 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.6. OpenGL
+4.6 is **only** available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+
+Mesa 21.2.3 implements the Vulkan 1.2 API, but the version reported by
+the apiVersion property of the VkPhysicalDeviceProperties struct
+depends on the particular driver being used.
+
+SHA256 checksum
+---------------
+
+::
+
+   7245284a159d2484770e1835a673e79e4322a9ddf43b17859668244946db7174  mesa-21.2.3.tar.xz
+
+
+New features
+------------
+
+- None
+
+
+Bug fixes
+---------
+
+- Significant performance drop on Radeon HD 8400
+- [nir][radv] Out of range shift when compiling Resident Evil Village shaders
+- [nir][radv] Out of range shift when compiling Resident Evil Village shaders
+- GL_EXT_disjoint_timer_query glGetInteger64v GL_TIMESTAMP failing with GL_INVALID_ENUM
+- Possible miscompilation of an integer division with vulkan
+
+
+Changes
+-------
+
+Boris Brezillon (3):
+
+- panfrost: RGB10_A2_SNORM is not a valid texture format on v6+
+- panfrost: RGB332_UNORM is not a valid texture format on v6+
+- pan/blit: Fix a NULL dereference in the preload path
+
+Charmaine Lee (1):
+
+- svga: fix render target views leak
+
+Dylan Baker (15):
+
+- docs/relnotes/21.2.2: Add SHA256 sum
+- .pick_status.json: Update to d3511e8af21ac11b8e7f5305942624d1ae29a73a
+- .pick_status.json: Mark a79ac1bee14d6600cde2788bf136aa59b69a786f as backported
+- .pick_status.json: Mark e0533ebf16edcb8b9f0687d3155417e6c1c53d35 as backported
+- .pick_status.json: Mark f241bd3749cec55ca5fac9cb24f17553ab31c0e1 as backported
+- .pick_status.json: Mark 268158a758551a46feb120af3f3cff5fb9292310 as backported
+- .pick_status.json: Mark 6373dd814a74d84becbbbfc42673df147adb6e9b as denominated
+- .pick_status.json: Mark eb7d2ef070a8819c2859c10559496836275848e2 as denominated
+- .pick_status.json: Mark a810e58051b4a065b0aade86c45bf7ed254fc726 as denominated
+- .pick_status.json: Update to 400da4900e2d72ee807cc3eedac9ace1dfd5dfba
+- .pick_status.json: Update to dc354b8fda928861b7dfff3f8f53159e0053f9f5
+- .pick_status.json: Update to b653164973bbd3053d3b9ed37c4362af96346900
+- .pick_status.json: Update to ecc6d78b0541d66765d434dd4158066d6c664f8e
+- .pick_status.json: Update to fbbe00c0b7f7aa5aca42a82358332eb2de56b9af
+- lavapipe/ci: Add additional failing test
+
+Ella-0 (1):
+
+- v3d: add R10G10B10X2_UNORM to format table
+
+Emma Anholt (1):
+
+- mesa: Fix missing CopyTexImage formats for OES_required_internalformat.
+
+Italo Nicola (1):
+
+- panfrost: fix null deref when no color buffer is attached
+
+Jordan Justen (2):
+
+- iris: Disable I915_FORMAT_MOD_Y_TILED_GEN12* on adl-p/display 13
+- intel/dev: Add display_ver and set adl-p to 13
+
+Lionel Landwerlin (1):
+
+- nir: fix opt_memcpy src/dst mixup
+
+Marcin Ślusarz (1):
+
+- intel/compiler: INT DIV function does not support source modifiers
+
+Marek Olšák (2):
+
+- radeonsi: fix a depth texturing performance regression on gfx6-7
+- radeonsi: fix clearing index_size for NGG fast launch
+
+Marek Vasut (1):
+
+- freedreno: Handle timeout == PIPE_TIMEOUT_INFINITE and rollover
+
+Neha Bhende (1):
+
+- auxiliary/indices: convert primitive type PIPE_PRIM_PATCHES
+
+Qiang Yu (1):
+
+- radeonsi: fix ps SI_PARAM_LINE_STIPPLE_TEX arg
+
+Rhys Perry (3):
+
+- aco: don't coalesce constant copies into non-power-of-two sizes
+- aco/tests: add idep_amdgfxregs_h
+- radv: don't require a GS copy shader to use the cache with NGG VS+GS
+
+Rob Clark (2):
+
+- freedreno: Use correct key for binning pass shader
+- freedreno/drm: Don't return shared/control bo's to cache
+
+Tapani Pälli (1):
+
+- mesa: fix timestamp enum with EXT_disjoint_timer_query
+
+Timur Kristóf (5):
+
+- aco/optimize_postRA: Use iterators instead of operator[] of std::array.
+- ac/nir: Fix match_mask to work correctly for VS outputs.
+- nir: Exclude non-generic patch variables from get_variable_io_mask.
+- ac/nir/nggc: Refactor save_reusable_variables.
+- ac/nir/nggc: Don't reuse uniform values from divergent control flow.
+
+Zachary Michaels (1):
+
+- X11: Ensure that VK_SUBOPTIMAL_KHR propagates to user code
diff --git a/mesa 3D driver/docs/relnotes/21.2.4.rst b/mesa 3D driver/docs/relnotes/21.2.4.rst
new file mode 100644
index 0000000000..bd80c40640
--- /dev/null
+++ b/mesa 3D driver/docs/relnotes/21.2.4.rst	
@@ -0,0 +1,147 @@
+Mesa 21.2.4 Release Notes / 2021-10-14
+======================================
+
+Mesa 21.2.4 is a bug fix release which fixes bugs found since the 21.2.3 release.
+
+Mesa 21.2.4 implements the OpenGL 4.6 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.6. OpenGL
+4.6 is **only** available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+
+Mesa 21.2.4 implements the Vulkan 1.2 API, but the version reported by
+the apiVersion property of the VkPhysicalDeviceProperties struct
+depends on the particular driver being used.
+
+SHA256 checksum
+---------------
+
+::
+
+   fe6ede82d1ac02339da3c2ec1820a379641902fd351a52cc01153f76eff85b44  mesa-21.2.4.tar.xz
+
+
+New features
+------------
+
+- None
+
+
+Bug fixes
+---------
+
+- RADV: Rendering issues in Resident Evil 2 with NGGC
+- crocus:  Incorrect stride when used through prime
+- anv: descriptorBindingUniformBufferUpdateAfterBind feature is not supported
+
+
+Changes
+-------
+
+Alyssa Rosenzweig (3):
+
+- panfrost: Move special_varying to compiler definitions
+- panfrost: Fix off-by-one in varying count assert
+- panfrost: Don't set CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER
+
+Bas Nieuwenhuizen (2):
+
+- amd/common: Add fallback for misreported clocks for RGP.
+- radv: Handle copying zero queries.
+
+Chia-I Wu (1):
+
+- radv: plug leaks in radv_device_init_accel_struct_build_state
+
+Connor Abbott (1):
+
+- ir3: Use source in ir3_output_conv_src_type()
+
+Dave Airlie (7):
+
+- crocus/query: don't loop on ready status after gpu hang.
+- device_select: close dri3 fd after using it.
+- crocus: Honor scanout requirement from DRI
+- crocus/gen5: reemit shaders on gen5 after new program cache bo.
+- crocus/gen5: add dirty flags for urb fences.
+- crocus/gen6: don't reemit the svbi when debugging
+- gallivm/format: clamp SINT conversion rather than truncate.
+
+Dylan Baker (7):
+
+- docs" Add SHA256 sum for mesa 21.2.3
+- .pick_status.json: Update to fb8f532ea1bbd9c959e0f59c652347e435a71f91
+- .pick_status.json: Update to d2543658ef6fe0ad59af217a09a931d3b6174a43
+- .pick_status.json: Update to 729991e09cd28550001ae63710ab929d95b115bc
+- .pick_status.json: Update to 3a18963b0876af2aa0d60dd9917e69d409ce4d6e
+- .pick_status.json: Update to ced950e42f4a95ef410e63c2d26a2119e0c3c40b
+- .pick_status.json: Update to 783f8f728ce8e77885adbc7b2c12c39c3e3e5198
+
+Emma Anholt (2):
+
+- gallium/dri: Make YUV formats we're going to emulate external-only.
+- i915g: Check for the scanout-layout conditions before setting level info.
+
+Filip Gawin (1):
+
+- r300: implement forgotten tgsi's cases of textures
+
+Gert Wollny (1):
+
+- mesa: signal driver when buffer is bound to different texture format
+
+Icecream95 (1):
+
+- pan/mdg: Use the correct swizzle for condition moves
+
+Jason Ekstrand (1):
+
+- vulkan/shader_module: Fix the lifetime of temporary shader modules
+
+Kai Wasserbäch (2):
+
+- gallivm: add new wrapper around Module::setOverrideStackAlignment()
+- gallivm: fix FTBFS on i386 with LLVM >= 13, StackAlignmentOverride is gone
+
+Lionel Landwerlin (3):
+
+- anv: enable UBO indexing
+- anv: add missing transition handling bits
+- spirv: deal with null pointers
+
+Marek Olšák (2):
+
+- radeonsi: add back a workaround for DCC MSAA on gfx9 due to conformance issues
+- ac/surface: don't overwrite DCC settings for imported buffers
+
+Pavel Asyutchenko (1):
+
+- llvmpipe: fix crash when doing FB fetch + gl_FragDepth write in one shader
+
+Qiang Yu (1):
+
+- loader/dri3: fix swap out of order when changing swap interval
+
+Rob Clark (1):
+
+- freedreno/drm: Move pipe unref after fence removal
+
+Samuel Pitoiset (2):
+
+- radv: fix adjusting the frag coord when RADV_FORCE_VRS is enabled
+- aco: fix load_barycentric_at_{offset,sample}
+
+Timur Kristóf (3):
+
+- aco: Fix small primitive precision.
+- aco: Fix determining whether any culling is enabled.
+- ac/nir/cull: Accept NaN and +/- Inf in face culling.
+
+Vasily Khoruzhick (1):
+
+- lima: split_load_input: don't split unaligned vec2
+
+Vinson Lee (1):
+
+- pps: Avoid duplicate elements in with_datasources array.
diff --git a/mesa 3D driver/docs/relnotes/new_features.txt b/mesa 3D driver/docs/relnotes/new_features.txt
index ac5d90fc37..e69de29bb2 100644
--- a/mesa 3D driver/docs/relnotes/new_features.txt	
+++ b/mesa 3D driver/docs/relnotes/new_features.txt	
@@ -1,8 +0,0 @@
-VK_EXT_color_write_enable on lavapipe
-GL_ARB_texture_filter_anisotropic in llvmpipe
-Anisotropic texture filtering in lavapipe
-VK_EXT_shader_atomic_float2 on Intel and RADV.
-VK_KHR_timeline_semaphore on lavapipe
-VK_EXT_external_memory_host on lavapipe
-GL_AMD_pinned_memory on llvmpipe
-GL 4.5 compatibility on llvmpipe
diff --git a/mesa 3D driver/include/EGL/eglext.h b/mesa 3D driver/include/EGL/eglext.h
index a7ca1f30fd..94dd038c9e 100644
--- a/mesa 3D driver/include/EGL/eglext.h	
+++ b/mesa 3D driver/include/EGL/eglext.h	
@@ -14,12 +14,12 @@ extern "C" {
 ** used to make the header, and the header can be found at
 **   http://www.khronos.org/registry/egl
 **
-** Khronos $Git commit SHA1: e8baa0bf39 $ on $Git commit date: 2021-04-26 17:56:26 -0600 $
+** Khronos $Git commit SHA1: dc0b58dca5 $ on $Git commit date: 2021-06-25 01:58:50 +0200 $
 */
 
 #include <EGL/eglplatform.h>
 
-#define EGL_EGLEXT_VERSION 20210604
+#define EGL_EGLEXT_VERSION 20210629
 
 /* Generated C header for:
  * API: egl
@@ -651,6 +651,11 @@ EGLAPI EGLBoolean EGLAPIENTRY eglCompositorSwapPolicyEXT (EGLint external_win_id
 #endif
 #endif /* EGL_EXT_compositor */
 
+#ifndef EGL_EXT_config_select_group
+#define EGL_EXT_config_select_group 1
+#define EGL_CONFIG_SELECT_GROUP_EXT       0x34C0
+#endif /* EGL_EXT_config_select_group */
+
 #ifndef EGL_EXT_create_context_robustness
 #define EGL_EXT_create_context_robustness 1
 #define EGL_CONTEXT_OPENGL_ROBUST_ACCESS_EXT 0x30BF
@@ -702,6 +707,10 @@ EGLAPI EGLBoolean EGLAPIENTRY eglQueryDisplayAttribEXT (EGLDisplay dpy, EGLint a
 #define EGL_DEVICE_UUID_EXT               0x335C
 #define EGL_DRIVER_UUID_EXT               0x335D
 #define EGL_DRIVER_NAME_EXT               0x335E
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLQUERYDEVICEBINARYEXTPROC) (EGLDeviceEXT device, EGLint name, EGLint max_size, void *value, EGLint *size);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglQueryDeviceBinaryEXT (EGLDeviceEXT device, EGLint name, EGLint max_size, void *value, EGLint *size);
+#endif
 #endif /* EGL_EXT_device_persistent_id */
 
 #ifndef EGL_EXT_device_query
@@ -895,6 +904,11 @@ EGLAPI EGLSurface EGLAPIENTRY eglCreatePlatformPixmapSurfaceEXT (EGLDisplay dpy,
 #define EGL_PLATFORM_XCB_SCREEN_EXT       0x31DE
 #endif /* EGL_EXT_platform_xcb */
 
+#ifndef EGL_EXT_present_opaque
+#define EGL_EXT_present_opaque 1
+#define EGL_PRESENT_OPAQUE_EXT            0x31DF
+#endif /* EGL_EXT_present_opaque */
+
 #ifndef EGL_EXT_protected_content
 #define EGL_EXT_protected_content 1
 #define EGL_PROTECTED_CONTENT_EXT         0x32C0
diff --git a/mesa 3D driver/include/GL/gl.h b/mesa 3D driver/include/GL/gl.h
index 191560ef64..e2f5408a5d 100644
--- a/mesa 3D driver/include/GL/gl.h	
+++ b/mesa 3D driver/include/GL/gl.h	
@@ -38,9 +38,7 @@
 #if defined(__WIN32__) && !defined(__CYGWIN__)
 #  if (defined(_MSC_VER) || defined(__MINGW32__)) && defined(BUILD_GL32) /* tag specify we're building mesa as a DLL */
 #    define GLAPI __declspec(dllexport)
-#  elif (defined(_MSC_VER) || defined(__MINGW32__)) && defined(_DLL) /* tag specifying we're building for DLL runtime support */
-#    define GLAPI __declspec(dllimport)
-#  else /* for use with static link lib build of Win32 edition only */
+#  else
 #    define GLAPI extern
 #  endif
 #  if defined(__MINGW32__) && defined(GL_NO_STDCALL) || defined(UNDER_CE)  /* The generated DLLs by MingW with STDCALL are not compatible with the ones done by Microsoft's compilers */
diff --git a/mesa 3D driver/include/GL/internal/dri_interface.h b/mesa 3D driver/include/GL/internal/dri_interface.h
index 6d98f2506b..5e06912ffb 100644
--- a/mesa 3D driver/include/GL/internal/dri_interface.h	
+++ b/mesa 3D driver/include/GL/internal/dri_interface.h	
@@ -48,8 +48,6 @@ typedef unsigned int drm_drawable_t;
 typedef struct drm_clip_rect drm_clip_rect_t;
 #endif
 
-#include <GL/gl.h>
-
 #include <stdint.h>
 
 /**
@@ -153,32 +151,6 @@ struct __DRIswapControlExtensionRec {
     unsigned int (*getSwapInterval)(__DRIdrawable *drawable);
 };
 
-/**
- * Used by drivers that implement the GLX_MESA_swap_frame_usage extension.
- */
-#define __DRI_FRAME_TRACKING "DRI_FrameTracking"
-#define __DRI_FRAME_TRACKING_VERSION 1
-struct __DRIframeTrackingExtensionRec {
-    __DRIextension base;
-
-    /**
-     * Enable or disable frame usage tracking.
-     * 
-     * \since Internal API version 20030317.
-     */
-    int (*frameTracking)(__DRIdrawable *drawable, GLboolean enable);
-
-    /**
-     * Retrieve frame usage information.
-     * 
-     * \since Internal API version 20030317.
-     */
-    int (*queryFrameTracking)(__DRIdrawable *drawable,
-			      int64_t * sbc, int64_t * missedFrames,
-			      float * lastMissedUsage, float * usage);
-};
-
-
 /**
  * Used by drivers that implement the GLX_SGI_video_sync extension.
  */
@@ -205,24 +177,6 @@ struct __DRImediaStreamCounterExtensionRec {
 			  int64_t *msc);
 };
 
-
-#define __DRI_TEX_OFFSET "DRI_TexOffset"
-#define __DRI_TEX_OFFSET_VERSION 1
-struct __DRItexOffsetExtensionRec {
-    __DRIextension base;
-
-    /**
-     * Method to override base texture image with a driver specific 'offset'.
-     * The depth passed in allows e.g. to ignore the alpha channel of texture
-     * images where the non-alpha components don't occupy a whole texel.
-     *
-     * For GLX_EXT_texture_from_pixmap with AIGLX.
-     */
-    void (*setTexOffset)(__DRIcontext *pDRICtx, GLint texname,
-			 unsigned long long offset, GLint depth, GLuint pitch);
-};
-
-
 /* Valid values for format in the setTexBuffer2 function below.  These
  * values match the GLX tokens for compatibility reasons, but we
  * define them here since the DRI interface can't depend on GLX. */
@@ -243,7 +197,7 @@ struct __DRItexBufferExtensionRec {
      * setTexBuffer2 in version 2 of this interface
      */
     void (*setTexBuffer)(__DRIcontext *pDRICtx,
-			 GLint target,
+			 int target,
 			 __DRIdrawable *pDraw);
 
     /**
@@ -255,8 +209,8 @@ struct __DRItexBufferExtensionRec {
      * \since 2
      */
     void (*setTexBuffer2)(__DRIcontext *pDRICtx,
-			  GLint target,
-			  GLint format,
+			  int target,
+			  int format,
 			  __DRIdrawable *pDraw);
     /**
      * Method to release texture buffer in case some special platform
@@ -267,7 +221,7 @@ struct __DRItexBufferExtensionRec {
      * \since 3
      */
     void (*releaseTexBuffer)(__DRIcontext *pDRICtx,
-			GLint target,
+			int target,
 			__DRIdrawable *pDraw);
 };
 
@@ -410,8 +364,8 @@ struct __DRI2fenceExtensionRec {
     * \param flags   a combination of __DRI2_FENCE_FLAG_xxx flags
     * \param timeout the timeout in ns or __DRI2_FENCE_TIMEOUT_INFINITE
     */
-   GLboolean (*client_wait_sync)(__DRIcontext *ctx, void *fence,
-                                 unsigned flags, uint64_t timeout);
+   unsigned char (*client_wait_sync)(__DRIcontext *ctx, void *fence,
+                                     unsigned flags, uint64_t timeout);
 
    /**
     * This function enqueues a wait command into the command stream of
@@ -549,28 +503,6 @@ typedef struct __DRIdamageExtensionRec __DRIdamageExtension;
 typedef struct __DRIloaderExtensionRec __DRIloaderExtension;
 typedef struct __DRIswrastLoaderExtensionRec __DRIswrastLoaderExtension;
 
-
-/**
- * Callback to getDrawableInfo protocol
- */
-#define __DRI_GET_DRAWABLE_INFO "DRI_GetDrawableInfo"
-#define __DRI_GET_DRAWABLE_INFO_VERSION 1
-struct __DRIgetDrawableInfoExtensionRec {
-    __DRIextension base;
-
-    /**
-     * This function is used to get information about the position, size, and
-     * clip rects of a drawable.
-     */
-    GLboolean (* getDrawableInfo) ( __DRIdrawable *drawable,
-	unsigned int * index, unsigned int * stamp,
-        int * x, int * y, int * width, int * height,
-        int * numClipRects, drm_clip_rect_t ** pClipRects,
-        int * backX, int * backY,
-	int * numBackClipRects, drm_clip_rect_t ** pBackClipRects,
-	void *loaderPrivate);
-};
-
 /**
  * Callback to get system time for media stream counter extensions.
  */
@@ -591,7 +523,7 @@ struct __DRIsystemTimeExtensionRec {
      * the rate of the "media stream counter".  In practical terms, this is
      * the frame refresh rate of the display.
      */
-    GLboolean (*getMSCRate)(__DRIdrawable *draw,
+    unsigned char (*getMSCRate)(__DRIdrawable *draw,
 			    int32_t * numerator, int32_t * denominator,
 			    void *loaderPrivate);
 };
@@ -622,7 +554,7 @@ struct __DRIdamageExtensionRec {
     void (*reportDamage)(__DRIdrawable *draw,
 			 int x, int y,
 			 drm_clip_rect_t *rects, int num_rects,
-			 GLboolean front_buffer,
+			 unsigned char front_buffer,
 			 void *loaderPrivate);
 };
 
@@ -721,9 +653,9 @@ struct __DRIswrastLoaderExtensionRec {
      *
      * \since 6
      */
-    GLboolean (*getImageShm2)(__DRIdrawable *readable,
-                              int x, int y, int width, int height,
-                              int shmid, void *loaderPrivate);
+    unsigned char (*getImageShm2)(__DRIdrawable *readable,
+                                  int x, int y, int width, int height,
+                                  int shmid, void *loaderPrivate);
 };
 
 /**
@@ -1305,7 +1237,7 @@ struct __DRIdri2ExtensionRec {
  * extensions.
  */
 #define __DRI_IMAGE "DRI_IMAGE"
-#define __DRI_IMAGE_VERSION 19
+#define __DRI_IMAGE_VERSION 20
 
 /**
  * These formats correspond to the similarly named MESA_FORMAT_*
@@ -1476,9 +1408,10 @@ enum __DRIChromaSiting {
 #define __BLIT_FLAG_FINISH		0x0002
 
 /**
- * Flags for createImageFromDmaBufs3
+ * Flags for createImageFromDmaBufs3 and createImageFromFds2
  */
 #define __DRI_IMAGE_PROTECTED_CONTENT_FLAG 0x00000001
+#define __DRI_IMAGE_PRIME_LINEAR_BUFFER    0x00000002
 
 /**
  * queryDmaBufFormatModifierAttribs attributes
@@ -1509,7 +1442,7 @@ struct __DRIimageExtensionRec {
 			       unsigned int use,
 			       void *loaderPrivate);
 
-   GLboolean (*queryImage)(__DRIimage *image, int attrib, int *value);
+   unsigned char (*queryImage)(__DRIimage *image, int attrib, int *value);
 
    /**
     * The new __DRIimage will share the content with the old one, see dup(2).
@@ -1521,7 +1454,7 @@ struct __DRIimageExtensionRec {
     *
     * \since 2
     */
-   GLboolean (*validateUsage)(__DRIimage *image, unsigned int use);
+   unsigned char (*validateUsage)(__DRIimage *image, unsigned int use);
 
    /**
     * Unlike createImageFromName __DRI_IMAGE_FORMAT is not used but instead
@@ -1702,8 +1635,8 @@ struct __DRIimageExtensionRec {
     *
     * \since 15
     */
-   GLboolean (*queryDmaBufFormats)(__DRIscreen *screen, int max,
-                                   int *formats, int *count);
+   unsigned char (*queryDmaBufFormats)(__DRIscreen *screen, int max,
+                                       int *formats, int *count);
 
    /*
     * dmabuf format modifier query for a given format to support
@@ -1724,10 +1657,10 @@ struct __DRIimageExtensionRec {
     *
     * \since 15
     */
-   GLboolean (*queryDmaBufModifiers)(__DRIscreen *screen, int fourcc,
-                                     int max, uint64_t *modifiers,
-                                     unsigned int *external_only,
-                                     int *count);
+   unsigned char (*queryDmaBufModifiers)(__DRIscreen *screen, int fourcc,
+                                         int max, uint64_t *modifiers,
+                                         unsigned int *external_only,
+                                         int *count);
 
    /**
     * dmabuf format modifier attribute query for a given format and modifier.
@@ -1743,9 +1676,11 @@ struct __DRIimageExtensionRec {
     *
     * \since 16
     */
-   GLboolean (*queryDmaBufFormatModifierAttribs)(__DRIscreen *screen,
-                                                 uint32_t fourcc, uint64_t modifier,
-                                                 int attrib, uint64_t *value);
+   unsigned char (*queryDmaBufFormatModifierAttribs)(__DRIscreen *screen,
+                                                     uint32_t fourcc,
+                                                     uint64_t modifier,
+                                                     int attrib,
+                                                     uint64_t *value);
 
    /**
     * Create a DRI image from the given renderbuffer.
@@ -1804,6 +1739,20 @@ struct __DRIimageExtensionRec {
                                             const unsigned int modifier_count,
                                             unsigned int use,
                                             void *loaderPrivate);
+
+   /**
+    * Like createImageFromFds, but with an added flag parameter.
+    *
+    * See __DRI_IMAGE_*_FLAG for valid definitions of flags.
+    *
+    * \since 20
+    */
+   __DRIimage *(*createImageFromFds2)(__DRIscreen *screen,
+                                      int width, int height, int fourcc,
+                                      int *fds, int num_fds,
+                                      uint32_t flags,
+                                      int *strides, int *offsets,
+                                      void *loaderPrivate);
 };
 
 
@@ -1817,14 +1766,36 @@ struct __DRIimageExtensionRec {
  * with new lookup functions.
  */
 #define __DRI_IMAGE_LOOKUP "DRI_IMAGE_LOOKUP"
-#define __DRI_IMAGE_LOOKUP_VERSION 1
+#define __DRI_IMAGE_LOOKUP_VERSION 2
 
 typedef struct __DRIimageLookupExtensionRec __DRIimageLookupExtension;
 struct __DRIimageLookupExtensionRec {
     __DRIextension base;
 
+    /**
+     * Lookup EGLImage without validated. Equivalent to call
+     * validateEGLImage() then lookupEGLImageValidated().
+     *
+     * \since 1
+     */
     __DRIimage *(*lookupEGLImage)(__DRIscreen *screen, void *image,
 				  void *loaderPrivate);
+
+    /**
+     * Check if EGLImage is associated with the EGL display before lookup with
+     * lookupEGLImageValidated(). It will hold EGLDisplay.Mutex, so is separated
+     * out from lookupEGLImage() to avoid deadlock.
+     *
+     * \since 2
+     */
+    unsigned char (*validateEGLImage)(void *image, void *loaderPrivate);
+
+    /**
+     * Lookup EGLImage after validateEGLImage(). No lock in this function.
+     *
+     * \since 2
+     */
+    __DRIimage *(*lookupEGLImageValidated)(void *image, void *loaderPrivate);
 };
 
 /**
@@ -1969,6 +1940,7 @@ typedef struct __DRIDriverVtableExtensionRec {
 #define   __DRI2_RENDERER_HAS_CONTEXT_PRIORITY_HIGH           (1 << 2)
 
 #define __DRI2_RENDERER_HAS_PROTECTED_CONTENT                 0x000e
+#define __DRI2_RENDERER_PREFER_BACK_BUFFER_REUSE              0x000f
 
 typedef struct __DRI2rendererQueryExtensionRec __DRI2rendererQueryExtension;
 struct __DRI2rendererQueryExtensionRec {
@@ -2178,7 +2150,7 @@ struct __DRIbackgroundCallableExtensionRec {
     * the context was created.  This can be used by the loader to identify
     * which context any callbacks are associated with.
     */
-   GLboolean (*isThreadSafe)(void *loaderPrivate);
+   unsigned char (*isThreadSafe)(void *loaderPrivate);
 };
 
 /**
diff --git a/mesa 3D driver/include/android_stub/android/hardware_buffer.h b/mesa 3D driver/include/android_stub/android/hardware_buffer.h
index ae5e47ba97..dcb05b5536 100644
--- a/mesa 3D driver/include/android_stub/android/hardware_buffer.h	
+++ b/mesa 3D driver/include/android_stub/android/hardware_buffer.h	
@@ -332,8 +332,6 @@ typedef struct AHardwareBuffer_Planes {
  */
 typedef struct AHardwareBuffer AHardwareBuffer;
 
-#if __ANDROID_API__ >= 26
-
 /**
  * Allocates a buffer that matches the passed AHardwareBuffer_Desc.
  *
@@ -501,10 +499,6 @@ int AHardwareBuffer_sendHandleToUnixSocket(const AHardwareBuffer* buffer, int so
  */
 int AHardwareBuffer_recvHandleFromUnixSocket(int socketFd, AHardwareBuffer** outBuffer) __INTRODUCED_IN(26);
 
-#endif // __ANDROID_API__ >= 26
-
-#if __ANDROID_API__ >= 29
-
 /**
  * Test whether the given format and usage flag combination is
  * allocatable.
@@ -540,7 +534,6 @@ int AHardwareBuffer_isSupported(const AHardwareBuffer_Desc* desc) __INTRODUCED_I
 int AHardwareBuffer_lockAndGetInfo(AHardwareBuffer* buffer, uint64_t usage,
         int32_t fence, const ARect* rect, void** outVirtualAddress,
         int32_t* outBytesPerPixel, int32_t* outBytesPerStride) __INTRODUCED_IN(29);
-#endif // __ANDROID_API__ >= 29
 
 __END_DECLS
 
diff --git a/mesa 3D driver/include/android_stub/android/log.h b/mesa 3D driver/include/android_stub/android/log.h
index 8a0ebf22fe..5dc365a4dd 100644
--- a/mesa 3D driver/include/android_stub/android/log.h	
+++ b/mesa 3D driver/include/android_stub/android/log.h	
@@ -217,7 +217,6 @@ typedef void (*__android_logger_function)(const struct __android_log_message* lo
  */
 typedef void (*__android_aborter_function)(const char* abort_message);
 
-#if !defined(__ANDROID__) || __ANDROID_API__ >= 30
 /**
  * Writes the log message specified by log_message.  log_message includes additional file name and
  * line number information that a logger may use.  log_message is versioned for backwards
@@ -371,7 +370,6 @@ int32_t __android_log_get_minimum_priority(void) __INTRODUCED_IN(30);
  * Available since API level 30.
  */
 void __android_log_set_default_tag(const char* tag) __INTRODUCED_IN(30);
-#endif
 
 #ifdef __cplusplus
 }
diff --git a/mesa 3D driver/include/android_stub/android/native_window.h b/mesa 3D driver/include/android_stub/android/native_window.h
index 36aad2eced..a3a45e3705 100644
--- a/mesa 3D driver/include/android_stub/android/native_window.h	
+++ b/mesa 3D driver/include/android_stub/android/native_window.h	
@@ -185,8 +185,6 @@ int32_t ANativeWindow_lock(ANativeWindow* window, ANativeWindow_Buffer* outBuffe
  */
 int32_t ANativeWindow_unlockAndPost(ANativeWindow* window);
 
-#if __ANDROID_API__ >= 26
-
 /**
  * Set a transform that will be applied to future buffers posted to the window.
  *
@@ -197,10 +195,6 @@ int32_t ANativeWindow_unlockAndPost(ANativeWindow* window);
  */
 int32_t ANativeWindow_setBuffersTransform(ANativeWindow* window, int32_t transform) __INTRODUCED_IN(26);
 
-#endif // __ANDROID_API__ >= 26
-
-#if __ANDROID_API__ >= 28
-
 /**
  * All buffers queued after this call will be associated with the dataSpace
  * parameter specified.
@@ -229,10 +223,6 @@ int32_t ANativeWindow_setBuffersDataSpace(ANativeWindow* window, int32_t dataSpa
  */
 int32_t ANativeWindow_getBuffersDataSpace(ANativeWindow* window) __INTRODUCED_IN(28);
 
-#endif // __ANDROID_API__ >= 28
-
-#if __ANDROID_API__ >= 30
-
 /** Compatibility value for ANativeWindow_setFrameRate. */
 enum ANativeWindow_FrameRateCompatibility {
     /**
@@ -301,8 +291,6 @@ int32_t ANativeWindow_setFrameRate(ANativeWindow* window, float frameRate, int8_
  */
 void ANativeWindow_tryAllocateBuffers(ANativeWindow* window);
 
-#endif // __ANDROID_API__ >= 30
-
 #ifdef __cplusplus
 };
 #endif
diff --git a/mesa 3D driver/include/android_stub/cutils/compiler.h b/mesa 3D driver/include/android_stub/cutils/compiler.h
new file mode 100644
index 0000000000..70f884a1e7
--- /dev/null
+++ b/mesa 3D driver/include/android_stub/cutils/compiler.h	
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_CUTILS_COMPILER_H
+#define ANDROID_CUTILS_COMPILER_H
+
+/*
+ * helps the compiler's optimizer predicting branches
+ */
+
+#ifdef __cplusplus
+#   define CC_LIKELY( exp )    (__builtin_expect( !!(exp), true ))
+#   define CC_UNLIKELY( exp )  (__builtin_expect( !!(exp), false ))
+#else
+#   define CC_LIKELY( exp )    (__builtin_expect( !!(exp), 1 ))
+#   define CC_UNLIKELY( exp )  (__builtin_expect( !!(exp), 0 ))
+#endif
+
+/**
+ * exports marked symbols
+ *
+ * if used on a C++ class declaration, this macro must be inserted
+ * after the "class" keyword. For instance:
+ *
+ * template <typename TYPE>
+ * class ANDROID_API Singleton { }
+ */
+
+#define ANDROID_API __attribute__((visibility("default")))
+
+#endif // ANDROID_CUTILS_COMPILER_H
diff --git a/mesa 3D driver/include/android_stub/cutils/trace.h b/mesa 3D driver/include/android_stub/cutils/trace.h
new file mode 100644
index 0000000000..24c6ae6290
--- /dev/null
+++ b/mesa 3D driver/include/android_stub/cutils/trace.h	
@@ -0,0 +1,238 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _LIBS_CUTILS_TRACE_H
+#define _LIBS_CUTILS_TRACE_H
+
+#include <inttypes.h>
+#include <stdatomic.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/cdefs.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <cutils/compiler.h>
+
+__BEGIN_DECLS
+
+/**
+ * The ATRACE_TAG macro can be defined before including this header to trace
+ * using one of the tags defined below.  It must be defined to one of the
+ * following ATRACE_TAG_* macros.  The trace tag is used to filter tracing in
+ * userland to avoid some of the runtime cost of tracing when it is not desired.
+ *
+ * Defining ATRACE_TAG to be ATRACE_TAG_ALWAYS will result in the tracing always
+ * being enabled - this should ONLY be done for debug code, as userland tracing
+ * has a performance cost even when the trace is not being recorded.  Defining
+ * ATRACE_TAG to be ATRACE_TAG_NEVER or leaving ATRACE_TAG undefined will result
+ * in the tracing always being disabled.
+ *
+ * ATRACE_TAG_HAL should be bitwise ORed with the relevant tags for tracing
+ * within a hardware module.  For example a camera hardware module would set:
+ * #define ATRACE_TAG  (ATRACE_TAG_CAMERA | ATRACE_TAG_HAL)
+ *
+ * Keep these in sync with frameworks/base/core/java/android/os/Trace.java.
+ */
+#define ATRACE_TAG_NEVER            0       // This tag is never enabled.
+#define ATRACE_TAG_ALWAYS           (1<<0)  // This tag is always enabled.
+#define ATRACE_TAG_GRAPHICS         (1<<1)
+#define ATRACE_TAG_INPUT            (1<<2)
+#define ATRACE_TAG_VIEW             (1<<3)
+#define ATRACE_TAG_WEBVIEW          (1<<4)
+#define ATRACE_TAG_WINDOW_MANAGER   (1<<5)
+#define ATRACE_TAG_ACTIVITY_MANAGER (1<<6)
+#define ATRACE_TAG_SYNC_MANAGER     (1<<7)
+#define ATRACE_TAG_AUDIO            (1<<8)
+#define ATRACE_TAG_VIDEO            (1<<9)
+#define ATRACE_TAG_CAMERA           (1<<10)
+#define ATRACE_TAG_HAL              (1<<11)
+#define ATRACE_TAG_APP              (1<<12)
+#define ATRACE_TAG_RESOURCES        (1<<13)
+#define ATRACE_TAG_DALVIK           (1<<14)
+#define ATRACE_TAG_RS               (1<<15)
+#define ATRACE_TAG_BIONIC           (1<<16)
+#define ATRACE_TAG_POWER            (1<<17)
+#define ATRACE_TAG_PACKAGE_MANAGER  (1<<18)
+#define ATRACE_TAG_SYSTEM_SERVER    (1<<19)
+#define ATRACE_TAG_DATABASE         (1<<20)
+#define ATRACE_TAG_NETWORK          (1<<21)
+#define ATRACE_TAG_ADB              (1<<22)
+#define ATRACE_TAG_VIBRATOR         (1<<23)
+#define ATRACE_TAG_AIDL             (1<<24)
+#define ATRACE_TAG_NNAPI            (1<<25)
+#define ATRACE_TAG_RRO              (1<<26)
+#define ATRACE_TAG_LAST             ATRACE_TAG_RRO
+
+// Reserved for initialization.
+#define ATRACE_TAG_NOT_READY        (1ULL<<63)
+
+#define ATRACE_TAG_VALID_MASK ((ATRACE_TAG_LAST - 1) | ATRACE_TAG_LAST)
+
+#ifndef ATRACE_TAG
+#define ATRACE_TAG ATRACE_TAG_NEVER
+#elif ATRACE_TAG > ATRACE_TAG_VALID_MASK
+#error ATRACE_TAG must be defined to be one of the tags defined in cutils/trace.h
+#endif
+
+/**
+ * Opens the trace file for writing and reads the property for initial tags.
+ * The atrace.tags.enableflags property sets the tags to trace.
+ * This function should not be explicitly called, the first call to any normal
+ * trace function will cause it to be run safely.
+ */
+void atrace_setup();
+
+/**
+ * If tracing is ready, set atrace_enabled_tags to the system property
+ * debug.atrace.tags.enableflags. Can be used as a sysprop change callback.
+ */
+void atrace_update_tags();
+
+/**
+ * Set whether tracing is enabled for the current process.  This is used to
+ * prevent tracing within the Zygote process.
+ */
+void atrace_set_tracing_enabled(bool enabled);
+
+/**
+ * This is always set to false. This forces code that uses an old version
+ * of this header to always call into atrace_setup, in which we call
+ * atrace_init unconditionally.
+ */
+extern atomic_bool atrace_is_ready;
+
+/**
+ * Set of ATRACE_TAG flags to trace for, initialized to ATRACE_TAG_NOT_READY.
+ * A value of zero indicates setup has failed.
+ * Any other nonzero value indicates setup has succeeded, and tracing is on.
+ */
+extern uint64_t atrace_enabled_tags;
+
+/**
+ * Handle to the kernel's trace buffer, initialized to -1.
+ * Any other value indicates setup has succeeded, and is a valid fd for tracing.
+ */
+extern int atrace_marker_fd;
+
+/**
+ * atrace_init readies the process for tracing by opening the trace_marker file.
+ * Calling any trace function causes this to be run, so calling it is optional.
+ * This can be explicitly run to avoid setup delay on first trace function.
+ */
+#define ATRACE_INIT() atrace_init()
+#define ATRACE_GET_ENABLED_TAGS() atrace_get_enabled_tags()
+
+void atrace_init();
+uint64_t atrace_get_enabled_tags();
+
+/**
+ * Test if a given tag is currently enabled.
+ * Returns nonzero if the tag is enabled, otherwise zero.
+ * It can be used as a guard condition around more expensive trace calculations.
+ */
+#define ATRACE_ENABLED() atrace_is_tag_enabled(ATRACE_TAG)
+static inline uint64_t atrace_is_tag_enabled(uint64_t tag)
+{
+    return atrace_get_enabled_tags() & tag;
+}
+
+/**
+ * Trace the beginning of a context.  name is used to identify the context.
+ * This is often used to time function execution.
+ */
+#define ATRACE_BEGIN(name) atrace_begin(ATRACE_TAG, name)
+static inline void atrace_begin(uint64_t tag, const char* name)
+{
+    if (CC_UNLIKELY(atrace_is_tag_enabled(tag))) {
+        void atrace_begin_body(const char*);
+        atrace_begin_body(name);
+    }
+}
+
+/**
+ * Trace the end of a context.
+ * This should match up (and occur after) a corresponding ATRACE_BEGIN.
+ */
+#define ATRACE_END() atrace_end(ATRACE_TAG)
+static inline void atrace_end(uint64_t tag)
+{
+    if (CC_UNLIKELY(atrace_is_tag_enabled(tag))) {
+        void atrace_end_body();
+        atrace_end_body();
+    }
+}
+
+/**
+ * Trace the beginning of an asynchronous event. Unlike ATRACE_BEGIN/ATRACE_END
+ * contexts, asynchronous events do not need to be nested. The name describes
+ * the event, and the cookie provides a unique identifier for distinguishing
+ * simultaneous events. The name and cookie used to begin an event must be
+ * used to end it.
+ */
+#define ATRACE_ASYNC_BEGIN(name, cookie) \
+    atrace_async_begin(ATRACE_TAG, name, cookie)
+static inline void atrace_async_begin(uint64_t tag, const char* name,
+        int32_t cookie)
+{
+    if (CC_UNLIKELY(atrace_is_tag_enabled(tag))) {
+        void atrace_async_begin_body(const char*, int32_t);
+        atrace_async_begin_body(name, cookie);
+    }
+}
+
+/**
+ * Trace the end of an asynchronous event.
+ * This should have a corresponding ATRACE_ASYNC_BEGIN.
+ */
+#define ATRACE_ASYNC_END(name, cookie) atrace_async_end(ATRACE_TAG, name, cookie)
+static inline void atrace_async_end(uint64_t tag, const char* name, int32_t cookie)
+{
+    if (CC_UNLIKELY(atrace_is_tag_enabled(tag))) {
+        void atrace_async_end_body(const char*, int32_t);
+        atrace_async_end_body(name, cookie);
+    }
+}
+
+/**
+ * Traces an integer counter value.  name is used to identify the counter.
+ * This can be used to track how a value changes over time.
+ */
+#define ATRACE_INT(name, value) atrace_int(ATRACE_TAG, name, value)
+static inline void atrace_int(uint64_t tag, const char* name, int32_t value)
+{
+    if (CC_UNLIKELY(atrace_is_tag_enabled(tag))) {
+        void atrace_int_body(const char*, int32_t);
+        atrace_int_body(name, value);
+    }
+}
+
+/**
+ * Traces a 64-bit integer counter value.  name is used to identify the
+ * counter. This can be used to track how a value changes over time.
+ */
+#define ATRACE_INT64(name, value) atrace_int64(ATRACE_TAG, name, value)
+static inline void atrace_int64(uint64_t tag, const char* name, int64_t value)
+{
+    if (CC_UNLIKELY(atrace_is_tag_enabled(tag))) {
+        void atrace_int64_body(const char*, int64_t);
+        atrace_int64_body(name, value);
+    }
+}
+
+__END_DECLS
+
+#endif // _LIBS_CUTILS_TRACE_H
diff --git a/mesa 3D driver/include/android_stub/log/log_main.h b/mesa 3D driver/include/android_stub/log/log_main.h
index 1bd1c8aec1..799a8e2de6 100644
--- a/mesa 3D driver/include/android_stub/log/log_main.h	
+++ b/mesa 3D driver/include/android_stub/log/log_main.h	
@@ -364,13 +364,11 @@ int __android_log_is_loggable(int prio, const char* tag, int default_prio);
 int __android_log_is_loggable_len(int prio, const char* tag, size_t len, int default_prio);
 
 #if LOG_NDEBUG /* Production */
-#define android_testLog(prio, tag)                                           \
-  (__android_log_is_loggable_len(prio, tag, ((tag) && *(tag)) ? strlen(tag) : 0, \
-                                 ANDROID_LOG_DEBUG) != 0)
+#define android_testLog(prio, tag) \
+  (__android_log_is_loggable_len(prio, tag, (tag) ? strlen(tag) : 0, ANDROID_LOG_DEBUG) != 0)
 #else
-#define android_testLog(prio, tag)                                           \
-  (__android_log_is_loggable_len(prio, tag, ((tag) && *(tag)) ? strlen(tag) : 0, \
-                                 ANDROID_LOG_VERBOSE) != 0)
+#define android_testLog(prio, tag) \
+  (__android_log_is_loggable_len(prio, tag, (tag) ? strlen(tag) : 0, ANDROID_LOG_VERBOSE) != 0)
 #endif
 
 #if defined(__clang__)
diff --git a/mesa 3D driver/include/android_stub/ndk/sync.h b/mesa 3D driver/include/android_stub/ndk/sync.h
index 2a59e35bbc..38ccb686c1 100644
--- a/mesa 3D driver/include/android_stub/ndk/sync.h	
+++ b/mesa 3D driver/include/android_stub/ndk/sync.h	
@@ -33,8 +33,6 @@
 
 __BEGIN_DECLS
 
-#if __ANDROID_API__ >= 26
-
 /* Fences indicate the status of an asynchronous task. They are initially
  * in unsignaled state (0), and make a one-time transition to either signaled
  * (1) or error (< 0) state. A sync file is a collection of one or more fences;
@@ -101,8 +99,6 @@ static inline struct sync_fence_info* sync_get_fence_info(const struct sync_file
  */
 void sync_file_info_free(struct sync_file_info* info) __INTRODUCED_IN(26);
 
-#endif /* __ANDROID_API__ >= 26 */
-
 __END_DECLS
 
 #endif /* ANDROID_SYNC_H */
diff --git a/mesa 3D driver/include/android_stub/vndk/hardware_buffer.h b/mesa 3D driver/include/android_stub/vndk/hardware_buffer.h
index 3392d7f094..12f8691684 100644
--- a/mesa 3D driver/include/android_stub/vndk/hardware_buffer.h	
+++ b/mesa 3D driver/include/android_stub/vndk/hardware_buffer.h	
@@ -81,6 +81,20 @@ enum {
     AHARDWAREBUFFER_FORMAT_YCbCr_422_I              = 0x14,
 };
 
+/**
+ * Buffer usage flags.
+ */
+enum {
+    /* for future proofing, keep these in sync with hardware/gralloc.h */
+
+    /* The buffer will be written by the HW camera pipeline. */
+    AHARDWAREBUFFER_USAGE_CAMERA_WRITE              = 2UL << 16,
+    /* The buffer will be read by the HW camera pipeline. */
+    AHARDWAREBUFFER_USAGE_CAMERA_READ               = 4UL << 16,
+    /* Mask for the camera access values. */
+    AHARDWAREBUFFER_USAGE_CAMERA_MASK               = 6UL << 16,
+};
+
 __END_DECLS
 
 #endif /* ANDROID_VNDK_NATIVEWINDOW_AHARDWAREBUFFER_H */
diff --git a/mesa 3D driver/include/drm-uapi/virtgpu_drm.h b/mesa 3D driver/include/drm-uapi/virtgpu_drm.h
index b9ec26e9c6..a13e20cc66 100644
--- a/mesa 3D driver/include/drm-uapi/virtgpu_drm.h	
+++ b/mesa 3D driver/include/drm-uapi/virtgpu_drm.h	
@@ -47,12 +47,15 @@ extern "C" {
 #define DRM_VIRTGPU_WAIT     0x08
 #define DRM_VIRTGPU_GET_CAPS  0x09
 #define DRM_VIRTGPU_RESOURCE_CREATE_BLOB 0x0a
+#define DRM_VIRTGPU_CONTEXT_INIT 0x0b
 
 #define VIRTGPU_EXECBUF_FENCE_FD_IN	0x01
 #define VIRTGPU_EXECBUF_FENCE_FD_OUT	0x02
+#define VIRTGPU_EXECBUF_RING_IDX	0x04
 #define VIRTGPU_EXECBUF_FLAGS  (\
 		VIRTGPU_EXECBUF_FENCE_FD_IN |\
 		VIRTGPU_EXECBUF_FENCE_FD_OUT |\
+		VIRTGPU_EXECBUF_RING_IDX |\
 		0)
 
 struct drm_virtgpu_map {
@@ -68,6 +71,8 @@ struct drm_virtgpu_execbuffer {
 	__u64 bo_handles;
 	__u32 num_bo_handles;
 	__s32 fence_fd; /* in/out fence fd (see VIRTGPU_EXECBUF_FENCE_FD_IN/OUT) */
+	__u32 ring_idx; /* command ring index (see VIRTGPU_EXECBUF_RING_IDX) */
+	__u32 pad;
 };
 
 #define VIRTGPU_PARAM_3D_FEATURES 1 /* do we have 3D features in the hw */
@@ -75,6 +80,8 @@ struct drm_virtgpu_execbuffer {
 #define VIRTGPU_PARAM_RESOURCE_BLOB 3 /* DRM_VIRTGPU_RESOURCE_CREATE_BLOB */
 #define VIRTGPU_PARAM_HOST_VISIBLE 4 /* Host blob resources are mappable */
 #define VIRTGPU_PARAM_CROSS_DEVICE 5 /* Cross virtio-device resource sharing  */
+#define VIRTGPU_PARAM_CONTEXT_INIT 6 /* DRM_VIRTGPU_CONTEXT_INIT */
+#define VIRTGPU_PARAM_SUPPORTED_CAPSET_IDs 7 /* Bitmask of supported capability set ids */
 
 struct drm_virtgpu_getparam {
 	__u64 param;
@@ -173,6 +180,22 @@ struct drm_virtgpu_resource_create_blob {
 	__u64 blob_id;
 };
 
+#define VIRTGPU_CONTEXT_PARAM_CAPSET_ID       0x0001
+#define VIRTGPU_CONTEXT_PARAM_NUM_RINGS       0x0002
+#define VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK 0x0003
+struct drm_virtgpu_context_set_param {
+	__u64 param;
+	__u64 value;
+};
+
+struct drm_virtgpu_context_init {
+	__u32 num_params;
+	__u32 pad;
+
+	/* pointer to drm_virtgpu_context_set_param array */
+	__u64 ctx_set_params;
+};
+
 #define DRM_IOCTL_VIRTGPU_MAP \
 	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_MAP, struct drm_virtgpu_map)
 
@@ -212,6 +235,10 @@ struct drm_virtgpu_resource_create_blob {
 	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_CREATE_BLOB,	\
 		struct drm_virtgpu_resource_create_blob)
 
+#define DRM_IOCTL_VIRTGPU_CONTEXT_INIT					\
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_CONTEXT_INIT,		\
+		struct drm_virtgpu_context_init)
+
 #if defined(__cplusplus)
 }
 #endif
diff --git a/mesa 3D driver/include/vulkan/vulkan_android.h b/mesa 3D driver/include/vulkan/vulkan_android.h
index 2160e3e7c6..a8a830673b 100644
--- a/mesa 3D driver/include/vulkan/vulkan_android.h	
+++ b/mesa 3D driver/include/vulkan/vulkan_android.h	
@@ -44,7 +44,7 @@ VKAPI_ATTR VkResult VKAPI_CALL vkCreateAndroidSurfaceKHR(
 
 #define VK_ANDROID_external_memory_android_hardware_buffer 1
 struct AHardwareBuffer;
-#define VK_ANDROID_EXTERNAL_MEMORY_ANDROID_HARDWARE_BUFFER_SPEC_VERSION 3
+#define VK_ANDROID_EXTERNAL_MEMORY_ANDROID_HARDWARE_BUFFER_SPEC_VERSION 4
 #define VK_ANDROID_EXTERNAL_MEMORY_ANDROID_HARDWARE_BUFFER_EXTENSION_NAME "VK_ANDROID_external_memory_android_hardware_buffer"
 typedef struct VkAndroidHardwareBufferUsageANDROID {
     VkStructureType    sType;
@@ -90,6 +90,19 @@ typedef struct VkExternalFormatANDROID {
     uint64_t           externalFormat;
 } VkExternalFormatANDROID;
 
+typedef struct VkAndroidHardwareBufferFormatProperties2ANDROID {
+    VkStructureType                  sType;
+    void*                            pNext;
+    VkFormat                         format;
+    uint64_t                         externalFormat;
+    VkFormatFeatureFlags2KHR         formatFeatures;
+    VkComponentMapping               samplerYcbcrConversionComponents;
+    VkSamplerYcbcrModelConversion    suggestedYcbcrModel;
+    VkSamplerYcbcrRange              suggestedYcbcrRange;
+    VkChromaLocation                 suggestedXChromaOffset;
+    VkChromaLocation                 suggestedYChromaOffset;
+} VkAndroidHardwareBufferFormatProperties2ANDROID;
+
 typedef VkResult (VKAPI_PTR *PFN_vkGetAndroidHardwareBufferPropertiesANDROID)(VkDevice device, const struct AHardwareBuffer* buffer, VkAndroidHardwareBufferPropertiesANDROID* pProperties);
 typedef VkResult (VKAPI_PTR *PFN_vkGetMemoryAndroidHardwareBufferANDROID)(VkDevice device, const VkMemoryGetAndroidHardwareBufferInfoANDROID* pInfo, struct AHardwareBuffer** pBuffer);
 
diff --git a/mesa 3D driver/include/vulkan/vulkan_beta.h b/mesa 3D driver/include/vulkan/vulkan_beta.h
index c615bb35f9..2fedade6b9 100644
--- a/mesa 3D driver/include/vulkan/vulkan_beta.h	
+++ b/mesa 3D driver/include/vulkan/vulkan_beta.h	
@@ -22,7 +22,7 @@ extern "C" {
 #define VK_KHR_video_queue 1
 VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkVideoSessionKHR)
 VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkVideoSessionParametersKHR)
-#define VK_KHR_VIDEO_QUEUE_SPEC_VERSION   1
+#define VK_KHR_VIDEO_QUEUE_SPEC_VERSION   2
 #define VK_KHR_VIDEO_QUEUE_EXTENSION_NAME "VK_KHR_video_queue"
 
 typedef enum VkQueryResultStatusKHR {
@@ -66,12 +66,12 @@ typedef enum VkVideoComponentBitDepthFlagBitsKHR {
 } VkVideoComponentBitDepthFlagBitsKHR;
 typedef VkFlags VkVideoComponentBitDepthFlagsKHR;
 
-typedef enum VkVideoCapabilitiesFlagBitsKHR {
-    VK_VIDEO_CAPABILITIES_PROTECTED_CONTENT_BIT_KHR = 0x00000001,
-    VK_VIDEO_CAPABILITIES_SEPARATE_REFERENCE_IMAGES_BIT_KHR = 0x00000002,
-    VK_VIDEO_CAPABILITIES_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF
-} VkVideoCapabilitiesFlagBitsKHR;
-typedef VkFlags VkVideoCapabilitiesFlagsKHR;
+typedef enum VkVideoCapabilityFlagBitsKHR {
+    VK_VIDEO_CAPABILITY_PROTECTED_CONTENT_BIT_KHR = 0x00000001,
+    VK_VIDEO_CAPABILITY_SEPARATE_REFERENCE_IMAGES_BIT_KHR = 0x00000002,
+    VK_VIDEO_CAPABILITY_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkVideoCapabilityFlagBitsKHR;
+typedef VkFlags VkVideoCapabilityFlagsKHR;
 
 typedef enum VkVideoSessionCreateFlagBitsKHR {
     VK_VIDEO_SESSION_CREATE_DEFAULT_KHR = 0,
@@ -90,7 +90,6 @@ typedef enum VkVideoCodingControlFlagBitsKHR {
 typedef VkFlags VkVideoCodingControlFlagsKHR;
 
 typedef enum VkVideoCodingQualityPresetFlagBitsKHR {
-    VK_VIDEO_CODING_QUALITY_PRESET_DEFAULT_BIT_KHR = 0,
     VK_VIDEO_CODING_QUALITY_PRESET_NORMAL_BIT_KHR = 0x00000001,
     VK_VIDEO_CODING_QUALITY_PRESET_POWER_BIT_KHR = 0x00000002,
     VK_VIDEO_CODING_QUALITY_PRESET_QUALITY_BIT_KHR = 0x00000004,
@@ -120,16 +119,16 @@ typedef struct VkVideoProfilesKHR {
 } VkVideoProfilesKHR;
 
 typedef struct VkVideoCapabilitiesKHR {
-    VkStructureType                sType;
-    void*                          pNext;
-    VkVideoCapabilitiesFlagsKHR    capabilityFlags;
-    VkDeviceSize                   minBitstreamBufferOffsetAlignment;
-    VkDeviceSize                   minBitstreamBufferSizeAlignment;
-    VkExtent2D                     videoPictureExtentGranularity;
-    VkExtent2D                     minExtent;
-    VkExtent2D                     maxExtent;
-    uint32_t                       maxReferencePicturesSlotsCount;
-    uint32_t                       maxReferencePicturesActiveCount;
+    VkStructureType              sType;
+    void*                        pNext;
+    VkVideoCapabilityFlagsKHR    capabilityFlags;
+    VkDeviceSize                 minBitstreamBufferOffsetAlignment;
+    VkDeviceSize                 minBitstreamBufferSizeAlignment;
+    VkExtent2D                   videoPictureExtentGranularity;
+    VkExtent2D                   minExtent;
+    VkExtent2D                   maxExtent;
+    uint32_t                     maxReferencePicturesSlotsCount;
+    uint32_t                     maxReferencePicturesActiveCount;
 } VkVideoCapabilitiesKHR;
 
 typedef struct VkPhysicalDeviceVideoFormatInfoKHR {
@@ -305,7 +304,7 @@ VKAPI_ATTR void VKAPI_CALL vkCmdControlVideoCodingKHR(
 
 
 #define VK_KHR_video_decode_queue 1
-#define VK_KHR_VIDEO_DECODE_QUEUE_SPEC_VERSION 1
+#define VK_KHR_VIDEO_DECODE_QUEUE_SPEC_VERSION 2
 #define VK_KHR_VIDEO_DECODE_QUEUE_EXTENSION_NAME "VK_KHR_video_decode_queue"
 
 typedef enum VkVideoDecodeFlagBitsKHR {
@@ -370,7 +369,7 @@ typedef struct VkPhysicalDevicePortabilitySubsetPropertiesKHR {
 
 
 #define VK_KHR_video_encode_queue 1
-#define VK_KHR_VIDEO_ENCODE_QUEUE_SPEC_VERSION 2
+#define VK_KHR_VIDEO_ENCODE_QUEUE_SPEC_VERSION 3
 #define VK_KHR_VIDEO_ENCODE_QUEUE_EXTENSION_NAME "VK_KHR_video_encode_queue"
 
 typedef enum VkVideoEncodeFlagBitsKHR {
@@ -433,10 +432,10 @@ VKAPI_ATTR void VKAPI_CALL vkCmdEncodeVideoKHR(
 #define VK_EXT_video_encode_h264 1
 #include "vk_video/vulkan_video_codec_h264std.h"
 #include "vk_video/vulkan_video_codec_h264std_encode.h"
-#define VK_EXT_VIDEO_ENCODE_H264_SPEC_VERSION 1
+#define VK_EXT_VIDEO_ENCODE_H264_SPEC_VERSION 2
 #define VK_EXT_VIDEO_ENCODE_H264_EXTENSION_NAME "VK_EXT_video_encode_h264"
 
-typedef enum VkVideoEncodeH264CapabilitiesFlagBitsEXT {
+typedef enum VkVideoEncodeH264CapabilityFlagBitsEXT {
     VK_VIDEO_ENCODE_H264_CAPABILITY_CABAC_BIT_EXT = 0x00000001,
     VK_VIDEO_ENCODE_H264_CAPABILITY_CAVLC_BIT_EXT = 0x00000002,
     VK_VIDEO_ENCODE_H264_CAPABILITY_WEIGHTED_BI_PRED_IMPLICIT_BIT_EXT = 0x00000004,
@@ -448,9 +447,9 @@ typedef enum VkVideoEncodeH264CapabilitiesFlagBitsEXT {
     VK_VIDEO_ENCODE_H264_CAPABILITY_DEBLOCKING_FILTER_PARTIAL_BIT_EXT = 0x00000100,
     VK_VIDEO_ENCODE_H264_CAPABILITY_MULTIPLE_SLICE_PER_FRAME_BIT_EXT = 0x00000200,
     VK_VIDEO_ENCODE_H264_CAPABILITY_EVENLY_DISTRIBUTED_SLICE_SIZE_BIT_EXT = 0x00000400,
-    VK_VIDEO_ENCODE_H264_CAPABILITIES_FLAG_BITS_MAX_ENUM_EXT = 0x7FFFFFFF
-} VkVideoEncodeH264CapabilitiesFlagBitsEXT;
-typedef VkFlags VkVideoEncodeH264CapabilitiesFlagsEXT;
+    VK_VIDEO_ENCODE_H264_CAPABILITY_FLAG_BITS_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkVideoEncodeH264CapabilityFlagBitsEXT;
+typedef VkFlags VkVideoEncodeH264CapabilityFlagsEXT;
 
 typedef enum VkVideoEncodeH264InputModeFlagBitsEXT {
     VK_VIDEO_ENCODE_H264_INPUT_MODE_FRAME_BIT_EXT = 0x00000001,
@@ -475,19 +474,19 @@ typedef enum VkVideoEncodeH264CreateFlagBitsEXT {
 } VkVideoEncodeH264CreateFlagBitsEXT;
 typedef VkFlags VkVideoEncodeH264CreateFlagsEXT;
 typedef struct VkVideoEncodeH264CapabilitiesEXT {
-    VkStructureType                          sType;
-    const void*                              pNext;
-    VkVideoEncodeH264CapabilitiesFlagsEXT    flags;
-    VkVideoEncodeH264InputModeFlagsEXT       inputModeFlags;
-    VkVideoEncodeH264OutputModeFlagsEXT      outputModeFlags;
-    VkExtent2D                               minPictureSizeInMbs;
-    VkExtent2D                               maxPictureSizeInMbs;
-    VkExtent2D                               inputImageDataAlignment;
-    uint8_t                                  maxNumL0ReferenceForP;
-    uint8_t                                  maxNumL0ReferenceForB;
-    uint8_t                                  maxNumL1Reference;
-    uint8_t                                  qualityLevelCount;
-    VkExtensionProperties                    stdExtensionVersion;
+    VkStructureType                        sType;
+    const void*                            pNext;
+    VkVideoEncodeH264CapabilityFlagsEXT    flags;
+    VkVideoEncodeH264InputModeFlagsEXT     inputModeFlags;
+    VkVideoEncodeH264OutputModeFlagsEXT    outputModeFlags;
+    VkExtent2D                             minPictureSizeInMbs;
+    VkExtent2D                             maxPictureSizeInMbs;
+    VkExtent2D                             inputImageDataAlignment;
+    uint8_t                                maxNumL0ReferenceForP;
+    uint8_t                                maxNumL0ReferenceForB;
+    uint8_t                                maxNumL1Reference;
+    uint8_t                                qualityLevelCount;
+    VkExtensionProperties                  stdExtensionVersion;
 } VkVideoEncodeH264CapabilitiesEXT;
 
 typedef struct VkVideoEncodeH264SessionCreateInfoEXT {
@@ -567,22 +566,22 @@ typedef struct VkVideoEncodeH264ProfileEXT {
 
 #define VK_EXT_video_decode_h264 1
 #include "vk_video/vulkan_video_codec_h264std_decode.h"
-#define VK_EXT_VIDEO_DECODE_H264_SPEC_VERSION 1
+#define VK_EXT_VIDEO_DECODE_H264_SPEC_VERSION 3
 #define VK_EXT_VIDEO_DECODE_H264_EXTENSION_NAME "VK_EXT_video_decode_h264"
 
-typedef enum VkVideoDecodeH264FieldLayoutFlagBitsEXT {
-    VK_VIDEO_DECODE_H264_PROGRESSIVE_PICTURES_ONLY_EXT = 0,
-    VK_VIDEO_DECODE_H264_FIELD_LAYOUT_LINE_INTERLACED_PLANE_BIT_EXT = 0x00000001,
-    VK_VIDEO_DECODE_H264_FIELD_LAYOUT_SEPARATE_INTERLACED_PLANE_BIT_EXT = 0x00000002,
-    VK_VIDEO_DECODE_H264_FIELD_LAYOUT_FLAG_BITS_MAX_ENUM_EXT = 0x7FFFFFFF
-} VkVideoDecodeH264FieldLayoutFlagBitsEXT;
-typedef VkFlags VkVideoDecodeH264FieldLayoutFlagsEXT;
+typedef enum VkVideoDecodeH264PictureLayoutFlagBitsEXT {
+    VK_VIDEO_DECODE_H264_PICTURE_LAYOUT_PROGRESSIVE_EXT = 0,
+    VK_VIDEO_DECODE_H264_PICTURE_LAYOUT_INTERLACED_INTERLEAVED_LINES_BIT_EXT = 0x00000001,
+    VK_VIDEO_DECODE_H264_PICTURE_LAYOUT_INTERLACED_SEPARATE_PLANES_BIT_EXT = 0x00000002,
+    VK_VIDEO_DECODE_H264_PICTURE_LAYOUT_FLAG_BITS_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkVideoDecodeH264PictureLayoutFlagBitsEXT;
+typedef VkFlags VkVideoDecodeH264PictureLayoutFlagsEXT;
 typedef VkFlags VkVideoDecodeH264CreateFlagsEXT;
 typedef struct VkVideoDecodeH264ProfileEXT {
-    VkStructureType                         sType;
-    const void*                             pNext;
-    StdVideoH264ProfileIdc                  stdProfileIdc;
-    VkVideoDecodeH264FieldLayoutFlagsEXT    fieldLayout;
+    VkStructureType                           sType;
+    const void*                               pNext;
+    StdVideoH264ProfileIdc                    stdProfileIdc;
+    VkVideoDecodeH264PictureLayoutFlagsEXT    pictureLayout;
 } VkVideoDecodeH264ProfileEXT;
 
 typedef struct VkVideoDecodeH264CapabilitiesEXT {
diff --git a/mesa 3D driver/include/vulkan/vulkan_core.h b/mesa 3D driver/include/vulkan/vulkan_core.h
index a5aafcc95b..83f2c3aa50 100644
--- a/mesa 3D driver/include/vulkan/vulkan_core.h	
+++ b/mesa 3D driver/include/vulkan/vulkan_core.h	
@@ -20,7 +20,7 @@ extern "C" {
 
 
 #define VK_VERSION_1_0 1
-#include "vulkan/vk_platform.h"
+#include "vk_platform.h"
 
 #define VK_DEFINE_HANDLE(object) typedef struct object##_T* object;
 
@@ -72,7 +72,7 @@ extern "C" {
 #define VK_API_VERSION_1_0 VK_MAKE_API_VERSION(0, 1, 0, 0)// Patch version should always be set to 0
 
 // Version of this file
-#define VK_HEADER_VERSION 185
+#define VK_HEADER_VERSION 195
 
 // Complete version of this file
 #define VK_HEADER_VERSION_COMPLETE VK_MAKE_API_VERSION(0, 1, 2, VK_HEADER_VERSION)
@@ -565,6 +565,7 @@ typedef enum VkStructureType {
     VK_STRUCTURE_TYPE_IMPORT_ANDROID_HARDWARE_BUFFER_INFO_ANDROID = 1000129003,
     VK_STRUCTURE_TYPE_MEMORY_GET_ANDROID_HARDWARE_BUFFER_INFO_ANDROID = 1000129004,
     VK_STRUCTURE_TYPE_EXTERNAL_FORMAT_ANDROID = 1000129005,
+    VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_2_ANDROID = 1000129006,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INLINE_UNIFORM_BLOCK_FEATURES_EXT = 1000138000,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INLINE_UNIFORM_BLOCK_PROPERTIES_EXT = 1000138001,
     VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_INLINE_UNIFORM_BLOCK_EXT = 1000138002,
@@ -607,6 +608,7 @@ typedef enum VkStructureType {
     VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT = 1000158003,
     VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT = 1000158004,
     VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_PROPERTIES_EXT = 1000158005,
+    VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_2_EXT = 1000158006,
     VK_STRUCTURE_TYPE_VALIDATION_CACHE_CREATE_INFO_EXT = 1000160000,
     VK_STRUCTURE_TYPE_SHADER_MODULE_VALIDATION_CACHE_CREATE_INFO_EXT = 1000160001,
 #ifdef VK_ENABLE_BETA_EXTENSIONS
@@ -754,6 +756,8 @@ typedef enum VkStructureType {
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEVICE_GENERATED_COMMANDS_FEATURES_NV = 1000277007,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INHERITED_VIEWPORT_SCISSOR_FEATURES_NV = 1000278000,
     VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_VIEWPORT_SCISSOR_INFO_NV = 1000278001,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_FEATURES_KHR = 1000280000,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_PROPERTIES_KHR = 1000280001,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_FEATURES_EXT = 1000281000,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_PROPERTIES_EXT = 1000281001,
     VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_RENDER_PASS_TRANSFORM_INFO_QCOM = 1000282000,
@@ -817,6 +821,7 @@ typedef enum VkStructureType {
     VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2_KHR = 1000337009,
     VK_STRUCTURE_TYPE_IMAGE_RESOLVE_2_KHR = 1000337010,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_4444_FORMATS_FEATURES_EXT = 1000340000,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RGBA10X6_FORMATS_FEATURES_EXT = 1000344000,
     VK_STRUCTURE_TYPE_DIRECTFB_SURFACE_CREATE_INFO_EXT = 1000346000,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MUTABLE_DESCRIPTOR_TYPE_FEATURES_VALVE = 1000351000,
     VK_STRUCTURE_TYPE_MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_VALVE = 1000351002,
@@ -824,11 +829,23 @@ typedef enum VkStructureType {
     VK_STRUCTURE_TYPE_VERTEX_INPUT_BINDING_DESCRIPTION_2_EXT = 1000352001,
     VK_STRUCTURE_TYPE_VERTEX_INPUT_ATTRIBUTE_DESCRIPTION_2_EXT = 1000352002,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRM_PROPERTIES_EXT = 1000353000,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRIMITIVE_TOPOLOGY_LIST_RESTART_FEATURES_EXT = 1000356000,
+    VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_3_KHR = 1000360000,
     VK_STRUCTURE_TYPE_IMPORT_MEMORY_ZIRCON_HANDLE_INFO_FUCHSIA = 1000364000,
     VK_STRUCTURE_TYPE_MEMORY_ZIRCON_HANDLE_PROPERTIES_FUCHSIA = 1000364001,
     VK_STRUCTURE_TYPE_MEMORY_GET_ZIRCON_HANDLE_INFO_FUCHSIA = 1000364002,
     VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_ZIRCON_HANDLE_INFO_FUCHSIA = 1000365000,
     VK_STRUCTURE_TYPE_SEMAPHORE_GET_ZIRCON_HANDLE_INFO_FUCHSIA = 1000365001,
+    VK_STRUCTURE_TYPE_BUFFER_COLLECTION_CREATE_INFO_FUCHSIA = 1000366000,
+    VK_STRUCTURE_TYPE_IMPORT_MEMORY_BUFFER_COLLECTION_FUCHSIA = 1000366001,
+    VK_STRUCTURE_TYPE_BUFFER_COLLECTION_IMAGE_CREATE_INFO_FUCHSIA = 1000366002,
+    VK_STRUCTURE_TYPE_BUFFER_COLLECTION_PROPERTIES_FUCHSIA = 1000366003,
+    VK_STRUCTURE_TYPE_BUFFER_CONSTRAINTS_INFO_FUCHSIA = 1000366004,
+    VK_STRUCTURE_TYPE_BUFFER_COLLECTION_BUFFER_CREATE_INFO_FUCHSIA = 1000366005,
+    VK_STRUCTURE_TYPE_IMAGE_CONSTRAINTS_INFO_FUCHSIA = 1000366006,
+    VK_STRUCTURE_TYPE_IMAGE_FORMAT_CONSTRAINTS_INFO_FUCHSIA = 1000366007,
+    VK_STRUCTURE_TYPE_SYSMEM_COLOR_SPACE_FUCHSIA = 1000366008,
+    VK_STRUCTURE_TYPE_BUFFER_COLLECTION_CONSTRAINTS_INFO_FUCHSIA = 1000366009,
     VK_STRUCTURE_TYPE_SUBPASS_SHADING_PIPELINE_CREATE_INFO_HUAWEI = 1000369000,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBPASS_SHADING_FEATURES_HUAWEI = 1000369001,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBPASS_SHADING_PROPERTIES_HUAWEI = 1000369002,
@@ -843,6 +860,11 @@ typedef enum VkStructureType {
     VK_STRUCTURE_TYPE_QUEUE_FAMILY_GLOBAL_PRIORITY_PROPERTIES_EXT = 1000388001,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTI_DRAW_FEATURES_EXT = 1000392000,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTI_DRAW_PROPERTIES_EXT = 1000392001,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PAGEABLE_DEVICE_LOCAL_MEMORY_FEATURES_EXT = 1000412000,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_4_FEATURES_KHR = 1000413000,
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_4_PROPERTIES_KHR = 1000413001,
+    VK_STRUCTURE_TYPE_DEVICE_BUFFER_MEMORY_REQUIREMENTS_KHR = 1000413002,
+    VK_STRUCTURE_TYPE_DEVICE_IMAGE_MEMORY_REQUIREMENTS_KHR = 1000413003,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTER_FEATURES = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTERS_FEATURES,
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETER_FEATURES = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETERS_FEATURES,
     VK_STRUCTURE_TYPE_DEBUG_REPORT_CREATE_INFO_EXT = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT,
@@ -1060,6 +1082,7 @@ typedef enum VkObjectType {
     VK_OBJECT_TYPE_DEFERRED_OPERATION_KHR = 1000268000,
     VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_NV = 1000277000,
     VK_OBJECT_TYPE_PRIVATE_DATA_SLOT_EXT = 1000295000,
+    VK_OBJECT_TYPE_BUFFER_COLLECTION_FUCHSIA = 1000366000,
     VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_KHR = VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE,
     VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION_KHR = VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION,
     VK_OBJECT_TYPE_MAX_ENUM = 0x7FFFFFFF
@@ -1706,13 +1729,15 @@ typedef enum VkAttachmentLoadOp {
     VK_ATTACHMENT_LOAD_OP_LOAD = 0,
     VK_ATTACHMENT_LOAD_OP_CLEAR = 1,
     VK_ATTACHMENT_LOAD_OP_DONT_CARE = 2,
+    VK_ATTACHMENT_LOAD_OP_NONE_EXT = 1000400000,
     VK_ATTACHMENT_LOAD_OP_MAX_ENUM = 0x7FFFFFFF
 } VkAttachmentLoadOp;
 
 typedef enum VkAttachmentStoreOp {
     VK_ATTACHMENT_STORE_OP_STORE = 0,
     VK_ATTACHMENT_STORE_OP_DONT_CARE = 1,
-    VK_ATTACHMENT_STORE_OP_NONE_QCOM = 1000301000,
+    VK_ATTACHMENT_STORE_OP_NONE_EXT = 1000301000,
+    VK_ATTACHMENT_STORE_OP_NONE_QCOM = VK_ATTACHMENT_STORE_OP_NONE_EXT,
     VK_ATTACHMENT_STORE_OP_MAX_ENUM = 0x7FFFFFFF
 } VkAttachmentStoreOp;
 
@@ -2123,10 +2148,6 @@ typedef enum VkImageViewCreateFlagBits {
     VK_IMAGE_VIEW_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
 } VkImageViewCreateFlagBits;
 typedef VkFlags VkImageViewCreateFlags;
-
-typedef enum VkShaderModuleCreateFlagBits {
-    VK_SHADER_MODULE_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
-} VkShaderModuleCreateFlagBits;
 typedef VkFlags VkShaderModuleCreateFlags;
 
 typedef enum VkPipelineCacheCreateFlagBits {
@@ -5282,6 +5303,10 @@ typedef enum VkDriverId {
     VK_DRIVER_ID_MOLTENVK = 14,
     VK_DRIVER_ID_COREAVI_PROPRIETARY = 15,
     VK_DRIVER_ID_JUICE_PROPRIETARY = 16,
+    VK_DRIVER_ID_VERISILICON_PROPRIETARY = 17,
+    VK_DRIVER_ID_MESA_TURNIP = 18,
+    VK_DRIVER_ID_MESA_V3DV = 19,
+    VK_DRIVER_ID_MESA_PANVK = 20,
     VK_DRIVER_ID_AMD_PROPRIETARY_KHR = VK_DRIVER_ID_AMD_PROPRIETARY,
     VK_DRIVER_ID_AMD_OPEN_SOURCE_KHR = VK_DRIVER_ID_AMD_OPEN_SOURCE,
     VK_DRIVER_ID_MESA_RADV_KHR = VK_DRIVER_ID_MESA_RADV,
@@ -6563,8 +6588,10 @@ VKAPI_ATTR void VKAPI_CALL vkCmdDispatchBaseKHR(
 
 
 #define VK_KHR_maintenance1 1
-#define VK_KHR_MAINTENANCE1_SPEC_VERSION  2
-#define VK_KHR_MAINTENANCE1_EXTENSION_NAME "VK_KHR_maintenance1"
+#define VK_KHR_MAINTENANCE_1_SPEC_VERSION 2
+#define VK_KHR_MAINTENANCE_1_EXTENSION_NAME "VK_KHR_maintenance1"
+#define VK_KHR_MAINTENANCE1_SPEC_VERSION  VK_KHR_MAINTENANCE_1_SPEC_VERSION
+#define VK_KHR_MAINTENANCE1_EXTENSION_NAME VK_KHR_MAINTENANCE_1_EXTENSION_NAME
 typedef VkCommandPoolTrimFlags VkCommandPoolTrimFlagsKHR;
 
 typedef void (VKAPI_PTR *PFN_vkTrimCommandPoolKHR)(VkDevice device, VkCommandPool commandPool, VkCommandPoolTrimFlags flags);
@@ -7144,8 +7171,10 @@ VKAPI_ATTR void VKAPI_CALL vkReleaseProfilingLockKHR(
 
 
 #define VK_KHR_maintenance2 1
-#define VK_KHR_MAINTENANCE2_SPEC_VERSION  1
-#define VK_KHR_MAINTENANCE2_EXTENSION_NAME "VK_KHR_maintenance2"
+#define VK_KHR_MAINTENANCE_2_SPEC_VERSION 1
+#define VK_KHR_MAINTENANCE_2_EXTENSION_NAME "VK_KHR_maintenance2"
+#define VK_KHR_MAINTENANCE2_SPEC_VERSION  VK_KHR_MAINTENANCE_2_SPEC_VERSION
+#define VK_KHR_MAINTENANCE2_EXTENSION_NAME VK_KHR_MAINTENANCE_2_EXTENSION_NAME
 typedef VkPointClippingBehavior VkPointClippingBehaviorKHR;
 
 typedef VkTessellationDomainOrigin VkTessellationDomainOriginKHR;
@@ -7398,8 +7427,10 @@ VKAPI_ATTR VkResult VKAPI_CALL vkBindImageMemory2KHR(
 
 
 #define VK_KHR_maintenance3 1
-#define VK_KHR_MAINTENANCE3_SPEC_VERSION  1
-#define VK_KHR_MAINTENANCE3_EXTENSION_NAME "VK_KHR_maintenance3"
+#define VK_KHR_MAINTENANCE_3_SPEC_VERSION 1
+#define VK_KHR_MAINTENANCE_3_EXTENSION_NAME "VK_KHR_maintenance3"
+#define VK_KHR_MAINTENANCE3_SPEC_VERSION  VK_KHR_MAINTENANCE_3_SPEC_VERSION
+#define VK_KHR_MAINTENANCE3_EXTENSION_NAME VK_KHR_MAINTENANCE_3_EXTENSION_NAME
 typedef VkPhysicalDeviceMaintenance3Properties VkPhysicalDeviceMaintenance3PropertiesKHR;
 
 typedef VkDescriptorSetLayoutSupport VkDescriptorSetLayoutSupportKHR;
@@ -7575,7 +7606,7 @@ typedef struct VkPhysicalDeviceShaderTerminateInvocationFeaturesKHR {
 
 
 #define VK_KHR_fragment_shading_rate 1
-#define VK_KHR_FRAGMENT_SHADING_RATE_SPEC_VERSION 1
+#define VK_KHR_FRAGMENT_SHADING_RATE_SPEC_VERSION 2
 #define VK_KHR_FRAGMENT_SHADING_RATE_EXTENSION_NAME "VK_KHR_fragment_shading_rate"
 
 typedef enum VkFragmentShadingRateCombinerOpKHR {
@@ -7864,6 +7895,52 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetPipelineExecutableInternalRepresentationsKHR
 #endif
 
 
+#define VK_KHR_shader_integer_dot_product 1
+#define VK_KHR_SHADER_INTEGER_DOT_PRODUCT_SPEC_VERSION 1
+#define VK_KHR_SHADER_INTEGER_DOT_PRODUCT_EXTENSION_NAME "VK_KHR_shader_integer_dot_product"
+typedef struct VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR {
+    VkStructureType    sType;
+    void*              pNext;
+    VkBool32           shaderIntegerDotProduct;
+} VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR;
+
+typedef struct VkPhysicalDeviceShaderIntegerDotProductPropertiesKHR {
+    VkStructureType    sType;
+    void*              pNext;
+    VkBool32           integerDotProduct8BitUnsignedAccelerated;
+    VkBool32           integerDotProduct8BitSignedAccelerated;
+    VkBool32           integerDotProduct8BitMixedSignednessAccelerated;
+    VkBool32           integerDotProduct4x8BitPackedUnsignedAccelerated;
+    VkBool32           integerDotProduct4x8BitPackedSignedAccelerated;
+    VkBool32           integerDotProduct4x8BitPackedMixedSignednessAccelerated;
+    VkBool32           integerDotProduct16BitUnsignedAccelerated;
+    VkBool32           integerDotProduct16BitSignedAccelerated;
+    VkBool32           integerDotProduct16BitMixedSignednessAccelerated;
+    VkBool32           integerDotProduct32BitUnsignedAccelerated;
+    VkBool32           integerDotProduct32BitSignedAccelerated;
+    VkBool32           integerDotProduct32BitMixedSignednessAccelerated;
+    VkBool32           integerDotProduct64BitUnsignedAccelerated;
+    VkBool32           integerDotProduct64BitSignedAccelerated;
+    VkBool32           integerDotProduct64BitMixedSignednessAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating8BitUnsignedAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating8BitSignedAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating16BitUnsignedAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating16BitSignedAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating32BitUnsignedAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating32BitSignedAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating64BitUnsignedAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating64BitSignedAccelerated;
+    VkBool32           integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated;
+} VkPhysicalDeviceShaderIntegerDotProductPropertiesKHR;
+
+
+
 #define VK_KHR_pipeline_library 1
 #define VK_KHR_PIPELINE_LIBRARY_SPEC_VERSION 1
 #define VK_KHR_PIPELINE_LIBRARY_EXTENSION_NAME "VK_KHR_pipeline_library"
@@ -8350,6 +8427,117 @@ VKAPI_ATTR void VKAPI_CALL vkCmdResolveImage2KHR(
 #endif
 
 
+#define VK_KHR_format_feature_flags2 1
+#define VK_KHR_FORMAT_FEATURE_FLAGS_2_SPEC_VERSION 1
+#define VK_KHR_FORMAT_FEATURE_FLAGS_2_EXTENSION_NAME "VK_KHR_format_feature_flags2"
+typedef VkFlags64 VkFormatFeatureFlags2KHR;
+
+// Flag bits for VkFormatFeatureFlagBits2KHR
+typedef VkFlags64 VkFormatFeatureFlagBits2KHR;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT_KHR = 0x00000001ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT_KHR = 0x00000002ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_STORAGE_IMAGE_ATOMIC_BIT_KHR = 0x00000004ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT_KHR = 0x00000008ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT_KHR = 0x00000010ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_ATOMIC_BIT_KHR = 0x00000020ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_VERTEX_BUFFER_BIT_KHR = 0x00000040ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT_KHR = 0x00000080ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BLEND_BIT_KHR = 0x00000100ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT_KHR = 0x00000200ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_BLIT_SRC_BIT_KHR = 0x00000400ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_BLIT_DST_BIT_KHR = 0x00000800ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT_KHR = 0x00001000ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_CUBIC_BIT_EXT = 0x00002000ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT_KHR = 0x00004000ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT_KHR = 0x00008000ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT_KHR = 0x00010000ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT_KHR = 0x00020000ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_LINEAR_FILTER_BIT_KHR = 0x00040000ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_SEPARATE_RECONSTRUCTION_FILTER_BIT_KHR = 0x00080000ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_CHROMA_RECONSTRUCTION_EXPLICIT_BIT_KHR = 0x00100000ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_CHROMA_RECONSTRUCTION_EXPLICIT_FORCEABLE_BIT_KHR = 0x00200000ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_DISJOINT_BIT_KHR = 0x00400000ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_COSITED_CHROMA_SAMPLES_BIT_KHR = 0x00800000ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT_KHR = 0x80000000ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT_KHR = 0x100000000ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_DEPTH_COMPARISON_BIT_KHR = 0x200000000ULL;
+#ifdef VK_ENABLE_BETA_EXTENSIONS
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_VIDEO_DECODE_OUTPUT_BIT_KHR = 0x02000000ULL;
+#endif
+#ifdef VK_ENABLE_BETA_EXTENSIONS
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_VIDEO_DECODE_DPB_BIT_KHR = 0x04000000ULL;
+#endif
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_ACCELERATION_STRUCTURE_VERTEX_BUFFER_BIT_KHR = 0x20000000ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_FRAGMENT_DENSITY_MAP_BIT_EXT = 0x01000000ULL;
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR = 0x40000000ULL;
+#ifdef VK_ENABLE_BETA_EXTENSIONS
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_VIDEO_ENCODE_INPUT_BIT_KHR = 0x08000000ULL;
+#endif
+#ifdef VK_ENABLE_BETA_EXTENSIONS
+static const VkFormatFeatureFlagBits2KHR VK_FORMAT_FEATURE_2_VIDEO_ENCODE_DPB_BIT_KHR = 0x10000000ULL;
+#endif
+
+typedef struct VkFormatProperties3KHR {
+    VkStructureType             sType;
+    void*                       pNext;
+    VkFormatFeatureFlags2KHR    linearTilingFeatures;
+    VkFormatFeatureFlags2KHR    optimalTilingFeatures;
+    VkFormatFeatureFlags2KHR    bufferFeatures;
+} VkFormatProperties3KHR;
+
+
+
+#define VK_KHR_maintenance4 1
+#define VK_KHR_MAINTENANCE_4_SPEC_VERSION 1
+#define VK_KHR_MAINTENANCE_4_EXTENSION_NAME "VK_KHR_maintenance4"
+typedef struct VkPhysicalDeviceMaintenance4FeaturesKHR {
+    VkStructureType    sType;
+    void*              pNext;
+    VkBool32           maintenance4;
+} VkPhysicalDeviceMaintenance4FeaturesKHR;
+
+typedef struct VkPhysicalDeviceMaintenance4PropertiesKHR {
+    VkStructureType    sType;
+    void*              pNext;
+    VkDeviceSize       maxBufferSize;
+} VkPhysicalDeviceMaintenance4PropertiesKHR;
+
+typedef struct VkDeviceBufferMemoryRequirementsKHR {
+    VkStructureType              sType;
+    const void*                  pNext;
+    const VkBufferCreateInfo*    pCreateInfo;
+} VkDeviceBufferMemoryRequirementsKHR;
+
+typedef struct VkDeviceImageMemoryRequirementsKHR {
+    VkStructureType             sType;
+    const void*                 pNext;
+    const VkImageCreateInfo*    pCreateInfo;
+    VkImageAspectFlagBits       planeAspect;
+} VkDeviceImageMemoryRequirementsKHR;
+
+typedef void (VKAPI_PTR *PFN_vkGetDeviceBufferMemoryRequirementsKHR)(VkDevice device, const VkDeviceBufferMemoryRequirementsKHR* pInfo, VkMemoryRequirements2* pMemoryRequirements);
+typedef void (VKAPI_PTR *PFN_vkGetDeviceImageMemoryRequirementsKHR)(VkDevice device, const VkDeviceImageMemoryRequirementsKHR* pInfo, VkMemoryRequirements2* pMemoryRequirements);
+typedef void (VKAPI_PTR *PFN_vkGetDeviceImageSparseMemoryRequirementsKHR)(VkDevice device, const VkDeviceImageMemoryRequirementsKHR* pInfo, uint32_t* pSparseMemoryRequirementCount, VkSparseImageMemoryRequirements2* pSparseMemoryRequirements);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR void VKAPI_CALL vkGetDeviceBufferMemoryRequirementsKHR(
+    VkDevice                                    device,
+    const VkDeviceBufferMemoryRequirementsKHR*  pInfo,
+    VkMemoryRequirements2*                      pMemoryRequirements);
+
+VKAPI_ATTR void VKAPI_CALL vkGetDeviceImageMemoryRequirementsKHR(
+    VkDevice                                    device,
+    const VkDeviceImageMemoryRequirementsKHR*   pInfo,
+    VkMemoryRequirements2*                      pMemoryRequirements);
+
+VKAPI_ATTR void VKAPI_CALL vkGetDeviceImageSparseMemoryRequirementsKHR(
+    VkDevice                                    device,
+    const VkDeviceImageMemoryRequirementsKHR*   pInfo,
+    uint32_t*                                   pSparseMemoryRequirementCount,
+    VkSparseImageMemoryRequirements2*           pSparseMemoryRequirements);
+#endif
+
+
 #define VK_EXT_debug_report 1
 VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDebugReportCallbackEXT)
 #define VK_EXT_DEBUG_REPORT_SPEC_VERSION  10
@@ -8394,6 +8582,7 @@ typedef enum VkDebugReportObjectTypeEXT {
     VK_DEBUG_REPORT_OBJECT_TYPE_CU_FUNCTION_NVX_EXT = 1000029001,
     VK_DEBUG_REPORT_OBJECT_TYPE_ACCELERATION_STRUCTURE_KHR_EXT = 1000150000,
     VK_DEBUG_REPORT_OBJECT_TYPE_ACCELERATION_STRUCTURE_NV_EXT = 1000165000,
+    VK_DEBUG_REPORT_OBJECT_TYPE_BUFFER_COLLECTION_FUCHSIA_EXT = 1000366000,
     VK_DEBUG_REPORT_OBJECT_TYPE_DEBUG_REPORT_EXT = VK_DEBUG_REPORT_OBJECT_TYPE_DEBUG_REPORT_CALLBACK_EXT_EXT,
     VK_DEBUG_REPORT_OBJECT_TYPE_VALIDATION_CACHE_EXT = VK_DEBUG_REPORT_OBJECT_TYPE_VALIDATION_CACHE_EXT_EXT,
     VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_KHR_EXT = VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_EXT,
@@ -9253,8 +9442,10 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetPastPresentationTimingGOOGLE(
 
 
 #define VK_NV_viewport_array2 1
-#define VK_NV_VIEWPORT_ARRAY2_SPEC_VERSION 1
-#define VK_NV_VIEWPORT_ARRAY2_EXTENSION_NAME "VK_NV_viewport_array2"
+#define VK_NV_VIEWPORT_ARRAY_2_SPEC_VERSION 1
+#define VK_NV_VIEWPORT_ARRAY_2_EXTENSION_NAME "VK_NV_viewport_array2"
+#define VK_NV_VIEWPORT_ARRAY2_SPEC_VERSION VK_NV_VIEWPORT_ARRAY_2_SPEC_VERSION
+#define VK_NV_VIEWPORT_ARRAY2_EXTENSION_NAME VK_NV_VIEWPORT_ARRAY_2_EXTENSION_NAME
 
 
 #define VK_NVX_multiview_per_view_attributes 1
@@ -9824,7 +10015,7 @@ typedef struct VkPhysicalDeviceShaderSMBuiltinsFeaturesNV {
 
 
 #define VK_EXT_image_drm_format_modifier 1
-#define VK_EXT_IMAGE_DRM_FORMAT_MODIFIER_SPEC_VERSION 1
+#define VK_EXT_IMAGE_DRM_FORMAT_MODIFIER_SPEC_VERSION 2
 #define VK_EXT_IMAGE_DRM_FORMAT_MODIFIER_EXTENSION_NAME "VK_EXT_image_drm_format_modifier"
 typedef struct VkDrmFormatModifierPropertiesEXT {
     uint64_t                drmFormatModifier;
@@ -9869,6 +10060,19 @@ typedef struct VkImageDrmFormatModifierPropertiesEXT {
     uint64_t           drmFormatModifier;
 } VkImageDrmFormatModifierPropertiesEXT;
 
+typedef struct VkDrmFormatModifierProperties2EXT {
+    uint64_t                    drmFormatModifier;
+    uint32_t                    drmFormatModifierPlaneCount;
+    VkFormatFeatureFlags2KHR    drmFormatModifierTilingFeatures;
+} VkDrmFormatModifierProperties2EXT;
+
+typedef struct VkDrmFormatModifierPropertiesList2EXT {
+    VkStructureType                       sType;
+    void*                                 pNext;
+    uint32_t                              drmFormatModifierCount;
+    VkDrmFormatModifierProperties2EXT*    pDrmFormatModifierProperties;
+} VkDrmFormatModifierPropertiesList2EXT;
+
 typedef VkResult (VKAPI_PTR *PFN_vkGetImageDrmFormatModifierPropertiesEXT)(VkDevice device, VkImage image, VkImageDrmFormatModifierPropertiesEXT* pProperties);
 
 #ifndef VK_NO_PROTOTYPES
@@ -10133,9 +10337,10 @@ typedef VkGeometryFlagBitsKHR VkGeometryFlagBitsNV;
 
 typedef enum VkGeometryInstanceFlagBitsKHR {
     VK_GEOMETRY_INSTANCE_TRIANGLE_FACING_CULL_DISABLE_BIT_KHR = 0x00000001,
-    VK_GEOMETRY_INSTANCE_TRIANGLE_FRONT_COUNTERCLOCKWISE_BIT_KHR = 0x00000002,
+    VK_GEOMETRY_INSTANCE_TRIANGLE_FLIP_FACING_BIT_KHR = 0x00000002,
     VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR = 0x00000004,
     VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR = 0x00000008,
+    VK_GEOMETRY_INSTANCE_TRIANGLE_FRONT_COUNTERCLOCKWISE_BIT_KHR = VK_GEOMETRY_INSTANCE_TRIANGLE_FLIP_FACING_BIT_KHR,
     VK_GEOMETRY_INSTANCE_TRIANGLE_CULL_DISABLE_BIT_NV = VK_GEOMETRY_INSTANCE_TRIANGLE_FACING_CULL_DISABLE_BIT_KHR,
     VK_GEOMETRY_INSTANCE_TRIANGLE_FRONT_COUNTERCLOCKWISE_BIT_NV = VK_GEOMETRY_INSTANCE_TRIANGLE_FRONT_COUNTERCLOCKWISE_BIT_KHR,
     VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_NV = VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR,
@@ -11029,7 +11234,7 @@ VKAPI_ATTR void VKAPI_CALL vkSetLocalDimmingAMD(
 
 
 #define VK_EXT_fragment_density_map 1
-#define VK_EXT_FRAGMENT_DENSITY_MAP_SPEC_VERSION 1
+#define VK_EXT_FRAGMENT_DENSITY_MAP_SPEC_VERSION 2
 #define VK_EXT_FRAGMENT_DENSITY_MAP_EXTENSION_NAME "VK_EXT_fragment_density_map"
 typedef struct VkPhysicalDeviceFragmentDensityMapFeaturesEXT {
     VkStructureType    sType;
@@ -11063,8 +11268,10 @@ typedef VkPhysicalDeviceScalarBlockLayoutFeatures VkPhysicalDeviceScalarBlockLay
 
 
 #define VK_GOOGLE_hlsl_functionality1 1
-#define VK_GOOGLE_HLSL_FUNCTIONALITY1_SPEC_VERSION 1
-#define VK_GOOGLE_HLSL_FUNCTIONALITY1_EXTENSION_NAME "VK_GOOGLE_hlsl_functionality1"
+#define VK_GOOGLE_HLSL_FUNCTIONALITY_1_SPEC_VERSION 1
+#define VK_GOOGLE_HLSL_FUNCTIONALITY_1_EXTENSION_NAME "VK_GOOGLE_hlsl_functionality1"
+#define VK_GOOGLE_HLSL_FUNCTIONALITY1_SPEC_VERSION VK_GOOGLE_HLSL_FUNCTIONALITY_1_SPEC_VERSION
+#define VK_GOOGLE_HLSL_FUNCTIONALITY1_EXTENSION_NAME VK_GOOGLE_HLSL_FUNCTIONALITY_1_EXTENSION_NAME
 
 
 #define VK_GOOGLE_decorate_string 1
@@ -12354,6 +12561,17 @@ typedef struct VkPhysicalDevice4444FormatsFeaturesEXT {
 
 
 
+#define VK_EXT_rgba10x6_formats 1
+#define VK_EXT_RGBA10X6_FORMATS_SPEC_VERSION 1
+#define VK_EXT_RGBA10X6_FORMATS_EXTENSION_NAME "VK_EXT_rgba10x6_formats"
+typedef struct VkPhysicalDeviceRGBA10X6FormatsFeaturesEXT {
+    VkStructureType    sType;
+    void*              pNext;
+    VkBool32           formatRgba10x6WithoutYCbCrSampler;
+} VkPhysicalDeviceRGBA10X6FormatsFeaturesEXT;
+
+
+
 #define VK_NV_acquire_winrt_display 1
 #define VK_NV_ACQUIRE_WINRT_DISPLAY_SPEC_VERSION 1
 #define VK_NV_ACQUIRE_WINRT_DISPLAY_EXTENSION_NAME "VK_NV_acquire_winrt_display"
@@ -12450,6 +12668,18 @@ typedef struct VkPhysicalDeviceDrmPropertiesEXT {
 
 
 
+#define VK_EXT_primitive_topology_list_restart 1
+#define VK_EXT_PRIMITIVE_TOPOLOGY_LIST_RESTART_SPEC_VERSION 1
+#define VK_EXT_PRIMITIVE_TOPOLOGY_LIST_RESTART_EXTENSION_NAME "VK_EXT_primitive_topology_list_restart"
+typedef struct VkPhysicalDevicePrimitiveTopologyListRestartFeaturesEXT {
+    VkStructureType    sType;
+    void*              pNext;
+    VkBool32           primitiveTopologyListRestart;
+    VkBool32           primitiveTopologyPatchListRestart;
+} VkPhysicalDevicePrimitiveTopologyListRestartFeaturesEXT;
+
+
+
 #define VK_HUAWEI_subpass_shading 1
 #define VK_HUAWEI_SUBPASS_SHADING_SPEC_VERSION 2
 #define VK_HUAWEI_SUBPASS_SHADING_EXTENSION_NAME "VK_HUAWEI_subpass_shading"
@@ -12666,9 +12896,33 @@ VKAPI_ATTR void VKAPI_CALL vkCmdDrawMultiIndexedEXT(
 #endif
 
 
+#define VK_EXT_load_store_op_none 1
+#define VK_EXT_LOAD_STORE_OP_NONE_SPEC_VERSION 1
+#define VK_EXT_LOAD_STORE_OP_NONE_EXTENSION_NAME "VK_EXT_load_store_op_none"
+
+
+#define VK_EXT_pageable_device_local_memory 1
+#define VK_EXT_PAGEABLE_DEVICE_LOCAL_MEMORY_SPEC_VERSION 1
+#define VK_EXT_PAGEABLE_DEVICE_LOCAL_MEMORY_EXTENSION_NAME "VK_EXT_pageable_device_local_memory"
+typedef struct VkPhysicalDevicePageableDeviceLocalMemoryFeaturesEXT {
+    VkStructureType    sType;
+    void*              pNext;
+    VkBool32           pageableDeviceLocalMemory;
+} VkPhysicalDevicePageableDeviceLocalMemoryFeaturesEXT;
+
+typedef void (VKAPI_PTR *PFN_vkSetDeviceMemoryPriorityEXT)(VkDevice       device, VkDeviceMemory memory, float          priority);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR void VKAPI_CALL vkSetDeviceMemoryPriorityEXT(
+    VkDevice                                    device,
+    VkDeviceMemory                              memory,
+    float                                       priority);
+#endif
+
+
 #define VK_KHR_acceleration_structure 1
 VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkAccelerationStructureKHR)
-#define VK_KHR_ACCELERATION_STRUCTURE_SPEC_VERSION 11
+#define VK_KHR_ACCELERATION_STRUCTURE_SPEC_VERSION 13
 #define VK_KHR_ACCELERATION_STRUCTURE_EXTENSION_NAME "VK_KHR_acceleration_structure"
 
 typedef enum VkBuildAccelerationStructureModeKHR {
diff --git a/mesa 3D driver/include/vulkan/vulkan_fuchsia.h b/mesa 3D driver/include/vulkan/vulkan_fuchsia.h
index d558715738..bc47273a3b 100644
--- a/mesa 3D driver/include/vulkan/vulkan_fuchsia.h	
+++ b/mesa 3D driver/include/vulkan/vulkan_fuchsia.h	
@@ -114,6 +114,147 @@ VKAPI_ATTR VkResult VKAPI_CALL vkGetSemaphoreZirconHandleFUCHSIA(
     zx_handle_t*                                pZirconHandle);
 #endif
 
+
+#define VK_FUCHSIA_buffer_collection 1
+VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkBufferCollectionFUCHSIA)
+#define VK_FUCHSIA_BUFFER_COLLECTION_SPEC_VERSION 2
+#define VK_FUCHSIA_BUFFER_COLLECTION_EXTENSION_NAME "VK_FUCHSIA_buffer_collection"
+
+typedef enum VkImageFormatConstraintsFlagBitsFUCHSIA {
+    VK_IMAGE_FORMAT_CONSTRAINTS_FLAG_BITS_MAX_ENUM_FUCHSIA = 0x7FFFFFFF
+} VkImageFormatConstraintsFlagBitsFUCHSIA;
+typedef VkFlags VkImageFormatConstraintsFlagsFUCHSIA;
+
+typedef enum VkImageConstraintsInfoFlagBitsFUCHSIA {
+    VK_IMAGE_CONSTRAINTS_INFO_CPU_READ_RARELY_FUCHSIA = 0x00000001,
+    VK_IMAGE_CONSTRAINTS_INFO_CPU_READ_OFTEN_FUCHSIA = 0x00000002,
+    VK_IMAGE_CONSTRAINTS_INFO_CPU_WRITE_RARELY_FUCHSIA = 0x00000004,
+    VK_IMAGE_CONSTRAINTS_INFO_CPU_WRITE_OFTEN_FUCHSIA = 0x00000008,
+    VK_IMAGE_CONSTRAINTS_INFO_PROTECTED_OPTIONAL_FUCHSIA = 0x00000010,
+    VK_IMAGE_CONSTRAINTS_INFO_FLAG_BITS_MAX_ENUM_FUCHSIA = 0x7FFFFFFF
+} VkImageConstraintsInfoFlagBitsFUCHSIA;
+typedef VkFlags VkImageConstraintsInfoFlagsFUCHSIA;
+typedef struct VkBufferCollectionCreateInfoFUCHSIA {
+    VkStructureType    sType;
+    const void*        pNext;
+    zx_handle_t        collectionToken;
+} VkBufferCollectionCreateInfoFUCHSIA;
+
+typedef struct VkImportMemoryBufferCollectionFUCHSIA {
+    VkStructureType              sType;
+    const void*                  pNext;
+    VkBufferCollectionFUCHSIA    collection;
+    uint32_t                     index;
+} VkImportMemoryBufferCollectionFUCHSIA;
+
+typedef struct VkBufferCollectionImageCreateInfoFUCHSIA {
+    VkStructureType              sType;
+    const void*                  pNext;
+    VkBufferCollectionFUCHSIA    collection;
+    uint32_t                     index;
+} VkBufferCollectionImageCreateInfoFUCHSIA;
+
+typedef struct VkBufferCollectionConstraintsInfoFUCHSIA {
+    VkStructureType    sType;
+    const void*        pNext;
+    uint32_t           minBufferCount;
+    uint32_t           maxBufferCount;
+    uint32_t           minBufferCountForCamping;
+    uint32_t           minBufferCountForDedicatedSlack;
+    uint32_t           minBufferCountForSharedSlack;
+} VkBufferCollectionConstraintsInfoFUCHSIA;
+
+typedef struct VkBufferConstraintsInfoFUCHSIA {
+    VkStructureType                             sType;
+    const void*                                 pNext;
+    VkBufferCreateInfo                          createInfo;
+    VkFormatFeatureFlags                        requiredFormatFeatures;
+    VkBufferCollectionConstraintsInfoFUCHSIA    bufferCollectionConstraints;
+} VkBufferConstraintsInfoFUCHSIA;
+
+typedef struct VkBufferCollectionBufferCreateInfoFUCHSIA {
+    VkStructureType              sType;
+    const void*                  pNext;
+    VkBufferCollectionFUCHSIA    collection;
+    uint32_t                     index;
+} VkBufferCollectionBufferCreateInfoFUCHSIA;
+
+typedef struct VkSysmemColorSpaceFUCHSIA {
+    VkStructureType    sType;
+    const void*        pNext;
+    uint32_t           colorSpace;
+} VkSysmemColorSpaceFUCHSIA;
+
+typedef struct VkBufferCollectionPropertiesFUCHSIA {
+    VkStructureType                  sType;
+    void*                            pNext;
+    uint32_t                         memoryTypeBits;
+    uint32_t                         bufferCount;
+    uint32_t                         createInfoIndex;
+    uint64_t                         sysmemPixelFormat;
+    VkFormatFeatureFlags             formatFeatures;
+    VkSysmemColorSpaceFUCHSIA        sysmemColorSpaceIndex;
+    VkComponentMapping               samplerYcbcrConversionComponents;
+    VkSamplerYcbcrModelConversion    suggestedYcbcrModel;
+    VkSamplerYcbcrRange              suggestedYcbcrRange;
+    VkChromaLocation                 suggestedXChromaOffset;
+    VkChromaLocation                 suggestedYChromaOffset;
+} VkBufferCollectionPropertiesFUCHSIA;
+
+typedef struct VkImageFormatConstraintsInfoFUCHSIA {
+    VkStructureType                         sType;
+    const void*                             pNext;
+    VkImageCreateInfo                       imageCreateInfo;
+    VkFormatFeatureFlags                    requiredFormatFeatures;
+    VkImageFormatConstraintsFlagsFUCHSIA    flags;
+    uint64_t                                sysmemPixelFormat;
+    uint32_t                                colorSpaceCount;
+    const VkSysmemColorSpaceFUCHSIA*        pColorSpaces;
+} VkImageFormatConstraintsInfoFUCHSIA;
+
+typedef struct VkImageConstraintsInfoFUCHSIA {
+    VkStructureType                               sType;
+    const void*                                   pNext;
+    uint32_t                                      formatConstraintsCount;
+    const VkImageFormatConstraintsInfoFUCHSIA*    pFormatConstraints;
+    VkBufferCollectionConstraintsInfoFUCHSIA      bufferCollectionConstraints;
+    VkImageConstraintsInfoFlagsFUCHSIA            flags;
+} VkImageConstraintsInfoFUCHSIA;
+
+typedef VkResult (VKAPI_PTR *PFN_vkCreateBufferCollectionFUCHSIA)(VkDevice device, const VkBufferCollectionCreateInfoFUCHSIA* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkBufferCollectionFUCHSIA* pCollection);
+typedef VkResult (VKAPI_PTR *PFN_vkSetBufferCollectionImageConstraintsFUCHSIA)(VkDevice device, VkBufferCollectionFUCHSIA collection, const VkImageConstraintsInfoFUCHSIA* pImageConstraintsInfo);
+typedef VkResult (VKAPI_PTR *PFN_vkSetBufferCollectionBufferConstraintsFUCHSIA)(VkDevice device, VkBufferCollectionFUCHSIA collection, const VkBufferConstraintsInfoFUCHSIA* pBufferConstraintsInfo);
+typedef void (VKAPI_PTR *PFN_vkDestroyBufferCollectionFUCHSIA)(VkDevice device, VkBufferCollectionFUCHSIA collection, const VkAllocationCallbacks* pAllocator);
+typedef VkResult (VKAPI_PTR *PFN_vkGetBufferCollectionPropertiesFUCHSIA)(VkDevice device, VkBufferCollectionFUCHSIA collection, VkBufferCollectionPropertiesFUCHSIA* pProperties);
+
+#ifndef VK_NO_PROTOTYPES
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateBufferCollectionFUCHSIA(
+    VkDevice                                    device,
+    const VkBufferCollectionCreateInfoFUCHSIA*  pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkBufferCollectionFUCHSIA*                  pCollection);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkSetBufferCollectionImageConstraintsFUCHSIA(
+    VkDevice                                    device,
+    VkBufferCollectionFUCHSIA                   collection,
+    const VkImageConstraintsInfoFUCHSIA*        pImageConstraintsInfo);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkSetBufferCollectionBufferConstraintsFUCHSIA(
+    VkDevice                                    device,
+    VkBufferCollectionFUCHSIA                   collection,
+    const VkBufferConstraintsInfoFUCHSIA*       pBufferConstraintsInfo);
+
+VKAPI_ATTR void VKAPI_CALL vkDestroyBufferCollectionFUCHSIA(
+    VkDevice                                    device,
+    VkBufferCollectionFUCHSIA                   collection,
+    const VkAllocationCallbacks*                pAllocator);
+
+VKAPI_ATTR VkResult VKAPI_CALL vkGetBufferCollectionPropertiesFUCHSIA(
+    VkDevice                                    device,
+    VkBufferCollectionFUCHSIA                   collection,
+    VkBufferCollectionPropertiesFUCHSIA*        pProperties);
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/mesa 3D driver/meson.build b/mesa 3D driver/meson.build
index 2106094cb4..2947a03399 100644
--- a/mesa 3D driver/meson.build	
+++ b/mesa 3D driver/meson.build	
@@ -30,6 +30,13 @@ project(
   default_options : ['buildtype=debugoptimized', 'b_ndebug=if-release', 'c_std=c11', 'cpp_std=c++14']
 )
 
+# For meson >= 0.55.0, meson can inject some extra arguments to get richer
+# results from gtest based tests.
+gtest_test_protocol = 'exitcode'
+if meson.version().version_compare('>= 0.55.0')
+  gtest_test_protocol = 'gtest'
+endif
+
 cc = meson.get_compiler('c')
 cpp = meson.get_compiler('cpp')
 
@@ -78,7 +85,6 @@ if with_tools.contains('all')
     'asahi',
   ]
 endif
-with_clc = false
 
 with_any_vulkan_layers = get_option('vulkan-layers').length() != 0
 with_intel_tools = with_tools.contains('intel') or with_tools.contains('intel-ui')
@@ -205,7 +211,7 @@ if gallium_drivers.contains('auto')
     if ['x86', 'x86_64'].contains(host_machine.cpu_family())
       gallium_drivers = [
         'r300', 'r600', 'radeonsi', 'nouveau', 'virgl', 'svga', 'swrast',
-        'iris'
+        'iris', 'crocus'
       ]
     elif ['arm', 'aarch64'].contains(host_machine.cpu_family())
       gallium_drivers = [
@@ -308,19 +314,9 @@ if with_aco_tests and not with_amd_vk
   error('ACO tests require Radv')
 endif
 
-dep_clang = dependency(
-  'clang',
-  method : 'cmake',
-  static : true,
-  modules : [
-    'clangBasic', 'clangCodeGen', 'clangDriver', 'clangFrontend', 'clangFrontendTool',
-    'clangHandleCXX', 'clangHandleLLVM',
-  ],
-  required : get_option('microsoft-clc'),
-)
-with_microsoft_clc = dep_clang.found()
-with_clc = dep_clang.found()
-
+with_microsoft_clc = get_option('microsoft-clc').enabled()
+with_clc = with_microsoft_clc
+with_libclc = with_clc
 with_spirv_to_dxil = get_option('spirv-to-dxil')
 
 if host_machine.system() == 'darwin'
@@ -425,6 +421,35 @@ else
   with_xlib_lease = _xlib_lease == 'enabled'
 endif
 
+if with_platform_wayland
+  c_args += '-DVK_USE_PLATFORM_WAYLAND_KHR'
+  #add this once aco and other places can build with it
+  #cpp_args += '-DVK_USE_PLATFORM_WAYLAND_KHR'
+endif
+if with_platform_x11
+  c_args += ['-DVK_USE_PLATFORM_XCB_KHR', '-DVK_USE_PLATFORM_XLIB_KHR']
+  #add this once aco and other places can build with it
+  #cpp_args += ['-DVK_USE_PLATFORM_XCB_KHR', '-DVK_USE_PLATFORM_XLIB_KHR']
+endif
+if with_platform_windows
+  c_args += '-DVK_USE_PLATFORM_WIN32_KHR'
+  #add this once aco and other places can build with it
+  #cpp_args += '-DVK_USE_PLATFORM_WIN32_KHR'
+endif
+if with_platform_android
+  c_args += '-DVK_USE_PLATFORM_ANDROID_KHR'
+  cpp_args += '-DVK_USE_PLATFORM_ANDROID_KHR'
+endif
+if with_xlib_lease
+  c_args += '-DVK_USE_PLATFORM_XLIB_XRANDR_EXT'
+  #add this once aco and other places can build with it
+  #cpp_args += '-DVK_USE_PLATFORM_XLIB_XRANDR_EXT'
+endif
+if system_has_kms_drm and not with_platform_android
+  c_args += '-DVK_USE_PLATFORM_DISPLAY_KHR'
+  cpp_args += '-DVK_USE_PLATFORM_DISPLAY_KHR'
+endif
+
 _egl = get_option('egl')
 if _egl == 'true'
   _egl = 'enabled'
@@ -435,18 +460,19 @@ elif _egl == 'false'
 endif
 if _egl == 'auto'
   with_egl = (
-    not ['darwin', 'windows'].contains(host_machine.system()) and
-    with_dri and with_shared_glapi
+    host_machine.system() != 'darwin' and
+    (with_platform_windows or with_dri) and
+    with_shared_glapi
   )
 elif _egl == 'enabled'
-  if not with_dri and not with_platform_haiku
-    error('EGL requires dri')
+  if not with_dri and not with_platform_haiku and not with_platform_windows
+    error('EGL requires dri, haiku, or windows')
   elif not with_shared_glapi
     error('EGL requires shared-glapi')
   elif not ['disabled', 'dri'].contains(with_glx)
     error('EGL requires dri, but a GLX is being built without dri')
-  elif ['darwin', 'windows'].contains(host_machine.system())
-    error('EGL is not available on Windows or MacOS')
+  elif host_machine.system() == 'darwin'
+    error('EGL is not available on MacOS')
   endif
   with_egl = true
 else
@@ -615,7 +641,7 @@ dep_dxheaders = null_dep
 if with_gallium_d3d12 or with_microsoft_clc
   dep_dxheaders = dependency('DirectX-Headers', fallback : ['DirectX-Headers', 'dep_dxheaders'],
     required : with_gallium_d3d12
-  ) 
+  )
 endif
 
 if with_vulkan_overlay_layer or with_aco_tests
@@ -818,7 +844,7 @@ if with_gallium_st_nine
     error('The nine state tracker requires gallium softpipe/llvmpipe.')
   elif not (with_gallium_radeonsi or with_gallium_nouveau or with_gallium_r600
             or with_gallium_r300 or with_gallium_svga or with_gallium_i915
-            or with_gallium_iris or with_gallium_crocus)
+            or with_gallium_iris or with_gallium_crocus or with_gallium_zink)
     error('The nine state tracker requires at least one non-swrast gallium driver.')
   endif
   if not with_dri3
@@ -866,13 +892,20 @@ if _power8 != 'disabled'
   endif
 endif
 
+if get_option('vmware-mks-stats')
+  if not with_gallium_svga
+    error('vmware-mks-stats requires gallium VMware/svga driver.')
+  endif
+  pre_args += '-DVMX86_STATS=1'
+endif
+
 _opencl = get_option('gallium-opencl')
 if _opencl != 'disabled'
   if not with_gallium
     error('OpenCL Clover implementation requires at least one gallium driver.')
   endif
 
-  with_clc = true
+  with_libclc = true
   with_gallium_opencl = true
   with_opencl_icd = _opencl == 'icd'
 else
@@ -881,7 +914,7 @@ else
 endif
 
 dep_clc = null_dep
-if with_clc
+if with_libclc
   dep_clc = dependency('libclc')
 endif
 
@@ -913,6 +946,9 @@ endif
 if with_gbm and not with_platform_android
   pre_args += '-DHAVE_DRM_PLATFORM'
 endif
+if with_platform_windows
+  pre_args += '-DHAVE_WINDOWS_PLATFORM'
+endif
 
 with_android_stub = get_option('android-stub')
 if with_android_stub and not with_platform_android
@@ -1077,6 +1113,7 @@ if cc.get_id() == 'msvc'
                '/wd5105',  # macro expansion producing 'defined' has undefined behavior (winbase.h, need Windows SDK upgrade)
                '/we4020',  # Error when passing the wrong number of parameters
                '/we4024',  # Error when passing different type of parameter
+               '/Zc:__cplusplus', #Set __cplusplus macro to match the /std:c++<version> on the command line
               ]
     if cc.has_argument(a)
       c_args += a
@@ -1094,7 +1131,6 @@ else
     '-Werror=incompatible-pointer-types',
     '-Werror=int-conversion',
     '-Wimplicit-fallthrough',
-    '-Werror=thread-safety',
     '-Wno-missing-field-initializers',
     '-Wno-format-truncation',
     '-fno-math-errno',
@@ -1106,6 +1142,10 @@ else
   if not (cc.get_id() == 'gcc' and host_machine.system() == 'windows')
     _trial += ['-Werror=format', '-Wformat-security']
   endif
+  # FreeBSD annotated <pthread.h> but Mesa isn't ready
+  if not (cc.get_id() == 'clang' and host_machine.system() == 'freebsd')
+    _trial += ['-Werror=thread-safety']
+  endif
   foreach a : _trial
     if cc.has_argument(a)
       c_args += a
@@ -1333,7 +1373,7 @@ if not ['linux'].contains(host_machine.system())
   endif
 endif
 
-foreach h : ['xlocale.h', 'linux/futex.h', 'endian.h', 'dlfcn.h', 'execinfo.h', 'sys/shm.h', 'cet.h', 'pthread_np.h']
+foreach h : ['xlocale.h', 'linux/futex.h', 'endian.h', 'dlfcn.h', 'sys/shm.h', 'cet.h', 'pthread_np.h']
   if cc.check_header(h)
     pre_args += '-DHAVE_@0@'.format(h.to_upper().underscorify())
   endif
@@ -1595,8 +1635,8 @@ if with_gallium_opencl
   ]
   llvm_optional_modules += ['frontendopenmp']
 endif
-if with_microsoft_clc
-  llvm_modules += ['target', 'linker', 'irreader', 'option', 'libdriver']
+if with_clc
+  llvm_modules += ['coverage', 'target', 'linker', 'irreader', 'option', 'libdriver', 'lto']
 endif
 if with_tests or with_gallium_softpipe
   llvm_modules += 'native'
@@ -1604,7 +1644,7 @@ endif
 
 if with_amd_vk or with_gallium_radeonsi
   _llvm_version = '>= 11.0.0'
-elif with_microsoft_clc
+elif with_clc
   _llvm_version = '>= 10.0.0'
 elif with_gallium_opencl
   _llvm_version = '>= 8.0.0'
@@ -1657,7 +1697,7 @@ if _llvm != 'disabled'
     optional_modules : llvm_optional_modules,
     required : (
       with_amd_vk or with_gallium_radeonsi or with_gallium_swr or
-      with_gallium_opencl or with_microsoft_clc or _llvm == 'enabled'
+      with_gallium_opencl or with_clc or _llvm == 'enabled'
     ),
     static : not _shared_llvm,
     method : _llvm_method,
@@ -1710,17 +1750,19 @@ if with_llvm
       language : ['c', 'cpp'],
     )
   endif
-elif with_amd_vk or with_gallium_radeonsi or with_gallium_swr or with_swrast_vk
-  error('The following drivers require LLVM: Radv, RadeonSI, SWR, Lavapipe. One of these is enabled, but LLVM is disabled.')
+elif with_amd_vk and with_aco_tests
+  error('ACO tests require LLVM, but LLVM is disabled.')
+elif with_gallium_radeonsi or with_gallium_swr or with_swrast_vk
+  error('The following drivers require LLVM: RadeonSI, SWR, Lavapipe. One of these is enabled, but LLVM is disabled.')
 elif with_gallium_opencl
   error('The OpenCL "Clover" state tracker requires LLVM, but LLVM is disabled.')
-elif with_microsoft_clc
-  error('The Microsoft CLC compiler requires LLVM, but LLVM is disabled.')
+elif with_clc
+  error('The CLC compiler requires LLVM, but LLVM is disabled.')
 else
   draw_with_llvm = false
 endif
 
-with_opencl_spirv = (_opencl != 'disabled' and get_option('opencl-spirv')) or with_microsoft_clc
+with_opencl_spirv = (_opencl != 'disabled' and get_option('opencl-spirv')) or with_clc
 if with_opencl_spirv
   chosen_llvm_version_array = dep_llvm.version().split('.')
   chosen_llvm_version_major = chosen_llvm_version_array[0].to_int()
@@ -1745,6 +1787,30 @@ else
   dep_llvmspirvlib = null_dep
 endif
 
+dep_clang = null_dep
+if with_clc
+  llvm_libdir = dep_llvm.get_variable(cmake : 'LLVM_LIBRARY_DIR', configtool: 'libdir')
+
+  clang_modules = [
+    'clangBasic', 'clangAST', 'clangCodeGen', 'clangLex',
+    'clangDriver', 'clangFrontend', 'clangFrontendTool',
+    'clangHandleCXX', 'clangHandleLLVM', 'clangSerialization',
+    'clangSema', 'clangParse', 'clangEdit', 'clangAnalysis'
+  ]
+
+  dep_clang = []
+  foreach m : clang_modules
+    dep_clang += cpp.find_library(m, dirs : llvm_libdir, required : true)
+  endforeach
+endif
+
+# Be explicit about only using this lib on Windows, to avoid picking
+# up random libs with the generic name 'libversion'
+dep_version = null_dep
+if with_opencl_spirv and host_machine.system() == 'windows'
+  dep_version = cpp.find_library('version')
+endif
+
 with_opencl_native = _opencl != 'disabled' and get_option('opencl-native')
 
 if (with_amd_vk or with_gallium_radeonsi or
@@ -1986,6 +2052,7 @@ if with_platform_x11
           dep_xcb_present.version().version_compare('>= 1.13'))
         pre_args += '-DHAVE_DRI3_MODIFIERS'
       endif
+      dep_xcb_shm = dependency('xcb-shm')
       dep_xcb_sync = dependency('xcb-sync')
       dep_xshmfence = dependency('xshmfence', version : '>= 1.1')
     endif
@@ -2187,6 +2254,9 @@ if with_egl
   if with_dri3
     egl_drivers += 'builtin:egl_dri3'
   endif
+  if with_platform_windows
+    egl_drivers += 'builtin:wgl'
+  endif
   lines += 'EGL drivers:     ' + ' '.join(egl_drivers)
 endif
 if with_egl or with_any_vk
diff --git a/mesa 3D driver/meson_options.txt b/mesa 3D driver/meson_options.txt
index 29c402c484..32c7593ee8 100644
--- a/mesa 3D driver/meson_options.txt	
+++ b/mesa 3D driver/meson_options.txt	
@@ -482,7 +482,7 @@ option(
   'platform-sdk-version',
   type : 'integer',
   min : 25,
-  max : 30,
+  max : 31,
   value : 25,
   description : 'Android Platform SDK version. Default: Nougat version.'
 )
@@ -524,3 +524,9 @@ option(
   value : '',
   description : 'Enable a custom shader replacement mechanism. Note that enabling this option requires adding/generating a shader_replacement.h file that can be included (see shaderapi.c).'
 )
+option(
+  'vmware-mks-stats',
+  type : 'boolean',
+  value : false,
+  description : 'Build gallium VMware/svga driver with mksGuestStats instrumentation.'
+)
diff --git a/mesa 3D driver/src/amd/addrlib/inc/addrinterface.h b/mesa 3D driver/src/amd/addrlib/inc/addrinterface.h
index 1b03548b96..47f6b5d148 100644
--- a/mesa 3D driver/src/amd/addrlib/inc/addrinterface.h	
+++ b/mesa 3D driver/src/amd/addrlib/inc/addrinterface.h	
@@ -1,28 +1,29 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+****************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+****************************************************************************************************
+*/
 
 /**
 ****************************************************************************************************
diff --git a/mesa 3D driver/src/amd/addrlib/inc/addrtypes.h b/mesa 3D driver/src/amd/addrlib/inc/addrtypes.h
index 55141ab67c..63c7f0af8a 100644
--- a/mesa 3D driver/src/amd/addrlib/inc/addrtypes.h	
+++ b/mesa 3D driver/src/amd/addrlib/inc/addrtypes.h	
@@ -1,28 +1,29 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+****************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+****************************************************************************************************
+*/
 
 /**
 ****************************************************************************************************
diff --git a/mesa 3D driver/src/amd/addrlib/src/addrinterface.cpp b/mesa 3D driver/src/amd/addrlib/src/addrinterface.cpp
index b2d032ef7d..9e7ca698b8 100644
--- a/mesa 3D driver/src/amd/addrlib/src/addrinterface.cpp	
+++ b/mesa 3D driver/src/amd/addrlib/src/addrinterface.cpp	
@@ -1,28 +1,29 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+****************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+****************************************************************************************************
+*/
 
 /**
 ****************************************************************************************************
diff --git a/mesa 3D driver/src/amd/addrlib/src/amdgpu_asic_addr.h b/mesa 3D driver/src/amd/addrlib/src/amdgpu_asic_addr.h
index d125bdad05..1133fd02f4 100644
--- a/mesa 3D driver/src/amd/addrlib/src/amdgpu_asic_addr.h	
+++ b/mesa 3D driver/src/amd/addrlib/src/amdgpu_asic_addr.h	
@@ -1,28 +1,29 @@
-/*
- * Copyright © 2017-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+***********************************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+***********************************************************************************************************************
+*/
 
 #ifndef _AMDGPU_ASIC_ADDR_H
 #define _AMDGPU_ASIC_ADDR_H
diff --git a/mesa 3D driver/src/amd/addrlib/src/chip/r800/si_gb_reg.h b/mesa 3D driver/src/amd/addrlib/src/chip/r800/si_gb_reg.h
index 1a02335f80..4c8a453569 100644
--- a/mesa 3D driver/src/amd/addrlib/src/chip/r800/si_gb_reg.h	
+++ b/mesa 3D driver/src/amd/addrlib/src/chip/r800/si_gb_reg.h	
@@ -1,28 +1,29 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+****************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+****************************************************************************************************
+*/
 
 #if !defined (__SI_GB_REG_H__)
 #define __SI_GB_REG_H__
diff --git a/mesa 3D driver/src/amd/addrlib/src/core/addrcommon.h b/mesa 3D driver/src/amd/addrlib/src/core/addrcommon.h
index 1871c737b7..83f40ff2eb 100644
--- a/mesa 3D driver/src/amd/addrlib/src/core/addrcommon.h	
+++ b/mesa 3D driver/src/amd/addrlib/src/core/addrcommon.h	
@@ -1,28 +1,29 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+****************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+****************************************************************************************************
+*/
 
 /**
 ****************************************************************************************************
diff --git a/mesa 3D driver/src/amd/addrlib/src/core/addrelemlib.cpp b/mesa 3D driver/src/amd/addrlib/src/core/addrelemlib.cpp
index 9279aff71f..d3b1e8eb92 100644
--- a/mesa 3D driver/src/amd/addrlib/src/core/addrelemlib.cpp	
+++ b/mesa 3D driver/src/amd/addrlib/src/core/addrelemlib.cpp	
@@ -1,28 +1,29 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+****************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+****************************************************************************************************
+*/
 
 /**
 ****************************************************************************************************
diff --git a/mesa 3D driver/src/amd/addrlib/src/core/addrelemlib.h b/mesa 3D driver/src/amd/addrlib/src/core/addrelemlib.h
index 519e194f3d..1e7ff12f6c 100644
--- a/mesa 3D driver/src/amd/addrlib/src/core/addrelemlib.h	
+++ b/mesa 3D driver/src/amd/addrlib/src/core/addrelemlib.h	
@@ -1,28 +1,30 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+****************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+****************************************************************************************************
+*/
+
 
 /**
 ****************************************************************************************************
diff --git a/mesa 3D driver/src/amd/addrlib/src/core/addrlib.cpp b/mesa 3D driver/src/amd/addrlib/src/core/addrlib.cpp
index 1ab885d658..38ede59ec8 100644
--- a/mesa 3D driver/src/amd/addrlib/src/core/addrlib.cpp	
+++ b/mesa 3D driver/src/amd/addrlib/src/core/addrlib.cpp	
@@ -1,28 +1,29 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+****************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+****************************************************************************************************
+*/
 
 /**
 ****************************************************************************************************
diff --git a/mesa 3D driver/src/amd/addrlib/src/core/addrlib.h b/mesa 3D driver/src/amd/addrlib/src/core/addrlib.h
index 79895ac189..1c121befb1 100644
--- a/mesa 3D driver/src/amd/addrlib/src/core/addrlib.h	
+++ b/mesa 3D driver/src/amd/addrlib/src/core/addrlib.h	
@@ -1,28 +1,29 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+****************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+****************************************************************************************************
+*/
 
 /**
 ****************************************************************************************************
diff --git a/mesa 3D driver/src/amd/addrlib/src/core/addrlib1.cpp b/mesa 3D driver/src/amd/addrlib/src/core/addrlib1.cpp
index 1622c5b6b0..f55afe3569 100644
--- a/mesa 3D driver/src/amd/addrlib/src/core/addrlib1.cpp	
+++ b/mesa 3D driver/src/amd/addrlib/src/core/addrlib1.cpp	
@@ -1,28 +1,29 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+****************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+****************************************************************************************************
+*/
 
 /**
 ****************************************************************************************************
diff --git a/mesa 3D driver/src/amd/addrlib/src/core/addrlib1.h b/mesa 3D driver/src/amd/addrlib/src/core/addrlib1.h
index 07bc54ac86..fb31ea71e1 100644
--- a/mesa 3D driver/src/amd/addrlib/src/core/addrlib1.h	
+++ b/mesa 3D driver/src/amd/addrlib/src/core/addrlib1.h	
@@ -1,28 +1,30 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+****************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+****************************************************************************************************
+*/
+
 
 /**
 ****************************************************************************************************
diff --git a/mesa 3D driver/src/amd/addrlib/src/core/addrlib2.cpp b/mesa 3D driver/src/amd/addrlib/src/core/addrlib2.cpp
index 8bea9ecf91..e632ba534d 100644
--- a/mesa 3D driver/src/amd/addrlib/src/core/addrlib2.cpp	
+++ b/mesa 3D driver/src/amd/addrlib/src/core/addrlib2.cpp	
@@ -1,28 +1,30 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+************************************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+************************************************************************************************************************
+*/
+
 
 /**
 ************************************************************************************************************************
diff --git a/mesa 3D driver/src/amd/addrlib/src/core/addrlib2.h b/mesa 3D driver/src/amd/addrlib/src/core/addrlib2.h
index d3032f5eab..9f8918f8da 100644
--- a/mesa 3D driver/src/amd/addrlib/src/core/addrlib2.h	
+++ b/mesa 3D driver/src/amd/addrlib/src/core/addrlib2.h	
@@ -1,28 +1,30 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+************************************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+************************************************************************************************************************
+*/
+
 
 /**
 ************************************************************************************************************************
diff --git a/mesa 3D driver/src/amd/addrlib/src/core/addrobject.cpp b/mesa 3D driver/src/amd/addrlib/src/core/addrobject.cpp
index 7e2510755a..63b72f885c 100644
--- a/mesa 3D driver/src/amd/addrlib/src/core/addrobject.cpp	
+++ b/mesa 3D driver/src/amd/addrlib/src/core/addrobject.cpp	
@@ -1,28 +1,30 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+****************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+****************************************************************************************************
+*/
+
 
 /**
 ****************************************************************************************************
diff --git a/mesa 3D driver/src/amd/addrlib/src/core/addrobject.h b/mesa 3D driver/src/amd/addrlib/src/core/addrobject.h
index 5e2233c7a0..5dabfcbaf7 100644
--- a/mesa 3D driver/src/amd/addrlib/src/core/addrobject.h	
+++ b/mesa 3D driver/src/amd/addrlib/src/core/addrobject.h	
@@ -1,28 +1,30 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+****************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+****************************************************************************************************
+*/
+
 
 /**
 ****************************************************************************************************
diff --git a/mesa 3D driver/src/amd/addrlib/src/core/coord.cpp b/mesa 3D driver/src/amd/addrlib/src/core/coord.cpp
index 3cf066daa9..4848f955ba 100644
--- a/mesa 3D driver/src/amd/addrlib/src/core/coord.cpp	
+++ b/mesa 3D driver/src/amd/addrlib/src/core/coord.cpp	
@@ -1,29 +1,30 @@
 
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+****************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+****************************************************************************************************
+*/
 
 // Coordinate class implementation
 #include "addrcommon.h"
diff --git a/mesa 3D driver/src/amd/addrlib/src/core/coord.h b/mesa 3D driver/src/amd/addrlib/src/core/coord.h
index 13efc9a0dd..cbdbb4cc3b 100644
--- a/mesa 3D driver/src/amd/addrlib/src/core/coord.h	
+++ b/mesa 3D driver/src/amd/addrlib/src/core/coord.h	
@@ -1,28 +1,29 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+****************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+****************************************************************************************************
+*/
 
 // Class used to define a coordinate bit
 
diff --git a/mesa 3D driver/src/amd/addrlib/src/gfx10/gfx10SwizzlePattern.h b/mesa 3D driver/src/amd/addrlib/src/gfx10/gfx10SwizzlePattern.h
index 49d3b2f667..ea022021a0 100644
--- a/mesa 3D driver/src/amd/addrlib/src/gfx10/gfx10SwizzlePattern.h	
+++ b/mesa 3D driver/src/amd/addrlib/src/gfx10/gfx10SwizzlePattern.h	
@@ -1,28 +1,29 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+************************************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+************************************************************************************************************************
+*/
 
 /**
 ************************************************************************************************************************
@@ -3739,7 +3740,6 @@ const ADDR_SW_PATINFO GFX10_SW_VAR_Z_X_8xaa_RBPLUS_PATINFO[] =
     {   3,   27,  344,  365,  124, } , // 64 pipes (32 PKRs) 16 bpe @ SW_VAR_Z_X 8xaa @ RbPlus
 };
 
-
 const UINT_64 GFX10_SW_PATTERN_NIBBLE01[][8] =
 {
     {X0,            X1,            X2,            X3,            Y0,            Y1,            Y2,            Y3,            }, // 0
@@ -5849,7 +5849,6 @@ const UINT_8 GFX10_CMASK_VAR_RBPLUS_PATIDX[] =
       31, // 64 pipes (32 PKRs) 8 bpe pa @ CMASK_VAR @ RbPlus
 };
 
-
 const UINT_64 GFX10_DCC_64K_R_X_SW_PATTERN[][17] =
 {
     {0,             X4,            Y4,            X5,            Y5,            X6,            Y6,            X7,            Y7,            X8,            Y8,            X9,            Y9,            0,             0,             0,             0,             }, //0
diff --git a/mesa 3D driver/src/amd/addrlib/src/gfx10/gfx10addrlib.cpp b/mesa 3D driver/src/amd/addrlib/src/gfx10/gfx10addrlib.cpp
index 14ce04d379..81675288bd 100644
--- a/mesa 3D driver/src/amd/addrlib/src/gfx10/gfx10addrlib.cpp	
+++ b/mesa 3D driver/src/amd/addrlib/src/gfx10/gfx10addrlib.cpp	
@@ -1,28 +1,29 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+************************************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+************************************************************************************************************************
+*/
 
 /**
 ************************************************************************************************************************
@@ -66,46 +67,46 @@ namespace V2
 
 const SwizzleModeFlags Gfx10Lib::SwizzleModeTable[ADDR_SW_MAX_TYPE] =
 {//Linear 256B  4KB  64KB   Var    Z    Std   Disp  Rot   XOR    T     RtOpt Reserved
-    {1,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // ADDR_SW_LINEAR
-    {0,    1,    0,    0,    0,    0,    1,    0,    0,    0,    0,    0,    0}, // ADDR_SW_256B_S
-    {0,    1,    0,    0,    0,    0,    0,    1,    0,    0,    0,    0,    0}, // ADDR_SW_256B_D
-    {0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // Reserved
+    {{1,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // ADDR_SW_LINEAR
+    {{0,    1,    0,    0,    0,    0,    1,    0,    0,    0,    0,    0,    0}}, // ADDR_SW_256B_S
+    {{0,    1,    0,    0,    0,    0,    0,    1,    0,    0,    0,    0,    0}}, // ADDR_SW_256B_D
+    {{0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // Reserved
 
-    {0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // Reserved
-    {0,    0,    1,    0,    0,    0,    1,    0,    0,    0,    0,    0,    0}, // ADDR_SW_4KB_S
-    {0,    0,    1,    0,    0,    0,    0,    1,    0,    0,    0,    0,    0}, // ADDR_SW_4KB_D
-    {0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // Reserved
+    {{0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // Reserved
+    {{0,    0,    1,    0,    0,    0,    1,    0,    0,    0,    0,    0,    0}}, // ADDR_SW_4KB_S
+    {{0,    0,    1,    0,    0,    0,    0,    1,    0,    0,    0,    0,    0}}, // ADDR_SW_4KB_D
+    {{0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // Reserved
 
-    {0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // Reserved
-    {0,    0,    0,    1,    0,    0,    1,    0,    0,    0,    0,    0,    0}, // ADDR_SW_64KB_S
-    {0,    0,    0,    1,    0,    0,    0,    1,    0,    0,    0,    0,    0}, // ADDR_SW_64KB_D
-    {0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // Reserved
+    {{0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // Reserved
+    {{0,    0,    0,    1,    0,    0,    1,    0,    0,    0,    0,    0,    0}}, // ADDR_SW_64KB_S
+    {{0,    0,    0,    1,    0,    0,    0,    1,    0,    0,    0,    0,    0}}, // ADDR_SW_64KB_D
+    {{0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // Reserved
 
-    {0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // Reserved
-    {0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // Reserved
-    {0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // Reserved
-    {0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // Reserved
+    {{0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // Reserved
+    {{0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // Reserved
+    {{0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // Reserved
+    {{0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // Reserved
 
-    {0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // Reserved
-    {0,    0,    0,    1,    0,    0,    1,    0,    0,    1,    1,    0,    0}, // ADDR_SW_64KB_S_T
-    {0,    0,    0,    1,    0,    0,    0,    1,    0,    1,    1,    0,    0}, // ADDR_SW_64KB_D_T
-    {0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // Reserved
+    {{0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // Reserved
+    {{0,    0,    0,    1,    0,    0,    1,    0,    0,    1,    1,    0,    0}}, // ADDR_SW_64KB_S_T
+    {{0,    0,    0,    1,    0,    0,    0,    1,    0,    1,    1,    0,    0}}, // ADDR_SW_64KB_D_T
+    {{0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // Reserved
 
-    {0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // Reserved
-    {0,    0,    1,    0,    0,    0,    1,    0,    0,    1,    0,    0,    0}, // ADDR_SW_4KB_S_X
-    {0,    0,    1,    0,    0,    0,    0,    1,    0,    1,    0,    0,    0}, // ADDR_SW_4KB_D_X
-    {0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // Reserved
+    {{0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // Reserved
+    {{0,    0,    1,    0,    0,    0,    1,    0,    0,    1,    0,    0,    0}}, // ADDR_SW_4KB_S_X
+    {{0,    0,    1,    0,    0,    0,    0,    1,    0,    1,    0,    0,    0}}, // ADDR_SW_4KB_D_X
+    {{0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // Reserved
 
-    {0,    0,    0,    1,    0,    1,    0,    0,    0,    1,    0,    0,    0}, // ADDR_SW_64KB_Z_X
-    {0,    0,    0,    1,    0,    0,    1,    0,    0,    1,    0,    0,    0}, // ADDR_SW_64KB_S_X
-    {0,    0,    0,    1,    0,    0,    0,    1,    0,    1,    0,    0,    0}, // ADDR_SW_64KB_D_X
-    {0,    0,    0,    1,    0,    0,    0,    0,    0,    1,    0,    1,    0}, // ADDR_SW_64KB_R_X
+    {{0,    0,    0,    1,    0,    1,    0,    0,    0,    1,    0,    0,    0}}, // ADDR_SW_64KB_Z_X
+    {{0,    0,    0,    1,    0,    0,    1,    0,    0,    1,    0,    0,    0}}, // ADDR_SW_64KB_S_X
+    {{0,    0,    0,    1,    0,    0,    0,    1,    0,    1,    0,    0,    0}}, // ADDR_SW_64KB_D_X
+    {{0,    0,    0,    1,    0,    0,    0,    0,    0,    1,    0,    1,    0}}, // ADDR_SW_64KB_R_X
 
-    {0,    0,    0,    0,    1,    1,    0,    0,    0,    1,    0,    0,    0}, // ADDR_SW_VAR_Z_X
-    {0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // Reserved
-    {0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // Reserved
-    {0,    0,    0,    0,    1,    0,    0,    0,    0,    1,    0,    1,    0}, // ADDR_SW_VAR_R_X
-    {1,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // ADDR_SW_LINEAR_GENERAL
+    {{0,    0,    0,    0,    1,    1,    0,    0,    0,    1,    0,    0,    0}}, // ADDR_SW_VAR_Z_X
+    {{0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // Reserved
+    {{0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // Reserved
+    {{0,    0,    0,    0,    1,    0,    0,    0,    0,    1,    0,    1,    0}}, // ADDR_SW_VAR_R_X
+    {{1,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // ADDR_SW_LINEAR_GENERAL
 };
 
 const Dim3d Gfx10Lib::Block256_3d[] = {{8, 4, 8}, {4, 4, 8}, {4, 4, 4}, {4, 2, 4}, {2, 2, 4}};
@@ -611,7 +612,6 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlComputeCmaskAddrFromCoord(
             (pIn->swizzleMode == ADDR_SW_VAR_Z_X) ? GFX10_CMASK_VAR_RBPLUS_PATIDX :
             (m_settings.supportRbPlus ? GFX10_CMASK_64K_RBPLUS_PATIDX : GFX10_CMASK_64K_PATIDX);
 
-
         const UINT_32  blkSizeLog2  = Log2(output.metaBlkWidth) + Log2(output.metaBlkHeight) - 7;
         const UINT_32  blkMask      = (1 << blkSizeLog2) - 1;
         const UINT_32  blkOffset    = ComputeOffsetFromSwizzlePattern(GFX10_CMASK_SW_PATTERN[patIdxTable[index]],
@@ -680,7 +680,6 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlComputeHtileAddrFromCoord(
             const UINT_32  index         = m_xmaskBaseIndex + numSampleLog2;
             const UINT_8*  patIdxTable   = m_settings.supportRbPlus ? GFX10_HTILE_RBPLUS_PATIDX : GFX10_HTILE_PATIDX;
 
-
             const UINT_32  blkSizeLog2   = Log2(output.metaBlkWidth) + Log2(output.metaBlkHeight) - 4;
             const UINT_32  blkMask       = (1 << blkSizeLog2) - 1;
             const UINT_32  blkOffset     = ComputeOffsetFromSwizzlePattern(GFX10_HTILE_SW_PATTERN[patIdxTable[index]],
@@ -992,7 +991,6 @@ BOOL_32 Gfx10Lib::HwlInitGlobalParams(
         m_blockVarSizeLog2 = m_pipesLog2 + 14;
     }
 
-
     if (valid)
     {
         InitEquationTable();
@@ -2511,7 +2509,6 @@ BOOL_32 Gfx10Lib::ValidateNonSwModeParams(
     const BOOL_32             tex1d    = IsTex1d(rsrcType);
     const BOOL_32             stereo   = flags.qbStereo;
 
-
     // Resource type check
     if (tex1d)
     {
@@ -2640,7 +2637,6 @@ BOOL_32 Gfx10Lib::ValidateSwModeParams(
             ADDR_ASSERT_ALWAYS();
             valid = FALSE;
         }
-
     }
     else if (tex3d)
     {
@@ -2956,7 +2952,6 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlGetPreferredSurfaceSetting(
 
                 case ADDR_RSRC_TEX_2D:
                     allowedSwModeSet.value &= pIn->flags.prt ? Gfx10Rsrc2dPrtSwModeMask : Gfx10Rsrc2dSwModeMask;
-
                     break;
 
                 case ADDR_RSRC_TEX_3D:
@@ -3522,7 +3517,6 @@ ADDR_E_RETURNCODE Gfx10Lib::HwlComputeSurfaceInfoTiled(
     return ret;
 }
 
-
 /**
 ************************************************************************************************************************
 *   Gfx10Lib::ComputeSurfaceInfoMicroTiled
@@ -4329,7 +4323,6 @@ const ADDR_SW_PATINFO* Gfx10Lib::GetSwizzlePatternInfo(
     return (patInfo != NULL) ? &patInfo[index] : NULL;
 }
 
-
 /**
 ************************************************************************************************************************
 *   Gfx10Lib::ComputeSurfaceAddrFromCoordMicroTiled
diff --git a/mesa 3D driver/src/amd/addrlib/src/gfx10/gfx10addrlib.h b/mesa 3D driver/src/amd/addrlib/src/gfx10/gfx10addrlib.h
index 6303c0801b..c23c67c14d 100644
--- a/mesa 3D driver/src/amd/addrlib/src/gfx10/gfx10addrlib.h	
+++ b/mesa 3D driver/src/amd/addrlib/src/gfx10/gfx10addrlib.h	
@@ -1,28 +1,29 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+************************************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+************************************************************************************************************************
+*/
 
 /**
 ************************************************************************************************************************
@@ -355,7 +356,6 @@ class Gfx10Lib : public Lib
         const ADDR2_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT* pIn,
         ADDR2_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT*      pOut) const;
 
-
     UINT_32 ComputeOffsetFromSwizzlePattern(
         const UINT_64* pPattern,
         UINT_32        numBits,
diff --git a/mesa 3D driver/src/amd/addrlib/src/gfx9/gfx9addrlib.cpp b/mesa 3D driver/src/amd/addrlib/src/gfx9/gfx9addrlib.cpp
index ca3d29c239..45ac1f2990 100644
--- a/mesa 3D driver/src/amd/addrlib/src/gfx9/gfx9addrlib.cpp	
+++ b/mesa 3D driver/src/amd/addrlib/src/gfx9/gfx9addrlib.cpp	
@@ -1,28 +1,29 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+****************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+****************************************************************************************************
+*/
 
 /**
 ************************************************************************************************************************
@@ -68,46 +69,46 @@ namespace V2
 
 const SwizzleModeFlags Gfx9Lib::SwizzleModeTable[ADDR_SW_MAX_TYPE] =
 {//Linear 256B  4KB  64KB   Var    Z    Std   Disp  Rot   XOR    T     RtOpt Reserved
-    {1,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // ADDR_SW_LINEAR
-    {0,    1,    0,    0,    0,    0,    1,    0,    0,    0,    0,    0,    0}, // ADDR_SW_256B_S
-    {0,    1,    0,    0,    0,    0,    0,    1,    0,    0,    0,    0,    0}, // ADDR_SW_256B_D
-    {0,    1,    0,    0,    0,    0,    0,    0,    1,    0,    0,    0,    0}, // ADDR_SW_256B_R
+    {{1,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // ADDR_SW_LINEAR
+    {{0,    1,    0,    0,    0,    0,    1,    0,    0,    0,    0,    0,    0}}, // ADDR_SW_256B_S
+    {{0,    1,    0,    0,    0,    0,    0,    1,    0,    0,    0,    0,    0}}, // ADDR_SW_256B_D
+    {{0,    1,    0,    0,    0,    0,    0,    0,    1,    0,    0,    0,    0}}, // ADDR_SW_256B_R
 
-    {0,    0,    1,    0,    0,    1,    0,    0,    0,    0,    0,    0,    0}, // ADDR_SW_4KB_Z
-    {0,    0,    1,    0,    0,    0,    1,    0,    0,    0,    0,    0,    0}, // ADDR_SW_4KB_S
-    {0,    0,    1,    0,    0,    0,    0,    1,    0,    0,    0,    0,    0}, // ADDR_SW_4KB_D
-    {0,    0,    1,    0,    0,    0,    0,    0,    1,    0,    0,    0,    0}, // ADDR_SW_4KB_R
+    {{0,    0,    1,    0,    0,    1,    0,    0,    0,    0,    0,    0,    0}}, // ADDR_SW_4KB_Z
+    {{0,    0,    1,    0,    0,    0,    1,    0,    0,    0,    0,    0,    0}}, // ADDR_SW_4KB_S
+    {{0,    0,    1,    0,    0,    0,    0,    1,    0,    0,    0,    0,    0}}, // ADDR_SW_4KB_D
+    {{0,    0,    1,    0,    0,    0,    0,    0,    1,    0,    0,    0,    0}}, // ADDR_SW_4KB_R
 
-    {0,    0,    0,    1,    0,    1,    0,    0,    0,    0,    0,    0,    0}, // ADDR_SW_64KB_Z
-    {0,    0,    0,    1,    0,    0,    1,    0,    0,    0,    0,    0,    0}, // ADDR_SW_64KB_S
-    {0,    0,    0,    1,    0,    0,    0,    1,    0,    0,    0,    0,    0}, // ADDR_SW_64KB_D
-    {0,    0,    0,    1,    0,    0,    0,    0,    1,    0,    0,    0,    0}, // ADDR_SW_64KB_R
+    {{0,    0,    0,    1,    0,    1,    0,    0,    0,    0,    0,    0,    0}}, // ADDR_SW_64KB_Z
+    {{0,    0,    0,    1,    0,    0,    1,    0,    0,    0,    0,    0,    0}}, // ADDR_SW_64KB_S
+    {{0,    0,    0,    1,    0,    0,    0,    1,    0,    0,    0,    0,    0}}, // ADDR_SW_64KB_D
+    {{0,    0,    0,    1,    0,    0,    0,    0,    1,    0,    0,    0,    0}}, // ADDR_SW_64KB_R
 
-    {0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // Reserved
-    {0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // Reserved
-    {0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // Reserved
-    {0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // Reserved
+    {{0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // Reserved
+    {{0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // Reserved
+    {{0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // Reserved
+    {{0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // Reserved
 
-    {0,    0,    0,    1,    0,    1,    0,    0,    0,    1,    1,    0,    0}, // ADDR_SW_64KB_Z_T
-    {0,    0,    0,    1,    0,    0,    1,    0,    0,    1,    1,    0,    0}, // ADDR_SW_64KB_S_T
-    {0,    0,    0,    1,    0,    0,    0,    1,    0,    1,    1,    0,    0}, // ADDR_SW_64KB_D_T
-    {0,    0,    0,    1,    0,    0,    0,    0,    1,    1,    1,    0,    0}, // ADDR_SW_64KB_R_T
+    {{0,    0,    0,    1,    0,    1,    0,    0,    0,    1,    1,    0,    0}}, // ADDR_SW_64KB_Z_T
+    {{0,    0,    0,    1,    0,    0,    1,    0,    0,    1,    1,    0,    0}}, // ADDR_SW_64KB_S_T
+    {{0,    0,    0,    1,    0,    0,    0,    1,    0,    1,    1,    0,    0}}, // ADDR_SW_64KB_D_T
+    {{0,    0,    0,    1,    0,    0,    0,    0,    1,    1,    1,    0,    0}}, // ADDR_SW_64KB_R_T
 
-    {0,    0,    1,    0,    0,    1,    0,    0,    0,    1,    0,    0,    0}, // ADDR_SW_4KB_Z_x
-    {0,    0,    1,    0,    0,    0,    1,    0,    0,    1,    0,    0,    0}, // ADDR_SW_4KB_S_x
-    {0,    0,    1,    0,    0,    0,    0,    1,    0,    1,    0,    0,    0}, // ADDR_SW_4KB_D_x
-    {0,    0,    1,    0,    0,    0,    0,    0,    1,    1,    0,    0,    0}, // ADDR_SW_4KB_R_x
+    {{0,    0,    1,    0,    0,    1,    0,    0,    0,    1,    0,    0,    0}}, // ADDR_SW_4KB_Z_x
+    {{0,    0,    1,    0,    0,    0,    1,    0,    0,    1,    0,    0,    0}}, // ADDR_SW_4KB_S_x
+    {{0,    0,    1,    0,    0,    0,    0,    1,    0,    1,    0,    0,    0}}, // ADDR_SW_4KB_D_x
+    {{0,    0,    1,    0,    0,    0,    0,    0,    1,    1,    0,    0,    0}}, // ADDR_SW_4KB_R_x
 
-    {0,    0,    0,    1,    0,    1,    0,    0,    0,    1,    0,    0,    0}, // ADDR_SW_64KB_Z_X
-    {0,    0,    0,    1,    0,    0,    1,    0,    0,    1,    0,    0,    0}, // ADDR_SW_64KB_S_X
-    {0,    0,    0,    1,    0,    0,    0,    1,    0,    1,    0,    0,    0}, // ADDR_SW_64KB_D_X
-    {0,    0,    0,    1,    0,    0,    0,    0,    1,    1,    0,    0,    0}, // ADDR_SW_64KB_R_X
+    {{0,    0,    0,    1,    0,    1,    0,    0,    0,    1,    0,    0,    0}}, // ADDR_SW_64KB_Z_X
+    {{0,    0,    0,    1,    0,    0,    1,    0,    0,    1,    0,    0,    0}}, // ADDR_SW_64KB_S_X
+    {{0,    0,    0,    1,    0,    0,    0,    1,    0,    1,    0,    0,    0}}, // ADDR_SW_64KB_D_X
+    {{0,    0,    0,    1,    0,    0,    0,    0,    1,    1,    0,    0,    0}}, // ADDR_SW_64KB_R_X
 
-    {0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // Reserved
-    {0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // Reserved
-    {0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // Reserved
-    {0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // Reserved
-    {1,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}, // ADDR_SW_LINEAR_GENERAL
+    {{0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // Reserved
+    {{0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // Reserved
+    {{0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // Reserved
+    {{0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // Reserved
+    {{1,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0}}, // ADDR_SW_LINEAR_GENERAL
 };
 
 const UINT_32 Gfx9Lib::MipTailOffset256B[] = {2048, 1024, 512, 256, 128, 64, 32, 16, 8, 6, 5, 4, 3, 2, 1, 0};
diff --git a/mesa 3D driver/src/amd/addrlib/src/gfx9/gfx9addrlib.h b/mesa 3D driver/src/amd/addrlib/src/gfx9/gfx9addrlib.h
index 196764aa21..6606786220 100644
--- a/mesa 3D driver/src/amd/addrlib/src/gfx9/gfx9addrlib.h	
+++ b/mesa 3D driver/src/amd/addrlib/src/gfx9/gfx9addrlib.h	
@@ -1,28 +1,29 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+****************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+****************************************************************************************************
+*/
 
 /**
 ************************************************************************************************************************
diff --git a/mesa 3D driver/src/amd/addrlib/src/r800/ciaddrlib.cpp b/mesa 3D driver/src/amd/addrlib/src/r800/ciaddrlib.cpp
index 51718ed1a5..d6a9ae932c 100644
--- a/mesa 3D driver/src/amd/addrlib/src/r800/ciaddrlib.cpp	
+++ b/mesa 3D driver/src/amd/addrlib/src/r800/ciaddrlib.cpp	
@@ -1,28 +1,29 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+****************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+****************************************************************************************************
+*/
 
 /**
 ****************************************************************************************************
diff --git a/mesa 3D driver/src/amd/addrlib/src/r800/ciaddrlib.h b/mesa 3D driver/src/amd/addrlib/src/r800/ciaddrlib.h
index 550127029c..c8e93ec54f 100644
--- a/mesa 3D driver/src/amd/addrlib/src/r800/ciaddrlib.h	
+++ b/mesa 3D driver/src/amd/addrlib/src/r800/ciaddrlib.h	
@@ -1,28 +1,29 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+****************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+****************************************************************************************************
+*/
 
 /**
 ****************************************************************************************************
diff --git a/mesa 3D driver/src/amd/addrlib/src/r800/egbaddrlib.cpp b/mesa 3D driver/src/amd/addrlib/src/r800/egbaddrlib.cpp
index 84c7b20232..72e02c2028 100644
--- a/mesa 3D driver/src/amd/addrlib/src/r800/egbaddrlib.cpp	
+++ b/mesa 3D driver/src/amd/addrlib/src/r800/egbaddrlib.cpp	
@@ -1,28 +1,29 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+****************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+****************************************************************************************************
+*/
 /**
 ****************************************************************************************************
 * @file  egbaddrlib.cpp
diff --git a/mesa 3D driver/src/amd/addrlib/src/r800/egbaddrlib.h b/mesa 3D driver/src/amd/addrlib/src/r800/egbaddrlib.h
index 55e53540e9..9209cc07e3 100644
--- a/mesa 3D driver/src/amd/addrlib/src/r800/egbaddrlib.h	
+++ b/mesa 3D driver/src/amd/addrlib/src/r800/egbaddrlib.h	
@@ -1,28 +1,29 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+****************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+****************************************************************************************************
+*/
 
 /**
 ****************************************************************************************************
diff --git a/mesa 3D driver/src/amd/addrlib/src/r800/siaddrlib.cpp b/mesa 3D driver/src/amd/addrlib/src/r800/siaddrlib.cpp
index 59c481f642..0ada14db6a 100644
--- a/mesa 3D driver/src/amd/addrlib/src/r800/siaddrlib.cpp	
+++ b/mesa 3D driver/src/amd/addrlib/src/r800/siaddrlib.cpp	
@@ -1,28 +1,29 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+****************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+****************************************************************************************************
+*/
 
 /**
 ****************************************************************************************************
diff --git a/mesa 3D driver/src/amd/addrlib/src/r800/siaddrlib.h b/mesa 3D driver/src/amd/addrlib/src/r800/siaddrlib.h
index 24d49dcced..a283c228b4 100644
--- a/mesa 3D driver/src/amd/addrlib/src/r800/siaddrlib.h	
+++ b/mesa 3D driver/src/amd/addrlib/src/r800/siaddrlib.h	
@@ -1,28 +1,29 @@
-/*
- * Copyright © 2007-2019 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
+/**
+****************************************************************************************************
+*
+* Copyright © 2007-2021 Advanced Micro Devices, Inc.
+* All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE
+*
+****************************************************************************************************
+*/
 
 /**
 ****************************************************************************************************
diff --git a/mesa 3D driver/src/amd/ci/deqp-radv-bonaire-aco-skips.txt b/mesa 3D driver/src/amd/ci/deqp-radv-bonaire-aco-skips.txt
index 1183781734..1238ee0722 100644
--- a/mesa 3D driver/src/amd/ci/deqp-radv-bonaire-aco-skips.txt	
+++ b/mesa 3D driver/src/amd/ci/deqp-radv-bonaire-aco-skips.txt	
@@ -1,12 +1,2 @@
-# Exclude this test which might fail when a new extension is implemented.
-dEQP-VK.info.device_extensions
-
-# Exclude WSI related tests.
-dEQP-VK.image.swapchain_mutable.*
-dEQP-VK.wsi.*
-
 # This subset of CTS randomly hangs but it's fine when using only one thread.
 dEQP-VK.synchronization.*
-
-# Exclude this test which timeout most of the time.
-dEQP-VK.memory.pipeline_barrier.transfer_src_transfer_dst.1048576
diff --git a/mesa 3D driver/src/amd/ci/deqp-radv-hawaii-aco-skips.txt b/mesa 3D driver/src/amd/ci/deqp-radv-hawaii-aco-skips.txt
index 1183781734..1238ee0722 100644
--- a/mesa 3D driver/src/amd/ci/deqp-radv-hawaii-aco-skips.txt	
+++ b/mesa 3D driver/src/amd/ci/deqp-radv-hawaii-aco-skips.txt	
@@ -1,12 +1,2 @@
-# Exclude this test which might fail when a new extension is implemented.
-dEQP-VK.info.device_extensions
-
-# Exclude WSI related tests.
-dEQP-VK.image.swapchain_mutable.*
-dEQP-VK.wsi.*
-
 # This subset of CTS randomly hangs but it's fine when using only one thread.
 dEQP-VK.synchronization.*
-
-# Exclude this test which timeout most of the time.
-dEQP-VK.memory.pipeline_barrier.transfer_src_transfer_dst.1048576
diff --git a/mesa 3D driver/src/amd/ci/deqp-radv-raven-aco-flakes.txt b/mesa 3D driver/src/amd/ci/deqp-radv-raven-aco-flakes.txt
new file mode 100644
index 0000000000..89c1b98364
--- /dev/null
+++ b/mesa 3D driver/src/amd/ci/deqp-radv-raven-aco-flakes.txt	
@@ -0,0 +1,2 @@
+dEQP-VK.api.object_management.multithreaded_per_thread_device.image_2d
+dEQP-VK.api.object_management.multithreaded_per_thread_resources.image_2d
diff --git a/mesa 3D driver/src/amd/ci/deqp-radv-raven-aco-skips.txt b/mesa 3D driver/src/amd/ci/deqp-radv-raven-aco-skips.txt
index 77f7620934..cbbe39a7b3 100644
--- a/mesa 3D driver/src/amd/ci/deqp-radv-raven-aco-skips.txt	
+++ b/mesa 3D driver/src/amd/ci/deqp-radv-raven-aco-skips.txt	
@@ -1,13 +1,3 @@
-# Exclude this test which might fail when a new extension is implemented.
-dEQP-VK.info.device_extensions
-
-# Exclude WSI related tests.
-dEQP-VK.image.swapchain_mutable.*
-dEQP-VK.wsi.*
-
-# This subset of CTS seems to randomly hangs on RAVEN only.
+# This subset of CTS seems to randomly hang on RAVEN and BONAIRE.
 # This needs to be investigated and fixed!
 dEQP-VK.synchronization.*
-
-# Exclude this test which timeout most of the time.
-dEQP-VK.memory.pipeline_barrier.transfer_src_transfer_dst.1048576
diff --git a/mesa 3D driver/src/amd/ci/deqp-radv-renoir-aco-flakes.txt b/mesa 3D driver/src/amd/ci/deqp-radv-renoir-aco-flakes.txt
new file mode 100644
index 0000000000..89c1b98364
--- /dev/null
+++ b/mesa 3D driver/src/amd/ci/deqp-radv-renoir-aco-flakes.txt	
@@ -0,0 +1,2 @@
+dEQP-VK.api.object_management.multithreaded_per_thread_device.image_2d
+dEQP-VK.api.object_management.multithreaded_per_thread_resources.image_2d
diff --git a/mesa 3D driver/src/amd/ci/deqp-radv-skips.txt b/mesa 3D driver/src/amd/ci/deqp-radv-skips.txt
new file mode 100644
index 0000000000..521123c773
--- /dev/null
+++ b/mesa 3D driver/src/amd/ci/deqp-radv-skips.txt	
@@ -0,0 +1,9 @@
+# Exclude this test which might fail when a new extension is implemented.
+dEQP-VK.info.device_extensions
+
+# Exclude WSI related tests.
+dEQP-VK.image.swapchain_mutable.*
+dEQP-VK.wsi.*
+
+# Exclude this test which timeout most of the time.
+dEQP-VK.memory.pipeline_barrier.transfer_src_transfer_dst.1048576
diff --git a/mesa 3D driver/src/amd/ci/deqp-radv-stoney-aco-skips.txt b/mesa 3D driver/src/amd/ci/deqp-radv-stoney-aco-skips.txt
index 3e8b73fa7e..ca8cbdcc0e 100644
--- a/mesa 3D driver/src/amd/ci/deqp-radv-stoney-aco-skips.txt	
+++ b/mesa 3D driver/src/amd/ci/deqp-radv-stoney-aco-skips.txt	
@@ -1,13 +1,3 @@
-# Exclude this test which might fail when a new extension is implemented.
-dEQP-VK.info.device_extensions
-
-# Exclude WSI related tests.
-dEQP-VK.image.swapchain_mutable.*
-dEQP-VK.wsi.*
-
-# Exclude this test which timeout most of the time.
-dEQP-VK.memory.pipeline_barrier.transfer_src_transfer_dst.1048576
-
 # These tests take too long to run on the current STONEY testing hardware, skip them.
 dEQP-VK.texture.explicit_lod.2d.sizes.128x128_linear_linear_mipmap_linear_clamp
 dEQP-VK.texture.explicit_lod.2d.sizes.128x128_linear_linear_mipmap_linear_repeat
diff --git a/mesa 3D driver/src/amd/ci/deqp-radv-vangogh-aco-fails.txt b/mesa 3D driver/src/amd/ci/deqp-radv-vangogh-aco-fails.txt
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/mesa 3D driver/src/amd/ci/deqp-radv-vega10-aco-flakes.txt b/mesa 3D driver/src/amd/ci/deqp-radv-vega10-aco-flakes.txt
new file mode 100644
index 0000000000..89c1b98364
--- /dev/null
+++ b/mesa 3D driver/src/amd/ci/deqp-radv-vega10-aco-flakes.txt	
@@ -0,0 +1,2 @@
+dEQP-VK.api.object_management.multithreaded_per_thread_device.image_2d
+dEQP-VK.api.object_management.multithreaded_per_thread_resources.image_2d
diff --git a/mesa 3D driver/src/amd/ci/gitlab-ci.yml b/mesa 3D driver/src/amd/ci/gitlab-ci.yml
index 91980cdfbe..a75bb52ed8 100644
--- a/mesa 3D driver/src/amd/ci/gitlab-ci.yml	
+++ b/mesa 3D driver/src/amd/ci/gitlab-ci.yml	
@@ -4,6 +4,7 @@
     - .radv-rules
   variables:
     VK_DRIVER: radeon
+    DRIVER_NAME: radv
     ACO_DEBUG: validateir,validatera
     MESA_VK_IGNORE_CONFORMANCE_WARNING: 1
 
@@ -50,7 +51,7 @@ radv_polaris10_vkcts:
     - .test-radv
     - .test-manual
   variables:
-    GPU_VERSION: radv-polaris10
+    GPU_VERSION: radv-polaris10-aco
   tags:
     - polaris10
 
@@ -63,7 +64,7 @@ radv_stoney_vkcts:amd64:
   variables:
     DEQP_VER: vk
     DEQP_FRACTION: 10
-    DEQP_PARALLEL: 4
+    FDO_CI_CONCURRENT: 4
     DEQP_EXPECTED_RENDERER: STONEY
     DEVICE_TYPE: hp-11A-G6-EE-grunt
     DTB: ""
@@ -73,6 +74,7 @@ radv_stoney_vkcts:amd64:
     HWCI_KERNEL_MODULES: amdgpu
     HWCI_FREQ_MAX: "true"
     VK_DRIVER: radeon
+    DRIVER_NAME: radv
   tags:
     - mesa-ci-x86-64-lava-hp-11A-G6-EE-grunt
 
diff --git a/mesa 3D driver/src/amd/common/ac_debug.c b/mesa 3D driver/src/amd/common/ac_debug.c
index 6cd04b0d9e..a60107f215 100644
--- a/mesa 3D driver/src/amd/common/ac_debug.c	
+++ b/mesa 3D driver/src/amd/common/ac_debug.c	
@@ -42,6 +42,8 @@
 #include <assert.h>
 #include <inttypes.h>
 
+DEBUG_GET_ONCE_BOOL_OPTION(color, "AMD_COLOR", true);
+
 /* Parsed IBs are difficult to read without colors. Use "less -R file" to
  * read them, or use "aha -b -f file" to convert them to html.
  */
@@ -51,6 +53,12 @@
 #define COLOR_YELLOW "\033[1;33m"
 #define COLOR_CYAN   "\033[1;36m"
 
+#define O_COLOR_RESET  (debug_get_option_color() ? COLOR_RESET : "")
+#define O_COLOR_RED    (debug_get_option_color() ? COLOR_RED : "")
+#define O_COLOR_GREEN  (debug_get_option_color() ? COLOR_GREEN : "")
+#define O_COLOR_YELLOW (debug_get_option_color() ? COLOR_YELLOW : "")
+#define O_COLOR_CYAN   (debug_get_option_color() ? COLOR_CYAN : "")
+
 #define INDENT_PKT 8
 
 struct ac_ib_parser {
@@ -95,7 +103,9 @@ static void print_value(FILE *file, uint32_t value, int bits)
 static void print_named_value(FILE *file, const char *name, uint32_t value, int bits)
 {
    print_spaces(file, INDENT_PKT);
-   fprintf(file, COLOR_YELLOW "%s" COLOR_RESET " <- ", name);
+   fprintf(file, "%s%s%s <- ",
+           O_COLOR_YELLOW, name,
+           O_COLOR_RESET);
    print_value(file, value, bits);
 }
 
@@ -157,7 +167,9 @@ void ac_dump_reg(FILE *file, enum chip_class chip_class, unsigned offset, uint32
       bool first_field = true;
 
       print_spaces(file, INDENT_PKT);
-      fprintf(file, COLOR_YELLOW "%s" COLOR_RESET " <- ", reg_name);
+      fprintf(file, "%s%s%s <- ",
+              O_COLOR_YELLOW, reg_name,
+              O_COLOR_RESET);
 
       if (!reg->num_fields) {
          print_value(file, value, 32);
@@ -190,7 +202,9 @@ void ac_dump_reg(FILE *file, enum chip_class chip_class, unsigned offset, uint32
    }
 
    print_spaces(file, INDENT_PKT);
-   fprintf(file, COLOR_YELLOW "0x%05x" COLOR_RESET " <- 0x%08x\n", offset, value);
+   fprintf(file, "%s0x%05x%s <- 0x%08x\n",
+           O_COLOR_YELLOW, offset,
+           O_COLOR_RESET, value);
 }
 
 static uint32_t ac_ib_get(struct ac_ib_parser *ib)
@@ -208,7 +222,8 @@ static uint32_t ac_ib_get(struct ac_ib_parser *ib)
        * and radeon_emit is performance sensitive...
        */
       if (VALGRIND_CHECK_VALUE_IS_DEFINED(v))
-         fprintf(ib->f, COLOR_RED "Valgrind: The next DWORD is garbage" COLOR_RESET "\n");
+         fprintf(ib->f, "%sValgrind: The next DWORD is garbage%s\n",
+                 debug_get_option_color() ? COLOR_RED : "", O_COLOR_RESET);
 #endif
       fprintf(ib->f, "\n\035#%08x ", v);
    } else {
@@ -255,11 +270,11 @@ static void ac_parse_packet3(FILE *f, uint32_t header, struct ac_ib_parser *ib,
 
       if (op == PKT3_SET_CONTEXT_REG || op == PKT3_SET_CONFIG_REG || op == PKT3_SET_UCONFIG_REG ||
           op == PKT3_SET_UCONFIG_REG_INDEX || op == PKT3_SET_SH_REG)
-         fprintf(f, COLOR_CYAN "%s%s" COLOR_CYAN ":\n", name, predicate);
+         fprintf(f, "%s%s%s%s:\n", O_COLOR_CYAN, name, predicate, O_COLOR_RESET);
       else
-         fprintf(f, COLOR_GREEN "%s%s" COLOR_RESET ":\n", name, predicate);
+         fprintf(f, "%s%s%s%s:\n", O_COLOR_GREEN, name, predicate, O_COLOR_RESET);
    } else
-      fprintf(f, COLOR_RED "PKT3_UNKNOWN 0x%x%s" COLOR_RESET ":\n", op, predicate);
+      fprintf(f, "%sPKT3_UNKNOWN 0x%x%s%s:\n", O_COLOR_RED, op, predicate, O_COLOR_RESET);
 
    /* Print the contents. */
    switch (op) {
@@ -459,7 +474,7 @@ static void ac_parse_packet3(FILE *f, uint32_t header, struct ac_ib_parser *ib,
          unsigned packet_id = AC_GET_TRACE_POINT_ID(ib->ib[ib->cur_dw]);
 
          print_spaces(f, INDENT_PKT);
-         fprintf(f, COLOR_RED "Trace point ID: %u\n", packet_id);
+         fprintf(f, "%sTrace point ID: %u%s\n", O_COLOR_RED, packet_id, O_COLOR_RESET);
 
          if (!ib->trace_id_count)
             break; /* tracing was disabled */
@@ -467,17 +482,22 @@ static void ac_parse_packet3(FILE *f, uint32_t header, struct ac_ib_parser *ib,
          *current_trace_id = packet_id;
 
          print_spaces(f, INDENT_PKT);
-         if (packet_id < *ib->trace_ids)
-            fprintf(f, COLOR_RED "This trace point was reached by the CP." COLOR_RESET "\n");
-         else if (packet_id == *ib->trace_ids)
-            fprintf(f, COLOR_RED "!!!!! This is the last trace point that "
-                                 "was reached by the CP !!!!!" COLOR_RESET "\n");
-         else if (packet_id + 1 == *ib->trace_ids)
-            fprintf(f, COLOR_RED "!!!!! This is the first trace point that "
-                                 "was NOT been reached by the CP !!!!!" COLOR_RESET "\n");
-         else
-            fprintf(f, COLOR_RED "!!!!! This trace point was NOT reached "
-                                 "by the CP !!!!!" COLOR_RESET "\n");
+         if (packet_id < *ib->trace_ids) {
+            fprintf(f, "%sThis trace point was reached by the CP.%s\n",
+                    O_COLOR_RED, O_COLOR_RESET);
+         } else if (packet_id == *ib->trace_ids) {
+            fprintf(f, "%s!!!!! This is the last trace point that "
+                                 "was reached by the CP !!!!!%s\n",
+                    O_COLOR_RED, O_COLOR_RESET);
+         } else if (packet_id + 1 == *ib->trace_ids) {
+            fprintf(f, "%s!!!!! This is the first trace point that "
+                                 "was NOT been reached by the CP !!!!!%s\n",
+                    O_COLOR_RED, O_COLOR_RESET);
+         } else {
+            fprintf(f, "%s!!!!! This trace point was NOT reached "
+                                 "by the CP !!!!!%s\n",
+                    O_COLOR_RED, O_COLOR_RESET);
+         }
          break;
       }
       break;
@@ -488,7 +508,8 @@ static void ac_parse_packet3(FILE *f, uint32_t header, struct ac_ib_parser *ib,
       ac_ib_get(ib);
 
    if (ib->cur_dw > first_dw + count + 1)
-      fprintf(f, COLOR_RED " !!!!! count in header too low !!!!!" COLOR_RESET "\n");
+      fprintf(f, "%s !!!!! count in header too low !!!!!%s\n",
+              O_COLOR_RED, O_COLOR_RESET);
 }
 
 /**
@@ -509,7 +530,8 @@ static void ac_do_parse_ib(FILE *f, struct ac_ib_parser *ib)
       case 2:
          /* type-2 nop */
          if (header == 0x80000000) {
-            fprintf(f, COLOR_GREEN "NOP (type 2)" COLOR_RESET "\n");
+            fprintf(f, "%sNOP (type 2)%s\n",
+                    O_COLOR_GREEN, O_COLOR_RESET);
             break;
          }
          FALLTHROUGH;
diff --git a/mesa 3D driver/src/amd/common/ac_exp_param.h b/mesa 3D driver/src/amd/common/ac_exp_param.h
index ac8018c0b3..ce7a0cc9f1 100644
--- a/mesa 3D driver/src/amd/common/ac_exp_param.h	
+++ b/mesa 3D driver/src/amd/common/ac_exp_param.h	
@@ -35,7 +35,7 @@ enum
    AC_EXP_PARAM_DEFAULT_VAL_0001,
    AC_EXP_PARAM_DEFAULT_VAL_1110,
    AC_EXP_PARAM_DEFAULT_VAL_1111,
-   AC_EXP_PARAM_UNDEFINED = 255,
+   AC_EXP_PARAM_UNDEFINED = 255, /* deprecated, use AC_EXP_PARAM_DEFAULT_VAL_0000 instead */
 };
 
 #endif
diff --git a/mesa 3D driver/src/amd/common/ac_gpu_info.c b/mesa 3D driver/src/amd/common/ac_gpu_info.c
index 7bb4fbb08f..0d4f349bd0 100644
--- a/mesa 3D driver/src/amd/common/ac_gpu_info.c	
+++ b/mesa 3D driver/src/amd/common/ac_gpu_info.c	
@@ -32,6 +32,7 @@
 #include "util/u_math.h"
 
 #include <stdio.h>
+#include <ctype.h>
 
 #ifdef _WIN32
 #define DRM_CAP_ADDFB2_MODIFIERS 0x10
@@ -553,10 +554,6 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
    info->all_vram_visible = info->vram_size * 0.9 < info->vram_vis_size;
 
    util_cpu_detect();
-   info->smart_access_memory = info->all_vram_visible &&
-                               info->chip_class >= GFX10_3 &&
-                               util_get_cpu_caps()->family >= CPU_AMD_ZEN3 &&
-                               util_get_cpu_caps()->family < CPU_AMD_LAST;
 
    /* Set chip identification. */
    info->pci_id = amdinfo->asic_id; /* TODO: is this correct? */
@@ -636,6 +633,10 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
       return false;
    }
 
+   memset(info->lowercase_name, 0, sizeof(info->lowercase_name));
+   for (unsigned i = 0; info->name[i] && i < ARRAY_SIZE(info->lowercase_name) - 1; i++)
+      info->lowercase_name[i] = tolower(info->name[i]);
+
    if (info->family >= CHIP_SIENNA_CICHLID)
       info->chip_class = GFX10_3;
    else if (info->family >= CHIP_NAVI10)
@@ -653,6 +654,11 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
       return false;
    }
 
+   info->smart_access_memory = info->all_vram_visible &&
+                               info->chip_class >= GFX10_3 &&
+                               util_get_cpu_caps()->family >= CPU_AMD_ZEN3 &&
+                               util_get_cpu_caps()->family < CPU_AMD_LAST;
+
    info->family_id = amdinfo->family_id;
    info->chip_external_rev = amdinfo->chip_external_rev;
    info->marketing_name = amdgpu_get_marketing_name(dev);
@@ -869,6 +875,13 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
    /* Whether chips support double rate packed math instructions. */
    info->has_packed_math_16bit = info->chip_class >= GFX9;
 
+   /* Whether chips support dot product instructions. A subset of these support a smaller
+    * instruction encoding which accumulates with the destination.
+    */
+   info->has_accelerated_dot_product =
+      info->family == CHIP_ARCTURUS || info->family == CHIP_ALDEBARAN ||
+      info->family == CHIP_VEGA20 || info->family >= CHIP_NAVI12;
+
    /* TODO: Figure out how to use LOAD_CONTEXT_REG on GFX6-GFX7. */
    info->has_load_ctx_reg_pkt =
       info->chip_class >= GFX9 || (info->chip_class >= GFX8 && info->me_fw_feature >= 41);
@@ -900,9 +913,7 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
     */
    info->has_two_planes_iterate256_bug = info->chip_class == GFX10;
 
-   /* GE has a bug when a legacy GS draw follows an NGG draw and it requires
-    * a VGT_FLUSH to fix that.
-    */
+   /* GFX10+Sienna: NGG->legacy transitions require VGT_FLUSH. */
    info->has_vgt_flush_ngg_legacy_bug = info->chip_class == GFX10 ||
                                         info->family == CHIP_SIENNA_CICHLID;
 
@@ -1127,6 +1138,7 @@ void ac_print_gpu_info(struct radeon_info *info, FILE *f)
            info->pci_dev, info->pci_func);
 
    fprintf(f, "    name = %s\n", info->name);
+   fprintf(f, "    lowercase_name = %s\n", info->lowercase_name);
    fprintf(f, "    marketing_name = %s\n", info->marketing_name);
    fprintf(f, "    is_pro_graphics = %u\n", info->is_pro_graphics);
    fprintf(f, "    pci_id = 0x%x\n", info->pci_id);
diff --git a/mesa 3D driver/src/amd/common/ac_gpu_info.h b/mesa 3D driver/src/amd/common/ac_gpu_info.h
index c928677369..e31b67022f 100644
--- a/mesa 3D driver/src/amd/common/ac_gpu_info.h	
+++ b/mesa 3D driver/src/amd/common/ac_gpu_info.h	
@@ -48,6 +48,7 @@ struct radeon_info {
 
    /* Device info. */
    const char *name;
+   char lowercase_name[32];
    const char *marketing_name;
    bool is_pro_graphics;
    uint32_t pci_id;
@@ -70,6 +71,7 @@ struct radeon_info {
    bool has_load_ctx_reg_pkt;
    bool has_out_of_order_rast;
    bool has_packed_math_16bit;
+   bool has_accelerated_dot_product;
    bool cpdma_prefetch_writes_memory;
    bool has_gfx9_scissor_bug;
    bool has_tc_compat_zrange_bug;
diff --git a/mesa 3D driver/src/amd/common/ac_nir.h b/mesa 3D driver/src/amd/common/ac_nir.h
index 470749949d..16db749bae 100644
--- a/mesa 3D driver/src/amd/common/ac_nir.h	
+++ b/mesa 3D driver/src/amd/common/ac_nir.h	
@@ -91,26 +91,19 @@ bool
 ac_nir_lower_indirect_derefs(nir_shader *shader,
                              enum chip_class chip_class);
 
-typedef struct
-{
-   unsigned lds_bytes_if_culling_off;
-   bool can_cull;
-   bool passthrough;
-   bool early_prim_export;
-   uint64_t nggc_inputs_read_by_pos;
-   uint64_t nggc_inputs_read_by_others;
-} ac_nir_ngg_config;
-
-ac_nir_ngg_config
+void
 ac_nir_lower_ngg_nogs(nir_shader *shader,
                       unsigned max_num_es_vertices,
                       unsigned num_vertices_per_primitive,
                       unsigned max_workgroup_size,
                       unsigned wave_size,
-                      bool consider_culling,
-                      bool consider_passthrough,
+                      bool can_cull,
+                      bool early_prim_export,
+                      bool passthrough,
                       bool export_prim_id,
-                      bool provoking_vtx_last);
+                      bool provoking_vtx_last,
+                      bool use_edgeflags,
+                      uint32_t instance_rate_inputs);
 
 void
 ac_nir_lower_ngg_gs(nir_shader *shader,
diff --git a/mesa 3D driver/src/amd/common/ac_nir_cull.c b/mesa 3D driver/src/amd/common/ac_nir_cull.c
index a5adf2a461..26e1f6515f 100644
--- a/mesa 3D driver/src/amd/common/ac_nir_cull.c	
+++ b/mesa 3D driver/src/amd/common/ac_nir_cull.c	
@@ -75,7 +75,14 @@ cull_face(nir_builder *b, nir_ssa_def *pos[3][4], const position_w_info *w_info)
    nir_ssa_def *cull_front = nir_build_load_cull_front_face_enabled_amd(b);
    nir_ssa_def *cull_back = nir_build_load_cull_back_face_enabled_amd(b);
 
-   return nir_inot(b, nir_bcsel(b, front_facing, cull_front, cull_back));
+   nir_ssa_def *face_culled = nir_bcsel(b, front_facing, cull_front, cull_back);
+
+   /* Don't reject NaN and +/-infinity, these are tricky.
+    * Just trust fixed-function HW to handle these cases correctly.
+    */
+   face_culled = nir_iand(b, face_culled, nir_fisfinite(b, det));
+
+   return nir_inot(b, face_culled);
 }
 
 static nir_ssa_def *
diff --git a/mesa 3D driver/src/amd/common/ac_nir_lower_esgs_io_to_mem.c b/mesa 3D driver/src/amd/common/ac_nir_lower_esgs_io_to_mem.c
index cd7f7cba70..0dbe7107a2 100644
--- a/mesa 3D driver/src/amd/common/ac_nir_lower_esgs_io_to_mem.c	
+++ b/mesa 3D driver/src/amd/common/ac_nir_lower_esgs_io_to_mem.c	
@@ -169,8 +169,8 @@ gs_per_vertex_input_vertex_offset_gfx9(nir_builder *b, nir_src *vertex_src)
 {
    if (nir_src_is_const(*vertex_src)) {
       unsigned vertex = nir_src_as_uint(*vertex_src);
-      return nir_ubfe(b, nir_build_load_gs_vertex_offset_amd(b, .base = vertex / 2u * 2u),
-                      nir_imm_int(b, (vertex % 2u) * 16u), nir_imm_int(b, 16u));
+      return nir_ubfe(b, nir_build_load_gs_vertex_offset_amd(b, .base = vertex / 2u),
+                      nir_imm_int(b, (vertex & 1u) * 16u), nir_imm_int(b, 16u));
    }
 
    nir_ssa_def *vertex_offset = nir_build_load_gs_vertex_offset_amd(b, .base = 0);
diff --git a/mesa 3D driver/src/amd/common/ac_nir_lower_ngg.c b/mesa 3D driver/src/amd/common/ac_nir_lower_ngg.c
index 75728b707f..9bf628381c 100644
--- a/mesa 3D driver/src/amd/common/ac_nir_lower_ngg.c	
+++ b/mesa 3D driver/src/amd/common/ac_nir_lower_ngg.c	
@@ -51,6 +51,7 @@ typedef struct
    bool passthrough;
    bool export_prim_id;
    bool early_prim_export;
+   bool use_edgeflags;
    unsigned wave_size;
    unsigned max_num_waves;
    unsigned num_vertices_per_primitives;
@@ -60,6 +61,10 @@ typedef struct
 
    uint64_t inputs_needed_by_pos;
    uint64_t inputs_needed_by_others;
+   uint32_t instance_rate_inputs;
+
+   nir_instr *compact_arg_stores[4];
+   nir_intrinsic_instr *overwrite_args;
 } lower_ngg_nogs_state;
 
 typedef struct
@@ -97,11 +102,6 @@ typedef struct {
    nir_variable *pos_value_replacement;
 } remove_extra_position_output_state;
 
-typedef struct {
-   nir_ssa_def *reduction_result;
-   nir_ssa_def *excl_scan_result;
-} wg_scan_result;
-
 /* Per-vertex LDS layout of culling shaders */
 enum {
    /* Position of the ES vertex (at the beginning for alignment reasons) */
@@ -134,6 +134,70 @@ typedef struct {
    nir_ssa_def *repacked_invocation_index;
 } wg_repack_result;
 
+/**
+ * Computes a horizontal sum of 8-bit packed values loaded from LDS.
+ *
+ * Each lane N will sum packed bytes 0 to N-1.
+ * We only care about the results from up to wave_id+1 lanes.
+ * (Other lanes are not deactivated but their calculation is not used.)
+ */
+static nir_ssa_def *
+summarize_repack(nir_builder *b, nir_ssa_def *packed_counts, unsigned num_lds_dwords)
+{
+   /* We'll use shift to filter out the bytes not needed by the current lane.
+    *
+    * Need to shift by: num_lds_dwords * 4 - lane_id (in bytes).
+    * However, two shifts are needed because one can't go all the way,
+    * so the shift amount is half that (and in bits).
+    *
+    * When v_dot4_u32_u8 is available, we right-shift a series of 0x01 bytes.
+    * This will yield 0x01 at wanted byte positions and 0x00 at unwanted positions,
+    * therefore v_dot can get rid of the unneeded values.
+    * This sequence is preferable because it better hides the latency of the LDS.
+    *
+    * If the v_dot instruction can't be used, we left-shift the packed bytes.
+    * This will shift out the unneeded bytes and shift in zeroes instead,
+    * then we sum them using v_sad_u8.
+    */
+
+   nir_ssa_def *lane_id = nir_load_subgroup_invocation(b);
+   nir_ssa_def *shift = nir_iadd_imm_nuw(b, nir_imul_imm(b, lane_id, -4u), num_lds_dwords * 16);
+   bool use_dot = b->shader->options->has_dot_4x8;
+
+   if (num_lds_dwords == 1) {
+      nir_ssa_def *dot_op = !use_dot ? NULL : nir_ushr(b, nir_ushr(b, nir_imm_int(b, 0x01010101), shift), shift);
+
+      /* Broadcast the packed data we read from LDS (to the first 16 lanes, but we only care up to num_waves). */
+      nir_ssa_def *packed = nir_build_lane_permute_16_amd(b, packed_counts, nir_imm_int(b, 0), nir_imm_int(b, 0));
+
+      /* Horizontally add the packed bytes. */
+      if (use_dot) {
+         return nir_udot_4x8_uadd(b, packed, dot_op, nir_imm_int(b, 0));
+      } else {
+         nir_ssa_def *sad_op = nir_ishl(b, nir_ishl(b, packed, shift), shift);
+         return nir_sad_u8x4(b, sad_op, nir_imm_int(b, 0), nir_imm_int(b, 0));
+      }
+   } else if (num_lds_dwords == 2) {
+      nir_ssa_def *dot_op = !use_dot ? NULL : nir_ushr(b, nir_ushr(b, nir_imm_int64(b, 0x0101010101010101), shift), shift);
+
+      /* Broadcast the packed data we read from LDS (to the first 16 lanes, but we only care up to num_waves). */
+      nir_ssa_def *packed_dw0 = nir_build_lane_permute_16_amd(b, nir_unpack_64_2x32_split_x(b, packed_counts), nir_imm_int(b, 0), nir_imm_int(b, 0));
+      nir_ssa_def *packed_dw1 = nir_build_lane_permute_16_amd(b, nir_unpack_64_2x32_split_y(b, packed_counts), nir_imm_int(b, 0), nir_imm_int(b, 0));
+
+      /* Horizontally add the packed bytes. */
+      if (use_dot) {
+         nir_ssa_def *sum = nir_udot_4x8_uadd(b, packed_dw0, nir_unpack_64_2x32_split_x(b, dot_op), nir_imm_int(b, 0));
+         return nir_udot_4x8_uadd(b, packed_dw1, nir_unpack_64_2x32_split_y(b, dot_op), sum);
+      } else {
+         nir_ssa_def *sad_op = nir_ishl(b, nir_ishl(b, nir_pack_64_2x32_split(b, packed_dw0, packed_dw1), shift), shift);
+         nir_ssa_def *sum = nir_sad_u8x4(b, nir_unpack_64_2x32_split_x(b, sad_op), nir_imm_int(b, 0), nir_imm_int(b, 0));
+         return nir_sad_u8x4(b, nir_unpack_64_2x32_split_y(b, sad_op), nir_imm_int(b, 0), sum);
+      }
+   } else {
+      unreachable("Unimplemented NGG wave count");
+   }
+}
+
 /**
  * Repacks invocations in the current workgroup to eliminate gaps between them.
  *
@@ -209,41 +273,7 @@ repack_invocations_in_workgroup(nir_builder *b, nir_ssa_def *input_bool,
     */
 
    nir_ssa_def *num_waves = nir_build_load_num_subgroups(b);
-
-   /* sel = 0x01010101 * lane_id + 0x03020100 */
-   nir_ssa_def *lane_id = nir_load_subgroup_invocation(b);
-   nir_ssa_def *packed_id = nir_build_byte_permute_amd(b, nir_imm_int(b, 0), lane_id, nir_imm_int(b, 0));
-   nir_ssa_def *sel = nir_iadd_imm_nuw(b, packed_id, 0x03020100);
-   nir_ssa_def *sum = NULL;
-
-   if (num_lds_dwords == 1) {
-      /* Broadcast the packed data we read from LDS (to the first 16 lanes, but we only care up to num_waves). */
-      nir_ssa_def *packed_dw = nir_build_lane_permute_16_amd(b, packed_counts, nir_imm_int(b, 0), nir_imm_int(b, 0));
-
-      /* Use byte-permute to filter out the bytes not needed by the current lane. */
-      nir_ssa_def *filtered_packed = nir_build_byte_permute_amd(b, packed_dw, nir_imm_int(b, 0), sel);
-
-      /* Horizontally add the packed bytes. */
-      sum = nir_sad_u8x4(b, filtered_packed, nir_imm_int(b, 0), nir_imm_int(b, 0));
-   } else if (num_lds_dwords == 2) {
-      /* Create selectors for the byte-permutes below. */
-      nir_ssa_def *dw0_selector = nir_build_lane_permute_16_amd(b, sel, nir_imm_int(b, 0x44443210), nir_imm_int(b, 0x4));
-      nir_ssa_def *dw1_selector = nir_build_lane_permute_16_amd(b, sel, nir_imm_int(b, 0x32100000), nir_imm_int(b, 0x4));
-
-      /* Broadcast the packed data we read from LDS (to the first 16 lanes, but we only care up to num_waves). */
-      nir_ssa_def *packed_dw0 = nir_build_lane_permute_16_amd(b, nir_unpack_64_2x32_split_x(b, packed_counts), nir_imm_int(b, 0), nir_imm_int(b, 0));
-      nir_ssa_def *packed_dw1 = nir_build_lane_permute_16_amd(b, nir_unpack_64_2x32_split_y(b, packed_counts), nir_imm_int(b, 0), nir_imm_int(b, 0));
-
-      /* Use byte-permute to filter out the bytes not needed by the current lane. */
-      nir_ssa_def *filtered_packed_dw0 = nir_build_byte_permute_amd(b, packed_dw0, nir_imm_int(b, 0), dw0_selector);
-      nir_ssa_def *filtered_packed_dw1 = nir_build_byte_permute_amd(b, packed_dw1, nir_imm_int(b, 0), dw1_selector);
-
-      /* Horizontally add the packed bytes. */
-      sum = nir_sad_u8x4(b, filtered_packed_dw0, nir_imm_int(b, 0), nir_imm_int(b, 0));
-      sum = nir_sad_u8x4(b, filtered_packed_dw1, nir_imm_int(b, 0), sum);
-   } else {
-      unreachable("Unimplemented NGG wave count");
-   }
+   nir_ssa_def *sum = summarize_repack(b, packed_counts, num_lds_dwords);
 
    nir_ssa_def *wg_repacked_index_base = nir_build_read_invocation(b, sum, wave_id);
    nir_ssa_def *wg_num_repacked_invocations = nir_build_read_invocation(b, sum, num_waves);
@@ -265,9 +295,10 @@ pervertex_lds_addr(nir_builder *b, nir_ssa_def *vertex_idx, unsigned per_vtx_byt
 
 static nir_ssa_def *
 emit_pack_ngg_prim_exp_arg(nir_builder *b, unsigned num_vertices_per_primitives,
-                           nir_ssa_def *vertex_indices[3], nir_ssa_def *is_null_prim)
+                           nir_ssa_def *vertex_indices[3], nir_ssa_def *is_null_prim,
+                           bool use_edgeflags)
 {
-   nir_ssa_def *arg = b->shader->info.stage == MESA_SHADER_VERTEX
+   nir_ssa_def *arg = use_edgeflags
                       ? nir_build_load_initial_edgeflags_amd(b)
                       : nir_imm_int(b, 0);
 
@@ -289,9 +320,8 @@ emit_pack_ngg_prim_exp_arg(nir_builder *b, unsigned num_vertices_per_primitives,
 static nir_ssa_def *
 ngg_input_primitive_vertex_index(nir_builder *b, unsigned vertex)
 {
-   /* TODO: This is RADV specific. We'll need to refactor RADV and/or RadeonSI to match. */
-   return nir_ubfe(b, nir_build_load_gs_vertex_offset_amd(b, .base = vertex / 2u * 2u),
-                      nir_imm_int(b, (vertex % 2u) * 16u), nir_imm_int(b, 16u));
+   return nir_ubfe(b, nir_build_load_gs_vertex_offset_amd(b, .base = vertex / 2u),
+                      nir_imm_int(b, (vertex & 1u) * 16u), nir_imm_int(b, 16u));
 }
 
 static nir_ssa_def *
@@ -311,7 +341,7 @@ emit_ngg_nogs_prim_exp_arg(nir_builder *b, lower_ngg_nogs_state *st)
                ? ngg_input_primitive_vertex_index(b, 2)
                : nir_imm_zero(b, 1, 32);
 
-      return emit_pack_ngg_prim_exp_arg(b, st->num_vertices_per_primitives, vtx_idx, NULL);
+      return emit_pack_ngg_prim_exp_arg(b, st->num_vertices_per_primitives, vtx_idx, NULL, st->use_edgeflags);
    }
 }
 
@@ -532,6 +562,105 @@ remove_extra_pos_outputs(nir_shader *shader, lower_ngg_nogs_state *nogs_state)
                                 nir_metadata_block_index | nir_metadata_dominance, &s);
 }
 
+static bool
+remove_compacted_arg(lower_ngg_nogs_state *state, nir_builder *b, unsigned idx)
+{
+   nir_instr *store_instr = state->compact_arg_stores[idx];
+   if (!store_instr)
+      return false;
+
+   /* Simply remove the store. */
+   nir_instr_remove(store_instr);
+
+   /* Find the intrinsic that overwrites the shader arguments,
+    * and change its corresponding source.
+    * This will cause NIR's DCE to recognize the load and its phis as dead.
+    */
+   b->cursor = nir_before_instr(&state->overwrite_args->instr);
+   nir_ssa_def *undef_arg = nir_ssa_undef(b, 1, 32);
+   nir_ssa_def_rewrite_uses(state->overwrite_args->src[idx].ssa, undef_arg);
+
+   state->compact_arg_stores[idx] = NULL;
+   return true;
+}
+
+static bool
+cleanup_culling_shader_after_dce(nir_shader *shader,
+                                 nir_function_impl *function_impl,
+                                 lower_ngg_nogs_state *state)
+{
+   bool uses_vs_vertex_id = false;
+   bool uses_vs_instance_id = false;
+   bool uses_tes_u = false;
+   bool uses_tes_v = false;
+   bool uses_tes_rel_patch_id = false;
+   bool uses_tes_patch_id = false;
+
+   bool progress = false;
+   nir_builder b;
+   nir_builder_init(&b, function_impl);
+
+   nir_foreach_block_reverse_safe(block, function_impl) {
+      nir_foreach_instr_reverse_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+         switch (intrin->intrinsic) {
+         case nir_intrinsic_alloc_vertices_and_primitives_amd:
+            goto cleanup_culling_shader_after_dce_done;
+         case nir_intrinsic_load_vertex_id:
+         case nir_intrinsic_load_vertex_id_zero_base:
+            uses_vs_vertex_id = true;
+            break;
+         case nir_intrinsic_load_instance_id:
+            uses_vs_instance_id = true;
+            break;
+         case nir_intrinsic_load_input:
+            if (state->instance_rate_inputs &
+                (1 << (nir_intrinsic_base(intrin) - VERT_ATTRIB_GENERIC0)))
+               uses_vs_instance_id = true;
+            else
+               uses_vs_vertex_id = true;
+            break;
+         case nir_intrinsic_load_tess_coord:
+            uses_tes_u = uses_tes_v = true;
+            break;
+         case nir_intrinsic_load_tess_rel_patch_id_amd:
+            uses_tes_rel_patch_id = true;
+            break;
+         case nir_intrinsic_load_primitive_id:
+            if (shader->info.stage == MESA_SHADER_TESS_EVAL)
+               uses_tes_patch_id = true;
+            break;
+         default:
+            break;
+         }
+      }
+   }
+
+   cleanup_culling_shader_after_dce_done:
+
+   if (shader->info.stage == MESA_SHADER_VERTEX) {
+      if (!uses_vs_vertex_id)
+         progress |= remove_compacted_arg(state, &b, 0);
+      if (!uses_vs_instance_id)
+         progress |= remove_compacted_arg(state, &b, 1);
+   } else if (shader->info.stage == MESA_SHADER_TESS_EVAL) {
+      if (!uses_tes_u)
+         progress |= remove_compacted_arg(state, &b, 0);
+      if (!uses_tes_v)
+         progress |= remove_compacted_arg(state, &b, 1);
+      if (!uses_tes_rel_patch_id)
+         progress |= remove_compacted_arg(state, &b, 2);
+      if (!uses_tes_patch_id)
+         progress |= remove_compacted_arg(state, &b, 3);
+   }
+
+   return progress;
+}
+
 /**
  * Perform vertex compaction after culling.
  *
@@ -548,6 +677,9 @@ compact_vertices_after_culling(nir_builder *b,
                                nir_variable **gs_vtxaddr_vars,
                                nir_ssa_def *invocation_index,
                                nir_ssa_def *es_vertex_lds_addr,
+                               nir_ssa_def *es_exporter_tid,
+                               nir_ssa_def *num_live_vertices_in_workgroup,
+                               nir_ssa_def *fully_culled,
                                unsigned ngg_scratch_lds_base_addr,
                                unsigned pervertex_lds_bytes,
                                unsigned max_exported_args)
@@ -557,15 +689,7 @@ compact_vertices_after_culling(nir_builder *b,
    nir_variable *position_value_var = nogs_state->position_value_var;
    nir_variable *prim_exp_arg_var = nogs_state->prim_exp_arg_var;
 
-   nir_ssa_def *es_accepted = nir_load_var(b, es_accepted_var);
-
-   /* Repack the vertices that survived the culling. */
-   wg_repack_result rep = repack_invocations_in_workgroup(b, es_accepted, ngg_scratch_lds_base_addr,
-                                                          nogs_state->max_num_waves, nogs_state->wave_size);
-   nir_ssa_def *num_live_vertices_in_workgroup = rep.num_repacked_invocations;
-   nir_ssa_def *es_exporter_tid = rep.repacked_invocation_index;
-
-   nir_if *if_es_accepted = nir_push_if(b, es_accepted);
+   nir_if *if_es_accepted = nir_push_if(b, nir_load_var(b, es_accepted_var));
    {
       nir_ssa_def *exporter_addr = pervertex_lds_addr(b, es_exporter_tid, pervertex_lds_bytes);
 
@@ -579,25 +703,13 @@ compact_vertices_after_culling(nir_builder *b,
       /* Store the current thread's repackable arguments to the exporter thread's LDS space */
       for (unsigned i = 0; i < max_exported_args; ++i) {
          nir_ssa_def *arg_val = nir_load_var(b, repacked_arg_vars[i]);
-         nir_build_store_shared(b, arg_val, exporter_addr, .base = lds_es_arg_0 + 4u * i, .align_mul = 4u, .write_mask = 0x1u);
+         nir_intrinsic_instr *store = nir_build_store_shared(b, arg_val, exporter_addr, .base = lds_es_arg_0 + 4u * i, .align_mul = 4u, .write_mask = 0x1u);
+
+         nogs_state->compact_arg_stores[i] = &store->instr;
       }
    }
    nir_pop_if(b, if_es_accepted);
 
-   /* If all vertices are culled, set primitive count to 0 as well. */
-   nir_ssa_def *num_exported_prims = nir_build_load_workgroup_num_input_primitives_amd(b);
-   nir_ssa_def *fully_culled = nir_ieq_imm(b, num_live_vertices_in_workgroup, 0u);
-   num_exported_prims = nir_bcsel(b, fully_culled, nir_imm_int(b, 0u), num_exported_prims);
-
-   nir_if *if_wave_0 = nir_push_if(b, nir_ieq(b, nir_build_load_subgroup_id(b), nir_imm_int(b, 0)));
-   {
-      /* Tell the final vertex and primitive count to the HW.
-       * We do this here to mask some of the latency of the LDS.
-       */
-      nir_build_alloc_vertices_and_primitives_amd(b, num_live_vertices_in_workgroup, num_exported_prims);
-   }
-   nir_pop_if(b, if_wave_0);
-
    /* TODO: Consider adding a shortcut exit.
     * Waves that have no vertices and primitives left can s_endpgm right here.
     */
@@ -618,6 +730,12 @@ compact_vertices_after_culling(nir_builder *b,
          nir_store_var(b, repacked_arg_vars[i], arg_val, 0x1u);
       }
    }
+   nir_push_else(b, if_packed_es_thread);
+   {
+      nir_store_var(b, position_value_var, nir_ssa_undef(b, 4, 32), 0xfu);
+      for (unsigned i = 0; i < max_exported_args; ++i)
+         nir_store_var(b, repacked_arg_vars[i], nir_ssa_undef(b, 1, 32), 0x1u);
+   }
    nir_pop_if(b, if_packed_es_thread);
 
    nir_if *if_gs_accepted = nir_push_if(b, nir_load_var(b, gs_accepted_var));
@@ -631,7 +749,7 @@ compact_vertices_after_culling(nir_builder *b,
          exporter_vtx_indices[v] = nir_u2u32(b, exporter_vtx_idx);
       }
 
-      nir_ssa_def *prim_exp_arg = emit_pack_ngg_prim_exp_arg(b, 3, exporter_vtx_indices, NULL);
+      nir_ssa_def *prim_exp_arg = emit_pack_ngg_prim_exp_arg(b, 3, exporter_vtx_indices, NULL, nogs_state->use_edgeflags);
       nir_store_var(b, prim_exp_arg_var, prim_exp_arg, 0x1u);
    }
    nir_pop_if(b, if_gs_accepted);
@@ -735,34 +853,27 @@ analyze_shader_before_culling(nir_shader *shader, lower_ngg_nogs_state *nogs_sta
 static void
 save_reusable_variables(nir_builder *b, lower_ngg_nogs_state *nogs_state)
 {
-   ASSERTED int vec_ok = u_vector_init(&nogs_state->saved_uniforms, sizeof(saved_uniform), 4 * sizeof(saved_uniform));
+   ASSERTED int vec_ok = u_vector_init(&nogs_state->saved_uniforms, 4, sizeof(saved_uniform));
    assert(vec_ok);
 
-   unsigned loop_depth = 0;
-
-   nir_foreach_block_safe(block, b->impl) {
-      /* Check whether we're in a loop. */
-      nir_cf_node *next_cf_node = nir_cf_node_next(&block->cf_node);
-      nir_cf_node *prev_cf_node = nir_cf_node_prev(&block->cf_node);
-      if (next_cf_node && next_cf_node->type == nir_cf_node_loop)
-         loop_depth++;
-      if (prev_cf_node && prev_cf_node->type == nir_cf_node_loop)
-         loop_depth--;
-
-      /* The following code doesn't make sense in loops, so just skip it then. */
-      if (loop_depth)
-         continue;
-
+   nir_block *block = nir_start_block(b->impl);
+   while (block) {
+      /* Process the instructions in the current block. */
       nir_foreach_instr_safe(instr, block) {
          /* Find instructions whose SSA definitions are used by both
-          * the top and bottom parts of the shader. In this case, it
-          * makes sense to try to reuse these from the top part.
+          * the top and bottom parts of the shader (before and after culling).
+          * Only in this case, it makes sense for the bottom part
+          * to try to reuse these from the top part.
           */
          if ((instr->pass_flags & nggc_passflag_used_by_both) != nggc_passflag_used_by_both)
             continue;
 
+         /* Determine if we can reuse the current SSA value.
+          * When vertex compaction is used, it is possible that the same shader invocation
+          * processes a different vertex in the top and bottom part of the shader.
+          * Therefore, we only reuse uniform values.
+          */
          nir_ssa_def *ssa = NULL;
-
          switch (instr->type) {
          case nir_instr_type_alu: {
             nir_alu_instr *alu = nir_instr_as_alu(instr);
@@ -796,6 +907,7 @@ save_reusable_variables(nir_builder *b, lower_ngg_nogs_state *nogs_state)
 
          assert(ssa);
 
+         /* Determine a suitable type for the SSA value. */
          enum glsl_base_type base_type = GLSL_TYPE_UINT;
          switch (ssa->bit_size) {
          case 8: base_type = GLSL_TYPE_UINT8; break;
@@ -812,6 +924,10 @@ save_reusable_variables(nir_builder *b, lower_ngg_nogs_state *nogs_state)
          saved_uniform *saved = (saved_uniform *) u_vector_add(&nogs_state->saved_uniforms);
          assert(saved);
 
+         /* Create a new NIR variable where we store the reusable value.
+          * Then, we reload the variable and replace the uses of the value
+          * with the reloaded variable.
+          */
          saved->var = nir_local_variable_create(b->impl, t, NULL);
          saved->ssa = ssa;
 
@@ -822,6 +938,35 @@ save_reusable_variables(nir_builder *b, lower_ngg_nogs_state *nogs_state)
          nir_ssa_def *reloaded = nir_load_var(b, saved->var);
          nir_ssa_def_rewrite_uses_after(ssa, reloaded, reloaded->parent_instr);
       }
+
+      /* Look at the next CF node. */
+      nir_cf_node *next_cf_node = nir_cf_node_next(&block->cf_node);
+      if (next_cf_node) {
+         /* It makes no sense to try to reuse things from within loops. */
+         bool next_is_loop = next_cf_node->type == nir_cf_node_loop;
+
+         /* Don't reuse if we're in divergent control flow.
+          *
+          * Thanks to vertex repacking, the same shader invocation may process a different vertex
+          * in the top and bottom part, and it's even possible that this different vertex was initially
+          * processed in a different wave. So the two parts may take a different divergent code path.
+          * Therefore, these variables in divergent control flow may stay undefined.
+          *
+          * Note that this problem doesn't exist if vertices are not repacked or if the
+          * workgroup only has a single wave.
+          */
+         bool next_is_divergent_if =
+            next_cf_node->type == nir_cf_node_if &&
+            nir_cf_node_as_if(next_cf_node)->condition.ssa->divergent;
+
+         if (next_is_loop || next_is_divergent_if) {
+            block = nir_cf_node_cf_tree_next(next_cf_node);
+            continue;
+         }
+      }
+
+      /* Go to the next block. */
+      block = nir_block_cf_tree_next(block);
    }
 }
 
@@ -846,8 +991,7 @@ apply_reusable_variables(nir_builder *b, lower_ngg_nogs_state *nogs_state)
          /* When we found any of these intrinsics, it means
           * we reached the top part and we must stop.
           */
-         if (intrin->intrinsic == nir_intrinsic_alloc_vertices_and_primitives_amd ||
-             intrin->intrinsic == nir_intrinsic_export_primitive_amd)
+         if (intrin->intrinsic == nir_intrinsic_alloc_vertices_and_primitives_amd)
             goto done;
 
          if (intrin->intrinsic != nir_intrinsic_store_deref)
@@ -1045,10 +1189,31 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
       }
       nir_pop_if(b, if_es_thread);
 
+      nir_ssa_def *es_accepted = nir_load_var(b, es_accepted_var);
+
+      /* Repack the vertices that survived the culling. */
+      wg_repack_result rep = repack_invocations_in_workgroup(b, es_accepted, ngg_scratch_lds_base_addr,
+                                                            nogs_state->max_num_waves, nogs_state->wave_size);
+      nir_ssa_def *num_live_vertices_in_workgroup = rep.num_repacked_invocations;
+      nir_ssa_def *es_exporter_tid = rep.repacked_invocation_index;
+
+      /* If all vertices are culled, set primitive count to 0 as well. */
+      nir_ssa_def *num_exported_prims = nir_build_load_workgroup_num_input_primitives_amd(b);
+      nir_ssa_def *fully_culled = nir_ieq_imm(b, num_live_vertices_in_workgroup, 0u);
+      num_exported_prims = nir_bcsel(b, fully_culled, nir_imm_int(b, 0u), num_exported_prims);
+
+      nir_if *if_wave_0 = nir_push_if(b, nir_ieq(b, nir_build_load_subgroup_id(b), nir_imm_int(b, 0)));
+      {
+         /* Tell the final vertex and primitive count to the HW. */
+         nir_build_alloc_vertices_and_primitives_amd(b, num_live_vertices_in_workgroup, num_exported_prims);
+      }
+      nir_pop_if(b, if_wave_0);
+
       /* Vertex compaction. */
       compact_vertices_after_culling(b, nogs_state,
                                      repacked_arg_vars, gs_vtxaddr_vars,
                                      invocation_index, es_vertex_lds_addr,
+                                     es_exporter_tid, num_live_vertices_in_workgroup, fully_culled,
                                      ngg_scratch_lds_base_addr, pervertex_lds_bytes, max_exported_args);
    }
    nir_push_else(b, if_cull_en);
@@ -1083,56 +1248,36 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
     */
 
    if (b->shader->info.stage == MESA_SHADER_VERTEX)
-      nir_build_overwrite_vs_arguments_amd(b,
-         nir_load_var(b, repacked_arg_vars[0]), nir_load_var(b, repacked_arg_vars[1]));
+      nogs_state->overwrite_args =
+         nir_build_overwrite_vs_arguments_amd(b,
+            nir_load_var(b, repacked_arg_vars[0]), nir_load_var(b, repacked_arg_vars[1]));
    else if (b->shader->info.stage == MESA_SHADER_TESS_EVAL)
-      nir_build_overwrite_tes_arguments_amd(b,
-         nir_load_var(b, repacked_arg_vars[0]), nir_load_var(b, repacked_arg_vars[1]),
-         nir_load_var(b, repacked_arg_vars[2]), nir_load_var(b, repacked_arg_vars[3]));
+      nogs_state->overwrite_args =
+         nir_build_overwrite_tes_arguments_amd(b,
+            nir_load_var(b, repacked_arg_vars[0]), nir_load_var(b, repacked_arg_vars[1]),
+            nir_load_var(b, repacked_arg_vars[2]), nir_load_var(b, repacked_arg_vars[3]));
    else
       unreachable("Should be VS or TES.");
 }
 
-static bool
-can_use_deferred_attribute_culling(nir_shader *shader)
-{
-   /* When the shader writes memory, it is difficult to guarantee correctness.
-    * Future work:
-    * - if only write-only SSBOs are used
-    * - if we can prove that non-position outputs don't rely on memory stores
-    * then may be okay to keep the memory stores in the 1st shader part, and delete them from the 2nd.
-    */
-   if (shader->info.writes_memory)
-      return false;
-
-   /* When the shader relies on the subgroup invocation ID, we'd break it, because the ID changes after the culling.
-    * Future work: try to save this to LDS and reload, but it can still be broken in subtle ways.
-    */
-   if (BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SUBGROUP_INVOCATION))
-      return false;
-
-   return true;
-}
-
-ac_nir_ngg_config
+void
 ac_nir_lower_ngg_nogs(nir_shader *shader,
                       unsigned max_num_es_vertices,
                       unsigned num_vertices_per_primitives,
                       unsigned max_workgroup_size,
                       unsigned wave_size,
-                      bool consider_culling,
-                      bool consider_passthrough,
+                      bool can_cull,
+                      bool early_prim_export,
+                      bool passthrough,
                       bool export_prim_id,
-                      bool provoking_vtx_last)
+                      bool provoking_vtx_last,
+                      bool use_edgeflags,
+                      uint32_t instance_rate_inputs)
 {
    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
    assert(impl);
    assert(max_num_es_vertices && max_workgroup_size && wave_size);
-
-   bool can_cull = consider_culling && (num_vertices_per_primitives == 3) &&
-                   can_use_deferred_attribute_culling(shader);
-   bool passthrough = consider_passthrough && !can_cull &&
-                      !(shader->info.stage == MESA_SHADER_VERTEX && export_prim_id);
+   assert(!(can_cull && passthrough));
 
    nir_variable *position_value_var = nir_local_variable_create(impl, glsl_vec4_type(), "position_value");
    nir_variable *prim_exp_arg_var = nir_local_variable_create(impl, glsl_uint_type(), "prim_exp_arg");
@@ -1142,7 +1287,8 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
    lower_ngg_nogs_state state = {
       .passthrough = passthrough,
       .export_prim_id = export_prim_id,
-      .early_prim_export = exec_list_is_singular(&impl->body),
+      .early_prim_export = early_prim_export,
+      .use_edgeflags = use_edgeflags,
       .num_vertices_per_primitives = num_vertices_per_primitives,
       .provoking_vtx_idx = provoking_vtx_last ? (num_vertices_per_primitives - 1) : 0,
       .position_value_var = position_value_var,
@@ -1152,15 +1298,13 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
       .max_num_waves = DIV_ROUND_UP(max_workgroup_size, wave_size),
       .max_es_num_vertices = max_num_es_vertices,
       .wave_size = wave_size,
+      .instance_rate_inputs = instance_rate_inputs,
    };
 
    /* We need LDS space when VS needs to export the primitive ID. */
    if (shader->info.stage == MESA_SHADER_VERTEX && export_prim_id)
       state.total_lds_bytes = max_num_es_vertices * 4u;
 
-   /* The shader only needs this much LDS when culling is turned off. */
-   unsigned lds_bytes_if_culling_off = state.total_lds_bytes;
-
    nir_builder builder;
    nir_builder *b = &builder; /* This is to avoid the & */
    nir_builder_init(b, impl);
@@ -1267,20 +1411,12 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
       NIR_PASS(progress, shader, nir_opt_undef);
       NIR_PASS(progress, shader, nir_opt_dce);
       NIR_PASS(progress, shader, nir_opt_dead_cf);
+
+      if (can_cull)
+         progress |= cleanup_culling_shader_after_dce(shader, b->impl, &state);
    } while (progress);
 
    shader->info.shared_size = state.total_lds_bytes;
-
-   ac_nir_ngg_config ret = {
-      .lds_bytes_if_culling_off = lds_bytes_if_culling_off,
-      .can_cull = can_cull,
-      .passthrough = passthrough,
-      .early_prim_export = state.early_prim_export,
-      .nggc_inputs_read_by_pos = state.inputs_needed_by_pos,
-      .nggc_inputs_read_by_others = state.inputs_needed_by_others,
-   };
-
-   return ret;
 }
 
 static nir_ssa_def *
@@ -1568,7 +1704,7 @@ ngg_gs_export_primitives(nir_builder *b, nir_ssa_def *max_num_out_prims, nir_ssa
       }
    }
 
-   nir_ssa_def *arg = emit_pack_ngg_prim_exp_arg(b, s->num_vertices_per_primitive, vtx_indices, is_null_prim);
+   nir_ssa_def *arg = emit_pack_ngg_prim_exp_arg(b, s->num_vertices_per_primitive, vtx_indices, is_null_prim, false);
    nir_build_export_primitive_amd(b, arg);
    nir_pop_if(b, if_prim_export_thread);
 }
diff --git a/mesa 3D driver/src/amd/common/ac_nir_lower_tess_io_to_mem.c b/mesa 3D driver/src/amd/common/ac_nir_lower_tess_io_to_mem.c
index 58d489e377..2137b4f9c0 100644
--- a/mesa 3D driver/src/amd/common/ac_nir_lower_tess_io_to_mem.c	
+++ b/mesa 3D driver/src/amd/common/ac_nir_lower_tess_io_to_mem.c	
@@ -154,7 +154,8 @@ typedef struct {
 } lower_tess_io_state;
 
 static bool
-match_mask(nir_intrinsic_instr *intrin,
+match_mask(gl_shader_stage stage,
+           nir_intrinsic_instr *intrin,
            uint64_t mask,
            bool match_indirect)
 {
@@ -163,7 +164,8 @@ match_mask(nir_intrinsic_instr *intrin,
       return match_indirect;
 
    uint64_t slot = nir_intrinsic_io_semantics(intrin).location;
-   if (intrin->intrinsic != nir_intrinsic_load_per_vertex_input &&
+   if (stage == MESA_SHADER_TESS_CTRL &&
+       intrin->intrinsic != nir_intrinsic_load_per_vertex_input &&
        intrin->intrinsic != nir_intrinsic_store_per_vertex_output)
       slot -= VARYING_SLOT_PATCH0;
 
@@ -178,7 +180,7 @@ tcs_output_needs_vmem(nir_intrinsic_instr *intrin,
                    ? st->tes_inputs_read
                    : st->tes_patch_inputs_read;
 
-   return match_mask(intrin, mask, true);
+   return match_mask(MESA_SHADER_TESS_CTRL, intrin, mask, true);
 }
 
 static bool
@@ -189,7 +191,7 @@ tcs_output_needs_lds(nir_intrinsic_instr *intrin,
                    ? shader->info.outputs_read
                    : shader->info.patch_outputs_read;
 
-   return match_mask(intrin, mask, true);
+   return match_mask(MESA_SHADER_TESS_CTRL, intrin, mask, true);
 }
 
 static bool
@@ -208,7 +210,7 @@ lower_ls_output_store(nir_builder *b,
    lower_tess_io_state *st = (lower_tess_io_state *) state;
 
    /* If this is a temp-only TCS input, we don't need to use shared memory at all. */
-   if (match_mask(intrin, st->tcs_temp_only_inputs, false))
+   if (match_mask(MESA_SHADER_VERTEX, intrin, st->tcs_temp_only_inputs, false))
       return false;
 
    b->cursor = nir_before_instr(instr);
diff --git a/mesa 3D driver/src/amd/common/ac_rgp.c b/mesa 3D driver/src/amd/common/ac_rgp.c
index 7d0d48a91c..5abe0c8b59 100644
--- a/mesa 3D driver/src/amd/common/ac_rgp.c	
+++ b/mesa 3D driver/src/amd/common/ac_rgp.c	
@@ -61,10 +61,6 @@
 enum sqtt_version
 {
    SQTT_VERSION_NONE = 0x0,
-   SQTT_VERSION_1_0 = 0x1,
-   SQTT_VERSION_1_1 = 0x2,
-   SQTT_VERSION_2_0 = 0x3, /* GFX6 */
-   SQTT_VERSION_2_1 = 0x4, /* GFX7 */
    SQTT_VERSION_2_2 = 0x5, /* GFX8 */
    SQTT_VERSION_2_3 = 0x6, /* GFX9 */
    SQTT_VERSION_2_4 = 0x7  /* GFX10+ */
@@ -368,10 +364,6 @@ static_assert(sizeof(struct sqtt_file_chunk_asic_info) == 720,
 static enum sqtt_gfxip_level ac_chip_class_to_sqtt_gfxip_level(enum chip_class chip_class)
 {
    switch (chip_class) {
-   case GFX6:
-      return SQTT_GFXIP_LEVEL_GFXIP_6;
-   case GFX7:
-      return SQTT_GFXIP_LEVEL_GFXIP_7;
    case GFX8:
       return SQTT_GFXIP_LEVEL_GFXIP_8;
    case GFX9:
@@ -454,13 +446,20 @@ static void ac_sqtt_fill_asic_info(struct radeon_info *rad_info,
    if (rad_info->chip_class < GFX9)
       chunk->flags |= SQTT_FILE_CHUNK_ASIC_INFO_FLAG_SC_PACKER_NUMBERING;
 
-   /* Only FIJI and GFX9+ support PS1 events. */
-   if (rad_info->family == CHIP_FIJI || rad_info->chip_class >= GFX9)
+   /* Only GFX9+ support PS1 events. */
+   if (rad_info->chip_class >= GFX9)
       chunk->flags |= SQTT_FILE_CHUNK_ASIC_INFO_FLAG_PS1_EVENT_TOKENS_ENABLED;
 
    chunk->trace_shader_core_clock = rad_info->max_shader_clock * 1000000;
    chunk->trace_memory_clock = rad_info->max_memory_clock * 1000000;
 
+   /* RGP gets very confused if these clocks are 0. The 1 GHz clocks are not necessarily correct,
+    * but the resulting traces are at least somewhat useful. */
+   if (!chunk->trace_shader_core_clock)
+      chunk->trace_shader_core_clock = 1e9;
+   if (!chunk->trace_memory_clock)
+      chunk->trace_memory_clock = 1e9;
+
    chunk->device_id = rad_info->pci_id;
    chunk->device_revision_id = rad_info->pci_rev_id;
    chunk->vgprs_per_simd = rad_info->num_physical_wave64_vgprs_per_simd * (has_wave32 ? 2 : 1);
@@ -725,10 +724,6 @@ static_assert(sizeof(struct sqtt_file_chunk_sqtt_desc) == 32,
 static enum sqtt_version ac_chip_class_to_sqtt_version(enum chip_class chip_class)
 {
    switch (chip_class) {
-   case GFX6:
-      return SQTT_VERSION_2_0;
-   case GFX7:
-      return SQTT_VERSION_2_1;
    case GFX8:
       return SQTT_VERSION_2_2;
    case GFX9:
@@ -790,8 +785,6 @@ static void ac_sqtt_fill_sqtt_data(struct sqtt_file_chunk_sqtt_data *chunk, int3
  */
 enum elf_gfxip_level
 {
-   EF_AMDGPU_MACH_AMDGCN_GFX600 = 0x020,
-   EF_AMDGPU_MACH_AMDGCN_GFX700 = 0x022,
    EF_AMDGPU_MACH_AMDGCN_GFX801 = 0x028,
    EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
    EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033,
@@ -801,10 +794,6 @@ enum elf_gfxip_level
 static enum elf_gfxip_level ac_chip_class_to_elf_gfxip_level(enum chip_class chip_class)
 {
    switch (chip_class) {
-   case GFX6:
-      return EF_AMDGPU_MACH_AMDGCN_GFX600;
-   case GFX7:
-      return EF_AMDGPU_MACH_AMDGCN_GFX700;
    case GFX8:
       return EF_AMDGPU_MACH_AMDGCN_GFX801;
    case GFX9:
diff --git a/mesa 3D driver/src/amd/common/ac_rgp.h b/mesa 3D driver/src/amd/common/ac_rgp.h
index 118fd9d5e9..694aade80d 100644
--- a/mesa 3D driver/src/amd/common/ac_rgp.h	
+++ b/mesa 3D driver/src/amd/common/ac_rgp.h	
@@ -52,6 +52,8 @@ struct rgp_shader_data {
    uint8_t *code;
    uint32_t vgpr_count;
    uint32_t sgpr_count;
+   uint32_t scratch_memory_size;
+   uint32_t wavefront_size;
    uint64_t base_address;
    uint32_t elf_symbol_offset;
    uint32_t hw_stage;
diff --git a/mesa 3D driver/src/amd/common/ac_rgp_elf_object_pack.c b/mesa 3D driver/src/amd/common/ac_rgp_elf_object_pack.c
index e70954ce46..4f1961ba02 100644
--- a/mesa 3D driver/src/amd/common/ac_rgp_elf_object_pack.c	
+++ b/mesa 3D driver/src/amd/common/ac_rgp_elf_object_pack.c	
@@ -149,7 +149,7 @@ ac_rgp_write_msgpack(FILE *output,
 
                   ac_msgpack_add_fixstr(&msgpack, hw_stage_string[
                                         record->shader_data[i].hw_stage]);
-                  ac_msgpack_add_fixmap_op(&msgpack, 3);
+                  ac_msgpack_add_fixmap_op(&msgpack, 5);
                      ac_msgpack_add_fixstr(&msgpack, ".entry_point");
                      ac_msgpack_add_fixstr(&msgpack, hw_stage_symbol_string[
                                            record->shader_data[i].hw_stage]);
@@ -161,6 +161,14 @@ ac_rgp_write_msgpack(FILE *output,
                      ac_msgpack_add_fixstr(&msgpack, ".vgpr_count");
                      ac_msgpack_add_uint(&msgpack,
                                          record->shader_data[i].vgpr_count);
+
+                     ac_msgpack_add_fixstr(&msgpack, ".scratch_memory_size");
+                     ac_msgpack_add_uint(&msgpack,
+                                         record->shader_data[i].scratch_memory_size);
+
+                     ac_msgpack_add_fixstr(&msgpack, ".wavefront_size");
+                     ac_msgpack_add_uint(&msgpack,
+                                         record->shader_data[i].wavefront_size);
                }
 
             /* 5 */
diff --git a/mesa 3D driver/src/amd/common/ac_shader_args.h b/mesa 3D driver/src/amd/common/ac_shader_args.h
index 4da9d06612..270682f42d 100644
--- a/mesa 3D driver/src/amd/common/ac_shader_args.h	
+++ b/mesa 3D driver/src/amd/common/ac_shader_args.h	
@@ -107,7 +107,7 @@ struct ac_shader_args {
    struct ac_arg es2gs_offset;      /* separate legacy ES */
    struct ac_arg gs2vs_offset;      /* legacy GS */
    struct ac_arg gs_wave_id;        /* legacy GS */
-   struct ac_arg gs_vtx_offset[6];  /* separate legacy GS */
+   struct ac_arg gs_vtx_offset[6];  /* GFX6-8: [0-5], GFX9+: [0-2] packed */
    struct ac_arg gs_prim_id;
    struct ac_arg gs_invocation_id;
 
@@ -139,10 +139,10 @@ struct ac_shader_args {
    /* Vulkan only */
    struct ac_arg push_constants;
    struct ac_arg inline_push_consts[AC_MAX_INLINE_PUSH_CONSTS];
-   unsigned num_inline_push_consts;
    unsigned base_inline_push_consts;
    struct ac_arg view_index;
    struct ac_arg sbt_descriptors;
+   struct ac_arg ray_launch_size;
 };
 
 void ac_add_arg(struct ac_shader_args *info, enum ac_arg_regfile regfile, unsigned registers,
diff --git a/mesa 3D driver/src/amd/common/ac_shader_util.c b/mesa 3D driver/src/amd/common/ac_shader_util.c
index 645d0d3617..943523b88d 100644
--- a/mesa 3D driver/src/amd/common/ac_shader_util.c	
+++ b/mesa 3D driver/src/amd/common/ac_shader_util.c	
@@ -25,6 +25,7 @@
 #include "ac_gpu_info.h"
 
 #include "sid.h"
+#include "u_math.h"
 
 #include <assert.h>
 #include <stdlib.h>
@@ -511,3 +512,72 @@ void ac_compute_late_alloc(const struct radeon_info *info, bool ngg, bool ngg_cu
    else /* VS */
       *late_alloc_wave64 = MIN2(*late_alloc_wave64, G_00B11C_LIMIT(~0u));
 }
+
+unsigned ac_compute_cs_workgroup_size(uint16_t sizes[3], bool variable, unsigned max)
+{
+   if (variable)
+      return max;
+
+   return sizes[0] * sizes[1] * sizes[2];
+}
+
+unsigned ac_compute_lshs_workgroup_size(enum chip_class chip_class, gl_shader_stage stage,
+                                        unsigned tess_num_patches,
+                                        unsigned tess_patch_in_vtx,
+                                        unsigned tess_patch_out_vtx)
+{
+   /* When tessellation is used, API VS runs on HW LS, API TCS runs on HW HS.
+    * These two HW stages are merged on GFX9+.
+    */
+
+   bool merged_shaders = chip_class >= GFX9;
+   unsigned ls_workgroup_size = tess_num_patches * tess_patch_in_vtx;
+   unsigned hs_workgroup_size = tess_num_patches * tess_patch_out_vtx;
+
+   if (merged_shaders)
+      return MAX2(ls_workgroup_size, hs_workgroup_size);
+   else if (stage == MESA_SHADER_VERTEX)
+      return ls_workgroup_size;
+   else if (stage == MESA_SHADER_TESS_CTRL)
+      return hs_workgroup_size;
+   else
+      unreachable("invalid LSHS shader stage");
+}
+
+unsigned ac_compute_esgs_workgroup_size(enum chip_class chip_class, unsigned wave_size,
+                                        unsigned es_verts, unsigned gs_inst_prims)
+{
+   /* ESGS may operate in workgroups if on-chip GS (LDS rings) are enabled.
+    *
+    * GFX6: Not possible in the HW.
+    * GFX7-8 (unmerged): possible in the HW, but not implemented in Mesa.
+    * GFX9+ (merged): implemented in Mesa.
+    */
+
+   if (chip_class <= GFX8)
+      return wave_size;
+
+   unsigned workgroup_size = MAX2(es_verts, gs_inst_prims);
+   return CLAMP(workgroup_size, 1, 256);
+}
+
+unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims,
+                                       unsigned max_vtx_out, unsigned prim_amp_factor)
+{
+   /* NGG always operates in workgroups.
+    *
+    * For API VS/TES/GS:
+    * - 1 invocation per input vertex
+    * - 1 invocation per input primitive
+    *
+    * The same invocation can process both an input vertex and primitive,
+    * however 1 invocation can only output up to 1 vertex and 1 primitive.
+    */
+
+   unsigned max_vtx_in = es_verts < 256 ? es_verts : 3 * gs_inst_prims;
+   unsigned max_prim_in = gs_inst_prims;
+   unsigned max_prim_out = gs_inst_prims * prim_amp_factor;
+   unsigned workgroup_size = MAX4(max_vtx_in, max_vtx_out, max_prim_in, max_prim_out);
+
+   return CLAMP(workgroup_size, 1, 256);
+}
diff --git a/mesa 3D driver/src/amd/common/ac_shader_util.h b/mesa 3D driver/src/amd/common/ac_shader_util.h
index f9020125f4..fcf4e48ca1 100644
--- a/mesa 3D driver/src/amd/common/ac_shader_util.h	
+++ b/mesa 3D driver/src/amd/common/ac_shader_util.h	
@@ -27,6 +27,7 @@
 #include "ac_binary.h"
 #include "amd_family.h"
 #include "compiler/nir/nir.h"
+#include "compiler/shader_enums.h"
 
 #include <stdbool.h>
 #include <stdint.h>
@@ -104,6 +105,19 @@ void ac_choose_spi_color_formats(unsigned format, unsigned swap, unsigned ntype,
 void ac_compute_late_alloc(const struct radeon_info *info, bool ngg, bool ngg_culling,
                            bool uses_scratch, unsigned *late_alloc_wave64, unsigned *cu_mask);
 
+unsigned ac_compute_cs_workgroup_size(uint16_t sizes[3], bool variable, unsigned max);
+
+unsigned ac_compute_lshs_workgroup_size(enum chip_class chip_class, gl_shader_stage stage,
+                                        unsigned tess_num_patches,
+                                        unsigned tess_patch_in_vtx,
+                                        unsigned tess_patch_out_vtx);
+
+unsigned ac_compute_esgs_workgroup_size(enum chip_class chip_class, unsigned wave_size,
+                                        unsigned es_verts, unsigned gs_inst_prims);
+
+unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims,
+                                       unsigned max_vtx_out, unsigned prim_amp_factor);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/mesa 3D driver/src/amd/common/ac_surface.c b/mesa 3D driver/src/amd/common/ac_surface.c
index 3bcec29292..121093979b 100644
--- a/mesa 3D driver/src/amd/common/ac_surface.c	
+++ b/mesa 3D driver/src/amd/common/ac_surface.c	
@@ -105,6 +105,57 @@ bool ac_modifier_has_dcc_retile(uint64_t modifier)
    return IS_AMD_FMT_MOD(modifier) && AMD_FMT_MOD_GET(DCC_RETILE, modifier);
 }
 
+bool ac_modifier_supports_dcc_image_stores(uint64_t modifier)
+{
+   if (!ac_modifier_has_dcc(modifier))
+      return false;
+
+   return (!AMD_FMT_MOD_GET(DCC_INDEPENDENT_64B, modifier) &&
+            AMD_FMT_MOD_GET(DCC_INDEPENDENT_128B, modifier) &&
+            AMD_FMT_MOD_GET(DCC_MAX_COMPRESSED_BLOCK, modifier) == AMD_FMT_MOD_DCC_BLOCK_128B) ||
+           (AMD_FMT_MOD_GET(TILE_VERSION, modifier) >= AMD_FMT_MOD_TILE_VER_GFX10_RBPLUS && /* gfx10.3 */
+            AMD_FMT_MOD_GET(DCC_INDEPENDENT_64B, modifier) &&
+            AMD_FMT_MOD_GET(DCC_INDEPENDENT_128B, modifier) &&
+            AMD_FMT_MOD_GET(DCC_MAX_COMPRESSED_BLOCK, modifier) == AMD_FMT_MOD_DCC_BLOCK_64B);
+
+}
+
+
+bool ac_surface_supports_dcc_image_stores(enum chip_class chip_class,
+                                          const struct radeon_surf *surf)
+{
+   /* DCC image stores is only available for GFX10+. */
+   if (chip_class < GFX10)
+      return false;
+
+   /* DCC image stores support the following settings:
+    * - INDEPENDENT_64B_BLOCKS = 0
+    * - INDEPENDENT_128B_BLOCKS = 1
+    * - MAX_COMPRESSED_BLOCK_SIZE = 128B
+    * - MAX_UNCOMPRESSED_BLOCK_SIZE = 256B (always used)
+    *
+    * gfx10.3 also supports the following setting:
+    * - INDEPENDENT_64B_BLOCKS = 1
+    * - INDEPENDENT_128B_BLOCKS = 1
+    * - MAX_COMPRESSED_BLOCK_SIZE = 64B
+    * - MAX_UNCOMPRESSED_BLOCK_SIZE = 256B (always used)
+    *
+    * The compressor only looks at MAX_COMPRESSED_BLOCK_SIZE to determine
+    * the INDEPENDENT_xx_BLOCKS settings. 128B implies INDEP_128B, while 64B
+    * implies INDEP_64B && INDEP_128B.
+    *
+    * The same limitations apply to SDMA compressed stores because
+    * SDMA uses the same DCC codec.
+    */
+   return (!surf->u.gfx9.color.dcc.independent_64B_blocks &&
+            surf->u.gfx9.color.dcc.independent_128B_blocks &&
+            surf->u.gfx9.color.dcc.max_compressed_block_size == V_028C78_MAX_BLOCK_SIZE_128B) ||
+           (chip_class >= GFX10_3 && /* gfx10.3 */
+            surf->u.gfx9.color.dcc.independent_64B_blocks &&
+            surf->u.gfx9.color.dcc.independent_128B_blocks &&
+            surf->u.gfx9.color.dcc.max_compressed_block_size == V_028C78_MAX_BLOCK_SIZE_64B);
+}
+
 static
 AddrSwizzleMode ac_modifier_gfx9_swizzle_mode(uint64_t modifier)
 {
@@ -302,6 +353,19 @@ bool ac_get_supported_modifiers(const struct radeon_info *info,
               AMD_FMT_MOD_SET(DCC_INDEPENDENT_128B, 1) |
               AMD_FMT_MOD_SET(DCC_MAX_COMPRESSED_BLOCK, AMD_FMT_MOD_DCC_BLOCK_128B))
 
+      if (info->chip_class >= GFX10_3) {
+         if (info->max_render_backends == 1) {
+            ADD_MOD(AMD_FMT_MOD | common_dcc |
+                    AMD_FMT_MOD_SET(DCC_INDEPENDENT_128B, 1) |
+                    AMD_FMT_MOD_SET(DCC_MAX_COMPRESSED_BLOCK, AMD_FMT_MOD_DCC_BLOCK_128B))
+         }
+
+         ADD_MOD(AMD_FMT_MOD | common_dcc |
+                 AMD_FMT_MOD_SET(DCC_RETILE, 1) |
+                 AMD_FMT_MOD_SET(DCC_INDEPENDENT_128B, 1) |
+                 AMD_FMT_MOD_SET(DCC_MAX_COMPRESSED_BLOCK, AMD_FMT_MOD_DCC_BLOCK_128B))
+      }
+
       if (info->family == CHIP_NAVI12 || info->family == CHIP_NAVI14 || info->chip_class >= GFX10_3) {
          bool independent_128b = info->chip_class >= GFX10_3;
 
@@ -1402,12 +1466,47 @@ ASSERTED static bool is_dcc_supported_by_L2(const struct radeon_info *info,
 
    /* 128B is recommended, but 64B can be set too if needed for 4K by DCN.
     * Since there is no reason to ever disable 128B, require it.
-    * DCC image stores are always supported.
+    * If 64B is used, DCC image stores are unsupported.
     */
    return surf->u.gfx9.color.dcc.independent_128B_blocks &&
           surf->u.gfx9.color.dcc.max_compressed_block_size <= V_028C78_MAX_BLOCK_SIZE_128B;
 }
 
+static bool gfx10_DCN_requires_independent_64B_blocks(const struct radeon_info *info,
+                                                      const struct ac_surf_config *config)
+{
+   assert(info->chip_class >= GFX10);
+
+   /* Older kernels have buggy DAL. */
+   if (info->drm_minor <= 43)
+      return true;
+
+   /* For 4K, DCN requires INDEPENDENT_64B_BLOCKS = 1 and MAX_COMPRESSED_BLOCK_SIZE = 64B. */
+   return config->info.width > 2560 || config->info.height > 2560;
+}
+
+void ac_modifier_max_extent(const struct radeon_info *info,
+                            uint64_t modifier, uint32_t *width, uint32_t *height)
+{
+   if (ac_modifier_has_dcc(modifier)) {
+      bool independent_64B_blocks = AMD_FMT_MOD_GET(DCC_INDEPENDENT_64B, modifier);
+
+      if (info->chip_class >= GFX10 && !independent_64B_blocks) {
+         /* For 4K, DCN requires INDEPENDENT_64B_BLOCKS = 1 and MAX_COMPRESSED_BLOCK_SIZE = 64B. */
+         *width = 2560;
+         *height = 2560;
+      } else {
+         /* DCC is not supported on surfaces above resolutions af 5760. */
+         *width = 5760;
+         *height = 5760;
+      }
+   } else {
+      /* Non-dcc modifiers */
+      *width = 16384;
+      *height = 16384;
+   }
+}
+
 static bool is_dcc_supported_by_DCN(const struct radeon_info *info,
                                     const struct ac_surf_config *config,
                                     const struct radeon_surf *surf, bool rb_aligned,
@@ -1424,13 +1523,12 @@ static bool is_dcc_supported_by_DCN(const struct radeon_info *info,
    if (info->use_display_dcc_unaligned && (rb_aligned || pipe_aligned))
       return false;
 
+   /* Big resolutions don't support DCC. */
+   if (config->info.width > 5760 || config->info.height > 5760)
+      return false;
+
    switch (info->chip_class) {
    case GFX9:
-      /* Only support 64KB_S_X, so that we have only 1 variant of the retile shader. */
-      if (info->use_display_dcc_with_retile_blit &&
-          surf->u.gfx9.swizzle_mode != ADDR_SW_64KB_S_X)
-         return false;
-
       /* There are more constraints, but we always set
        * INDEPENDENT_64B_BLOCKS = 1 and MAX_COMPRESSED_BLOCK_SIZE = 64B,
        * which always works.
@@ -1440,17 +1538,11 @@ static bool is_dcc_supported_by_DCN(const struct radeon_info *info,
       return true;
    case GFX10:
    case GFX10_3:
-      /* Only support 64KB_R_X, so that we have only 1 variant of the retile shader. */
-      if (info->use_display_dcc_with_retile_blit &&
-          surf->u.gfx9.swizzle_mode != ADDR_SW_64KB_R_X)
-         return false;
-
       /* DCN requires INDEPENDENT_128B_BLOCKS = 0 only on Navi1x. */
       if (info->chip_class == GFX10 && surf->u.gfx9.color.dcc.independent_128B_blocks)
          return false;
 
-      /* For 4K, DCN requires INDEPENDENT_64B_BLOCKS = 1. */
-      return ((config->info.width <= 2560 && config->info.height <= 2560) ||
+      return (!gfx10_DCN_requires_independent_64B_blocks(info, config) ||
               (surf->u.gfx9.color.dcc.independent_64B_blocks &&
                surf->u.gfx9.color.dcc.max_compressed_block_size == V_028C78_MAX_BLOCK_SIZE_64B));
    default:
@@ -2038,14 +2130,17 @@ static int gfx9_compute_surface(struct ac_addrlib *addrlib, const struct radeon_
       ac_modifier_fill_dcc_params(surf->modifier, surf, &AddrSurfInfoIn);
    } else if (!AddrSurfInfoIn.flags.depth && !AddrSurfInfoIn.flags.stencil) {
       /* Optimal values for the L2 cache. */
-      if (info->chip_class == GFX9) {
-         surf->u.gfx9.color.dcc.independent_64B_blocks = 1;
-         surf->u.gfx9.color.dcc.independent_128B_blocks = 0;
-         surf->u.gfx9.color.dcc.max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B;
-      } else if (info->chip_class >= GFX10) {
-         surf->u.gfx9.color.dcc.independent_64B_blocks = 0;
-         surf->u.gfx9.color.dcc.independent_128B_blocks = 1;
-         surf->u.gfx9.color.dcc.max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_128B;
+      /* Don't change the DCC settings for imported buffers - they might differ. */
+      if (!(surf->flags & RADEON_SURF_IMPORTED)) {
+         if (info->chip_class == GFX9) {
+            surf->u.gfx9.color.dcc.independent_64B_blocks = 1;
+            surf->u.gfx9.color.dcc.independent_128B_blocks = 0;
+            surf->u.gfx9.color.dcc.max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B;
+         } else if (info->chip_class >= GFX10) {
+            surf->u.gfx9.color.dcc.independent_64B_blocks = 0;
+            surf->u.gfx9.color.dcc.independent_128B_blocks = 1;
+            surf->u.gfx9.color.dcc.max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_128B;
+         }
       }
 
       if (AddrSurfInfoIn.flags.display) {
@@ -2062,7 +2157,9 @@ static int gfx9_compute_surface(struct ac_addrlib *addrlib, const struct radeon_
          }
 
          /* Adjust DCC settings to meet DCN requirements. */
-         if (info->use_display_dcc_unaligned || info->use_display_dcc_with_retile_blit) {
+         /* Don't change the DCC settings for imported buffers - they might differ. */
+         if (!(surf->flags & RADEON_SURF_IMPORTED) &&
+             (info->use_display_dcc_unaligned || info->use_display_dcc_with_retile_blit)) {
             /* Only Navi12/14 support independent 64B blocks in L2,
              * but without DCC image stores.
              */
@@ -2072,7 +2169,13 @@ static int gfx9_compute_surface(struct ac_addrlib *addrlib, const struct radeon_
                surf->u.gfx9.color.dcc.max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B;
             }
 
-            if (info->chip_class >= GFX10_3) {
+            if ((info->chip_class >= GFX10_3 && info->family <= CHIP_YELLOW_CARP) ||
+                /* Newer chips will skip this when possible to get better performance.
+                 * This is also possible for other gfx10.3 chips, but is disabled for
+                 * interoperability between different Mesa versions.
+                 */
+                (info->family > CHIP_YELLOW_CARP &&
+                 gfx10_DCN_requires_independent_64B_blocks(info, config))) {
                surf->u.gfx9.color.dcc.independent_64B_blocks = 1;
                surf->u.gfx9.color.dcc.independent_128B_blocks = 1;
                surf->u.gfx9.color.dcc.max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B;
diff --git a/mesa 3D driver/src/amd/common/ac_surface.h b/mesa 3D driver/src/amd/common/ac_surface.h
index d52a62de02..3f69b34991 100644
--- a/mesa 3D driver/src/amd/common/ac_surface.h	
+++ b/mesa 3D driver/src/amd/common/ac_surface.h	
@@ -454,6 +454,9 @@ bool ac_get_supported_modifiers(const struct radeon_info *info,
                                 uint64_t *mods);
 bool ac_modifier_has_dcc(uint64_t modifier);
 bool ac_modifier_has_dcc_retile(uint64_t modifier);
+bool ac_modifier_supports_dcc_image_stores(uint64_t modifier);
+void ac_modifier_max_extent(const struct radeon_info *info,
+                            uint64_t modifier, uint32_t *width, uint32_t *height);
 
 unsigned ac_surface_get_nplanes(const struct radeon_surf *surf);
 uint64_t ac_surface_get_plane_offset(enum chip_class chip_class,
@@ -469,6 +472,9 @@ uint64_t ac_surface_get_plane_size(const struct radeon_surf *surf,
 void ac_surface_print_info(FILE *out, const struct radeon_info *info,
                            const struct radeon_surf *surf);
 
+bool ac_surface_supports_dcc_image_stores(enum chip_class chip_class,
+                                          const struct radeon_surf *surf);
+
 #ifdef AC_SURFACE_INCLUDE_NIR
 nir_ssa_def *ac_nir_dcc_addr_from_coord(nir_builder *b, const struct radeon_info *info,
                                         unsigned bpe, struct gfx9_meta_equation *equation,
diff --git a/mesa 3D driver/src/amd/common/ac_surface_modifier_test.c b/mesa 3D driver/src/amd/common/ac_surface_modifier_test.c
index 91bdc8f5bb..e7249822e5 100644
--- a/mesa 3D driver/src/amd/common/ac_surface_modifier_test.c	
+++ b/mesa 3D driver/src/amd/common/ac_surface_modifier_test.c	
@@ -397,7 +397,7 @@ int main()
    STATIC_ASSERT(sizeof(struct test_entry) == 64);
 
    struct u_vector test_entries;
-   u_vector_init(&test_entries,  sizeof(struct test_entry), 4096);
+   u_vector_init_pow2(&test_entries, 64, sizeof(struct test_entry));
 
    for (unsigned i = 0; i < ARRAY_SIZE(testcases); ++i) {
       struct radeon_info info = get_radeon_info(&testcases[i]);
diff --git a/mesa 3D driver/src/amd/compiler/README-ISA.md b/mesa 3D driver/src/amd/compiler/README-ISA.md
index 296ba7a864..b49e4d0508 100644
--- a/mesa 3D driver/src/amd/compiler/README-ISA.md	
+++ b/mesa 3D driver/src/amd/compiler/README-ISA.md	
@@ -113,6 +113,16 @@ Some instructions have a `_LEGACY` variant which implements "DX9 rules", in whic
 the zero "wins" in multiplications, ie. `0.0*x` is always `0.0`. The VEGA ISA
 mentions `V_MAC_LEGACY_F32` but this instruction is not really there on VEGA.
 
+## `m0` with LDS instructions on Vega and newer
+
+The Vega ISA doc (both the old one and the "7nm" one) claims that LDS instructions
+use the `m0` register for address clamping like older GPUs, but this is not the case.
+
+In reality, only the `_addtid` variants of LDS instructions use `m0` on Vega and
+newer GPUs, so the relevant section of the RDNA ISA doc seems to apply.
+LLVM also doesn't emit any initialization of `m0` for LDS instructions, and this
+was also confirmed by AMD devs.
+
 ## RDNA L0, L1 cache and DLC, GLC bits
 
 The old L1 cache was renamed to L0, and a new L1 cache was added to RDNA. The
diff --git a/mesa 3D driver/src/amd/compiler/README.md b/mesa 3D driver/src/amd/compiler/README.md
index 8fe2366ab2..4b91a254d3 100644
--- a/mesa 3D driver/src/amd/compiler/README.md	
+++ b/mesa 3D driver/src/amd/compiler/README.md	
@@ -242,6 +242,26 @@ Note that you need to **combine these options into a comma-separated list**, for
 RADV_DEBUG=nocache,shaders ACO_DEBUG=validateir,validatera vkcube
 ```
 
+### Using GCC sanitizers
+
+GCC has several sanitizers which can help figure out hard to diagnose issues. To use these, you need to pass
+the `-Dbsanitize` flag to `meson` when building mesa. For example `-Dbsanitize=undefined` will add support for
+the undefined behavior sanitizer.
+
+### Hardened builds and glibc++ assertions
+
+Several Linux distributions use "hardened" builds meaning several special compiler flags are added by
+downstream packaging which are not used in mesa builds by default. These may be responsible for
+some bug reports of inexplicable crashes with assertion failures you can't reproduce.
+
+Most notable are the glibc++ debug flags, which you can use by adding the `-D_GLIBCXX_ASSERTIONS=1` and
+`-D_GLIBCXX_DEBUG=1` flags.
+
+To see the full list of downstream compiler flags, you can use eg. `rpm --eval "%optflags"`
+on Red Hat based distros like Fedora.
+
+### Good practices
+
 Here are some good practices we learned while debugging visual corruption and hangs.
 
 1. Bisecting shaders:
diff --git a/mesa 3D driver/src/amd/compiler/aco_assembler.cpp b/mesa 3D driver/src/amd/compiler/aco_assembler.cpp
index fd4916c5de..e520da50fe 100644
--- a/mesa 3D driver/src/amd/compiler/aco_assembler.cpp	
+++ b/mesa 3D driver/src/amd/compiler/aco_assembler.cpp	
@@ -61,19 +61,6 @@ struct asm_context {
    int subvector_begin_pos = -1;
 };
 
-static uint32_t
-get_sdwa_sel(unsigned sel, PhysReg reg)
-{
-   if (sel & sdwa_isra) {
-      unsigned size = sdwa_rasize & sel;
-      if (size == 1)
-         return reg.byte();
-      else /* size == 2 */
-         return sdwa_isword | (reg.byte() >> 1);
-   }
-   return sel & sdwa_asuint;
-}
-
 unsigned
 get_mimg_nsa_dwords(const Instruction* instr)
 {
@@ -715,23 +702,23 @@ emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* inst
             }
             encoding |= (sdwa.clamp ? 1 : 0) << 13;
          } else {
-            encoding |= get_sdwa_sel(sdwa.dst_sel, instr->definitions[0].physReg()) << 8;
-            uint32_t dst_u = sdwa.dst_sel & sdwa_sext ? 1 : 0;
-            if (sdwa.dst_preserve || (sdwa.dst_sel & sdwa_isra))
+            encoding |= sdwa.dst_sel.to_sdwa_sel(instr->definitions[0].physReg().byte()) << 8;
+            uint32_t dst_u = sdwa.dst_sel.sign_extend() ? 1 : 0;
+            if (instr->definitions[0].bytes() < 4) /* dst_preserve */
                dst_u = 2;
             encoding |= dst_u << 11;
             encoding |= (sdwa.clamp ? 1 : 0) << 13;
             encoding |= sdwa.omod << 14;
          }
 
-         encoding |= get_sdwa_sel(sdwa.sel[0], sdwa_op.physReg()) << 16;
-         encoding |= sdwa.sel[0] & sdwa_sext ? 1 << 19 : 0;
+         encoding |= sdwa.sel[0].to_sdwa_sel(sdwa_op.physReg().byte()) << 16;
+         encoding |= sdwa.sel[0].sign_extend() ? 1 << 19 : 0;
          encoding |= sdwa.abs[0] << 21;
          encoding |= sdwa.neg[0] << 20;
 
          if (instr->operands.size() >= 2) {
-            encoding |= get_sdwa_sel(sdwa.sel[1], instr->operands[1].physReg()) << 24;
-            encoding |= sdwa.sel[1] & sdwa_sext ? 1 << 27 : 0;
+            encoding |= sdwa.sel[1].to_sdwa_sel(instr->operands[1].physReg().byte()) << 24;
+            encoding |= sdwa.sel[1].sign_extend() ? 1 << 27 : 0;
             encoding |= sdwa.abs[1] << 29;
             encoding |= sdwa.neg[1] << 28;
          }
diff --git a/mesa 3D driver/src/amd/compiler/aco_builder_h.py b/mesa 3D driver/src/amd/compiler/aco_builder_h.py
index 39b77fdc6c..9ebf633eef 100644
--- a/mesa 3D driver/src/amd/compiler/aco_builder_h.py	
+++ b/mesa 3D driver/src/amd/compiler/aco_builder_h.py	
@@ -432,7 +432,7 @@ public:
    }
 
    Result vadd32(Definition dst, Op a, Op b, bool carry_out=false, Op carry_in=Op(Operand(s2)), bool post_ra=false) {
-      if (!b.op.isTemp() || b.op.regClass().type() != RegType::vgpr)
+      if (b.op.isConstant() || b.op.regClass().type() != RegType::vgpr)
          std::swap(a, b);
       if (!post_ra && (!b.op.hasRegClass() || b.op.regClass().type() == RegType::sgpr))
          b = copy(def(v1), b);
diff --git a/mesa 3D driver/src/amd/compiler/aco_insert_NOPs.cpp b/mesa 3D driver/src/amd/compiler/aco_insert_NOPs.cpp
index edc52c3304..5dd1c7183e 100644
--- a/mesa 3D driver/src/amd/compiler/aco_insert_NOPs.cpp	
+++ b/mesa 3D driver/src/amd/compiler/aco_insert_NOPs.cpp	
@@ -33,6 +33,12 @@
 namespace aco {
 namespace {
 
+struct State {
+   Program* program;
+   Block* block;
+   std::vector<aco_ptr<Instruction>> old_instructions;
+};
+
 struct NOP_ctx_gfx6 {
    void join(const NOP_ctx_gfx6& other)
    {
@@ -198,33 +204,53 @@ regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size)
 }
 
 template <bool Valu, bool Vintrp, bool Salu>
-int
-handle_raw_hazard_internal(Program* program, Block* block, int nops_needed, PhysReg reg,
-                           uint32_t mask)
+bool
+handle_raw_hazard_instr(aco_ptr<Instruction>& pred, PhysReg reg, int* nops_needed, uint32_t* mask)
 {
-   unsigned mask_size = util_last_bit(mask);
-   for (int pred_idx = block->instructions.size() - 1; pred_idx >= 0; pred_idx--) {
-      aco_ptr<Instruction>& pred = block->instructions[pred_idx];
+   unsigned mask_size = util_last_bit(*mask);
 
-      uint32_t writemask = 0;
-      for (Definition& def : pred->definitions) {
-         if (regs_intersect(reg, mask_size, def.physReg(), def.size())) {
-            unsigned start = def.physReg() > reg ? def.physReg() - reg : 0;
-            unsigned end = MIN2(mask_size, start + def.size());
-            writemask |= u_bit_consecutive(start, end - start);
-         }
+   uint32_t writemask = 0;
+   for (Definition& def : pred->definitions) {
+      if (regs_intersect(reg, mask_size, def.physReg(), def.size())) {
+         unsigned start = def.physReg() > reg ? def.physReg() - reg : 0;
+         unsigned end = MIN2(mask_size, start + def.size());
+         writemask |= u_bit_consecutive(start, end - start);
       }
+   }
 
-      bool is_hazard = writemask != 0 && ((pred->isVALU() && Valu) ||
-                                          (pred->isVINTRP() && Vintrp) || (pred->isSALU() && Salu));
-      if (is_hazard)
+   bool is_hazard = writemask != 0 && ((pred->isVALU() && Valu) || (pred->isVINTRP() && Vintrp) ||
+                                       (pred->isSALU() && Salu));
+   if (is_hazard)
+      return true;
+
+   *mask &= ~writemask;
+   *nops_needed = MAX2(*nops_needed - get_wait_states(pred), 0);
+
+   if (*mask == 0)
+      *nops_needed = 0;
+
+   return *nops_needed == 0;
+}
+
+template <bool Valu, bool Vintrp, bool Salu>
+int
+handle_raw_hazard_internal(State& state, Block* block, int nops_needed, PhysReg reg, uint32_t mask,
+                           bool start_at_end)
+{
+   if (block == state.block && start_at_end) {
+      /* If it's the current block, block->instructions is incomplete. */
+      for (int pred_idx = state.old_instructions.size() - 1; pred_idx >= 0; pred_idx--) {
+         aco_ptr<Instruction>& instr = state.old_instructions[pred_idx];
+         if (!instr)
+            break; /* Instruction has been moved to block->instructions. */
+         if (handle_raw_hazard_instr<Valu, Vintrp, Salu>(instr, reg, &nops_needed, &mask))
+            return nops_needed;
+      }
+   }
+   for (int pred_idx = block->instructions.size() - 1; pred_idx >= 0; pred_idx--) {
+      if (handle_raw_hazard_instr<Valu, Vintrp, Salu>(block->instructions[pred_idx], reg,
+                                                      &nops_needed, &mask))
          return nops_needed;
-
-      mask &= ~writemask;
-      nops_needed -= get_wait_states(pred);
-
-      if (nops_needed <= 0 || mask == 0)
-         return 0;
    }
 
    int res = 0;
@@ -233,20 +259,21 @@ handle_raw_hazard_internal(Program* program, Block* block, int nops_needed, Phys
     * states. So even with loops this should finish unless nops_needed is some
     * huge value. */
    for (unsigned lin_pred : block->linear_preds) {
-      res = std::max(res, handle_raw_hazard_internal<Valu, Vintrp, Salu>(
-                             program, &program->blocks[lin_pred], nops_needed, reg, mask));
+      res =
+         std::max(res, handle_raw_hazard_internal<Valu, Vintrp, Salu>(
+                          state, &state.program->blocks[lin_pred], nops_needed, reg, mask, true));
    }
    return res;
 }
 
 template <bool Valu, bool Vintrp, bool Salu>
 void
-handle_raw_hazard(Program* program, Block* cur_block, int* NOPs, int min_states, Operand op)
+handle_raw_hazard(State& state, int* NOPs, int min_states, Operand op)
 {
    if (*NOPs >= min_states)
       return;
    int res = handle_raw_hazard_internal<Valu, Vintrp, Salu>(
-      program, cur_block, min_states, op.physReg(), u_bit_consecutive(0, op.size()));
+      state, state.block, min_states, op.physReg(), u_bit_consecutive(0, op.size()), false);
    *NOPs = MAX2(*NOPs, res);
 }
 
@@ -260,7 +287,7 @@ set_bitset_range(BITSET_WORD* words, unsigned start, unsigned size)
    unsigned end = start + size - 1;
    unsigned start_mod = start % BITSET_WORDBITS;
    if (start_mod + size <= BITSET_WORDBITS) {
-      BITSET_SET_RANGE(words, start, end);
+      BITSET_SET_RANGE_INSIDE_WORD(words, start, end);
    } else {
       unsigned first_size = BITSET_WORDBITS - start_mod;
       set_bitset_range(words, start, BITSET_WORDBITS - start_mod);
@@ -322,15 +349,14 @@ handle_smem_clause_hazards(Program* program, NOP_ctx_gfx6& ctx, aco_ptr<Instruct
 
 /* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */
 void
-handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
-                        aco_ptr<Instruction>& instr,
+handle_instruction_gfx6(State& state, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr,
                         std::vector<aco_ptr<Instruction>>& new_instructions)
 {
    /* check hazards */
    int NOPs = 0;
 
    if (instr->isSMEM()) {
-      if (program->chip_class == GFX6) {
+      if (state.program->chip_class == GFX6) {
          /* A read of an SGPR by SMRD instruction requires 4 wait states
           * when the SGPR was written by a VALU instruction. According to LLVM,
           * there is also an undocumented hardware behavior when the buffer
@@ -342,13 +368,13 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
 
             bool is_buffer_desc = i == 0 && op.size() > 2;
             if (is_buffer_desc)
-               handle_valu_salu_then_read_hazard(program, cur_block, &NOPs, 4, op);
+               handle_valu_salu_then_read_hazard(state, &NOPs, 4, op);
             else
-               handle_valu_then_read_hazard(program, cur_block, &NOPs, 4, op);
+               handle_valu_then_read_hazard(state, &NOPs, 4, op);
          }
       }
 
-      handle_smem_clause_hazards(program, ctx, instr, &NOPs);
+      handle_smem_clause_hazards(state.program, ctx, instr, &NOPs);
    } else if (instr->isSALU()) {
       if (instr->opcode == aco_opcode::s_setreg_b32 ||
           instr->opcode == aco_opcode::s_setreg_imm32_b32 ||
@@ -356,7 +382,7 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
          NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg);
       }
 
-      if (program->chip_class == GFX9) {
+      if (state.program->chip_class == GFX9) {
          if (instr->opcode == aco_opcode::s_movrels_b32 ||
              instr->opcode == aco_opcode::s_movrels_b64 ||
              instr->opcode == aco_opcode::s_movreld_b32 ||
@@ -379,7 +405,7 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
 
       if (instr->isDPP()) {
          NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_dpp);
-         handle_valu_then_read_hazard(program, cur_block, &NOPs, 2, instr->operands[0]);
+         handle_valu_then_read_hazard(state, &NOPs, 2, instr->operands[0]);
       }
 
       for (Definition def : instr->definitions) {
@@ -394,7 +420,7 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
            instr->opcode == aco_opcode::v_writelane_b32 ||
            instr->opcode == aco_opcode::v_writelane_b32_e64) &&
           !instr->operands[1].isConstant()) {
-         handle_valu_then_read_hazard(program, cur_block, &NOPs, 4, instr->operands[1]);
+         handle_valu_then_read_hazard(state, &NOPs, 4, instr->operands[1]);
       }
 
       /* It's required to insert 1 wait state if the dst VGPR of any v_interp_*
@@ -402,10 +428,10 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
        * hangs on GFX6. Note that v_writelane_* is apparently not affected.
        * This hazard isn't documented anywhere but AMD confirmed that hazard.
        */
-      if (program->chip_class == GFX6 &&
+      if (state.program->chip_class == GFX6 &&
           (instr->opcode == aco_opcode::v_readlane_b32 || /* GFX6 doesn't have v_readlane_b32_e64 */
            instr->opcode == aco_opcode::v_readfirstlane_b32)) {
-         handle_vintrp_then_read_hazard(program, cur_block, &NOPs, 1, instr->operands[0]);
+         handle_vintrp_then_read_hazard(state, &NOPs, 1, instr->operands[0]);
       }
 
       if (instr->opcode == aco_opcode::v_div_fmas_f32 ||
@@ -415,14 +441,14 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
       /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
       for (Operand op : instr->operands) {
          if (!op.isConstant() && !op.isUndefined() && op.regClass().type() == RegType::sgpr)
-            handle_valu_then_read_hazard(program, cur_block, &NOPs, 5, op);
+            handle_valu_then_read_hazard(state, &NOPs, 5, op);
       }
    }
 
    if (!instr->isSALU() && instr->format != Format::SMEM)
       NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector);
 
-   if (program->chip_class == GFX9) {
+   if (state.program->chip_class == GFX9) {
       bool lds_scratch_global = (instr->isScratch() || instr->isGlobal()) && instr->flatlike().lds;
       if (instr->isVINTRP() || lds_scratch_global ||
           instr->opcode == aco_opcode::ds_read_addtid_b32 ||
@@ -449,7 +475,7 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
       ctx.smem_clause = false;
       ctx.smem_write = false;
 
-      if (program->dev.xnack_enabled) {
+      if (state.program->dev.xnack_enabled) {
          BITSET_ZERO(ctx.smem_clause_read_write);
          BITSET_ZERO(ctx.smem_clause_write);
       }
@@ -461,7 +487,7 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
       } else {
          ctx.smem_clause = true;
 
-         if (program->dev.xnack_enabled) {
+         if (state.program->dev.xnack_enabled) {
             for (Operand op : instr->operands) {
                if (!op.isConstant()) {
                   set_bitset_range(ctx.smem_clause_read_write, op.physReg(), op.size());
@@ -606,8 +632,7 @@ instr_is_branch(const aco_ptr<Instruction>& instr)
 }
 
 void
-handle_instruction_gfx10(Program* program, Block* cur_block, NOP_ctx_gfx10& ctx,
-                         aco_ptr<Instruction>& instr,
+handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>& instr,
                          std::vector<aco_ptr<Instruction>>& new_instructions)
 {
    // TODO: s_dcache_inv needs to be in it's own group on GFX10
@@ -620,7 +645,7 @@ handle_instruction_gfx10(Program* program, Block* cur_block, NOP_ctx_gfx10& ctx,
       /* Remember all SGPRs that are read by the VMEM instruction */
       mark_read_regs(instr, ctx.sgprs_read_by_VMEM);
       ctx.sgprs_read_by_VMEM.set(exec);
-      if (program->wave_size == 64)
+      if (state.program->wave_size == 64)
          ctx.sgprs_read_by_VMEM.set(exec_hi);
    } else if (instr->isSALU() || instr->isSMEM()) {
       if (instr->opcode == aco_opcode::s_waitcnt) {
@@ -776,7 +801,7 @@ handle_instruction_gfx10(Program* program, Block* cur_block, NOP_ctx_gfx10& ctx,
       if (instr->isMUBUF() || instr->isMTBUF()) {
          uint32_t offset = instr->isMUBUF() ? instr->mubuf().offset : instr->mtbuf().offset;
          if (offset & 6)
-            Builder(program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0);
+            Builder(state.program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0);
       }
    }
 
@@ -788,12 +813,12 @@ handle_instruction_gfx10(Program* program, Block* cur_block, NOP_ctx_gfx10& ctx,
    } else if (ctx.has_writelane) {
       ctx.has_writelane = false;
       if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 0)
-         Builder(program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0);
+         Builder(state.program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0);
    }
 }
 
 template <typename Ctx>
-using HandleInstr = void (*)(Program*, Block* block, Ctx&, aco_ptr<Instruction>&,
+using HandleInstr = void (*)(State& state, Ctx&, aco_ptr<Instruction>&,
                              std::vector<aco_ptr<Instruction>>&);
 
 template <typename Ctx, HandleInstr<Ctx> Handle>
@@ -803,13 +828,16 @@ handle_block(Program* program, Ctx& ctx, Block& block)
    if (block.instructions.empty())
       return;
 
-   std::vector<aco_ptr<Instruction>> old_instructions = std::move(block.instructions);
+   State state;
+   state.program = program;
+   state.block = &block;
+   state.old_instructions = std::move(block.instructions);
 
    block.instructions.clear(); // Silence clang-analyzer-cplusplus.Move warning
-   block.instructions.reserve(old_instructions.size());
+   block.instructions.reserve(state.old_instructions.size());
 
-   for (aco_ptr<Instruction>& instr : old_instructions) {
-      Handle(program, &block, ctx, instr, block.instructions);
+   for (aco_ptr<Instruction>& instr : state.old_instructions) {
+      Handle(state, ctx, instr, block.instructions);
       block.instructions.emplace_back(std::move(instr));
    }
 }
@@ -819,7 +847,7 @@ void
 mitigate_hazards(Program* program)
 {
    std::vector<Ctx> all_ctx(program->blocks.size());
-   std::stack<unsigned> loop_header_indices;
+   std::stack<unsigned, std::vector<unsigned>> loop_header_indices;
 
    for (unsigned i = 0; i < program->blocks.size(); i++) {
       Block& block = program->blocks[i];
diff --git a/mesa 3D driver/src/amd/compiler/aco_insert_waitcnt.cpp b/mesa 3D driver/src/amd/compiler/aco_insert_waitcnt.cpp
index d7fc87c126..2934c71c08 100644
--- a/mesa 3D driver/src/amd/compiler/aco_insert_waitcnt.cpp	
+++ b/mesa 3D driver/src/amd/compiler/aco_insert_waitcnt.cpp	
@@ -767,9 +767,16 @@ insert_wait_states(Program* program)
    std::vector<wait_ctx> in_ctx(program->blocks.size(), wait_ctx(program));
    std::vector<wait_ctx> out_ctx(program->blocks.size(), wait_ctx(program));
 
-   std::stack<unsigned> loop_header_indices;
+   std::stack<unsigned, std::vector<unsigned>> loop_header_indices;
    unsigned loop_progress = 0;
 
+   if (program->stage.has(SWStage::VS) && program->info->vs.dynamic_inputs) {
+      for (Definition def : program->vs_inputs) {
+         update_counters(in_ctx[0], event_vmem);
+         insert_wait_entry(in_ctx[0], def, event_vmem);
+      }
+   }
+
    for (unsigned i = 0; i < program->blocks.size();) {
       Block& current = program->blocks[i++];
       wait_ctx ctx = in_ctx[current.index];
diff --git a/mesa 3D driver/src/amd/compiler/aco_instruction_selection.cpp b/mesa 3D driver/src/amd/compiler/aco_instruction_selection.cpp
index 4b3a913b86..54575986aa 100644
--- a/mesa 3D driver/src/amd/compiler/aco_instruction_selection.cpp	
+++ b/mesa 3D driver/src/amd/compiler/aco_instruction_selection.cpp	
@@ -40,6 +40,7 @@
 #include <map>
 #include <numeric>
 #include <stack>
+#include <utility>
 #include <vector>
 
 namespace aco {
@@ -634,27 +635,10 @@ convert_int(isel_context* ctx, Builder& bld, Temp src, unsigned src_bits, unsign
       assert(src_bits < 32);
       bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), src, Operand::zero(),
                  Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
-   } else if (ctx->options->chip_class >= GFX8) {
-      assert(src_bits < 32);
-      assert(src_bits != 8 || src.regClass() == v1b);
-      assert(src_bits != 16 || src.regClass() == v2b);
-      assert(dst_bits >= 16);
-      aco_ptr<SDWA_instruction> sdwa{
-         create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
-      sdwa->operands[0] = Operand(src);
-      sdwa->definitions[0] = Definition(tmp);
-      if (sign_extend)
-         sdwa->sel[0] = src_bits == 8 ? sdwa_sbyte : sdwa_sword;
-      else
-         sdwa->sel[0] = src_bits == 8 ? sdwa_ubyte : sdwa_uword;
-      sdwa->dst_sel = tmp.bytes() == 2 ? sdwa_uword : sdwa_udword;
-      bld.insert(std::move(sdwa));
    } else {
       assert(src_bits < 32);
-      assert(ctx->options->chip_class == GFX6 || ctx->options->chip_class == GFX7);
-      aco_opcode opcode = sign_extend ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32;
-      bld.vop3(opcode, Definition(tmp), src, Operand::zero(),
-               Operand::c32(src_bits == 8 ? 8u : 16u));
+      bld.pseudo(aco_opcode::p_extract, Definition(tmp), src, Operand::zero(), Operand::c32(src_bits),
+                 Operand::c32((unsigned)sign_extend));
    }
 
    if (dst_bits == 64) {
@@ -715,7 +699,7 @@ get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1)
       return get_ssa_temp(ctx, src.src.ssa);
 
    Temp vec = get_ssa_temp(ctx, src.src.ssa);
-   unsigned elem_size = vec.bytes() / src.src.ssa->num_components;
+   unsigned elem_size = src.src.ssa->bit_size / 8u;
    bool identity_swizzle = true;
 
    for (unsigned i = 0; identity_swizzle && i < size; i++) {
@@ -728,13 +712,16 @@ get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1)
    assert(elem_size > 0);
    assert(vec.bytes() % elem_size == 0);
 
-   if (elem_size < 4 && vec.type() == RegType::sgpr) {
+   if (elem_size < 4 && vec.type() == RegType::sgpr && size == 1) {
       assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
-      assert(size == 1);
       return extract_8_16_bit_sgpr_element(ctx, ctx->program->allocateTmp(s1), &src,
                                            sgpr_extract_undef);
    }
 
+   bool as_uniform = elem_size < 4 && vec.type() == RegType::sgpr;
+   if (as_uniform)
+      vec = as_vgpr(ctx, vec);
+
    RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword()
                                     : RegClass(vec.type(), elem_size / 4);
    if (size == 1) {
@@ -752,7 +739,7 @@ get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1)
       vec_instr->definitions[0] = Definition(dst);
       ctx->block->instructions.emplace_back(std::move(vec_instr));
       ctx->allocated_vec.emplace(dst.id(), elems);
-      return dst;
+      return vec.type() == RegType::sgpr ? Builder(ctx->program, ctx->block).as_uniform(dst) : dst;
    }
 }
 
@@ -832,7 +819,7 @@ emit_sop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Te
 }
 
 void
-emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
+emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode opc, Temp dst,
                       bool commutative, bool swap_srcs = false, bool flush_denorms = false,
                       bool nuw = false, uint8_t uses_ub = 0)
 {
@@ -851,28 +838,27 @@ emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Te
       }
    }
 
-   Operand op0(src0);
-   Operand op1(src1);
+   Operand op[2] = {Operand(src0), Operand(src1)};
 
    for (int i = 0; i < 2; i++) {
       if (uses_ub & (1 << i)) {
          uint32_t src_ub = get_alu_src_ub(ctx, instr, swap_srcs ? !i : i);
          if (src_ub <= 0xffff)
-            bld.set16bit(i ? op1 : op0);
+            op[i].set16bit(true);
          else if (src_ub <= 0xffffff)
-            bld.set24bit(i ? op1 : op0);
+            op[i].set24bit(true);
       }
    }
 
    if (flush_denorms && ctx->program->chip_class < GFX9) {
       assert(dst.size() == 1);
-      Temp tmp = bld.vop2(op, bld.def(v1), op0, op1);
+      Temp tmp = bld.vop2(opc, bld.def(v1), op[0], op[1]);
       bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
    } else {
       if (nuw) {
-         bld.nuw().vop2(op, Definition(dst), op0, op1);
+         bld.nuw().vop2(opc, Definition(dst), op[0], op[1]);
       } else {
-         bld.vop2(op, Definition(dst), op0, op1);
+         bld.vop2(opc, Definition(dst), op[0], op[1]);
       }
    }
 }
@@ -959,6 +945,24 @@ emit_vop3p_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, T
    return res;
 }
 
+void
+emit_idot_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, bool clamp)
+{
+   Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
+   bool has_sgpr = false;
+   for (unsigned i = 0; i < 3; i++) {
+      src[i] = get_alu_src(ctx, instr->src[i]);
+      if (has_sgpr)
+         src[i] = as_vgpr(ctx, src[i]);
+      else
+         has_sgpr = src[i].type() == RegType::sgpr;
+   }
+
+   Builder bld(ctx->program, ctx->block);
+   bld.is_precise = instr->exact;
+   bld.vop3p(op, Definition(dst), src[0], src[1], src[2], 0x0, 0x7).instr->vop3p().clamp = clamp;
+}
+
 void
 emit_vop1_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
 {
@@ -1421,13 +1425,7 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
    }
    case nir_op_inot: {
       Temp src = get_alu_src(ctx, instr->src[0]);
-      if (instr->dest.dest.ssa.bit_size == 1) {
-         assert(src.regClass() == bld.lm);
-         assert(dst.regClass() == bld.lm);
-         /* Don't use s_andn2 here, this allows the optimizer to make a better decision */
-         Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
-         bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
-      } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
+      if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
          emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
       } else if (dst.regClass() == v2) {
          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
@@ -1633,7 +1631,7 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshlrev_b16, dst, true);
       } else if (dst.regClass() == v1) {
          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false,
-                               false, 1);
+                               false, 2);
       } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
          bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
                   get_alu_src(ctx, instr->src[0]));
@@ -1817,6 +1815,22 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
       }
       break;
    }
+   case nir_op_iadd_sat: {
+      Temp src0 = get_alu_src(ctx, instr->src[0]);
+      Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
+      if (dst.regClass() == v2b) {
+         Instruction* add_instr =
+            bld.vop3(aco_opcode::v_add_i16, Definition(dst), src0, src1).instr;
+         add_instr->vop3().clamp = 1;
+      } else if (dst.regClass() == v1) {
+         Instruction* add_instr =
+            bld.vop3(aco_opcode::v_add_i32, Definition(dst), src0, src1).instr;
+         add_instr->vop3().clamp = 1;
+      } else {
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
+      }
+      break;
+   }
    case nir_op_uadd_carry: {
       Temp src0 = get_alu_src(ctx, instr->src[0]);
       Temp src1 = get_alu_src(ctx, instr->src[1]);
@@ -1951,29 +1965,13 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_lo_u16, dst);
       } else if (dst.type() == RegType::vgpr) {
-         Temp src0 = get_alu_src(ctx, instr->src[0]);
-         Temp src1 = get_alu_src(ctx, instr->src[1]);
          uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
          uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
 
-         if (src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff &&
-             (ctx->options->chip_class == GFX8 || ctx->options->chip_class == GFX9)) {
-            /* If the 16-bit multiplication can't overflow, emit v_mul_lo_u16
-             * but only on GFX8-9 because GFX10 doesn't zero the upper 16
-             * bits.
-             */
-            emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true /* commutative */,
-                                  false, false, true /* nuw */);
-         } else if (src0_ub <= 0xffff && src1_ub <= 0xffff && ctx->options->chip_class >= GFX9) {
-            /* Initialize the accumulator to 0 to allow further combinations
-             * in the optimizer.
-             */
-            Operand op0(src0);
-            Operand op1(src1);
-            bld.vop3(aco_opcode::v_mad_u32_u16, Definition(dst), bld.set16bit(op0),
-                     bld.set16bit(op1), Operand::zero());
-         } else if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
-            emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst, true);
+         if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
+            bool nuw_16bit = src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff;
+            emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst,
+                                  true /* commutative */, false, false, nuw_16bit);
          } else if (nir_src_is_const(instr->src[0].src)) {
             bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[1]),
                           nir_src_as_uint(instr->src[0].src), false);
@@ -2117,6 +2115,38 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
       }
       break;
    }
+   case nir_op_sdot_4x8_iadd: {
+      emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, false);
+      break;
+   }
+   case nir_op_sdot_4x8_iadd_sat: {
+      emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, true);
+      break;
+   }
+   case nir_op_udot_4x8_uadd: {
+      emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, false);
+      break;
+   }
+   case nir_op_udot_4x8_uadd_sat: {
+      emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, true);
+      break;
+   }
+   case nir_op_sdot_2x16_iadd: {
+      emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, false);
+      break;
+   }
+   case nir_op_sdot_2x16_iadd_sat: {
+      emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, true);
+      break;
+   }
+   case nir_op_udot_2x16_uadd: {
+      emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, false);
+      break;
+   }
+   case nir_op_udot_2x16_uadd_sat: {
+      emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, true);
+      break;
+   }
    case nir_op_cube_face_coord_amd: {
       Temp in = get_alu_src(ctx, instr->src[0], 3);
       Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
@@ -3008,7 +3038,7 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c32(0x3f800000u),
                   Operand::zero(), bld.scc(src));
       } else if (dst.regClass() == v2) {
-         Temp one = bld.copy(bld.def(v2), Operand::c32(0x3FF00000u));
+         Temp one = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
          Temp upper =
             bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), one, src);
          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
@@ -3156,6 +3186,7 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
       }
       break;
    }
+   case nir_op_pack_32_4x8: bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0], 4)); break;
    case nir_op_pack_half_2x16_split: {
       if (dst.regClass() == v1) {
          nir_const_value* val = nir_src_as_const_value(instr->src[1].src);
@@ -3509,6 +3540,14 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
    case nir_op_fddy_fine:
    case nir_op_fddx_coarse:
    case nir_op_fddy_coarse: {
+      if (!nir_src_is_divergent(instr->src[0].src)) {
+         /* Source is the same in all lanes, so the derivative is zero.
+          * This also avoids emitting invalid IR.
+          */
+         bld.copy(Definition(dst), Operand::zero());
+         break;
+      }
+
       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
       uint16_t dpp_ctrl1, dpp_ctrl2;
       if (instr->op == nir_op_fddx_fine) {
@@ -3872,7 +3911,10 @@ emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
 Operand
 load_lds_size_m0(Builder& bld)
 {
-   /* TODO: m0 does not need to be initialized on GFX9+ */
+   /* m0 does not need to be initialized on GFX9+ */
+   if (bld.program->chip_class >= GFX9)
+      return Operand(s1);
+
    return bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0xffffffffu)));
 }
 
@@ -3938,6 +3980,9 @@ lds_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned
       instr = bld.ds(op, Definition(val), offset, m, const_offset);
    instr->ds().sync = info.sync;
 
+   if (m.isUndefined())
+      instr->operands.pop_back();
+
    return val;
 }
 
@@ -4371,6 +4416,9 @@ store_lds(isel_context* ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmas
          instr = bld.ds(op, address_offset, split_data, m, inline_offset);
       }
       instr->ds().sync = memory_sync_info(storage_shared);
+
+      if (m.isUndefined())
+         instr->operands.pop_back();
    }
 }
 
@@ -4749,8 +4797,12 @@ emit_load_frag_coord(isel_context* ctx, Temp dst, unsigned num_components)
 
    aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
       aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
-   for (unsigned i = 0; i < num_components; i++)
-      vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i]));
+   for (unsigned i = 0; i < num_components; i++) {
+      if (ctx->args->ac.frag_pos[i].used)
+         vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i]));
+      else
+         vec->operands[i] = Operand(v1);
+   }
    if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
       assert(num_components == 4);
       vec->operands[3] =
@@ -4838,7 +4890,7 @@ visit_load_interpolated_input(isel_context* ctx, nir_intrinsic_instr* instr)
       aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
          aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
       for (unsigned i = 0; i < instr->dest.ssa.num_components; i++) {
-         Temp tmp = ctx->program->allocateTmp(v1);
+         Temp tmp = ctx->program->allocateTmp(instr->dest.ssa.bit_size == 16 ? v2b : v1);
          emit_interp_instr(ctx, idx, component + i, coords, tmp, prim_mask);
          vec->operands[i] = Operand(tmp);
       }
@@ -4917,11 +4969,11 @@ get_fetch_data_format(isel_context* ctx, const ac_data_format_info* vtx_info, un
 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
  * so we may need to fix it up. */
 Temp
-adjust_vertex_fetch_alpha(isel_context* ctx, unsigned adjustment, Temp alpha)
+adjust_vertex_fetch_alpha(isel_context* ctx, enum radv_vs_input_alpha_adjust adjustment, Temp alpha)
 {
    Builder bld(ctx->program, ctx->block);
 
-   if (adjustment == AC_FETCH_FORMAT_SSCALED)
+   if (adjustment == ALPHA_ADJUST_SSCALED)
       alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha);
 
    /* For the integer-like cases, do a natural sign extension.
@@ -4930,15 +4982,15 @@ adjust_vertex_fetch_alpha(isel_context* ctx, unsigned adjustment, Temp alpha)
     * and happen to contain 0, 1, 2, 3 as the two LSBs of the
     * exponent.
     */
-   unsigned offset = adjustment == AC_FETCH_FORMAT_SNORM ? 23u : 0u;
+   unsigned offset = adjustment == ALPHA_ADJUST_SNORM ? 23u : 0u;
    alpha =
       bld.vop3(aco_opcode::v_bfe_i32, bld.def(v1), alpha, Operand::c32(offset), Operand::c32(2u));
 
    /* Convert back to the right type. */
-   if (adjustment == AC_FETCH_FORMAT_SNORM) {
+   if (adjustment == ALPHA_ADJUST_SNORM) {
       alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
       alpha = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::c32(0xbf800000u), alpha);
-   } else if (adjustment == AC_FETCH_FORMAT_SSCALED) {
+   } else if (adjustment == ALPHA_ADJUST_SSCALED) {
       alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
    }
 
@@ -4952,7 +5004,36 @@ visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr)
    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
    nir_src offset = *nir_get_io_offset_src(instr);
 
-   if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
+   if (ctx->shader->info.stage == MESA_SHADER_VERTEX && ctx->program->info->vs.dynamic_inputs) {
+      if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
+         isel_err(offset.ssa->parent_instr,
+                  "Unimplemented non-zero nir_intrinsic_load_input offset");
+
+      unsigned location = nir_intrinsic_base(instr) - VERT_ATTRIB_GENERIC0;
+      unsigned component = nir_intrinsic_component(instr);
+      unsigned bitsize = instr->dest.ssa.bit_size;
+      unsigned num_components = instr->dest.ssa.num_components;
+
+      Temp input = get_arg(ctx, ctx->args->vs_inputs[location]);
+
+      aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
+         aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
+      std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
+      for (unsigned i = 0; i < num_components; i++) {
+         elems[i] = emit_extract_vector(ctx, input, component + i, bitsize == 64 ? v2 : v1);
+         if (bitsize == 16) {
+            if (nir_alu_type_get_base_type(nir_intrinsic_dest_type(instr)) == nir_type_float)
+               elems[i] = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), elems[i]);
+            else
+               elems[i] = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), elems[i],
+                                     Operand::c32(0u));
+         }
+         vec->operands[i] = Operand(elems[i]);
+      }
+      vec->definitions[0] = Definition(dst);
+      ctx->block->instructions.emplace_back(std::move(vec));
+      ctx->allocated_vec.emplace(dst.id(), elems);
+   } else if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
 
       if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
          isel_err(offset.ssa->parent_instr,
@@ -4969,7 +5050,8 @@ visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr)
       uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
       unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
       unsigned binding_align = ctx->options->key.vs.vertex_binding_align[attrib_binding];
-      enum ac_fetch_format alpha_adjust = ctx->options->key.vs.alpha_adjust[location];
+      enum radv_vs_input_alpha_adjust alpha_adjust =
+         ctx->options->key.vs.vertex_alpha_adjust[location];
 
       unsigned dfmt = attrib_format & 0xf;
       unsigned nfmt = (attrib_format >> 4) & 0x7;
@@ -4977,7 +5059,7 @@ visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr)
 
       unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
       unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels);
-      bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location);
+      bool post_shuffle = ctx->options->key.vs.vertex_post_shuffle & (1 << location);
       if (post_shuffle)
          num_channels = MAX2(num_channels, 3);
 
@@ -5105,7 +5187,7 @@ visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr)
 
          Temp fetch_dst;
          if (channel_start == 0 && fetch_bytes == dst.bytes() && !post_shuffle && !expanded &&
-             (alpha_adjust == AC_FETCH_FORMAT_NONE || num_channels <= 3)) {
+             (alpha_adjust == ALPHA_ADJUST_NONE || num_channels <= 3)) {
             direct_fetch = true;
             fetch_dst = dst;
          } else {
@@ -5154,7 +5236,7 @@ visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr)
             unsigned idx = i + component;
             if (swizzle[idx] < num_channels && channels[swizzle[idx]].id()) {
                Temp channel = channels[swizzle[idx]];
-               if (idx == 3 && alpha_adjust != AC_FETCH_FORMAT_NONE)
+               if (idx == 3 && alpha_adjust != ALPHA_ADJUST_NONE)
                   channel = adjust_vertex_fetch_alpha(ctx, alpha_adjust, channel);
                vec->operands[i] = Operand(channel);
 
@@ -5202,16 +5284,23 @@ visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr)
          }
       }
 
-      if (dst.size() == 1) {
+      if (instr->dest.ssa.num_components == 1 &&
+          instr->dest.ssa.bit_size != 64) {
          bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand::c32(vertex_id),
                     bld.m0(prim_mask), idx, component);
       } else {
+         unsigned num_components = instr->dest.ssa.num_components;
+         if (instr->dest.ssa.bit_size == 64)
+            num_components *= 2;
          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
-            aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
-         for (unsigned i = 0; i < dst.size(); i++)
-            vec->operands[i] =
-               bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand::c32(vertex_id),
-                          bld.m0(prim_mask), idx, component + i);
+            aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
+         for (unsigned i = 0; i < num_components; i++) {
+            unsigned chan_component = (component + i) % 4;
+            unsigned chan_idx = idx + (component + i) / 4;
+            vec->operands[i] = bld.vintrp(
+               aco_opcode::v_interp_mov_f32, bld.def(instr->dest.ssa.bit_size == 16 ? v2b : v1),
+               Operand::c32(vertex_id), bld.m0(prim_mask), chan_idx, chan_component);
+         }
          vec->definitions[0] = Definition(dst);
          bld.insert(std::move(vec));
       }
@@ -5268,7 +5357,9 @@ visit_load_tess_coord(isel_context* ctx, nir_intrinsic_instr* instr)
 Temp
 load_desc_ptr(isel_context* ctx, unsigned desc_set)
 {
-   if (ctx->program->info->need_indirect_descriptor_sets) {
+   const struct radv_userdata_locations *user_sgprs_locs = &ctx->program->info->user_sgprs_locs;
+
+   if (user_sgprs_locs->shader_data[AC_UD_INDIRECT_DESCRIPTOR_SETS].sgpr_idx != -1) {
       Builder bld(ctx->program, ctx->block);
       Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0]));
       Operand off = bld.copy(bld.def(s1), Operand::c32(desc_set << 2));
@@ -5421,23 +5512,12 @@ void
 visit_load_sbt_amd(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
-   Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
    unsigned binding = nir_intrinsic_binding(instr);
-   unsigned base = nir_intrinsic_base(instr);
-
-   index = as_vgpr(ctx, index);
 
    Builder bld(ctx->program, ctx->block);
    Temp desc_base = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.sbt_descriptors));
    Operand desc_off = bld.copy(bld.def(s1), Operand::c32(binding * 16u));
-   Temp rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), desc_base, desc_off);
-
-   /* If we want more we need to implement */
-   assert(instr->dest.ssa.bit_size == 32);
-   assert(instr->num_components == 1);
-
-   bld.mubuf(aco_opcode::buffer_load_dword, Definition(dst), rsrc, index, Operand::zero(), base,
-             false, false, true);
+   bld.smem(aco_opcode::s_load_dwordx4, Definition(dst), desc_base, desc_off);
 }
 
 void
@@ -5450,9 +5530,13 @@ visit_load_push_constant(isel_context* ctx, nir_intrinsic_instr* instr)
    nir_const_value* index_cv = nir_src_as_const_value(instr->src[0]);
 
    if (index_cv && instr->dest.ssa.bit_size == 32) {
+      const struct radv_userdata_info *loc =
+         &ctx->program->info->user_sgprs_locs.shader_data[AC_UD_INLINE_PUSH_CONSTANTS];
       unsigned start = (offset + index_cv->u32) / 4u;
-      start -= ctx->args->ac.base_inline_push_consts;
-      if (start + count <= ctx->args->ac.num_inline_push_consts) {
+      unsigned num_inline_push_consts = loc->sgpr_idx != -1 ? loc->num_sgprs : 0;
+
+      start -= ctx->program->info->min_push_constant_used / 4;
+      if (start + count <= num_inline_push_consts) {
          std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
             aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
@@ -5954,79 +6038,6 @@ visit_bvh64_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr)
    mimg->r128 = true;
 }
 
-/* Adjust the sample index according to FMASK.
- *
- * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
- * which is the identity mapping. Each nibble says which physical sample
- * should be fetched to get that sample.
- *
- * For example, 0x11111100 means there are only 2 samples stored and
- * the second sample covers 3/4 of the pixel. When reading samples 0
- * and 1, return physical sample 0 (determined by the first two 0s
- * in FMASK), otherwise return physical sample 1.
- *
- * The sample index should be adjusted as follows:
- *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
- */
-static Temp
-adjust_sample_index_using_fmask(isel_context* ctx, bool da, std::vector<Temp>& coords,
-                                Operand sample_index, Temp fmask_desc_ptr)
-{
-   Builder bld(ctx->program, ctx->block);
-   Temp fmask = bld.tmp(v1);
-   unsigned dim = ctx->options->chip_class >= GFX10
-                     ? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da)
-                     : 0;
-
-   MIMG_instruction* load = emit_mimg(bld, aco_opcode::image_load, Definition(fmask),
-                                      fmask_desc_ptr, Operand(s4), coords);
-   load->glc = false;
-   load->dlc = false;
-   load->dmask = 0x1;
-   load->unrm = true;
-   load->da = da;
-   load->dim = dim;
-
-   Operand sample_index4;
-   if (sample_index.isConstant()) {
-      if (sample_index.constantValue() < 16) {
-         sample_index4 = Operand::c32(sample_index.constantValue() << 2);
-      } else {
-         sample_index4 = Operand::zero();
-      }
-   } else if (sample_index.regClass() == s1) {
-      sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index,
-                               Operand::c32(2u));
-   } else {
-      assert(sample_index.regClass() == v1);
-      sample_index4 =
-         bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), sample_index);
-   }
-
-   Temp final_sample;
-   if (sample_index4.isConstant() && sample_index4.constantValue() == 0)
-      final_sample = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(15u), fmask);
-   else if (sample_index4.isConstant() && sample_index4.constantValue() == 28)
-      final_sample = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand::c32(28u), fmask);
-   else
-      final_sample =
-         bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand::c32(4u));
-
-   /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
-    * resource descriptor is 0 (invalid),
-    */
-   Temp compare = bld.tmp(bld.lm);
-   bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare), Operand::zero(),
-                emit_extract_vector(ctx, fmask_desc_ptr, 1, s1))
-      .def(0)
-      .setHint(vcc);
-
-   Temp sample_index_v = bld.copy(bld.def(v1), sample_index);
-
-   /* Replace the MSAA sample index. */
-   return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare);
-}
-
 static std::vector<Temp>
 get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr)
 {
@@ -6043,28 +6054,8 @@ get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr)
    std::vector<Temp> coords(count);
    Builder bld(ctx->program, ctx->block);
 
-   if (is_ms) {
-      count--;
-      Temp src2 = get_ssa_temp(ctx, instr->src[2].ssa);
-      /* get sample index */
-      if (instr->intrinsic == nir_intrinsic_image_deref_load ||
-          instr->intrinsic == nir_intrinsic_image_deref_sparse_load) {
-         nir_const_value* sample_cv = nir_src_as_const_value(instr->src[2]);
-         Operand sample_index = sample_cv ? Operand::c32(sample_cv->u32)
-                                          : Operand(emit_extract_vector(ctx, src2, 0, v1));
-         std::vector<Temp> fmask_load_address;
-         for (unsigned i = 0; i < (is_array ? 3 : 2); i++)
-            fmask_load_address.emplace_back(emit_extract_vector(ctx, src0, i, v1));
-
-         Temp fmask_desc_ptr =
-            get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
-                             ACO_DESC_FMASK, nullptr, false);
-         coords[count] = adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address,
-                                                         sample_index, fmask_desc_ptr);
-      } else {
-         coords[count] = emit_extract_vector(ctx, src2, 0, v1);
-      }
-   }
+   if (is_ms)
+      coords[--count] = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[2].ssa), 0, v1);
 
    if (gfx9_1d) {
       coords[0] = emit_extract_vector(ctx, src0, 0, v1);
@@ -7062,7 +7053,7 @@ translate_nir_scope(nir_scope scope)
    case NIR_SCOPE_WORKGROUP: return scope_workgroup;
    case NIR_SCOPE_QUEUE_FAMILY: return scope_queuefamily;
    case NIR_SCOPE_DEVICE: return scope_device;
-   case NIR_SCOPE_SHADER_CALL: unreachable("unsupported scope");
+   case NIR_SCOPE_SHADER_CALL: return scope_invocation;
    }
    unreachable("invalid scope");
 }
@@ -7096,7 +7087,9 @@ emit_scoped_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
 
    unsigned nir_storage = nir_intrinsic_memory_modes(instr);
    if (nir_storage & (nir_var_mem_ssbo | nir_var_mem_global))
-      storage |= storage_buffer | storage_image; // TODO: split this when NIR gets nir_var_mem_image
+      storage |= storage_buffer;
+   if (nir_storage & nir_var_image)
+      storage |= storage_image;
    if (shared_storage_used && (nir_storage & nir_var_mem_shared))
       storage |= storage_shared;
 
@@ -7264,6 +7257,10 @@ visit_shared_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
    if (return_previous)
       ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
    ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw);
+
+   if (m.isUndefined())
+      ds->operands.pop_back();
+
    ctx->block->instructions.emplace_back(std::move(ds));
 }
 
@@ -7345,9 +7342,9 @@ visit_load_sample_mask_in(isel_context* ctx, nir_intrinsic_instr* instr)
 {
    uint8_t log2_ps_iter_samples;
    if (ctx->program->info->ps.uses_sample_shading) {
-      log2_ps_iter_samples = util_logbase2(ctx->options->key.fs.num_samples);
+      log2_ps_iter_samples = util_logbase2(ctx->options->key.ps.num_samples);
    } else {
-      log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples;
+      log2_ps_iter_samples = ctx->options->key.ps.log2_ps_iter_samples;
    }
 
    Builder bld(ctx->program, ctx->block);
@@ -7854,12 +7851,11 @@ emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned
 }
 
 void
-emit_interp_center(isel_context* ctx, Temp dst, Temp pos1, Temp pos2)
+emit_interp_center(isel_context* ctx, Temp dst, Temp bary, Temp pos1, Temp pos2)
 {
    Builder bld(ctx->program, ctx->block);
-   Temp persp_center = get_arg(ctx, ctx->args->ac.persp_center);
-   Temp p1 = emit_extract_vector(ctx, persp_center, 0, v1);
-   Temp p2 = emit_extract_vector(ctx, persp_center, 1, v1);
+   Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
+   Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
 
    Temp ddx_1, ddx_2, ddy_1, ddy_2;
    uint32_t dpp_ctrl0 = dpp_quad_perm(0, 0, 0, 0);
@@ -7906,6 +7902,23 @@ Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i);
 void ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt);
 static void create_vs_exports(isel_context* ctx);
 
+Temp
+get_interp_param(isel_context* ctx, nir_intrinsic_op intrin,
+                 enum glsl_interp_mode interp)
+{
+   bool linear = interp == INTERP_MODE_NOPERSPECTIVE;
+   if (intrin == nir_intrinsic_load_barycentric_pixel ||
+       intrin == nir_intrinsic_load_barycentric_at_sample ||
+       intrin == nir_intrinsic_load_barycentric_at_offset) {
+      return get_arg(ctx, linear ? ctx->args->ac.linear_center : ctx->args->ac.persp_center);
+   } else if (intrin == nir_intrinsic_load_barycentric_centroid) {
+      return linear ? ctx->linear_centroid : ctx->persp_centroid;
+   } else {
+      assert(intrin == nir_intrinsic_load_barycentric_sample);
+      return get_arg(ctx, linear ? ctx->args->ac.linear_sample : ctx->args->ac.persp_sample);
+   }
+}
+
 void
 visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
 {
@@ -7915,27 +7928,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
    case nir_intrinsic_load_barycentric_pixel:
    case nir_intrinsic_load_barycentric_centroid: {
       glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
-      Temp bary = Temp(0, s2);
-      switch (mode) {
-      case INTERP_MODE_SMOOTH:
-      case INTERP_MODE_NONE:
-         if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
-            bary = get_arg(ctx, ctx->args->ac.persp_center);
-         else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
-            bary = ctx->persp_centroid;
-         else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
-            bary = get_arg(ctx, ctx->args->ac.persp_sample);
-         break;
-      case INTERP_MODE_NOPERSPECTIVE:
-         if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
-            bary = get_arg(ctx, ctx->args->ac.linear_center);
-         else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
-            bary = ctx->linear_centroid;
-         else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
-            bary = get_arg(ctx, ctx->args->ac.linear_sample);
-         break;
-      default: break;
-      }
+      Temp bary = get_interp_param(ctx, instr->intrinsic, mode);
       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
       Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
       Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
@@ -7957,7 +7950,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
    }
    case nir_intrinsic_load_barycentric_at_sample: {
       uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;
-      switch (ctx->options->key.fs.num_samples) {
+      switch (ctx->options->key.ps.num_samples) {
       case 2: sample_pos_offset += 1 << 3; break;
       case 4: sample_pos_offset += 3 << 3; break;
       case 8: sample_pos_offset += 7 << 3; break;
@@ -8047,7 +8040,8 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
       pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand::c32(0x3f000000u));
       pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand::c32(0x3f000000u));
 
-      emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
+      Temp bary = get_interp_param(ctx, instr->intrinsic, (glsl_interp_mode)nir_intrinsic_interp_mode(instr));
+      emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), bary, pos1, pos2);
       break;
    }
    case nir_intrinsic_load_barycentric_at_offset: {
@@ -8055,7 +8049,8 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
       RegClass rc = RegClass(offset.type(), 1);
       Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
-      emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
+      Temp bary = get_interp_param(ctx, instr->intrinsic, (glsl_interp_mode)nir_intrinsic_interp_mode(instr));
+      emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), bary, pos1, pos2);
       break;
    }
    case nir_intrinsic_load_front_face: {
@@ -8066,18 +8061,8 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
       break;
    }
    case nir_intrinsic_load_view_index: {
-      if (ctx->stage.has(SWStage::VS) || ctx->stage.has(SWStage::GS) ||
-          ctx->stage.has(SWStage::TCS) || ctx->stage.has(SWStage::TES)) {
-         Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
-         bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index)));
-         break;
-      }
-      FALLTHROUGH;
-   }
-   case nir_intrinsic_load_layer_id: {
-      unsigned idx = nir_intrinsic_base(instr);
-      bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
-                 Operand::c32(2u), bld.m0(get_arg(ctx, ctx->args->ac.prim_mask)), idx, 0);
+      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+      bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index)));
       break;
    }
    case nir_intrinsic_load_frag_coord: {
@@ -8144,6 +8129,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
    case nir_intrinsic_image_deref_samples: visit_image_samples(ctx, instr); break;
    case nir_intrinsic_load_ssbo: visit_load_ssbo(ctx, instr); break;
    case nir_intrinsic_store_ssbo: visit_store_ssbo(ctx, instr); break;
+   case nir_intrinsic_load_global_constant:
    case nir_intrinsic_load_global: visit_load_global(ctx, instr); break;
    case nir_intrinsic_load_buffer_amd: visit_load_buffer(ctx, instr); break;
    case nir_intrinsic_store_buffer_amd: visit_store_buffer(ctx, instr); break;
@@ -8182,6 +8168,12 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
       emit_split_vector(ctx, dst, 3);
       break;
    }
+   case nir_intrinsic_load_ray_launch_size: {
+      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+      bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.ray_launch_size)));
+      emit_split_vector(ctx, dst, 3);
+      break;
+   }
    case nir_intrinsic_load_local_invocation_id: {
       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.local_invocation_ids)));
@@ -8190,7 +8182,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
    }
    case nir_intrinsic_load_workgroup_id: {
       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
-      struct ac_arg* args = ctx->args->ac.workgroup_ids;
+      const struct ac_arg* args = ctx->args->ac.workgroup_ids;
       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
                  args[0].used ? Operand(get_arg(ctx, args[0])) : Operand::zero(),
                  args[1].used ? Operand(get_arg(ctx, args[1])) : Operand::zero(),
@@ -8849,7 +8841,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
              ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
 
       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
-      bld.copy(Definition(dst), Operand::c32(ctx->args->options->key.tcs.input_vertices));
+      bld.copy(Definition(dst), Operand::c32(ctx->options->key.tcs.tess_input_vertices));
       break;
    }
    case nir_intrinsic_emit_vertex_with_counter: {
@@ -8906,7 +8898,9 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
       break;
    }
    case nir_intrinsic_load_gs_vertex_offset_amd: {
+      /* GFX6-8 uses 6 separate args, while GFX9+ packs these into only 3 args. */
       unsigned b = nir_intrinsic_base(instr);
+      assert(b <= (ctx->program->chip_class >= GFX9 ? 2 : 5));
       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
                get_arg(ctx, ctx->args->ac.gs_vtx_offset[b]));
       break;
@@ -8933,11 +8927,13 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
 
       Temp gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id);
       /* Get initial edgeflags for each vertex at bits 8, 9, 10 of gs_invocation_id. */
-      Temp flags = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x700u), gs_invocation_id);
+      Temp flags =
+         bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x700u), gs_invocation_id);
       /* Move the bits to their desired position: 8->9, 9->19, 10->29. */
       flags = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), Operand::c32(0x80402u), flags);
       /* Remove garbage bits that are a byproduct of the multiplication. */
-      bld.vop2(aco_opcode::v_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand::c32(0x20080200), flags);
+      bld.vop2(aco_opcode::v_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
+               Operand::c32(0x20080200), flags);
       break;
    }
    case nir_intrinsic_load_packed_passthrough_primitive_amd: {
@@ -9012,7 +9008,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
    case nir_intrinsic_load_cull_any_enabled_amd: {
       Builder::Result cull_any_enabled =
          bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
-                  get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(0x00ffffffu));
+                  get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(0xbu));
       cull_any_enabled.instr->definitions[1].setNoCSE(true);
       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
                bool_to_vector_condition(ctx, cull_any_enabled.def(1).getTemp()));
@@ -9021,7 +9017,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
    case nir_intrinsic_load_cull_small_prim_precision_amd: {
       /* Exponent is 8-bit signed int, move that into a signed 32-bit int. */
       Temp exponent = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc),
-                               get_arg(ctx, ctx->args->ngg_gs_state), Operand::c32(24u));
+                               get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(24u));
       /* small_prim_precision = 1.0 * 2^X */
       bld.vop3(aco_opcode::v_ldexp_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
                Operand::c32(0x3f800000u), Operand(exponent));
@@ -9070,7 +9066,7 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
 
 void
 tex_fetch_ptrs(isel_context* ctx, nir_tex_instr* instr, Temp* res_ptr, Temp* samp_ptr,
-               Temp* fmask_ptr, enum glsl_base_type* stype)
+               enum glsl_base_type* stype)
 {
    nir_deref_instr* texture_deref_instr = NULL;
    nir_deref_instr* sampler_deref_instr = NULL;
@@ -9095,13 +9091,12 @@ tex_fetch_ptrs(isel_context* ctx, nir_tex_instr* instr, Temp* res_ptr, Temp* sam
       sampler_deref_instr = texture_deref_instr;
 
    if (plane >= 0) {
-      assert(instr->op != nir_texop_txf_ms && instr->op != nir_texop_samples_identical);
       assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF);
       *res_ptr = get_sampler_desc(ctx, texture_deref_instr,
                                   (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false);
    } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false);
-   } else if (instr->op == nir_texop_fragment_mask_fetch) {
+   } else if (instr->op == nir_texop_fragment_mask_fetch_amd) {
       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false);
    } else {
       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false);
@@ -9130,8 +9125,6 @@ tex_fetch_ptrs(isel_context* ctx, nir_tex_instr* instr, Temp* res_ptr, Temp* sam
                                 samp[3]);
       }
    }
-   if (fmask_ptr && (instr->op == nir_texop_txf_ms || instr->op == nir_texop_samples_identical))
-      *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false);
 }
 
 void
@@ -9270,19 +9263,19 @@ get_const_vec(nir_ssa_def* vec, nir_const_value* cv[4])
 void
 visit_tex(isel_context* ctx, nir_tex_instr* instr)
 {
+   assert(instr->op != nir_texop_txf_ms && instr->op != nir_texop_samples_identical);
+
    Builder bld(ctx->program, ctx->block);
    bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
         has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false,
         has_sample_index = false, has_clamped_lod = false;
-   Temp resource, sampler, fmask_ptr, bias = Temp(), compare = Temp(), sample_index = Temp(),
-                                      lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(),
-                                      clamped_lod = Temp();
+   Temp resource, sampler, bias = Temp(), compare = Temp(), sample_index = Temp(), lod = Temp(),
+                           offset = Temp(), ddx = Temp(), ddy = Temp(), clamped_lod = Temp();
    std::vector<Temp> coords;
    std::vector<Temp> derivs;
-   nir_const_value* sample_index_cv = NULL;
    nir_const_value* const_offset[4] = {NULL, NULL, NULL, NULL};
    enum glsl_base_type stype;
-   tex_fetch_ptrs(ctx, instr, &resource, &sampler, &fmask_ptr, &stype);
+   tex_fetch_ptrs(ctx, instr, &resource, &sampler, &stype);
 
    bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 &&
                                   (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT);
@@ -9335,7 +9328,6 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr)
          break;
       case nir_tex_src_ms_index:
          sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa);
-         sample_index_cv = nir_src_as_const_value(instr->src[i].src);
          has_sample_index = true;
          break;
       case nir_tex_src_texture_offset:
@@ -9352,7 +9344,7 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr)
       return;
    }
 
-   if (has_offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {
+   if (has_offset && instr->op != nir_texop_txf) {
       aco_ptr<Instruction> tmp_instr;
       Temp acc, pack = Temp();
 
@@ -9444,8 +9436,8 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr)
        (instr->sampler_dim == GLSL_SAMPLER_DIM_2D || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
         instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
         instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
-       instr->is_array && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms &&
-       instr->op != nir_texop_fragment_fetch && instr->op != nir_texop_fragment_mask_fetch)
+       instr->is_array && instr->op != nir_texop_txf && instr->op != nir_texop_fragment_fetch_amd &&
+       instr->op != nir_texop_fragment_mask_fetch_amd)
       coords[2] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[2]);
 
    if (ctx->options->chip_class == GFX9 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
@@ -9459,21 +9451,7 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr)
 
    bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);
 
-   if (instr->op == nir_texop_samples_identical)
-      resource = fmask_ptr;
-
-   else if ((instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
-             instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
-            instr->op != nir_texop_txs && instr->op != nir_texop_fragment_fetch &&
-            instr->op != nir_texop_fragment_mask_fetch) {
-      assert(has_sample_index);
-      Operand op(sample_index);
-      if (sample_index_cv)
-         op = Operand::c32(sample_index_cv->u32);
-      sample_index = adjust_sample_index_using_fmask(ctx, da, coords, op, fmask_ptr);
-   }
-
-   if (has_offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) {
+   if (has_offset && instr->op == nir_texop_txf) {
       for (unsigned i = 0; i < std::min(offset.size(), instr->coord_components); i++) {
          Temp off = emit_extract_vector(ctx, offset, i, v1);
          coords[i] = bld.vadd32(bld.def(v1), coords[i], off);
@@ -9503,7 +9481,7 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr)
          dmask = 1 << instr->component;
       if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
          tmp_dst = bld.tmp(instr->is_sparse ? v5 : v4);
-   } else if (instr->op == nir_texop_samples_identical) {
+   } else if (instr->op == nir_texop_fragment_mask_fetch_amd) {
       tmp_dst = bld.tmp(v1);
    } else if (util_bitcount(dmask) != instr->dest.ssa.num_components ||
               dst.type() == RegType::sgpr) {
@@ -9682,9 +9660,8 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr)
    if (has_clamped_lod)
       args.emplace_back(clamped_lod);
 
-   if (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms ||
-       instr->op == nir_texop_samples_identical || instr->op == nir_texop_fragment_fetch ||
-       instr->op == nir_texop_fragment_mask_fetch) {
+   if (instr->op == nir_texop_txf || instr->op == nir_texop_fragment_fetch_amd ||
+       instr->op == nir_texop_fragment_mask_fetch_amd) {
       aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
                             instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS
                          ? aco_opcode::image_load
@@ -9692,19 +9669,35 @@ visit_tex(isel_context* ctx, nir_tex_instr* instr)
       Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
       MIMG_instruction* tex =
          emit_mimg(bld, op, Definition(tmp_dst), resource, Operand(s4), args, 0, vdata);
-      tex->dim = dim;
+      if (instr->op == nir_texop_fragment_mask_fetch_amd)
+         tex->dim = da ? ac_image_2darray : ac_image_2d;
+      else
+         tex->dim = dim;
       tex->dmask = dmask & 0xf;
       tex->unrm = true;
       tex->da = da;
       tex->tfe = instr->is_sparse;
 
-      if (instr->op == nir_texop_samples_identical) {
-         assert(dmask == 1 && dst.regClass() == bld.lm);
+      if (instr->op == nir_texop_fragment_mask_fetch_amd) {
+         /* Use 0x76543210 if the image doesn't have FMASK. */
+         assert(dmask == 1 && dst.bytes() == 4);
          assert(dst.id() != tmp_dst.id());
 
-         bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(dst), Operand::zero(), tmp_dst)
-            .def(0)
-            .setHint(vcc);
+         if (dst.regClass() == s1) {
+            Temp is_not_null = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand::zero(),
+                                        emit_extract_vector(ctx, resource, 1, s1));
+            bld.sop2(aco_opcode::s_cselect_b32, Definition(dst),
+                     bld.as_uniform(tmp_dst), Operand::c32(0x76543210),
+                     bld.scc(is_not_null));
+         } else {
+            Temp is_not_null = bld.tmp(bld.lm);
+            bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(is_not_null), Operand::zero(),
+                         emit_extract_vector(ctx, resource, 1, s1))
+               .def(0)
+               .setHint(vcc);
+            bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst),
+                     bld.copy(bld.def(v1), Operand::c32(0x76543210)), tmp_dst, is_not_null);
+         }
       } else {
          expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
       }
@@ -10795,9 +10788,9 @@ create_vs_exports(isel_context* ctx)
 {
    assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG);
 
-   radv_vs_output_info* outinfo = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS))
-                                     ? &ctx->program->info->tes.outinfo
-                                     : &ctx->program->info->vs.outinfo;
+   const radv_vs_output_info* outinfo = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS))
+                                        ? &ctx->program->info->tes.outinfo
+                                        : &ctx->program->info->vs.outinfo;
 
    ctx->block->kind |= block_kind_export_end;
 
@@ -10935,10 +10928,10 @@ export_fs_mrt_color(isel_context* ctx, int slot)
 
    slot -= FRAG_RESULT_DATA0;
    target = V_008DFC_SQ_EXP_MRT + slot;
-   col_format = (ctx->options->key.fs.col_format >> (4 * slot)) & 0xf;
+   col_format = (ctx->options->key.ps.col_format >> (4 * slot)) & 0xf;
 
-   bool is_int8 = (ctx->options->key.fs.is_int8 >> slot) & 1;
-   bool is_int10 = (ctx->options->key.fs.is_int10 >> slot) & 1;
+   bool is_int8 = (ctx->options->key.ps.is_int8 >> slot) & 1;
+   bool is_int10 = (ctx->options->key.ps.is_int10 >> slot) & 1;
    bool is_16bit = values[0].regClass() == v2b;
 
    /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
@@ -11150,27 +11143,12 @@ static void
 emit_stream_output(isel_context* ctx, Temp const* so_buffers, Temp const* so_write_offset,
                    const struct radv_stream_output* output)
 {
-   unsigned num_comps = util_bitcount(output->component_mask);
-   unsigned writemask = (1 << num_comps) - 1;
+   assert(ctx->stage.hw == HWStage::VS);
+
    unsigned loc = output->location;
    unsigned buf = output->buffer;
 
-   assert(num_comps && num_comps <= 4);
-   if (!num_comps || num_comps > 4)
-      return;
-
-   unsigned first_comp = ffs(output->component_mask) - 1;
-
-   Temp out[4];
-   bool all_undef = true;
-   assert(ctx->stage.hw == HWStage::VS);
-   for (unsigned i = 0; i < num_comps; i++) {
-      out[i] = ctx->outputs.temps[loc * 4 + first_comp + i];
-      all_undef = all_undef && !out[i].id();
-   }
-   if (all_undef)
-      return;
-
+   unsigned writemask = output->component_mask & ctx->outputs.mask[loc];
    while (writemask) {
       int start, count;
       u_bit_scan_consecutive_range(&writemask, &start, &count);
@@ -11180,26 +11158,17 @@ emit_stream_output(isel_context* ctx, Temp const* so_buffers, Temp const* so_wri
          count = 2;
       }
 
-      unsigned offset = output->offset + start * 4;
+      unsigned offset = output->offset + (start - (ffs(output->component_mask) - 1)) * 4;
 
       Temp write_data = ctx->program->allocateTmp(RegClass(RegType::vgpr, count));
       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
          aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
       for (int i = 0; i < count; ++i)
-         vec->operands[i] =
-            (ctx->outputs.mask[loc] & 1 << (start + i)) ? Operand(out[start + i]) : Operand::zero();
+         vec->operands[i] = Operand(ctx->outputs.temps[loc * 4 + start + i]);
       vec->definitions[0] = Definition(write_data);
       ctx->block->instructions.emplace_back(std::move(vec));
 
-      aco_opcode opcode;
-      switch (count) {
-      case 1: opcode = aco_opcode::buffer_store_dword; break;
-      case 2: opcode = aco_opcode::buffer_store_dwordx2; break;
-      case 3: opcode = aco_opcode::buffer_store_dwordx3; break;
-      case 4: opcode = aco_opcode::buffer_store_dwordx4; break;
-      default: unreachable("Unsupported dword count.");
-      }
-
+      aco_opcode opcode = get_buffer_store_op(count * 4);
       aco_ptr<MUBUF_instruction> store{
          create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
       store->operands[0] = Operand(so_buffers[buf]);
@@ -11209,7 +11178,7 @@ emit_stream_output(isel_context* ctx, Temp const* so_buffers, Temp const* so_wri
       if (offset > 4095) {
          /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
          Builder bld(ctx->program, ctx->block);
-         store->operands[0] =
+         store->operands[1] =
             bld.vadd32(bld.def(v1), Operand::c32(offset), Operand(so_write_offset[buf]));
       } else {
          store->offset = offset;
@@ -11227,17 +11196,6 @@ emit_streamout(isel_context* ctx, unsigned stream)
 {
    Builder bld(ctx->program, ctx->block);
 
-   Temp so_buffers[4];
-   Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers));
-   for (unsigned i = 0; i < 4; i++) {
-      unsigned stride = ctx->program->info->so.strides[i];
-      if (!stride)
-         continue;
-
-      Operand off = bld.copy(bld.def(s1), Operand::c32(i * 16u));
-      so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, off);
-   }
-
    Temp so_vtx_count =
       bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
                get_arg(ctx, ctx->args->ac.streamout_config), Operand::c32(0x70010u));
@@ -11254,13 +11212,18 @@ emit_streamout(isel_context* ctx, unsigned stream)
    Temp so_write_index =
       bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.streamout_write_index), tid);
 
+   Temp so_buffers[4];
    Temp so_write_offset[4];
+   Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers));
 
    for (unsigned i = 0; i < 4; i++) {
       unsigned stride = ctx->program->info->so.strides[i];
       if (!stride)
          continue;
 
+      so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr,
+                               bld.copy(bld.def(s1), Operand::c32(i * 16u)));
+
       if (stride == 1) {
          Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
                                 get_arg(ctx, ctx->args->ac.streamout_write_index),
@@ -11278,7 +11241,7 @@ emit_streamout(isel_context* ctx, unsigned stream)
    }
 
    for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) {
-      struct radv_stream_output* output = &ctx->program->info->so.outputs[i];
+      const struct radv_stream_output* output = &ctx->program->info->so.outputs[i];
       if (stream != output->stream)
          continue;
 
@@ -11292,37 +11255,8 @@ emit_streamout(isel_context* ctx, unsigned stream)
 Pseudo_instruction*
 add_startpgm(struct isel_context* ctx)
 {
-   unsigned arg_count = ctx->args->ac.arg_count;
-   if (ctx->stage == fragment_fs) {
-      /* LLVM optimizes away unused FS inputs and computes spi_ps_input_addr
-       * itself and then communicates the results back via the ELF binary.
-       * Mirror what LLVM does by re-mapping the VGPR arguments here.
-       *
-       * TODO: If we made the FS input scanning code into a separate pass that
-       * could run before argument setup, then this wouldn't be necessary
-       * anymore.
-       */
-      struct ac_shader_args* args = &ctx->args->ac;
-      arg_count = 0;
-      for (unsigned i = 0, vgpr_arg = 0, vgpr_reg = 0; i < args->arg_count; i++) {
-         if (args->args[i].file != AC_ARG_VGPR) {
-            arg_count++;
-            continue;
-         }
-
-         if (!(ctx->program->config->spi_ps_input_addr & (1 << vgpr_arg))) {
-            args->args[i].skip = true;
-         } else {
-            args->args[i].offset = vgpr_reg;
-            vgpr_reg += args->args[i].size;
-            arg_count++;
-         }
-         vgpr_arg++;
-      }
-   }
-
    aco_ptr<Pseudo_instruction> startpgm{
-      create_instruction<Pseudo_instruction>(aco_opcode::p_startpgm, Format::PSEUDO, 0, arg_count)};
+      create_instruction<Pseudo_instruction>(aco_opcode::p_startpgm, Format::PSEUDO, 0, ctx->args->ac.arg_count)};
    for (unsigned i = 0, arg = 0; i < ctx->args->ac.arg_count; i++) {
       if (ctx->args->ac.args[i].skip)
          continue;
@@ -11346,6 +11280,18 @@ add_startpgm(struct isel_context* ctx)
    ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets);
    ctx->program->scratch_offset = get_arg(ctx, ctx->args->ac.scratch_offset);
 
+   if (ctx->stage.has(SWStage::VS) && ctx->program->info->vs.dynamic_inputs) {
+      unsigned num_attributes = util_last_bit(ctx->program->info->vs.vb_desc_usage_mask);
+      for (unsigned i = 0; i < num_attributes; i++) {
+         Definition def(get_arg(ctx, ctx->args->vs_inputs[i]));
+
+         unsigned idx = ctx->args->vs_inputs[i].arg_index;
+         def.setFixed(PhysReg(256 + ctx->args->ac.args[idx].offset));
+
+         ctx->program->vs_inputs.push_back(def);
+      }
+   }
+
    return instr;
 }
 
@@ -11399,15 +11345,19 @@ handle_bc_optimize(isel_context* ctx)
    uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;
    bool uses_center =
       G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);
-   bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) ||
-                        G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
-   ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid);
-   ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid);
-   if (uses_center && uses_centroid) {
+   bool uses_persp_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena);
+   bool uses_linear_centroid = G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
+
+   if (uses_persp_centroid)
+      ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid);
+   if (uses_linear_centroid)
+      ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid);
+
+   if (uses_center && (uses_persp_centroid || uses_linear_centroid)) {
       Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)),
                               get_arg(ctx, ctx->args->ac.prim_mask), Operand::zero());
 
-      if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) {
+      if (uses_persp_centroid) {
          Temp new_coord[2];
          for (unsigned i = 0; i < 2; i++) {
             Temp persp_centroid =
@@ -11423,7 +11373,7 @@ handle_bc_optimize(isel_context* ctx)
          emit_split_vector(ctx, ctx->persp_centroid, 2);
       }
 
-      if (G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena)) {
+      if (uses_linear_centroid) {
          Temp new_coord[2];
          for (unsigned i = 0; i < 2; i++) {
             Temp linear_centroid =
@@ -11618,9 +11568,11 @@ ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt)
 
 void
 select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
-               ac_shader_config* config, struct radv_shader_args* args)
+               ac_shader_config* config, const struct radv_nir_compiler_options* options,
+               const struct radv_shader_info* info,
+               const struct radv_shader_args* args)
 {
-   isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);
+   isel_context ctx = setup_isel_context(program, shader_count, shaders, config, options, info, args, false);
    if_context ic_merged_wave_info;
    bool ngg_gs = ctx.stage.hw == HWStage::NGG && ctx.stage.has(SWStage::GS);
 
@@ -11635,12 +11587,13 @@ select_program(Program* program, unsigned shader_count, struct nir_shader* const
          Pseudo_instruction* startpgm = add_startpgm(&ctx);
          append_logical_start(ctx.block);
 
-         if (unlikely(args->options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs))
+         if (unlikely(ctx.options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs))
             fix_ls_vgpr_init_bug(&ctx, startpgm);
 
          split_arguments(&ctx, startpgm);
 
-         if (program->stage.has(SWStage::VS) || program->stage.has(SWStage::TES)) {
+         if (!info->vs.has_prolog &&
+             (program->stage.has(SWStage::VS) || program->stage.has(SWStage::TES))) {
             Builder(ctx.program, ctx.block).sopp(aco_opcode::s_setprio, -1u, 0x3u);
          }
       }
@@ -11736,9 +11689,11 @@ select_program(Program* program, unsigned shader_count, struct nir_shader* const
 
 void
 select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config,
-                      struct radv_shader_args* args)
+                      const struct radv_nir_compiler_options* options,
+                      const struct radv_shader_info* info,
+                      const struct radv_shader_args* args)
 {
-   isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true);
+   isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, options, info, args, true);
 
    ctx.block->fp_mode = program->next_fp_mode;
 
@@ -11751,21 +11706,21 @@ select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_
                              program->private_segment_buffer, Operand::c32(RING_GSVS_VS * 16u));
 
    Operand stream_id = Operand::zero();
-   if (args->shader_info->so.num_outputs)
+   if (program->info->so.num_outputs)
       stream_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
                            get_arg(&ctx, ctx.args->ac.streamout_config), Operand::c32(0x20018u));
 
    Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u),
                               get_arg(&ctx, ctx.args->ac.vertex_id));
 
-   std::stack<if_context> if_contexts;
+   std::stack<if_context, std::vector<if_context>> if_contexts;
 
    for (unsigned stream = 0; stream < 4; stream++) {
       if (stream_id.isConstant() && stream != stream_id.constantValue())
          continue;
 
-      unsigned num_components = args->shader_info->gs.num_stream_output_components[stream];
-      if (stream > 0 && (!num_components || !args->shader_info->so.num_outputs))
+      unsigned num_components = program->info->gs.num_stream_output_components[stream];
+      if (stream > 0 && (!num_components || !program->info->so.num_outputs))
          continue;
 
       memset(ctx.outputs.mask, 0, sizeof(ctx.outputs.mask));
@@ -11780,17 +11735,17 @@ select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_
 
       unsigned offset = 0;
       for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
-         if (args->shader_info->gs.output_streams[i] != stream)
+         if (program->info->gs.output_streams[i] != stream)
             continue;
 
-         unsigned output_usage_mask = args->shader_info->gs.output_usage_mask[i];
+         unsigned output_usage_mask = program->info->gs.output_usage_mask[i];
          unsigned length = util_last_bit(output_usage_mask);
          for (unsigned j = 0; j < length; ++j) {
             if (!(output_usage_mask & (1 << j)))
                continue;
 
             Temp val = bld.tmp(v1);
-            unsigned const_offset = offset * args->shader_info->gs.vertices_out * 16 * 4;
+            unsigned const_offset = offset * program->info->gs.vertices_out * 16 * 4;
             load_vmem_mubuf(&ctx, val, gsvs_ring, vtx_offset, Temp(), const_offset, 4, 1, 0u, true,
                             true, true);
 
@@ -11801,7 +11756,7 @@ select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_
          }
       }
 
-      if (args->shader_info->so.num_outputs) {
+      if (program->info->so.num_outputs) {
          emit_streamout(&ctx, stream);
          bld.reset(ctx.block);
       }
@@ -11833,17 +11788,19 @@ select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_
 
 void
 select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shader_config* config,
-                           struct radv_shader_args* args)
+                           const struct radv_nir_compiler_options* options,
+                           const struct radv_shader_info* info,
+                           const struct radv_shader_args* args)
 {
-   assert(args->options->chip_class == GFX8);
+   assert(options->chip_class == GFX8);
 
-   init_program(program, compute_cs, args->shader_info, args->options->chip_class,
-                args->options->family, args->options->wgp_mode, config);
+   init_program(program, compute_cs, info, options->chip_class,
+                options->family, options->wgp_mode, config);
 
    isel_context ctx = {};
    ctx.program = program;
    ctx.args = args;
-   ctx.options = args->options;
+   ctx.options = options;
    ctx.stage = program->stage;
 
    ctx.block = ctx.program->create_and_insert_block();
@@ -11889,4 +11846,327 @@ select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shade
 
    cleanup_cfg(program);
 }
+
+Operand
+get_arg_fixed(const struct radv_shader_args* args, struct ac_arg arg)
+{
+   assert(arg.used);
+
+   enum ac_arg_regfile file = args->ac.args[arg.arg_index].file;
+   unsigned size = args->ac.args[arg.arg_index].size;
+   unsigned reg = args->ac.args[arg.arg_index].offset;
+
+   return Operand(PhysReg(file == AC_ARG_SGPR ? reg : reg + 256),
+                  RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size));
+}
+
+unsigned
+load_vb_descs(Builder& bld, PhysReg dest, Operand base, unsigned start, unsigned max)
+{
+   unsigned count = MIN2((bld.program->dev.sgpr_limit - dest.reg()) / 4u, max);
+
+   unsigned num_loads = (count / 4u) + util_bitcount(count & 0x3);
+   if (bld.program->chip_class >= GFX10 && num_loads > 1)
+      bld.sopp(aco_opcode::s_clause, -1, num_loads - 1);
+
+   for (unsigned i = 0; i < count;) {
+      unsigned size = 1u << util_logbase2(MIN2(count - i, 4));
+
+      if (size == 4)
+         bld.smem(aco_opcode::s_load_dwordx16, Definition(dest, s16), base,
+                  Operand::c32((start + i) * 16u));
+      else if (size == 2)
+         bld.smem(aco_opcode::s_load_dwordx8, Definition(dest, s8), base,
+                  Operand::c32((start + i) * 16u));
+      else
+         bld.smem(aco_opcode::s_load_dwordx4, Definition(dest, s4), base,
+                  Operand::c32((start + i) * 16u));
+
+      dest = dest.advance(size * 16u);
+      i += size;
+   }
+
+   return count;
+}
+
+Operand
+calc_nontrivial_instance_id(Builder& bld, const struct radv_shader_args* args, unsigned index,
+                            Operand instance_id, Operand start_instance, PhysReg tmp_sgpr,
+                            PhysReg tmp_vgpr0, PhysReg tmp_vgpr1)
+{
+   bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_sgpr, s2),
+            get_arg_fixed(args, args->prolog_inputs), Operand::c32(8u + index * 8u));
+
+   wait_imm lgkm_imm;
+   lgkm_imm.lgkm = 0;
+   bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(bld.program->chip_class));
+
+   Definition fetch_index_def(tmp_vgpr0, v1);
+   Operand fetch_index(tmp_vgpr0, v1);
+
+   Operand div_info(tmp_sgpr, s1);
+   if (bld.program->chip_class >= GFX8) {
+      /* use SDWA */
+      if (bld.program->chip_class < GFX9) {
+         bld.vop1(aco_opcode::v_mov_b32, Definition(tmp_vgpr1, v1), div_info);
+         div_info = Operand(tmp_vgpr1, v1);
+      }
+
+      bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id).instr;
+
+      Instruction* instr;
+      if (bld.program->chip_class >= GFX9)
+         instr = bld.vop2_sdwa(aco_opcode::v_add_u32, fetch_index_def, div_info, fetch_index).instr;
+      else
+         instr = bld.vop2_sdwa(aco_opcode::v_add_co_u32, fetch_index_def, Definition(vcc, bld.lm),
+                               div_info, fetch_index)
+                    .instr;
+      instr->sdwa().sel[0] = SubdwordSel::ubyte1;
+
+      bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, Operand(tmp_sgpr.advance(4), s1),
+               fetch_index);
+
+      instr =
+         bld.vop2_sdwa(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, fetch_index).instr;
+      instr->sdwa().sel[0] = SubdwordSel::ubyte2;
+   } else {
+      Operand tmp_op(tmp_vgpr1, v1);
+      Definition tmp_def(tmp_vgpr1, v1);
+
+      bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id);
+
+      bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(8u), Operand::c32(8u));
+      bld.vadd32(fetch_index_def, tmp_op, fetch_index, false, Operand(s2), true);
+
+      bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, fetch_index,
+               Operand(tmp_sgpr.advance(4), s1));
+
+      bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(16u), Operand::c32(8u));
+      bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, tmp_op, fetch_index);
+   }
+
+   bld.vadd32(fetch_index_def, start_instance, fetch_index, false, Operand(s2), true);
+
+   return fetch_index;
+}
+
+void
+select_vs_prolog(Program* program, const struct radv_vs_prolog_key* key, ac_shader_config* config,
+                 const struct radv_nir_compiler_options* options,
+                 const struct radv_shader_info* info,
+                 const struct radv_shader_args* args, unsigned* num_preserved_sgprs)
+{
+   assert(key->num_attributes > 0);
+
+   /* This should be enough for any shader/stage. */
+   unsigned max_user_sgprs = options->chip_class >= GFX9 ? 32 : 16;
+   *num_preserved_sgprs = max_user_sgprs + 14;
+
+   init_program(program, compute_cs, info, options->chip_class,
+                options->family, options->wgp_mode, config);
+
+   Block* block = program->create_and_insert_block();
+   block->kind = block_kind_top_level;
+
+   program->workgroup_size = 64;
+   calc_min_waves(program);
+
+   Builder bld(program, block);
+
+   block->instructions.reserve(16 + key->num_attributes * 4);
+
+   bld.sopp(aco_opcode::s_setprio, -1u, 0x3u);
+
+   uint32_t attrib_mask = BITFIELD_MASK(key->num_attributes);
+   bool has_nontrivial_divisors = key->state->nontrivial_divisors & attrib_mask;
+
+   wait_imm lgkm_imm;
+   lgkm_imm.lgkm = 0;
+
+   /* choose sgprs */
+   PhysReg vertex_buffers(align(*num_preserved_sgprs, 2));
+   PhysReg prolog_input = vertex_buffers.advance(8);
+   PhysReg desc(
+      align((has_nontrivial_divisors ? prolog_input : vertex_buffers).advance(8).reg(), 4));
+
+   Operand start_instance = get_arg_fixed(args, args->ac.start_instance);
+   Operand instance_id = get_arg_fixed(args, args->ac.instance_id);
+
+   PhysReg attributes_start(256 + args->ac.num_vgprs_used);
+   /* choose vgprs that won't be used for anything else until the last attribute load */
+   PhysReg vertex_index(attributes_start.reg() + key->num_attributes * 4 - 1);
+   PhysReg instance_index(attributes_start.reg() + key->num_attributes * 4 - 2);
+   PhysReg start_instance_vgpr(attributes_start.reg() + key->num_attributes * 4 - 3);
+   PhysReg nontrivial_tmp_vgpr0(attributes_start.reg() + key->num_attributes * 4 - 4);
+   PhysReg nontrivial_tmp_vgpr1(attributes_start.reg() + key->num_attributes * 4);
+
+   bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers, s1),
+            get_arg_fixed(args, args->ac.vertex_buffers));
+   bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers.advance(4), s1),
+            Operand::c32((unsigned)options->address32_hi));
+
+   /* calculate vgpr requirements */
+   unsigned num_vgprs = attributes_start.reg() - 256;
+   num_vgprs += key->num_attributes * 4;
+   if (has_nontrivial_divisors && program->chip_class <= GFX8)
+      num_vgprs++; /* make space for nontrivial_tmp_vgpr1 */
+   unsigned num_sgprs = 0;
+
+   for (unsigned loc = 0; loc < key->num_attributes;) {
+      unsigned num_descs =
+         load_vb_descs(bld, desc, Operand(vertex_buffers, s2), loc, key->num_attributes - loc);
+      num_sgprs = MAX2(num_sgprs, desc.advance(num_descs * 16u).reg());
+
+      if (loc == 0) {
+         /* perform setup while we load the descriptors */
+         if (key->is_ngg || key->next_stage != MESA_SHADER_VERTEX) {
+            Operand count = get_arg_fixed(args, args->ac.merged_wave_info);
+            bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), count, Operand::c32(0u));
+            if (program->wave_size == 64) {
+               bld.sopc(aco_opcode::s_bitcmp1_b32, Definition(scc, s1), count,
+                        Operand::c32(6u /* log2(64) */));
+               bld.sop2(aco_opcode::s_cselect_b64, Definition(exec, s2), Operand::c64(UINT64_MAX),
+                        Operand(exec, s2), Operand(scc, s1));
+            }
+         }
+
+         bool needs_instance_index = false;
+         bool needs_start_instance = false;
+         u_foreach_bit(i, key->state->instance_rate_inputs & attrib_mask)
+         {
+            needs_instance_index |= key->state->divisors[i] == 1;
+            needs_start_instance |= key->state->divisors[i] == 0;
+         }
+         bool needs_vertex_index = ~key->state->instance_rate_inputs & attrib_mask;
+         if (needs_vertex_index)
+            bld.vadd32(Definition(vertex_index, v1), get_arg_fixed(args, args->ac.base_vertex),
+                       get_arg_fixed(args, args->ac.vertex_id), false, Operand(s2), true);
+         if (needs_instance_index)
+            bld.vadd32(Definition(instance_index, v1), start_instance, instance_id, false,
+                       Operand(s2), true);
+         if (needs_start_instance)
+            bld.vop1(aco_opcode::v_mov_b32, Definition(start_instance_vgpr, v1), start_instance);
+      }
+
+      bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->chip_class));
+
+      for (unsigned i = 0; i < num_descs; i++, loc++) {
+         PhysReg dest(attributes_start.reg() + loc * 4u);
+
+         /* calculate index */
+         Operand fetch_index = Operand(vertex_index, v1);
+         if (key->state->instance_rate_inputs & (1u << loc)) {
+            uint32_t divisor = key->state->divisors[loc];
+            if (divisor) {
+               fetch_index = instance_id;
+               if (key->state->nontrivial_divisors & (1u << loc)) {
+                  unsigned index =
+                     util_bitcount(key->state->nontrivial_divisors & BITFIELD_MASK(loc));
+                  fetch_index = calc_nontrivial_instance_id(
+                     bld, args, index, instance_id, start_instance, prolog_input,
+                     nontrivial_tmp_vgpr0, nontrivial_tmp_vgpr1);
+               } else {
+                  fetch_index = Operand(instance_index, v1);
+               }
+            } else {
+               fetch_index = Operand(start_instance_vgpr, v1);
+            }
+         }
+
+         /* perform load */
+         PhysReg cur_desc = desc.advance(i * 16);
+         if ((key->misaligned_mask & (1u << loc))) {
+            unsigned dfmt = key->state->formats[loc] & 0xf;
+            unsigned nfmt = key->state->formats[loc] >> 4;
+            const struct ac_data_format_info* vtx_info = ac_get_data_format_info(dfmt);
+            for (unsigned j = 0; j < vtx_info->num_channels; j++) {
+               bool post_shuffle = key->state->post_shuffle & (1u << loc);
+               unsigned offset = vtx_info->chan_byte_size * (post_shuffle && j < 3 ? 2 - j : j);
+
+               /* Use MUBUF to workaround hangs for byte-aligned dword loads. The Vulkan spec
+                * doesn't require this to work, but some GL CTS tests over Zink do this anyway.
+                * MTBUF can hang, but MUBUF doesn't (probably gives garbage, but GL CTS doesn't
+                * care).
+                */
+               if (vtx_info->chan_format == V_008F0C_BUF_DATA_FORMAT_32)
+                  bld.mubuf(aco_opcode::buffer_load_dword, Definition(dest.advance(j * 4u), v1),
+                            Operand(cur_desc, s4), fetch_index, Operand::c32(0u), offset, false,
+                            false, true);
+               else
+                  bld.mtbuf(aco_opcode::tbuffer_load_format_x, Definition(dest.advance(j * 4u), v1),
+                            Operand(cur_desc, s4), fetch_index, Operand::c32(0u),
+                            vtx_info->chan_format, nfmt, offset, false, true);
+            }
+            uint32_t one =
+               nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || nfmt == V_008F0C_BUF_NUM_FORMAT_SINT
+                  ? 1u
+                  : 0x3f800000u;
+            for (unsigned j = vtx_info->num_channels; j < 4; j++) {
+               bld.vop1(aco_opcode::v_mov_b32, Definition(dest.advance(j * 4u), v1),
+                        Operand::c32(j == 3 ? one : 0u));
+            }
+         } else {
+            bld.mubuf(aco_opcode::buffer_load_format_xyzw, Definition(dest, v4),
+                      Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 0u, false, false, true);
+         }
+      }
+   }
+
+   if (key->state->alpha_adjust_lo | key->state->alpha_adjust_hi) {
+      wait_imm vm_imm;
+      vm_imm.vm = 0;
+      bld.sopp(aco_opcode::s_waitcnt, -1, vm_imm.pack(program->chip_class));
+   }
+
+   /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
+    * so we may need to fix it up. */
+   u_foreach_bit(loc, (key->state->alpha_adjust_lo | key->state->alpha_adjust_hi))
+   {
+      PhysReg alpha(attributes_start.reg() + loc * 4u + 3);
+
+      unsigned alpha_adjust = (key->state->alpha_adjust_lo >> loc) & 0x1;
+      alpha_adjust |= ((key->state->alpha_adjust_hi >> loc) & 0x1) << 1;
+
+      if (alpha_adjust == ALPHA_ADJUST_SSCALED)
+         bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(alpha, v1), Operand(alpha, v1));
+
+      /* For the integer-like cases, do a natural sign extension.
+       *
+       * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
+       * and happen to contain 0, 1, 2, 3 as the two LSBs of the
+       * exponent.
+       */
+      unsigned offset = alpha_adjust == ALPHA_ADJUST_SNORM ? 23u : 0u;
+      bld.vop3(aco_opcode::v_bfe_i32, Definition(alpha, v1), Operand(alpha, v1),
+               Operand::c32(offset), Operand::c32(2u));
+
+      /* Convert back to the right type. */
+      if (alpha_adjust == ALPHA_ADJUST_SNORM) {
+         bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
+         bld.vop2(aco_opcode::v_max_f32, Definition(alpha, v1), Operand::c32(0xbf800000u),
+                  Operand(alpha, v1));
+      } else if (alpha_adjust == ALPHA_ADJUST_SSCALED) {
+         bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
+      }
+   }
+
+   block->kind |= block_kind_uniform;
+
+   /* continue on to the main shader */
+   Operand continue_pc = get_arg_fixed(args, args->prolog_inputs);
+   if (has_nontrivial_divisors) {
+      bld.smem(aco_opcode::s_load_dwordx2, Definition(prolog_input, s2),
+               get_arg_fixed(args, args->prolog_inputs), Operand::c32(0u));
+      bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->chip_class));
+      continue_pc = Operand(prolog_input, s2);
+   }
+
+   bld.sop1(aco_opcode::s_setpc_b64, continue_pc);
+
+   program->config->float_mode = program->blocks[0].fp_mode.val;
+   /* addition on GFX6-8 requires a carry-out (we use VCC) */
+   program->needs_vcc = program->chip_class <= GFX8;
+   program->config->num_vgprs = get_vgpr_alloc(program, num_vgprs);
+   program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
+}
 } // namespace aco
diff --git a/mesa 3D driver/src/amd/compiler/aco_instruction_selection.h b/mesa 3D driver/src/amd/compiler/aco_instruction_selection.h
index 43dd76a31a..65f74b3260 100644
--- a/mesa 3D driver/src/amd/compiler/aco_instruction_selection.h	
+++ b/mesa 3D driver/src/amd/compiler/aco_instruction_selection.h	
@@ -48,7 +48,7 @@ struct shader_io_state {
 
 struct isel_context {
    const struct radv_nir_compiler_options* options;
-   struct radv_shader_args* args;
+   const struct radv_shader_args* args;
    Program* program;
    nir_shader* shader;
    uint32_t constant_data_offset;
@@ -116,7 +116,9 @@ void cleanup_context(isel_context* ctx);
 
 isel_context setup_isel_context(Program* program, unsigned shader_count,
                                 struct nir_shader* const* shaders, ac_shader_config* config,
-                                struct radv_shader_args* args, bool is_gs_copy_shader);
+                                const struct radv_nir_compiler_options* options,
+                                const struct radv_shader_info* info,
+                                const struct radv_shader_args* args, bool is_gs_copy_shader);
 
 } // namespace aco
 
diff --git a/mesa 3D driver/src/amd/compiler/aco_instruction_selection_setup.cpp b/mesa 3D driver/src/amd/compiler/aco_instruction_selection_setup.cpp
index f86a5a5a33..8ad3a515b2 100644
--- a/mesa 3D driver/src/amd/compiler/aco_instruction_selection_setup.cpp	
+++ b/mesa 3D driver/src/amd/compiler/aco_instruction_selection_setup.cpp	
@@ -36,34 +36,6 @@ namespace aco {
 
 namespace {
 
-unsigned
-get_interp_input(nir_intrinsic_op intrin, enum glsl_interp_mode interp)
-{
-   switch (interp) {
-   case INTERP_MODE_SMOOTH:
-   case INTERP_MODE_NONE:
-      if (intrin == nir_intrinsic_load_barycentric_pixel ||
-          intrin == nir_intrinsic_load_barycentric_at_sample ||
-          intrin == nir_intrinsic_load_barycentric_at_offset)
-         return S_0286CC_PERSP_CENTER_ENA(1);
-      else if (intrin == nir_intrinsic_load_barycentric_centroid)
-         return S_0286CC_PERSP_CENTROID_ENA(1);
-      else if (intrin == nir_intrinsic_load_barycentric_sample)
-         return S_0286CC_PERSP_SAMPLE_ENA(1);
-      break;
-   case INTERP_MODE_NOPERSPECTIVE:
-      if (intrin == nir_intrinsic_load_barycentric_pixel)
-         return S_0286CC_LINEAR_CENTER_ENA(1);
-      else if (intrin == nir_intrinsic_load_barycentric_centroid)
-         return S_0286CC_LINEAR_CENTROID_ENA(1);
-      else if (intrin == nir_intrinsic_load_barycentric_sample)
-         return S_0286CC_LINEAR_SAMPLE_ENA(1);
-      break;
-   default: break;
-   }
-   return 0;
-}
-
 bool
 is_loop_header_block(nir_block* block)
 {
@@ -275,56 +247,15 @@ get_reg_class(isel_context* ctx, RegType type, unsigned components, unsigned bit
 }
 
 void
-setup_vs_output_info(isel_context* ctx, nir_shader* nir, bool export_prim_id,
-                     bool export_clip_dists, radv_vs_output_info* outinfo)
+setup_vs_output_info(isel_context* ctx, nir_shader* nir,
+                     const radv_vs_output_info* outinfo)
 {
-   memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
-          sizeof(outinfo->vs_output_param_offset));
-
-   outinfo->param_exports = 0;
-   int pos_written = 0x1;
-   bool writes_primitive_shading_rate =
-      outinfo->writes_primitive_shading_rate || ctx->options->force_vrs_rates;
-   if (outinfo->writes_pointsize || outinfo->writes_viewport_index || outinfo->writes_layer ||
-       writes_primitive_shading_rate)
-      pos_written |= 1 << 1;
-
-   uint64_t mask = nir->info.outputs_written;
-   while (mask) {
-      int idx = u_bit_scan64(&mask);
-      if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER ||
-          idx == VARYING_SLOT_PRIMITIVE_ID || idx == VARYING_SLOT_VIEWPORT ||
-          ((idx == VARYING_SLOT_CLIP_DIST0 || idx == VARYING_SLOT_CLIP_DIST1) &&
-           export_clip_dists)) {
-         if (outinfo->vs_output_param_offset[idx] == AC_EXP_PARAM_UNDEFINED)
-            outinfo->vs_output_param_offset[idx] = outinfo->param_exports++;
-      }
-   }
-   if (outinfo->writes_layer &&
-       outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] == AC_EXP_PARAM_UNDEFINED) {
-      /* when ctx->options->key.has_multiview_view_index = true, the layer
-       * variable isn't declared in NIR and it's isel's job to get the layer */
-      outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] = outinfo->param_exports++;
-   }
-
-   if (export_prim_id) {
-      assert(outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] == AC_EXP_PARAM_UNDEFINED);
-      outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = outinfo->param_exports++;
-   }
-
-   ctx->export_clip_dists = export_clip_dists;
+   ctx->export_clip_dists = outinfo->export_clip_dists;
    ctx->num_clip_distances = util_bitcount(outinfo->clip_dist_mask);
    ctx->num_cull_distances = util_bitcount(outinfo->cull_dist_mask);
 
    assert(ctx->num_clip_distances + ctx->num_cull_distances <= 8);
 
-   if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
-      pos_written |= 1 << 2;
-   if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
-      pos_written |= 1 << 3;
-
-   outinfo->pos_exports = util_bitcount(pos_written);
-
    /* GFX10+ early rasterization:
     * When there are no param exports in an NGG (or legacy VS) shader,
     * RADV sets NO_PC_EXPORT=1, which means the HW will start clipping and rasterization
@@ -338,13 +269,11 @@ void
 setup_vs_variables(isel_context* ctx, nir_shader* nir)
 {
    if (ctx->stage == vertex_vs || ctx->stage == vertex_ngg) {
-      radv_vs_output_info* outinfo = &ctx->program->info->vs.outinfo;
-      setup_vs_output_info(ctx, nir, outinfo->export_prim_id,
-                           ctx->options->key.vs_common_out.export_clip_dists, outinfo);
+      setup_vs_output_info(ctx, nir, &ctx->program->info->vs.outinfo);
 
       /* TODO: NGG streamout */
       if (ctx->stage.hw == HWStage::NGG)
-         assert(!ctx->args->shader_info->so.num_outputs);
+         assert(!ctx->program->info->so.num_outputs);
    }
 
    if (ctx->stage == vertex_ngg) {
@@ -362,42 +291,33 @@ setup_gs_variables(isel_context* ctx, nir_shader* nir)
       ctx->program->config->lds_size =
          ctx->program->info->gs_ring_info.lds_size; /* Already in units of the alloc granularity */
    } else if (ctx->stage == vertex_geometry_ngg || ctx->stage == tess_eval_geometry_ngg) {
-      radv_vs_output_info* outinfo = &ctx->program->info->vs.outinfo;
-      setup_vs_output_info(ctx, nir, false, ctx->options->key.vs_common_out.export_clip_dists,
-                           outinfo);
+      setup_vs_output_info(ctx, nir, &ctx->program->info->vs.outinfo);
 
       ctx->program->config->lds_size =
          DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
    }
-
-   if (ctx->stage.has(SWStage::VS))
-      ctx->program->info->gs.es_type = MESA_SHADER_VERTEX;
-   else if (ctx->stage.has(SWStage::TES))
-      ctx->program->info->gs.es_type = MESA_SHADER_TESS_EVAL;
 }
 
 void
 setup_tcs_info(isel_context* ctx, nir_shader* nir, nir_shader* vs)
 {
-   ctx->tcs_in_out_eq = ctx->args->shader_info->vs.tcs_in_out_eq;
-   ctx->tcs_temp_only_inputs = ctx->args->shader_info->vs.tcs_temp_only_input_mask;
-   ctx->tcs_num_patches = ctx->args->shader_info->num_tess_patches;
-   ctx->program->config->lds_size = ctx->args->shader_info->tcs.num_lds_blocks;
+   ctx->tcs_in_out_eq = ctx->program->info->vs.tcs_in_out_eq;
+   ctx->tcs_temp_only_inputs = ctx->program->info->vs.tcs_temp_only_input_mask;
+   ctx->tcs_num_patches = ctx->program->info->num_tess_patches;
+   ctx->program->config->lds_size = ctx->program->info->tcs.num_lds_blocks;
 }
 
 void
 setup_tes_variables(isel_context* ctx, nir_shader* nir)
 {
-   ctx->tcs_num_patches = ctx->args->shader_info->num_tess_patches;
+   ctx->tcs_num_patches = ctx->program->info->num_tess_patches;
 
    if (ctx->stage == tess_eval_vs || ctx->stage == tess_eval_ngg) {
-      radv_vs_output_info* outinfo = &ctx->program->info->tes.outinfo;
-      setup_vs_output_info(ctx, nir, outinfo->export_prim_id,
-                           ctx->options->key.vs_common_out.export_clip_dists, outinfo);
+      setup_vs_output_info(ctx, nir, &ctx->program->info->tes.outinfo);
 
       /* TODO: NGG streamout */
       if (ctx->stage.hw == HWStage::NGG)
-         assert(!ctx->args->shader_info->so.num_outputs);
+         assert(!ctx->program->info->so.num_outputs);
    }
 
    if (ctx->stage == tess_eval_ngg) {
@@ -468,9 +388,9 @@ init_context(isel_context* ctx, nir_shader* shader)
    ctx->range_ht = _mesa_pointer_hash_table_create(NULL);
    ctx->ub_config.min_subgroup_size = 64;
    ctx->ub_config.max_subgroup_size = 64;
-   if (ctx->shader->info.stage == MESA_SHADER_COMPUTE && ctx->options->key.cs.subgroup_size) {
-      ctx->ub_config.min_subgroup_size = ctx->options->key.cs.subgroup_size;
-      ctx->ub_config.max_subgroup_size = ctx->options->key.cs.subgroup_size;
+   if (ctx->shader->info.stage == MESA_SHADER_COMPUTE && ctx->program->info->cs.subgroup_size) {
+      ctx->ub_config.min_subgroup_size = ctx->program->info->cs.subgroup_size;
+      ctx->ub_config.max_subgroup_size = ctx->program->info->cs.subgroup_size;
    }
    ctx->ub_config.max_workgroup_invocations = 2048;
    ctx->ub_config.max_workgroup_count[0] = 65535;
@@ -530,8 +450,6 @@ init_context(isel_context* ctx, nir_shader* shader)
    ctx->program->allocateRange(impl->ssa_alloc);
    RegClass* regclasses = ctx->program->temp_rc.data() + ctx->first_temp_id;
 
-   unsigned spi_ps_inputs = 0;
-
    std::unique_ptr<unsigned[]> nir_to_aco{new unsigned[impl->num_blocks]()};
 
    /* TODO: make this recursive to improve compile times */
@@ -593,7 +511,16 @@ init_context(isel_context* ctx, nir_shader* shader)
                case nir_op_frexp_exp:
                case nir_op_cube_face_index_amd:
                case nir_op_cube_face_coord_amd:
-               case nir_op_sad_u8x4: type = RegType::vgpr; break;
+               case nir_op_sad_u8x4:
+               case nir_op_iadd_sat:
+               case nir_op_udot_4x8_uadd:
+               case nir_op_sdot_4x8_iadd:
+               case nir_op_udot_4x8_uadd_sat:
+               case nir_op_sdot_4x8_iadd_sat:
+               case nir_op_udot_2x16_uadd:
+               case nir_op_sdot_2x16_iadd:
+               case nir_op_udot_2x16_uadd_sat:
+               case nir_op_sdot_2x16_iadd_sat: type = RegType::vgpr; break;
                case nir_op_f2i16:
                case nir_op_f2u16:
                case nir_op_f2i32:
@@ -650,6 +577,7 @@ init_context(isel_context* ctx, nir_shader* shader)
                case nir_intrinsic_load_push_constant:
                case nir_intrinsic_load_workgroup_id:
                case nir_intrinsic_load_num_workgroups:
+               case nir_intrinsic_load_ray_launch_size:
                case nir_intrinsic_load_subgroup_id:
                case nir_intrinsic_load_num_subgroups:
                case nir_intrinsic_load_first_vertex:
@@ -700,7 +628,6 @@ init_context(isel_context* ctx, nir_shader* shader)
                case nir_intrinsic_load_frag_coord:
                case nir_intrinsic_load_frag_shading_rate:
                case nir_intrinsic_load_sample_pos:
-               case nir_intrinsic_load_layer_id:
                case nir_intrinsic_load_local_invocation_id:
                case nir_intrinsic_load_local_invocation_index:
                case nir_intrinsic_load_subgroup_invocation:
@@ -769,7 +696,6 @@ init_context(isel_context* ctx, nir_shader* shader)
                case nir_intrinsic_load_initial_edgeflags_amd:
                case nir_intrinsic_load_packed_passthrough_primitive_amd:
                case nir_intrinsic_gds_atomic_add_amd:
-               case nir_intrinsic_load_sbt_amd:
                case nir_intrinsic_bvh64_intersect_ray_amd:
                case nir_intrinsic_load_cull_small_prim_precision_amd: type = RegType::vgpr; break;
                case nir_intrinsic_load_shared:
@@ -792,9 +718,11 @@ init_context(isel_context* ctx, nir_shader* shader)
                case nir_intrinsic_inclusive_scan:
                case nir_intrinsic_exclusive_scan:
                case nir_intrinsic_reduce:
+               case nir_intrinsic_load_sbt_amd:
                case nir_intrinsic_load_ubo:
                case nir_intrinsic_load_ssbo:
                case nir_intrinsic_load_global:
+               case nir_intrinsic_load_global_constant:
                case nir_intrinsic_vulkan_resource_index:
                case nir_intrinsic_get_ssbo_size:
                   type = nir_dest_is_divergent(intrinsic->dest) ? RegType::vgpr : RegType::sgpr;
@@ -813,51 +741,6 @@ init_context(isel_context* ctx, nir_shader* shader)
                RegClass rc = get_reg_class(ctx, type, intrinsic->dest.ssa.num_components,
                                            intrinsic->dest.ssa.bit_size);
                regclasses[intrinsic->dest.ssa.index] = rc;
-
-               switch (intrinsic->intrinsic) {
-               case nir_intrinsic_load_barycentric_sample:
-               case nir_intrinsic_load_barycentric_pixel:
-               case nir_intrinsic_load_barycentric_centroid:
-               case nir_intrinsic_load_barycentric_at_sample:
-               case nir_intrinsic_load_barycentric_at_offset: {
-                  glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(intrinsic);
-                  spi_ps_inputs |= get_interp_input(intrinsic->intrinsic, mode);
-                  break;
-               }
-               case nir_intrinsic_load_barycentric_model:
-                  spi_ps_inputs |= S_0286CC_PERSP_PULL_MODEL_ENA(1);
-                  break;
-               case nir_intrinsic_load_front_face:
-                  spi_ps_inputs |= S_0286CC_FRONT_FACE_ENA(1);
-                  break;
-               case nir_intrinsic_load_frag_coord:
-               case nir_intrinsic_load_sample_pos: {
-                  uint8_t mask = nir_ssa_def_components_read(&intrinsic->dest.ssa);
-                  for (unsigned i = 0; i < 4; i++) {
-                     if (mask & (1 << i))
-                        spi_ps_inputs |= S_0286CC_POS_X_FLOAT_ENA(1) << i;
-                  }
-
-                  if (ctx->options->adjust_frag_coord_z &&
-                      intrinsic->intrinsic == nir_intrinsic_load_frag_coord &&
-                      G_0286CC_POS_Z_FLOAT_ENA(spi_ps_inputs)) {
-                     /* Enable ancillary for adjusting gl_FragCoord.z for
-                      * VRS due to a hw bug on some GFX10.3 chips.
-                      */
-                     spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1);
-                  }
-                  break;
-               }
-               case nir_intrinsic_load_sample_id:
-               case nir_intrinsic_load_frag_shading_rate:
-                  spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1);
-                  break;
-               case nir_intrinsic_load_sample_mask_in:
-                  spi_ps_inputs |= S_0286CC_ANCILLARY_ENA(1);
-                  spi_ps_inputs |= S_0286CC_SAMPLE_COVERAGE_ENA(1);
-                  break;
-               default: break;
-               }
                break;
             }
             case nir_instr_type_tex: {
@@ -914,20 +797,10 @@ init_context(isel_context* ctx, nir_shader* shader)
       }
    }
 
-   if (G_0286CC_POS_W_FLOAT_ENA(spi_ps_inputs)) {
-      /* If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be enabled too */
-      spi_ps_inputs |= S_0286CC_PERSP_CENTER_ENA(1);
-   }
+   ctx->program->config->spi_ps_input_ena = ctx->program->info->ps.spi_ps_input;
+   ctx->program->config->spi_ps_input_addr = ctx->program->info->ps.spi_ps_input;
 
-   if (!(spi_ps_inputs & 0x7F)) {
-      /* At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled */
-      spi_ps_inputs |= S_0286CC_PERSP_CENTER_ENA(1);
-   }
-
-   ctx->program->config->spi_ps_input_ena = spi_ps_inputs;
-   ctx->program->config->spi_ps_input_addr = spi_ps_inputs;
-
-   ctx->cf_info.nir_to_aco.reset(nir_to_aco.release());
+   ctx->cf_info.nir_to_aco = std::move(nir_to_aco);
 
    /* align and copy constant data */
    while (ctx->program->constant_data.size() % 4u)
@@ -946,7 +819,9 @@ cleanup_context(isel_context* ctx)
 
 isel_context
 setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
-                   ac_shader_config* config, struct radv_shader_args* args, bool is_gs_copy_shader)
+                   ac_shader_config* config, const struct radv_nir_compiler_options* options,
+                   const struct radv_shader_info* info,
+                   const struct radv_shader_args* args, bool is_gs_copy_shader)
 {
    SWStage sw_stage = SWStage::None;
    for (unsigned i = 0; i < shader_count; i++) {
@@ -962,12 +837,12 @@ setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* c
       default: unreachable("Shader stage not implemented");
       }
    }
-   bool gfx9_plus = args->options->chip_class >= GFX9;
-   bool ngg = args->shader_info->is_ngg && args->options->chip_class >= GFX10;
+   bool gfx9_plus = options->chip_class >= GFX9;
+   bool ngg = info->is_ngg && options->chip_class >= GFX10;
    HWStage hw_stage{};
-   if (sw_stage == SWStage::VS && args->shader_info->vs.as_es && !ngg)
+   if (sw_stage == SWStage::VS && info->vs.as_es && !ngg)
       hw_stage = HWStage::ES;
-   else if (sw_stage == SWStage::VS && !args->shader_info->vs.as_ls && !ngg)
+   else if (sw_stage == SWStage::VS && !info->vs.as_ls && !ngg)
       hw_stage = HWStage::VS;
    else if (sw_stage == SWStage::VS && ngg)
       hw_stage = HWStage::NGG; /* GFX10/NGG: VS without GS uses the HW GS stage */
@@ -983,17 +858,17 @@ setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* c
       hw_stage = HWStage::GS; /* GFX6-9: VS+GS merged into a GS (and GFX10/legacy) */
    else if (sw_stage == SWStage::VS_GS && ngg)
       hw_stage = HWStage::NGG; /* GFX10+: VS+GS merged into an NGG GS */
-   else if (sw_stage == SWStage::VS && args->shader_info->vs.as_ls)
+   else if (sw_stage == SWStage::VS && info->vs.as_ls)
       hw_stage = HWStage::LS; /* GFX6-8: VS is a Local Shader, when tessellation is used */
    else if (sw_stage == SWStage::TCS)
       hw_stage = HWStage::HS; /* GFX6-8: TCS is a Hull Shader */
    else if (sw_stage == SWStage::VS_TCS)
       hw_stage = HWStage::HS; /* GFX9-10: VS+TCS merged into a Hull Shader */
-   else if (sw_stage == SWStage::TES && !args->shader_info->tes.as_es && !ngg)
+   else if (sw_stage == SWStage::TES && !info->tes.as_es && !ngg)
       hw_stage = HWStage::VS; /* GFX6-9: TES without GS uses the HW VS stage (and GFX10/legacy) */
-   else if (sw_stage == SWStage::TES && !args->shader_info->tes.as_es && ngg)
+   else if (sw_stage == SWStage::TES && !info->tes.as_es && ngg)
       hw_stage = HWStage::NGG; /* GFX10/NGG: TES without GS */
-   else if (sw_stage == SWStage::TES && args->shader_info->tes.as_es && !ngg)
+   else if (sw_stage == SWStage::TES && info->tes.as_es && !ngg)
       hw_stage = HWStage::ES; /* GFX6-8: TES is an Export Shader */
    else if (sw_stage == SWStage::TES_GS && gfx9_plus && !ngg)
       hw_stage = HWStage::GS; /* GFX9: TES+GS merged into a GS (and GFX10/legacy) */
@@ -1002,77 +877,29 @@ setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* c
    else
       unreachable("Shader stage not implemented");
 
-   init_program(program, Stage{hw_stage, sw_stage}, args->shader_info, args->options->chip_class,
-                args->options->family, args->options->wgp_mode, config);
+   init_program(program, Stage{hw_stage, sw_stage}, info, options->chip_class,
+                options->family, options->wgp_mode, config);
 
    isel_context ctx = {};
    ctx.program = program;
    ctx.args = args;
-   ctx.options = args->options;
+   ctx.options = options;
    ctx.stage = program->stage;
 
-   /* TODO: Check if we need to adjust min_waves for unknown workgroup sizes. */
-   if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::FS) {
-      /* PS and legacy VS have separate waves, no workgroups */
-      program->workgroup_size = program->wave_size;
-   } else if (program->stage == compute_cs) {
-      /* CS sets the workgroup size explicitly */
-      program->workgroup_size = shaders[0]->info.workgroup_size[0] *
-                                shaders[0]->info.workgroup_size[1] *
-                                shaders[0]->info.workgroup_size[2];
-   } else if (program->stage.hw == HWStage::ES || program->stage == geometry_gs) {
-      /* Unmerged ESGS operate in workgroups if on-chip GS (LDS rings) are enabled on GFX7-8
-       * (not implemented in Mesa)  */
-      program->workgroup_size = program->wave_size;
-   } else if (program->stage.hw == HWStage::GS) {
-      /* If on-chip GS (LDS rings) are enabled on GFX9 or later, merged GS operates in workgroups */
-      assert(program->chip_class >= GFX9);
-      uint32_t es_verts_per_subgrp =
-         G_028A44_ES_VERTS_PER_SUBGRP(program->info->gs_ring_info.vgt_gs_onchip_cntl);
-      uint32_t gs_instr_prims_in_subgrp =
-         G_028A44_GS_INST_PRIMS_IN_SUBGRP(program->info->gs_ring_info.vgt_gs_onchip_cntl);
-      uint32_t workgroup_size = MAX2(es_verts_per_subgrp, gs_instr_prims_in_subgrp);
-      program->workgroup_size = MAX2(MIN2(workgroup_size, 256), 1);
-   } else if (program->stage == vertex_ls) {
-      /* Unmerged LS operates in workgroups */
-      program->workgroup_size = UINT_MAX; /* TODO: probably tcs_num_patches * tcs_vertices_in, but
-                                             those are not plumbed to ACO for LS */
-   } else if (program->stage == tess_control_hs) {
-      /* Unmerged HS operates in workgroups, size is determined by the output vertices */
+   program->workgroup_size = program->info->workgroup_size;
+   assert(program->workgroup_size);
+
+   if (ctx.stage == tess_control_hs)
       setup_tcs_info(&ctx, shaders[0], NULL);
-      program->workgroup_size = ctx.tcs_num_patches * shaders[0]->info.tess.tcs_vertices_out;
-   } else if (program->stage == vertex_tess_control_hs) {
-      /* Merged LSHS operates in workgroups, but can still have a different number of LS and HS
-       * invocations */
+   else if (ctx.stage == vertex_tess_control_hs)
       setup_tcs_info(&ctx, shaders[1], shaders[0]);
-      program->workgroup_size =
-         ctx.tcs_num_patches *
-         MAX2(shaders[1]->info.tess.tcs_vertices_out, ctx.args->options->key.tcs.input_vertices);
-   } else if (program->stage.hw == HWStage::NGG) {
-      gfx10_ngg_info& ngg_info = args->shader_info->ngg_info;
-      unsigned num_gs_invocations =
-         (program->stage.has(SWStage::GS)) ? MAX2(shaders[1]->info.gs.invocations, 1) : 1;
-
-      /* Max ES (SW VS/TES) threads */
-      uint32_t max_esverts = ngg_info.hw_max_esverts;
-      /* Max GS input primitives = max GS threads */
-      uint32_t max_gs_input_prims = ngg_info.max_gsprims * num_gs_invocations;
-      /* Maximum output vertices -- each thread can export only 1 vertex */
-      uint32_t max_out_vtx = ngg_info.max_out_verts;
-      /* Maximum output primitives -- each thread can export only 1 or 0 primitive */
-      uint32_t max_out_prm = ngg_info.max_gsprims * num_gs_invocations * ngg_info.prim_amp_factor;
-
-      program->workgroup_size = MAX4(max_esverts, max_gs_input_prims, max_out_vtx, max_out_prm);
-   } else {
-      unreachable("Unsupported shader stage.");
-   }
 
    calc_min_waves(program);
 
    unsigned scratch_size = 0;
    if (program->stage == gs_copy_vs) {
       assert(shader_count == 1);
-      setup_vs_output_info(&ctx, shaders[0], false, true, &args->shader_info->vs.outinfo);
+      setup_vs_output_info(&ctx, shaders[0], &program->info->vs.outinfo);
    } else {
       for (unsigned i = 0; i < shader_count; i++) {
          nir_shader* nir = shaders[i];
diff --git a/mesa 3D driver/src/amd/compiler/aco_interface.cpp b/mesa 3D driver/src/amd/compiler/aco_interface.cpp
index 0c06f2a3ca..6cdd74d968 100644
--- a/mesa 3D driver/src/amd/compiler/aco_interface.cpp	
+++ b/mesa 3D driver/src/amd/compiler/aco_interface.cpp	
@@ -74,29 +74,32 @@ validate(aco::Program* program)
 }
 
 void
-aco_compile_shader(unsigned shader_count, struct nir_shader* const* shaders,
-                   struct radv_shader_binary** binary, struct radv_shader_args* args)
+aco_compile_shader(const struct radv_nir_compiler_options* options,
+                   const struct radv_shader_info* info,
+                   unsigned shader_count, struct nir_shader* const* shaders,
+                   const struct radv_shader_args *args,
+                   struct radv_shader_binary** binary)
 {
    aco::init();
 
    ac_shader_config config = {0};
    std::unique_ptr<aco::Program> program{new aco::Program};
 
-   program->collect_statistics = args->options->record_stats;
+   program->collect_statistics = options->record_stats;
    if (program->collect_statistics)
       memset(program->statistics, 0, sizeof(program->statistics));
 
-   program->debug.func = args->options->debug.func;
-   program->debug.private_data = args->options->debug.private_data;
+   program->debug.func = options->debug.func;
+   program->debug.private_data = options->debug.private_data;
 
    /* Instruction Selection */
    if (args->is_gs_copy_shader)
-      aco::select_gs_copy_shader(program.get(), shaders[0], &config, args);
+      aco::select_gs_copy_shader(program.get(), shaders[0], &config, options, info, args);
    else if (args->is_trap_handler_shader)
-      aco::select_trap_handler_shader(program.get(), shaders[0], &config, args);
+      aco::select_trap_handler_shader(program.get(), shaders[0], &config, options, info, args);
    else
-      aco::select_program(program.get(), shader_count, shaders, &config, args);
-   if (args->options->dump_preoptir)
+      aco::select_program(program.get(), shader_count, shaders, &config, options, info, args);
+   if (options->dump_preoptir)
       aco_print_program(program.get(), stderr);
 
    aco::live live_vars;
@@ -107,7 +110,7 @@ aco_compile_shader(unsigned shader_count, struct nir_shader* const* shaders,
       validate(program.get());
 
       /* Optimization */
-      if (!args->options->disable_optimizations) {
+      if (!options->key.optimisations_disabled) {
          if (!(aco::debug_flags & aco::DEBUG_NO_VN))
             aco::value_numbering(program.get());
          if (!(aco::debug_flags & aco::DEBUG_NO_OPT))
@@ -125,7 +128,7 @@ aco_compile_shader(unsigned shader_count, struct nir_shader* const* shaders,
    }
 
    std::string llvm_ir;
-   if (args->options->record_ir) {
+   if (options->record_ir) {
       char* data = NULL;
       size_t size = 0;
       u_memstream mem;
@@ -143,11 +146,11 @@ aco_compile_shader(unsigned shader_count, struct nir_shader* const* shaders,
    if (program->collect_statistics)
       aco::collect_presched_stats(program.get());
 
-   if ((aco::debug_flags & aco::DEBUG_LIVE_INFO) && args->options->dump_shader)
+   if ((aco::debug_flags & aco::DEBUG_LIVE_INFO) && options->dump_shader)
       aco_print_program(program.get(), stderr, live_vars, aco::print_live_vars | aco::print_kill);
 
    if (!args->is_trap_handler_shader) {
-      if (!args->options->disable_optimizations && !(aco::debug_flags & aco::DEBUG_NO_SCHED))
+      if (!options->key.optimisations_disabled && !(aco::debug_flags & aco::DEBUG_NO_SCHED))
          aco::schedule_program(program.get(), live_vars);
       validate(program.get());
 
@@ -157,14 +160,14 @@ aco_compile_shader(unsigned shader_count, struct nir_shader* const* shaders,
       if (aco::validate_ra(program.get())) {
          aco_print_program(program.get(), stderr);
          abort();
-      } else if (args->options->dump_shader) {
+      } else if (options->dump_shader) {
          aco_print_program(program.get(), stderr);
       }
 
       validate(program.get());
 
       /* Optimization */
-      if (!args->options->disable_optimizations && !(aco::debug_flags & aco::DEBUG_NO_OPT)) {
+      if (!options->key.optimisations_disabled && !(aco::debug_flags & aco::DEBUG_NO_OPT)) {
          aco::optimize_postRA(program.get());
          validate(program.get());
       }
@@ -192,25 +195,34 @@ aco_compile_shader(unsigned shader_count, struct nir_shader* const* shaders,
    if (program->collect_statistics)
       aco::collect_postasm_stats(program.get(), code);
 
-   bool get_disasm = args->options->dump_shader || args->options->record_ir;
+   bool get_disasm = options->dump_shader || options->record_ir;
 
    size_t size = llvm_ir.size();
 
    std::string disasm;
    if (get_disasm) {
-      char* data = NULL;
-      size_t disasm_size = 0;
-      struct u_memstream mem;
-      if (u_memstream_open(&mem, &data, &disasm_size)) {
-         FILE* const memf = u_memstream_get(&mem);
-         aco::print_asm(program.get(), code, exec_size / 4u, memf);
-         fputc(0, memf);
-         u_memstream_close(&mem);
-      }
+      if (check_print_asm_support(program.get())) {
+         char* data = NULL;
+         size_t disasm_size = 0;
+         struct u_memstream mem;
+         if (u_memstream_open(&mem, &data, &disasm_size)) {
+            FILE* const memf = u_memstream_get(&mem);
+            aco::print_asm(program.get(), code, exec_size / 4u, memf);
+            fputc(0, memf);
+            u_memstream_close(&mem);
+         }
 
-      disasm = std::string(data, data + disasm_size);
-      size += disasm_size;
-      free(data);
+         disasm = std::string(data, data + disasm_size);
+         size += disasm_size;
+         free(data);
+      } else {
+         disasm = "Shader disassembly is not supported in the current configuration"
+#ifndef LLVM_AVAILABLE
+                  " (LLVM not available)"
+#endif
+                  ".\n";
+         size += disasm.length();
+      }
    }
 
    size_t stats_size = 0;
@@ -239,7 +251,7 @@ aco_compile_shader(unsigned shader_count, struct nir_shader* const* shaders,
    legacy_binary->exec_size = exec_size;
    legacy_binary->code_size = code.size() * sizeof(uint32_t);
 
-   legacy_binary->config = config;
+   legacy_binary->base.config = config;
    legacy_binary->disasm_size = 0;
    legacy_binary->ir_size = llvm_ir.size();
 
@@ -255,3 +267,50 @@ aco_compile_shader(unsigned shader_count, struct nir_shader* const* shaders,
 
    *binary = (radv_shader_binary*)legacy_binary;
 }
+
+void
+aco_compile_vs_prolog(const struct radv_nir_compiler_options* options,
+                      const struct radv_shader_info* info,
+                      const struct radv_vs_prolog_key* key,
+                      const struct radv_shader_args* args,
+                      struct radv_prolog_binary** binary)
+{
+   aco::init();
+
+   /* create program */
+   ac_shader_config config = {0};
+   std::unique_ptr<aco::Program> program{new aco::Program};
+   program->collect_statistics = false;
+   program->debug.func = NULL;
+   program->debug.private_data = NULL;
+
+   /* create IR */
+   unsigned num_preserved_sgprs;
+   aco::select_vs_prolog(program.get(), key, &config, options, info, args, &num_preserved_sgprs);
+   aco::insert_NOPs(program.get());
+
+   if (options->dump_shader)
+      aco_print_program(program.get(), stderr);
+
+   /* assembly */
+   std::vector<uint32_t> code;
+   code.reserve(align(program->blocks[0].instructions.size() * 2, 16));
+   unsigned exec_size = aco::emit_program(program.get(), code);
+
+   if (options->dump_shader) {
+      aco::print_asm(program.get(), code, exec_size / 4u, stderr);
+      fprintf(stderr, "\n");
+   }
+
+   /* copy into binary */
+   size_t size = code.size() * sizeof(uint32_t) + sizeof(radv_prolog_binary);
+   radv_prolog_binary* prolog_binary = (radv_prolog_binary*)calloc(size, 1);
+
+   prolog_binary->num_sgprs = config.num_sgprs;
+   prolog_binary->num_vgprs = config.num_vgprs;
+   prolog_binary->num_preserved_sgprs = num_preserved_sgprs;
+   prolog_binary->code_size = code.size() * sizeof(uint32_t);
+   memcpy(prolog_binary->data, code.data(), prolog_binary->code_size);
+
+   *binary = prolog_binary;
+}
diff --git a/mesa 3D driver/src/amd/compiler/aco_interface.h b/mesa 3D driver/src/amd/compiler/aco_interface.h
index a0df87827e..b0b5c74190 100644
--- a/mesa 3D driver/src/amd/compiler/aco_interface.h	
+++ b/mesa 3D driver/src/amd/compiler/aco_interface.h	
@@ -41,8 +41,17 @@ struct aco_compiler_statistic_info {
 extern const unsigned aco_num_statistics;
 extern const struct aco_compiler_statistic_info* aco_statistic_infos;
 
-void aco_compile_shader(unsigned shader_count, struct nir_shader* const* shaders,
-                        struct radv_shader_binary** binary, struct radv_shader_args* args);
+void aco_compile_shader(const struct radv_nir_compiler_options* options,
+                        const struct radv_shader_info* info,
+                        unsigned shader_count, struct nir_shader* const* shaders,
+                        const struct radv_shader_args *args,
+                        struct radv_shader_binary** binary);
+
+void aco_compile_vs_prolog(const struct radv_nir_compiler_options* options,
+                           const struct radv_shader_info* info,
+                           const struct radv_vs_prolog_key* key,
+                           const struct radv_shader_args* args,
+                           struct radv_prolog_binary** binary);
 
 #ifdef __cplusplus
 }
diff --git a/mesa 3D driver/src/amd/compiler/aco_ir.cpp b/mesa 3D driver/src/amd/compiler/aco_ir.cpp
index 79f9d71a79..dcc85a92e2 100644
--- a/mesa 3D driver/src/amd/compiler/aco_ir.cpp	
+++ b/mesa 3D driver/src/amd/compiler/aco_ir.cpp	
@@ -24,6 +24,8 @@
 
 #include "aco_ir.h"
 
+#include "aco_builder.h"
+
 #include "util/debug.h"
 
 #include "c11/threads.h"
@@ -63,7 +65,7 @@ init()
 }
 
 void
-init_program(Program* program, Stage stage, struct radv_shader_info* info,
+init_program(Program* program, Stage stage, const struct radv_shader_info* info,
              enum chip_class chip_class, enum radeon_family family, bool wgp_mode,
              ac_shader_config* config)
 {
@@ -195,7 +197,7 @@ can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_ra)
       VOP3_instruction& vop3 = instr->vop3();
       if (instr->format == Format::VOP3)
          return false;
-      if (vop3.clamp && instr->format == asVOP3(Format::VOPC) && chip != GFX8)
+      if (vop3.clamp && instr->isVOPC() && chip != GFX8)
          return false;
       if (vop3.omod && chip < GFX9)
          return false;
@@ -212,7 +214,7 @@ can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_ra)
       }
    }
 
-   if (!instr->definitions.empty() && instr->definitions[0].bytes() > 4)
+   if (!instr->definitions.empty() && instr->definitions[0].bytes() > 4 && !instr->isVOPC())
       return false;
 
    if (!instr->operands.empty()) {
@@ -233,7 +235,7 @@ can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_ra)
       return false;
 
    // TODO: return true if we know we will use vcc
-   if (!pre_ra && instr->isVOPC())
+   if (!pre_ra && instr->isVOPC() && chip == GFX8)
       return false;
    if (!pre_ra && instr->operands.size() >= 3 && !is_mac)
       return false;
@@ -274,24 +276,11 @@ convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr)
       if (i >= 2)
          break;
 
-      switch (instr->operands[i].bytes()) {
-      case 1: sdwa.sel[i] = sdwa_ubyte; break;
-      case 2: sdwa.sel[i] = sdwa_uword; break;
-      case 4: sdwa.sel[i] = sdwa_udword; break;
-      }
-   }
-   switch (instr->definitions[0].bytes()) {
-   case 1:
-      sdwa.dst_sel = sdwa_ubyte;
-      sdwa.dst_preserve = true;
-      break;
-   case 2:
-      sdwa.dst_sel = sdwa_uword;
-      sdwa.dst_preserve = true;
-      break;
-   case 4: sdwa.dst_sel = sdwa_udword; break;
+      sdwa.sel[i] = SubdwordSel(instr->operands[i].bytes(), 0, false);
    }
 
+   sdwa.dst_sel = SubdwordSel(instr->definitions[0].bytes(), 0, false);
+
    if (instr->definitions[0].getTemp().type() == RegType::sgpr && chip == GFX8)
       instr->definitions[0].setFixed(vcc);
    if (instr->definitions.size() >= 2)
@@ -302,6 +291,78 @@ convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr)
    return tmp;
 }
 
+bool
+can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra)
+{
+   assert(instr->isVALU() && !instr->operands.empty());
+
+   if (instr->isDPP())
+      return true;
+
+   if (instr->operands.size() && instr->operands[0].isLiteral())
+      return false;
+
+   if (instr->isSDWA())
+      return false;
+
+   if (!pre_ra && (instr->isVOPC() || instr->definitions.size() > 1) &&
+       instr->definitions.back().physReg() != vcc)
+      return false;
+
+   if (!pre_ra && instr->operands.size() >= 3 && instr->operands[2].physReg() != vcc)
+      return false;
+
+   if (instr->isVOP3()) {
+      const VOP3_instruction* vop3 = &instr->vop3();
+      if (vop3->clamp || vop3->omod || vop3->opsel)
+         return false;
+      if (instr->format == Format::VOP3)
+         return false;
+   }
+
+   /* there are more cases but those all take 64-bit inputs */
+   return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
+          instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
+          instr->opcode != aco_opcode::v_readfirstlane_b32 &&
+          instr->opcode != aco_opcode::v_cvt_f64_i32 &&
+          instr->opcode != aco_opcode::v_cvt_f64_f32 && instr->opcode != aco_opcode::v_cvt_f64_u32;
+}
+
+aco_ptr<Instruction>
+convert_to_DPP(aco_ptr<Instruction>& instr)
+{
+   if (instr->isDPP())
+      return NULL;
+
+   aco_ptr<Instruction> tmp = std::move(instr);
+   Format format =
+      (Format)(((uint32_t)tmp->format & ~(uint32_t)Format::VOP3) | (uint32_t)Format::DPP);
+   instr.reset(create_instruction<DPP_instruction>(tmp->opcode, format, tmp->operands.size(),
+                                                   tmp->definitions.size()));
+   std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
+   for (unsigned i = 0; i < instr->definitions.size(); i++)
+      instr->definitions[i] = tmp->definitions[i];
+
+   DPP_instruction* dpp = &instr->dpp();
+   dpp->dpp_ctrl = dpp_quad_perm(0, 1, 2, 3);
+   dpp->row_mask = 0xf;
+   dpp->bank_mask = 0xf;
+
+   if (tmp->isVOP3()) {
+      const VOP3_instruction* vop3 = &tmp->vop3();
+      memcpy(dpp->neg, vop3->neg, sizeof(dpp->neg));
+      memcpy(dpp->abs, vop3->abs, sizeof(dpp->abs));
+   }
+
+   if (instr->isVOPC() || instr->definitions.size() > 1)
+      instr->definitions.back().setFixed(vcc);
+
+   if (instr->operands.size() >= 3)
+      instr->operands[2].setFixed(vcc);
+
+   return tmp;
+}
+
 bool
 can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high)
 {
@@ -345,6 +406,65 @@ can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high)
    }
 }
 
+bool
+instr_is_16bit(chip_class chip, aco_opcode op)
+{
+   /* partial register writes are GFX9+, only */
+   if (chip < GFX9)
+      return false;
+
+   switch (op) {
+   /* VOP3 */
+   case aco_opcode::v_mad_f16:
+   case aco_opcode::v_mad_u16:
+   case aco_opcode::v_mad_i16:
+   case aco_opcode::v_fma_f16:
+   case aco_opcode::v_div_fixup_f16:
+   case aco_opcode::v_interp_p2_f16:
+   case aco_opcode::v_fma_mixlo_f16:
+   /* VOP2 */
+   case aco_opcode::v_mac_f16:
+   case aco_opcode::v_madak_f16:
+   case aco_opcode::v_madmk_f16: return chip >= GFX9;
+   case aco_opcode::v_add_f16:
+   case aco_opcode::v_sub_f16:
+   case aco_opcode::v_subrev_f16:
+   case aco_opcode::v_mul_f16:
+   case aco_opcode::v_max_f16:
+   case aco_opcode::v_min_f16:
+   case aco_opcode::v_ldexp_f16:
+   case aco_opcode::v_fmac_f16:
+   case aco_opcode::v_fmamk_f16:
+   case aco_opcode::v_fmaak_f16:
+   /* VOP1 */
+   case aco_opcode::v_cvt_f16_f32:
+   case aco_opcode::v_cvt_f16_u16:
+   case aco_opcode::v_cvt_f16_i16:
+   case aco_opcode::v_rcp_f16:
+   case aco_opcode::v_sqrt_f16:
+   case aco_opcode::v_rsq_f16:
+   case aco_opcode::v_log_f16:
+   case aco_opcode::v_exp_f16:
+   case aco_opcode::v_frexp_mant_f16:
+   case aco_opcode::v_frexp_exp_i16_f16:
+   case aco_opcode::v_floor_f16:
+   case aco_opcode::v_ceil_f16:
+   case aco_opcode::v_trunc_f16:
+   case aco_opcode::v_rndne_f16:
+   case aco_opcode::v_fract_f16:
+   case aco_opcode::v_sin_f16:
+   case aco_opcode::v_cos_f16: return chip >= GFX10;
+   // TODO: confirm whether these write 16 or 32 bit on GFX10+
+   // case aco_opcode::v_cvt_u16_f16:
+   // case aco_opcode::v_cvt_i16_f16:
+   // case aco_opcode::p_cvt_f16_f32_rtne:
+   // case aco_opcode::v_cvt_norm_i16_f16:
+   // case aco_opcode::v_cvt_norm_u16_f16:
+   /* on GFX10, all opsel instructions preserve the high bits */
+   default: return chip >= GFX10 && can_use_opsel(chip, op, -1, false);
+   }
+}
+
 uint32_t
 get_reduction_identity(ReduceOp op, unsigned idx)
 {
@@ -442,6 +562,171 @@ needs_exec_mask(const Instruction* instr)
    return true;
 }
 
+struct CmpInfo {
+   aco_opcode ordered;
+   aco_opcode unordered;
+   aco_opcode ordered_swapped;
+   aco_opcode unordered_swapped;
+   aco_opcode inverse;
+   aco_opcode f32;
+   unsigned size;
+};
+
+ALWAYS_INLINE bool
+get_cmp_info(aco_opcode op, CmpInfo* info)
+{
+   info->ordered = aco_opcode::num_opcodes;
+   info->unordered = aco_opcode::num_opcodes;
+   info->ordered_swapped = aco_opcode::num_opcodes;
+   info->unordered_swapped = aco_opcode::num_opcodes;
+   switch (op) {
+      // clang-format off
+#define CMP2(ord, unord, ord_swap, unord_swap, sz)                                                 \
+   case aco_opcode::v_cmp_##ord##_f##sz:                                                           \
+   case aco_opcode::v_cmp_n##unord##_f##sz:                                                        \
+      info->ordered = aco_opcode::v_cmp_##ord##_f##sz;                                             \
+      info->unordered = aco_opcode::v_cmp_n##unord##_f##sz;                                        \
+      info->ordered_swapped = aco_opcode::v_cmp_##ord_swap##_f##sz;                                \
+      info->unordered_swapped = aco_opcode::v_cmp_n##unord_swap##_f##sz;                           \
+      info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz \
+                                                               : aco_opcode::v_cmp_n##ord##_f##sz; \
+      info->f32 = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord##_f32            \
+                                                        : aco_opcode::v_cmp_n##unord##_f32;        \
+      info->size = sz;                                                                             \
+      return true;
+#define CMP(ord, unord, ord_swap, unord_swap)                                                      \
+   CMP2(ord, unord, ord_swap, unord_swap, 16)                                                      \
+   CMP2(ord, unord, ord_swap, unord_swap, 32)                                                      \
+   CMP2(ord, unord, ord_swap, unord_swap, 64)
+      CMP(lt, /*n*/ge, gt, /*n*/le)
+      CMP(eq, /*n*/lg, eq, /*n*/lg)
+      CMP(le, /*n*/gt, ge, /*n*/lt)
+      CMP(gt, /*n*/le, lt, /*n*/le)
+      CMP(lg, /*n*/eq, lg, /*n*/eq)
+      CMP(ge, /*n*/lt, le, /*n*/gt)
+#undef CMP
+#undef CMP2
+#define ORD_TEST(sz)                                                                               \
+   case aco_opcode::v_cmp_u_f##sz:                                                                 \
+      info->f32 = aco_opcode::v_cmp_u_f32;                                                         \
+      info->inverse = aco_opcode::v_cmp_o_f##sz;                                                   \
+      info->size = sz;                                                                             \
+      return true;                                                                                 \
+   case aco_opcode::v_cmp_o_f##sz:                                                                 \
+      info->f32 = aco_opcode::v_cmp_o_f32;                                                         \
+      info->inverse = aco_opcode::v_cmp_u_f##sz;                                                   \
+      info->size = sz;                                                                             \
+      return true;
+      ORD_TEST(16)
+      ORD_TEST(32)
+      ORD_TEST(64)
+#undef ORD_TEST
+      // clang-format on
+   default: return false;
+   }
+}
+
+aco_opcode
+get_ordered(aco_opcode op)
+{
+   CmpInfo info;
+   return get_cmp_info(op, &info) ? info.ordered : aco_opcode::num_opcodes;
+}
+
+aco_opcode
+get_unordered(aco_opcode op)
+{
+   CmpInfo info;
+   return get_cmp_info(op, &info) ? info.unordered : aco_opcode::num_opcodes;
+}
+
+aco_opcode
+get_inverse(aco_opcode op)
+{
+   CmpInfo info;
+   return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes;
+}
+
+aco_opcode
+get_f32_cmp(aco_opcode op)
+{
+   CmpInfo info;
+   return get_cmp_info(op, &info) ? info.f32 : aco_opcode::num_opcodes;
+}
+
+unsigned
+get_cmp_bitsize(aco_opcode op)
+{
+   CmpInfo info;
+   return get_cmp_info(op, &info) ? info.size : 0;
+}
+
+bool
+is_cmp(aco_opcode op)
+{
+   CmpInfo info;
+   return get_cmp_info(op, &info) && info.ordered != aco_opcode::num_opcodes;
+}
+
+bool
+can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op)
+{
+   if (instr->isDPP())
+      return false;
+
+   if (instr->operands[0].isConstant() ||
+       (instr->operands[0].isTemp() && instr->operands[0].getTemp().type() == RegType::sgpr))
+      return false;
+
+   switch (instr->opcode) {
+   case aco_opcode::v_add_u32:
+   case aco_opcode::v_add_co_u32:
+   case aco_opcode::v_add_co_u32_e64:
+   case aco_opcode::v_add_i32:
+   case aco_opcode::v_add_f16:
+   case aco_opcode::v_add_f32:
+   case aco_opcode::v_mul_f16:
+   case aco_opcode::v_mul_f32:
+   case aco_opcode::v_or_b32:
+   case aco_opcode::v_and_b32:
+   case aco_opcode::v_xor_b32:
+   case aco_opcode::v_max_f16:
+   case aco_opcode::v_max_f32:
+   case aco_opcode::v_min_f16:
+   case aco_opcode::v_min_f32:
+   case aco_opcode::v_max_i32:
+   case aco_opcode::v_min_i32:
+   case aco_opcode::v_max_u32:
+   case aco_opcode::v_min_u32:
+   case aco_opcode::v_max_i16:
+   case aco_opcode::v_min_i16:
+   case aco_opcode::v_max_u16:
+   case aco_opcode::v_min_u16:
+   case aco_opcode::v_max_i16_e64:
+   case aco_opcode::v_min_i16_e64:
+   case aco_opcode::v_max_u16_e64:
+   case aco_opcode::v_min_u16_e64: *new_op = instr->opcode; return true;
+   case aco_opcode::v_sub_f16: *new_op = aco_opcode::v_subrev_f16; return true;
+   case aco_opcode::v_sub_f32: *new_op = aco_opcode::v_subrev_f32; return true;
+   case aco_opcode::v_sub_co_u32: *new_op = aco_opcode::v_subrev_co_u32; return true;
+   case aco_opcode::v_sub_u16: *new_op = aco_opcode::v_subrev_u16; return true;
+   case aco_opcode::v_sub_u32: *new_op = aco_opcode::v_subrev_u32; return true;
+   default: {
+      CmpInfo info;
+      get_cmp_info(instr->opcode, &info);
+      if (info.ordered == instr->opcode) {
+         *new_op = info.ordered_swapped;
+         return true;
+      }
+      if (info.unordered == instr->opcode) {
+         *new_op = info.unordered_swapped;
+         return true;
+      }
+      return false;
+   }
+   }
+}
+
 wait_imm::wait_imm() : vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter)
 {}
 wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_)
diff --git a/mesa 3D driver/src/amd/compiler/aco_ir.h b/mesa 3D driver/src/amd/compiler/aco_ir.h
index c375b3857c..efab59f899 100644
--- a/mesa 3D driver/src/amd/compiler/aco_ir.h	
+++ b/mesa 3D driver/src/amd/compiler/aco_ir.h	
@@ -38,6 +38,7 @@
 
 struct radv_shader_args;
 struct radv_shader_info;
+struct radv_vs_prolog_key;
 
 namespace aco {
 
@@ -290,6 +291,12 @@ asSDWA(Format format)
    return (Format)((uint32_t)Format::SDWA | (uint32_t)format);
 }
 
+constexpr Format
+withoutDPP(Format format)
+{
+   return (Format)((uint32_t)format & ~(uint32_t)Format::DPP);
+}
+
 enum class RegType {
    none = 0,
    sgpr,
@@ -337,11 +344,12 @@ struct RegClass {
    explicit operator bool() = delete;
 
    constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; }
+   constexpr bool is_linear_vgpr() const { return rc & (1 << 6); };
    constexpr bool is_subdword() const { return rc & (1 << 7); }
    constexpr unsigned bytes() const { return ((unsigned)rc & 0x1F) * (is_subdword() ? 1 : 4); }
    // TODO: use size() less in favor of bytes()
    constexpr unsigned size() const { return (bytes() + 3) >> 2; }
-   constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); }
+   constexpr bool is_linear() const { return rc <= RC::s16 || is_linear_vgpr(); }
    constexpr RegClass as_linear() const { return RegClass((RC)(rc | (1 << 6))); }
    constexpr RegClass as_subdword() const { return RegClass((RC)(rc | 1 << 7)); }
 
@@ -354,6 +362,15 @@ struct RegClass {
       }
    }
 
+   constexpr RegClass resize(unsigned bytes) const
+   {
+      if (is_linear_vgpr()) {
+         assert(bytes % 4u == 0);
+         return get(RegType::vgpr, bytes).as_linear();
+      }
+      return get(type(), bytes);
+   }
+
 private:
    RC rc;
 };
@@ -1399,40 +1416,53 @@ struct DPP_instruction : public Instruction {
 };
 static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 
-enum sdwa_sel : uint8_t {
-   /* masks */
-   sdwa_wordnum = 0x1,
-   sdwa_bytenum = 0x3,
-   sdwa_asuint = 0x7 | 0x10,
-   sdwa_rasize = 0x3,
+struct SubdwordSel {
+   enum sdwa_sel : uint8_t {
+      ubyte = 0x4,
+      uword = 0x8,
+      dword = 0x10,
+      sext = 0x20,
+      sbyte = ubyte | sext,
+      sword = uword | sext,
 
-   /* flags */
-   sdwa_isword = 0x4,
-   sdwa_sext = 0x8,
-   sdwa_isra = 0x10,
+      ubyte0 = ubyte,
+      ubyte1 = ubyte | 1,
+      ubyte2 = ubyte | 2,
+      ubyte3 = ubyte | 3,
+      sbyte0 = sbyte,
+      sbyte1 = sbyte | 1,
+      sbyte2 = sbyte | 2,
+      sbyte3 = sbyte | 3,
+      uword0 = uword,
+      uword1 = uword | 2,
+      sword0 = sword,
+      sword1 = sword | 2,
+   };
 
-   /* specific values */
-   sdwa_ubyte0 = 0,
-   sdwa_ubyte1 = 1,
-   sdwa_ubyte2 = 2,
-   sdwa_ubyte3 = 3,
-   sdwa_uword0 = sdwa_isword | 0,
-   sdwa_uword1 = sdwa_isword | 1,
-   sdwa_udword = 6,
+   SubdwordSel() : sel((sdwa_sel)0) {}
+   constexpr SubdwordSel(sdwa_sel sel_) : sel(sel_) {}
+   constexpr SubdwordSel(unsigned size, unsigned offset, bool sign_extend)
+       : sel((sdwa_sel)((sign_extend ? sext : 0) | size << 2 | offset))
+   {}
+   constexpr operator sdwa_sel() const { return sel; }
+   explicit operator bool() const { return sel != 0; }
 
-   sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext,
-   sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext,
-   sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext,
-   sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext,
-   sdwa_sword0 = sdwa_uword0 | sdwa_sext,
-   sdwa_sword1 = sdwa_uword1 | sdwa_sext,
-   sdwa_sdword = sdwa_udword | sdwa_sext,
+   constexpr unsigned size() const { return (sel >> 2) & 0x7; }
+   constexpr unsigned offset() const { return sel & 0x3; }
+   constexpr bool sign_extend() const { return sel & sext; }
+   constexpr unsigned to_sdwa_sel(unsigned reg_byte_offset) const
+   {
+      reg_byte_offset += offset();
+      if (size() == 1)
+         return reg_byte_offset;
+      else if (size() == 2)
+         return 4 + (reg_byte_offset >> 1);
+      else
+         return 6;
+   }
 
-   /* register-allocated */
-   sdwa_ubyte = 1 | sdwa_isra,
-   sdwa_uword = 2 | sdwa_isra,
-   sdwa_sbyte = sdwa_ubyte | sdwa_sext,
-   sdwa_sword = sdwa_uword | sdwa_sext,
+private:
+   sdwa_sel sel;
 };
 
 /**
@@ -1446,14 +1476,13 @@ enum sdwa_sel : uint8_t {
 struct SDWA_instruction : public Instruction {
    /* these destination modifiers aren't available with VOPC except for
     * clamp on GFX8 */
-   uint8_t sel[2];
-   uint8_t dst_sel;
+   SubdwordSel sel[2];
+   SubdwordSel dst_sel;
    bool neg[2];
    bool abs[2];
-   bool dst_preserve : 1;
    bool clamp : 1;
    uint8_t omod : 2; /* GFX9+ */
-   uint8_t padding : 4;
+   uint8_t padding : 5;
 };
 static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 
@@ -1729,11 +1758,23 @@ memory_sync_info get_sync_info(const Instruction* instr);
 bool is_dead(const std::vector<uint16_t>& uses, Instruction* instr);
 
 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
+bool instr_is_16bit(chip_class chip, aco_opcode op);
 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_ra);
+bool can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra);
 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
 aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr);
+aco_ptr<Instruction> convert_to_DPP(aco_ptr<Instruction>& instr);
 bool needs_exec_mask(const Instruction* instr);
 
+aco_opcode get_ordered(aco_opcode op);
+aco_opcode get_unordered(aco_opcode op);
+aco_opcode get_inverse(aco_opcode op);
+aco_opcode get_f32_cmp(aco_opcode op);
+unsigned get_cmp_bitsize(aco_opcode op);
+bool is_cmp(aco_opcode op);
+
+bool can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op);
+
 uint32_t get_reduction_identity(ReduceOp op, unsigned idx);
 
 unsigned get_mimg_nsa_dwords(const Instruction* instr);
@@ -2000,7 +2041,7 @@ class Program final {
    uint16_t num_waves = 0;
    uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
    ac_shader_config* config;
-   struct radv_shader_info* info;
+   const struct radv_shader_info* info;
    enum chip_class chip_class;
    enum radeon_family family;
    DeviceInfo dev;
@@ -2032,6 +2073,8 @@ class Program final {
    unsigned next_divergent_if_logical_depth = 0;
    unsigned next_uniform_if_depth = 0;
 
+   std::vector<Definition> vs_inputs;
+
    struct {
       FILE* output = stderr;
       bool shorten_messages = false;
@@ -2095,16 +2138,29 @@ struct ra_test_policy {
 
 void init();
 
-void init_program(Program* program, Stage stage, struct radv_shader_info* info,
+void init_program(Program* program, Stage stage, const struct radv_shader_info* info,
                   enum chip_class chip_class, enum radeon_family family, bool wgp_mode,
                   ac_shader_config* config);
 
 void select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
-                    ac_shader_config* config, struct radv_shader_args* args);
+                    ac_shader_config* config, const struct radv_nir_compiler_options* options,
+                    const struct radv_shader_info* info,
+                    const struct radv_shader_args* args);
 void select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config,
-                           struct radv_shader_args* args);
+                           const struct radv_nir_compiler_options* options,
+                           const struct radv_shader_info* info,
+                           const struct radv_shader_args* args);
 void select_trap_handler_shader(Program* program, struct nir_shader* shader,
-                                ac_shader_config* config, struct radv_shader_args* args);
+                                ac_shader_config* config,
+                                const struct radv_nir_compiler_options* options,
+                                const struct radv_shader_info* info,
+                                const struct radv_shader_args* args);
+void select_vs_prolog(Program* program, const struct radv_vs_prolog_key* key,
+                      ac_shader_config* config,
+                      const struct radv_nir_compiler_options* options,
+                      const struct radv_shader_info* info,
+                      const struct radv_shader_args* args,
+                      unsigned* num_preserved_sgprs);
 
 void lower_phis(Program* program);
 void calc_min_waves(Program* program);
@@ -2128,6 +2184,11 @@ void insert_wait_states(Program* program);
 void insert_NOPs(Program* program);
 void form_hard_clauses(Program* program);
 unsigned emit_program(Program* program, std::vector<uint32_t>& code);
+/**
+ * Returns true if print_asm can disassemble the given program for the current build/runtime
+ * configuration
+ */
+bool check_print_asm_support(Program* program);
 bool print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output);
 bool validate_ir(Program* program);
 bool validate_ra(Program* program);
@@ -2190,7 +2251,6 @@ typedef struct {
    const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)];
    /* sizes used for input/output modifiers and constants */
    const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)];
-   const unsigned definition_size[static_cast<int>(aco_opcode::num_opcodes)];
    const instr_class classes[static_cast<int>(aco_opcode::num_opcodes)];
 } Info;
 
diff --git a/mesa 3D driver/src/amd/compiler/aco_live_var_analysis.cpp b/mesa 3D driver/src/amd/compiler/aco_live_var_analysis.cpp
index 0e94118a14..48456fc0df 100644
--- a/mesa 3D driver/src/amd/compiler/aco_live_var_analysis.cpp	
+++ b/mesa 3D driver/src/amd/compiler/aco_live_var_analysis.cpp	
@@ -355,6 +355,7 @@ update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
    unsigned max_waves_per_simd = program->dev.max_wave64_per_simd * (64 / program->wave_size);
    unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);
    unsigned lds_limit = program->wgp_mode ? program->dev.lds_limit * 2 : program->dev.lds_limit;
+   unsigned max_workgroups_per_cu_wgp = program->wgp_mode ? 32 : 16;
 
    assert(program->min_waves >= 1);
    uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
@@ -375,14 +376,26 @@ update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
       /* adjust max_waves for workgroup and LDS limits */
       unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
       unsigned workgroups_per_cu_wgp = max_waves_per_simd * simd_per_cu_wgp / waves_per_workgroup;
-      if (program->config->lds_size) {
-         unsigned lds = program->config->lds_size * program->dev.lds_encoding_granule;
-         lds = align(lds, program->dev.lds_alloc_granule);
-         workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, lds_limit / lds);
+
+      unsigned lds_per_workgroup =
+         align(program->config->lds_size * program->dev.lds_encoding_granule,
+               program->dev.lds_alloc_granule);
+
+      if (program->stage == fragment_fs) {
+         /* PS inputs are moved from PC (parameter cache) to LDS before PS waves are launched.
+          * Each PS input occupies 3x vec4 of LDS space. See Figure 10.3 in GCN3 ISA manual.
+          * These limit occupancy the same way as other stages' LDS usage does.
+          */
+         unsigned lds_bytes_per_interp = 3 * 16;
+         unsigned lds_param_bytes = lds_bytes_per_interp * program->info->ps.num_interp;
+         lds_per_workgroup += align(lds_param_bytes, program->dev.lds_alloc_granule);
       }
-      if (waves_per_workgroup > 1 && program->chip_class < GFX10)
-         workgroups_per_cu_wgp = std::min(
-            workgroups_per_cu_wgp, 16u); /* TODO: is this a SI-only limit? what about Navi? */
+
+      if (lds_per_workgroup)
+         workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, lds_limit / lds_per_workgroup);
+
+      if (waves_per_workgroup > 1)
+         workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, max_workgroups_per_cu_wgp);
 
       /* in cases like waves_per_workgroup=3 or lds=65536 and
        * waves_per_workgroup=1, we want the maximum possible number of waves per
diff --git a/mesa 3D driver/src/amd/compiler/aco_lower_phis.cpp b/mesa 3D driver/src/amd/compiler/aco_lower_phis.cpp
index cbce86eb2f..6b8f611ecc 100644
--- a/mesa 3D driver/src/amd/compiler/aco_lower_phis.cpp	
+++ b/mesa 3D driver/src/amd/compiler/aco_lower_phis.cpp	
@@ -31,64 +31,92 @@
 
 namespace aco {
 
+enum class pred_defined : uint8_t {
+   undef = 0,
+   const_1 = 1,
+   const_0 = 2,
+   temp = 3,
+   zero = 4, /* all disabled lanes are zero'd out */
+};
+MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(pred_defined);
+
 struct ssa_state {
    bool checked_preds_for_uniform;
    bool all_preds_uniform;
-
-   bool needs_init;
-   uint64_t cur_undef_operands;
-
-   unsigned phi_block_idx;
    unsigned loop_nest_depth;
-   std::map<unsigned, unsigned> writes;
-   /* Whether there's a write in any of a block's predecessors. Indexed by the block index. */
-   std::vector<bool> any_pred_defined;
-   std::vector<Operand> latest;
+
+   std::vector<pred_defined> any_pred_defined;
    std::vector<bool> visited;
+   std::vector<Operand> outputs; /* the output per block */
 };
 
 Operand
-get_ssa(Program* program, unsigned block_idx, ssa_state* state, bool before_write)
+get_ssa(Program* program, unsigned block_idx, ssa_state* state, bool input)
 {
-   if (!before_write) {
-      auto it = state->writes.find(block_idx);
-      if (it != state->writes.end())
-         return Operand(Temp(it->second, program->lane_mask));
+   if (!input) {
       if (state->visited[block_idx])
-         return state->latest[block_idx];
+         return state->outputs[block_idx];
+
+      /* otherwise, output == input */
+      Operand output = get_ssa(program, block_idx, state, true);
+      state->visited[block_idx] = true;
+      state->outputs[block_idx] = output;
+      return output;
    }
 
-   state->visited[block_idx] = true;
+   /* retrieve the Operand by checking the predecessors */
+   if (state->any_pred_defined[block_idx] == pred_defined::undef)
+      return Operand(program->lane_mask);
 
    Block& block = program->blocks[block_idx];
    size_t pred = block.linear_preds.size();
-   if (pred == 0 || block.loop_nest_depth < state->loop_nest_depth ||
-       !state->any_pred_defined[block_idx]) {
-      return Operand(program->lane_mask);
-   } else if (block.loop_nest_depth > state->loop_nest_depth) {
-      Operand op = get_ssa(program, block_idx - 1, state, false);
-      state->latest[block_idx] = op;
-      return op;
-   } else if (pred == 1 || block.kind & block_kind_loop_exit) {
-      Operand op = get_ssa(program, block.linear_preds[0], state, false);
-      state->latest[block_idx] = op;
-      return op;
-   } else if (block.kind & block_kind_loop_header &&
-              !(program->blocks[state->phi_block_idx].kind & block_kind_loop_exit)) {
-      return Operand(program->lane_mask);
+   Operand op;
+   if (block.loop_nest_depth < state->loop_nest_depth) {
+      /* loop-carried value for loop exit phis */
+      op = Operand::zero(program->lane_mask.bytes());
+   } else if (block.loop_nest_depth > state->loop_nest_depth || pred == 1 ||
+              block.kind & block_kind_loop_exit) {
+      op = get_ssa(program, block.linear_preds[0], state, false);
    } else {
-      Temp res = Temp(program->allocateTmp(program->lane_mask));
-      state->latest[block_idx] = Operand(res);
+      assert(pred > 1);
+      bool previously_visited = state->visited[block_idx];
+      /* potential recursion: anchor at loop header */
+      if (block.kind & block_kind_loop_header) {
+         assert(!previously_visited);
+         previously_visited = true;
+         state->visited[block_idx] = true;
+         state->outputs[block_idx] = Operand(Temp(program->allocateTmp(program->lane_mask)));
+      }
 
+      /* collect predecessor output operands */
+      std::vector<Operand> ops(pred);
+      for (unsigned i = 0; i < pred; i++)
+         ops[i] = get_ssa(program, block.linear_preds[i], state, false);
+
+      /* check triviality */
+      if (std::all_of(ops.begin() + 1, ops.end(), [&](Operand same) { return same == ops[0]; }))
+         return ops[0];
+
+      /* Return if this was handled in a recursive call by a loop header phi */
+      if (!previously_visited && state->visited[block_idx])
+         return state->outputs[block_idx];
+
+      if (block.kind & block_kind_loop_header)
+         op = state->outputs[block_idx];
+      else
+         op = Operand(Temp(program->allocateTmp(program->lane_mask)));
+
+      /* create phi */
       aco_ptr<Pseudo_instruction> phi{
          create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, pred, 1)};
       for (unsigned i = 0; i < pred; i++)
-         phi->operands[i] = get_ssa(program, block.linear_preds[i], state, false);
-      phi->definitions[0] = Definition(res);
+         phi->operands[i] = ops[i];
+      phi->definitions[0] = Definition(op.getTemp());
       block.instructions.emplace(block.instructions.begin(), std::move(phi));
-
-      return Operand(res);
    }
+
+   assert(op.size() == program->lane_mask.size());
+   return op;
 }
 
 void
@@ -107,79 +135,117 @@ insert_before_logical_end(Block* block, aco_ptr<Instruction> instr)
 }
 
 void
-build_merge_code(Program* program, Block* block, Definition dst, Operand prev, Operand cur)
+build_merge_code(Program* program, ssa_state* state, Block* block, Operand cur)
 {
-   Builder bld(program);
+   unsigned block_idx = block->index;
+   Definition dst = Definition(state->outputs[block_idx].getTemp());
+   Operand prev = get_ssa(program, block_idx, state, true);
+   if (cur.isUndefined())
+      cur = Operand::zero(program->lane_mask.bytes());
 
+   Builder bld(program);
    auto IsLogicalEnd = [](const aco_ptr<Instruction>& instr) -> bool
    { return instr->opcode == aco_opcode::p_logical_end; };
    auto it = std::find_if(block->instructions.rbegin(), block->instructions.rend(), IsLogicalEnd);
    assert(it != block->instructions.rend());
    bld.reset(&block->instructions, std::prev(it.base()));
 
-   if (prev.isUndefined()) {
-      bld.copy(dst, cur);
+   pred_defined defined = state->any_pred_defined[block_idx];
+   if (defined == pred_defined::undef) {
+      return;
+   } else if (defined == pred_defined::const_0) {
+      bld.sop2(Builder::s_and, dst, bld.def(s1, scc), cur, Operand(exec, bld.lm));
+      return;
+   } else if (defined == pred_defined::const_1) {
+      bld.sop2(Builder::s_orn2, dst, bld.def(s1, scc), cur, Operand(exec, bld.lm));
       return;
    }
 
-   bool prev_is_constant = prev.isConstant() && prev.constantValue() + 1u < 2u;
-   bool cur_is_constant = cur.isConstant() && cur.constantValue() + 1u < 2u;
-
-   if (!prev_is_constant) {
-      if (!cur_is_constant) {
-         Temp tmp1 = bld.tmp(bld.lm), tmp2 = bld.tmp(bld.lm);
-         bld.sop2(Builder::s_andn2, Definition(tmp1), bld.def(s1, scc), prev,
-                  Operand(exec, bld.lm));
-         bld.sop2(Builder::s_and, Definition(tmp2), bld.def(s1, scc), cur, Operand(exec, bld.lm));
-         bld.sop2(Builder::s_or, dst, bld.def(s1, scc), tmp1, tmp2);
-      } else if (cur.constantValue()) {
-         bld.sop2(Builder::s_or, dst, bld.def(s1, scc), prev, Operand(exec, bld.lm));
+   assert(prev.isTemp());
+   /* simpler sequence in case prev has only zeros in disabled lanes */
+   if ((defined & pred_defined::zero) == pred_defined::zero) {
+      if (cur.isConstant()) {
+         if (!cur.constantValue()) {
+            bld.copy(dst, prev);
+            return;
+         }
+         cur = Operand(exec, bld.lm);
       } else {
-         bld.sop2(Builder::s_andn2, dst, bld.def(s1, scc), prev, Operand(exec, bld.lm));
+         cur =
+            bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cur, Operand(exec, bld.lm));
       }
-   } else if (prev.constantValue()) {
-      if (!cur_is_constant)
-         bld.sop2(Builder::s_orn2, dst, bld.def(s1, scc), cur, Operand(exec, bld.lm));
-      else if (cur.constantValue())
-         bld.copy(dst, Operand::c32_or_c64(UINT32_MAX, bld.lm == s2));
-      else
-         bld.sop1(Builder::s_not, dst, bld.def(s1, scc), Operand(exec, bld.lm));
-   } else {
-      if (!cur_is_constant)
-         bld.sop2(Builder::s_and, dst, bld.def(s1, scc), cur, Operand(exec, bld.lm));
-      else if (cur.constantValue())
-         bld.copy(dst, Operand(exec, bld.lm));
-      else
-         bld.copy(dst, Operand::zero(bld.lm.bytes()));
+      bld.sop2(Builder::s_or, dst, bld.def(s1, scc), prev, cur);
+      return;
    }
+
+   if (cur.isConstant()) {
+      if (cur.constantValue())
+         bld.sop2(Builder::s_or, dst, bld.def(s1, scc), prev, Operand(exec, bld.lm));
+      else
+         bld.sop2(Builder::s_andn2, dst, bld.def(s1, scc), prev, Operand(exec, bld.lm));
+      return;
+   }
+   prev =
+      bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), prev, Operand(exec, bld.lm));
+   cur = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cur, Operand(exec, bld.lm));
+   bld.sop2(Builder::s_or, dst, bld.def(s1, scc), prev, cur);
+   return;
 }
 
 void
 init_any_pred_defined(Program* program, ssa_state* state, Block* block, aco_ptr<Instruction>& phi)
 {
-   std::fill(state->any_pred_defined.begin(), state->any_pred_defined.end(), false);
+   std::fill(state->any_pred_defined.begin(), state->any_pred_defined.end(), pred_defined::undef);
    for (unsigned i = 0; i < block->logical_preds.size(); i++) {
       if (phi->operands[i].isUndefined())
          continue;
+      pred_defined defined = pred_defined::temp;
+      if (phi->operands[i].isConstant())
+         defined = phi->operands[i].constantValue() ? pred_defined::const_1 : pred_defined::const_0;
       for (unsigned succ : program->blocks[block->logical_preds[i]].linear_succs)
-         state->any_pred_defined[succ] = true;
+         state->any_pred_defined[succ] |= defined;
    }
 
    unsigned start = block->logical_preds[0];
+   unsigned end = block->index;
 
    /* for loop exit phis, start at the loop header */
-   const bool loop_exit = block->kind & block_kind_loop_exit;
-   while (loop_exit && program->blocks[start - 1].loop_nest_depth >= state->loop_nest_depth)
-      start--;
+   if (block->kind & block_kind_loop_exit) {
+      while (program->blocks[start - 1].loop_nest_depth >= state->loop_nest_depth)
+         start--;
+      /* If the loop-header has a back-edge, we need to insert a phi.
+       * This will contain a defined value */
+      if (program->blocks[start].linear_preds.size() > 1)
+         state->any_pred_defined[start] = pred_defined::temp;
+   }
+   /* for loop header phis, end at the loop exit */
+   if (block->kind & block_kind_loop_header) {
+      while (program->blocks[end].loop_nest_depth >= state->loop_nest_depth)
+         end++;
+      /* don't propagate the incoming value */
+      state->any_pred_defined[block->index] = pred_defined::undef;
+   }
 
-   for (unsigned i = 0; i < 1u + loop_exit; i++) {
-      for (unsigned j = start; j < block->index; j++) {
-         if (!state->any_pred_defined[j])
-            continue;
-         for (unsigned succ : program->blocks[j].linear_succs)
-            state->any_pred_defined[succ] = true;
+   /* add dominating zero: this allows to emit simpler merge sequences
+    * if we can ensure that all disabled lanes are always zero on incoming values */
+   // TODO: find more occasions where pred_defined::zero is beneficial (e.g. with 2+ temp merges)
+   if (block->kind & block_kind_loop_exit) {
+      /* zero the loop-carried variable */
+      if (program->blocks[start].linear_preds.size() > 1) {
+         state->any_pred_defined[start] |= pred_defined::zero;
+         // TODO: emit this zero explicitly
+         state->any_pred_defined[start - 1] = pred_defined::const_0;
       }
    }
+
+   for (unsigned j = start; j < end; j++) {
+      if (state->any_pred_defined[j] == pred_defined::undef)
+         continue;
+      for (unsigned succ : program->blocks[j].linear_succs)
+         state->any_pred_defined[succ] |= state->any_pred_defined[j];
+   }
+
+   state->any_pred_defined[block->index] = pred_defined::undef;
 }
 
 void
@@ -202,62 +268,28 @@ lower_divergent_bool_phi(Program* program, ssa_state* state, Block* block,
       return;
    }
 
-   state->latest.resize(program->blocks.size());
+   /* do this here to avoid resizing in case of no boolean phis */
    state->visited.resize(program->blocks.size());
+   state->outputs.resize(program->blocks.size());
    state->any_pred_defined.resize(program->blocks.size());
-
-   uint64_t undef_operands = 0;
-   for (unsigned i = 0; i < phi->operands.size(); i++)
-      undef_operands |= (uint64_t)phi->operands[i].isUndefined() << i;
-
-   if (state->needs_init || undef_operands != state->cur_undef_operands ||
-       block->logical_preds.size() > 64) {
-      /* this only has to be done once per block unless the set of predecessors
-       * which are undefined changes */
-      state->cur_undef_operands = undef_operands;
-      state->phi_block_idx = block->index;
-      state->loop_nest_depth = block->loop_nest_depth;
-      if (block->kind & block_kind_loop_exit) {
-         state->loop_nest_depth += 1;
-      }
-      state->writes.clear();
-      init_any_pred_defined(program, state, block, phi);
-      state->needs_init = false;
-   }
-   std::fill(state->latest.begin(), state->latest.end(), Operand(program->lane_mask));
+   state->loop_nest_depth = block->loop_nest_depth;
+   if (block->kind & block_kind_loop_exit)
+      state->loop_nest_depth += 1;
    std::fill(state->visited.begin(), state->visited.end(), false);
+   init_any_pred_defined(program, state, block, phi);
 
    for (unsigned i = 0; i < phi->operands.size(); i++) {
-      if (phi->operands[i].isUndefined())
-         continue;
-
-      state->writes[block->logical_preds[i]] = program->allocateId(program->lane_mask);
+      unsigned pred = block->logical_preds[i];
+      if (state->any_pred_defined[pred] != pred_defined::undef)
+         state->outputs[pred] = Operand(bld.tmp(bld.lm));
+      else
+         state->outputs[pred] = phi->operands[i];
+      assert(state->outputs[pred].size() == bld.lm.size());
+      state->visited[pred] = true;
    }
 
-   bool uniform_merge = block->kind & block_kind_loop_header;
-
-   for (unsigned i = 0; i < phi->operands.size(); i++) {
-      Block* pred = &program->blocks[block->logical_preds[i]];
-
-      bool need_get_ssa = !uniform_merge;
-      if (block->kind & block_kind_loop_header && !(pred->kind & block_kind_uniform))
-         uniform_merge = false;
-
-      if (phi->operands[i].isUndefined())
-         continue;
-
-      Operand cur(bld.lm);
-      if (need_get_ssa)
-         cur = get_ssa(program, pred->index, state, true);
-      assert(cur.regClass() == bld.lm);
-
-      Temp new_cur = {state->writes.at(pred->index), program->lane_mask};
-      assert(new_cur.regClass() == bld.lm);
-
-      if (i == 1 && (block->kind & block_kind_merge) && phi->operands[0].isConstant())
-         cur = phi->operands[0];
-      build_merge_code(program, pred, Definition(new_cur), cur, phi->operands[i]);
-   }
+   for (unsigned i = 0; i < phi->operands.size(); i++)
+      build_merge_code(program, state, &program->blocks[block->logical_preds[i]], phi->operands[i]);
 
    unsigned num_preds = block->linear_preds.size();
    if (phi->operands.size() != num_preds) {
@@ -310,7 +342,6 @@ lower_phis(Program* program)
 
    for (Block& block : program->blocks) {
       state.checked_preds_for_uniform = false;
-      state.needs_init = true;
       for (aco_ptr<Instruction>& phi : block.instructions) {
          if (phi->opcode == aco_opcode::p_phi) {
             assert(program->wave_size == 64 ? phi->definitions[0].regClass() != s1
diff --git a/mesa 3D driver/src/amd/compiler/aco_lower_to_cssa.cpp b/mesa 3D driver/src/amd/compiler/aco_lower_to_cssa.cpp
index db809867a7..753a869cc4 100644
--- a/mesa 3D driver/src/amd/compiler/aco_lower_to_cssa.cpp	
+++ b/mesa 3D driver/src/amd/compiler/aco_lower_to_cssa.cpp	
@@ -384,7 +384,7 @@ struct ltg_node {
 /* emit the copies in an order that does not
  * create interferences within a merge-set */
 void
-emit_copies_block(Builder bld, std::map<uint32_t, ltg_node>& ltg, RegType type)
+emit_copies_block(Builder& bld, std::map<uint32_t, ltg_node>& ltg, RegType type)
 {
    auto&& it = ltg.begin();
    while (it != ltg.end()) {
@@ -445,6 +445,9 @@ emit_parallelcopies(cssa_ctx& ctx)
          continue;
 
       std::map<uint32_t, ltg_node> ltg;
+      bool has_vgpr_copy = false;
+      bool has_sgpr_copy = false;
+
       /* first, try to coalesce all parallelcopies */
       for (const copy& cp : ctx.parallelcopies[i]) {
          if (try_coalesce_copy(ctx, cp, i)) {
@@ -459,6 +462,10 @@ emit_parallelcopies(cssa_ctx& ctx)
             uint32_t write_idx = ctx.merge_node_table[cp.def.tempId()].index;
             assert(write_idx != -1u);
             ltg[write_idx] = {cp, read_idx};
+
+            bool is_vgpr = cp.def.regClass().type() == RegType::vgpr;
+            has_vgpr_copy |= is_vgpr;
+            has_sgpr_copy |= !is_vgpr;
          }
       }
 
@@ -475,19 +482,24 @@ emit_parallelcopies(cssa_ctx& ctx)
       Builder bld(ctx.program);
       Block& block = ctx.program->blocks[i];
 
-      /* emit VGPR copies */
-      auto IsLogicalEnd = [](const aco_ptr<Instruction>& inst) -> bool
-      { return inst->opcode == aco_opcode::p_logical_end; };
-      auto it = std::find_if(block.instructions.rbegin(), block.instructions.rend(), IsLogicalEnd);
-      bld.reset(&block.instructions, std::prev(it.base()));
-      emit_copies_block(bld, ltg, RegType::vgpr);
+      if (has_vgpr_copy) {
+         /* emit VGPR copies */
+         auto IsLogicalEnd = [](const aco_ptr<Instruction>& inst) -> bool
+         { return inst->opcode == aco_opcode::p_logical_end; };
+         auto it =
+            std::find_if(block.instructions.rbegin(), block.instructions.rend(), IsLogicalEnd);
+         bld.reset(&block.instructions, std::prev(it.base()));
+         emit_copies_block(bld, ltg, RegType::vgpr);
+      }
 
-      /* emit SGPR copies */
-      aco_ptr<Instruction> branch = std::move(block.instructions.back());
-      block.instructions.pop_back();
-      bld.reset(&block.instructions);
-      emit_copies_block(bld, ltg, RegType::sgpr);
-      bld.insert(std::move(branch));
+      if (has_sgpr_copy) {
+         /* emit SGPR copies */
+         aco_ptr<Instruction> branch = std::move(block.instructions.back());
+         block.instructions.pop_back();
+         bld.reset(&block.instructions);
+         emit_copies_block(bld, ltg, RegType::sgpr);
+         bld.insert(std::move(branch));
+      }
    }
 
    /* finally, rename coalesced phi operands */
diff --git a/mesa 3D driver/src/amd/compiler/aco_lower_to_hw_instr.cpp b/mesa 3D driver/src/amd/compiler/aco_lower_to_hw_instr.cpp
index 8a9db76aba..df94f21db8 100644
--- a/mesa 3D driver/src/amd/compiler/aco_lower_to_hw_instr.cpp	
+++ b/mesa 3D driver/src/amd/compiler/aco_lower_to_hw_instr.cpp	
@@ -516,11 +516,9 @@ emit_reduction(lower_context* ctx, aco_opcode op, ReduceOp reduce_op, unsigned c
             aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
          sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
          sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
-         if (reduce_op == imin8 || reduce_op == imax8)
-            sdwa->sel[0] = sdwa_sbyte;
-         else
-            sdwa->sel[0] = sdwa_ubyte;
-         sdwa->dst_sel = sdwa_udword;
+         bool sext = reduce_op == imin8 || reduce_op == imax8;
+         sdwa->sel[0] = SubdwordSel(1, 0, sext);
+         sdwa->dst_sel = SubdwordSel::dword;
          bld.insert(std::move(sdwa));
       } else {
          aco_opcode opcode;
@@ -541,11 +539,9 @@ emit_reduction(lower_context* ctx, aco_opcode op, ReduceOp reduce_op, unsigned c
             aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
          sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
          sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
-         if (reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16)
-            sdwa->sel[0] = sdwa_sword;
-         else
-            sdwa->sel[0] = sdwa_uword;
-         sdwa->dst_sel = sdwa_udword;
+         bool sext = reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16;
+         sdwa->sel[0] = SubdwordSel(2, 0, sext);
+         sdwa->dst_sel = SubdwordSel::dword;
          bld.insert(std::move(sdwa));
       } else if (ctx->program->chip_class == GFX6 || ctx->program->chip_class == GFX7) {
          aco_opcode opcode;
@@ -998,16 +994,13 @@ split_copy(lower_context* ctx, unsigned offset, Definition* def, Operand* op,
          break;
    }
 
-   RegClass def_cls = bytes % 4 == 0 ? RegClass(src.def.regClass().type(), bytes / 4u)
-                                     : RegClass(src.def.regClass().type(), bytes).as_subdword();
-   *def = Definition(src.def.tempId(), def_reg, def_cls);
+   *def = Definition(src.def.tempId(), def_reg, src.def.regClass().resize(bytes));
    if (src.op.isConstant()) {
       assert(bytes >= 1 && bytes <= 8);
       uint64_t val = src.op.constantValue64() >> (offset * 8u);
       *op = Operand::get_const(ctx->program->chip_class, val, bytes);
    } else {
-      RegClass op_cls = bytes % 4 == 0 ? RegClass(src.op.regClass().type(), bytes / 4u)
-                                       : RegClass(src.op.regClass().type(), bytes).as_subdword();
+      RegClass op_cls = src.op.regClass().resize(bytes);
       *op = Operand(op_reg, op_cls);
       op->setTemp(Temp(src.op.tempId(), op_cls));
    }
@@ -1120,6 +1113,54 @@ copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op)
    }
 }
 
+void
+copy_linear_vgpr(Builder& bld, Definition def, Operand op, bool preserve_scc, PhysReg scratch_sgpr)
+{
+   if (preserve_scc)
+      bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand(scc, s1));
+
+   for (unsigned i = 0; i < 2; i++) {
+      if (def.size() == 2)
+         bld.vop3(aco_opcode::v_lshrrev_b64, def, Operand::zero(), op);
+      else
+         bld.vop1(aco_opcode::v_mov_b32, def, op);
+
+      bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1),
+               Operand(exec, bld.lm));
+   }
+
+   if (preserve_scc)
+      bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(scratch_sgpr, s1),
+               Operand::zero());
+}
+
+void
+swap_linear_vgpr(Builder& bld, Definition def, Operand op, bool preserve_scc, PhysReg scratch_sgpr)
+{
+   if (preserve_scc)
+      bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand(scc, s1));
+
+   Operand def_as_op = Operand(def.physReg(), def.regClass());
+   Definition op_as_def = Definition(op.physReg(), op.regClass());
+
+   for (unsigned i = 0; i < 2; i++) {
+      if (bld.program->chip_class >= GFX9) {
+         bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op);
+      } else {
+         bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
+         bld.vop2(aco_opcode::v_xor_b32, def, op, def_as_op);
+         bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
+      }
+
+      bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1),
+               Operand(exec, bld.lm));
+   }
+
+   if (preserve_scc)
+      bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(scratch_sgpr, s1),
+               Operand::zero());
+}
+
 bool
 do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* preserve_scc,
         PhysReg scratch_sgpr)
@@ -1140,6 +1181,8 @@ do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* pres
          *preserve_scc = true;
       } else if (op.isConstant()) {
          copy_constant(ctx, bld, def, op);
+      } else if (def.regClass().is_linear_vgpr()) {
+         copy_linear_vgpr(bld, def, op, *preserve_scc, scratch_sgpr);
       } else if (def.regClass() == v1) {
          bld.vop1(aco_opcode::v_mov_b32, def, op);
       } else if (def.regClass() == v2) {
@@ -1239,7 +1282,9 @@ do_swap(lower_context* ctx, Builder& bld, const copy_operation& copy, bool prese
       assert(op.regClass() == def.regClass());
       Operand def_as_op = Operand(def.physReg(), def.regClass());
       Definition op_as_def = Definition(op.physReg(), op.regClass());
-      if (ctx->program->chip_class >= GFX9 && def.regClass() == v1) {
+      if (def.regClass().is_linear_vgpr()) {
+         swap_linear_vgpr(bld, def, op, preserve_scc, pi->scratch_sgpr);
+      } else if (ctx->program->chip_class >= GFX9 && def.regClass() == v1) {
          bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op);
       } else if (def.regClass() == v1) {
          assert(def.physReg().byte() == 0 && op.physReg().byte() == 0);
@@ -1429,18 +1474,20 @@ try_coalesce_copies(lower_context* ctx, std::map<PhysReg, copy_operation>& copy_
    if (copy.op.isConstant()) {
       uint64_t val =
          copy.op.constantValue64() | (other->second.op.constantValue64() << (copy.bytes * 8u));
-      if (!Operand::is_constant_representable(val, copy.bytes + other->second.bytes, true,
+      if (!util_is_power_of_two_or_zero(new_size))
+         return;
+      if (!Operand::is_constant_representable(val, new_size, true,
                                               copy.def.regClass().type() == RegType::vgpr))
          return;
       copy.op = Operand::get_const(ctx->program->chip_class, val, new_size);
    } else {
       if (other->second.op.physReg() != copy.op.physReg().advance(copy.bytes))
          return;
-      copy.op = Operand(copy.op.physReg(), RegClass::get(copy.op.regClass().type(), new_size));
+      copy.op = Operand(copy.op.physReg(), copy.op.regClass().resize(new_size));
    }
 
    copy.bytes = new_size;
-   copy.def = Definition(copy.def.physReg(), RegClass::get(copy.def.regClass().type(), copy.bytes));
+   copy.def = Definition(copy.def.physReg(), copy.def.regClass().resize(copy.bytes));
    copy_map.erase(other);
 }
 
@@ -1728,15 +1775,14 @@ handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
       PhysReg src = swap.op.physReg(), dst = swap.def.physReg();
       if (abs((int)src.reg_b - (int)dst.reg_b) < (int)swap.bytes) {
          unsigned offset = abs((int)src.reg_b - (int)dst.reg_b);
-         RegType type = swap.def.regClass().type();
 
          copy_operation remaining;
          src.reg_b += offset;
          dst.reg_b += offset;
          remaining.bytes = swap.bytes - offset;
          memcpy(remaining.uses, swap.uses + offset, remaining.bytes);
-         remaining.op = Operand(src, RegClass::get(type, remaining.bytes));
-         remaining.def = Definition(dst, RegClass::get(type, remaining.bytes));
+         remaining.op = Operand(src, swap.def.regClass().resize(remaining.bytes));
+         remaining.def = Definition(dst, swap.def.regClass().resize(remaining.bytes));
          copy_map[dst] = remaining;
 
          memset(swap.uses + offset, 0, swap.bytes - offset);
@@ -1784,7 +1830,7 @@ handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
             copy_operation copy;
             copy.bytes = after_bytes;
             memcpy(copy.uses, target->second.uses + after_offset, copy.bytes);
-            RegClass rc = RegClass::get(target->second.op.regClass().type(), after_bytes);
+            RegClass rc = target->second.op.regClass().resize(after_bytes);
             copy.op = Operand(target->second.op.physReg().advance(after_offset), rc);
             copy.def = Definition(target->second.def.physReg().advance(after_offset), rc);
             copy_map[copy.def.physReg()] = copy;
@@ -1794,7 +1840,7 @@ handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
             copy_operation copy;
             copy.bytes = middle_bytes;
             memcpy(copy.uses, target->second.uses + before_bytes, copy.bytes);
-            RegClass rc = RegClass::get(target->second.op.regClass().type(), middle_bytes);
+            RegClass rc = target->second.op.regClass().resize(middle_bytes);
             copy.op = Operand(swap.op.physReg().advance(MAX2(offset, 0)), rc);
             copy.def = Definition(target->second.def.physReg().advance(before_bytes), rc);
             copy_map[copy.def.physReg()] = copy;
@@ -1803,7 +1849,7 @@ handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
          if (before_bytes) {
             copy_operation copy;
             target->second.bytes = before_bytes;
-            RegClass rc = RegClass::get(target->second.op.regClass().type(), before_bytes);
+            RegClass rc = target->second.op.regClass().resize(before_bytes);
             target->second.op = Operand(target->second.op.physReg(), rc);
             target->second.def = Definition(target->second.def.physReg(), rc);
             memset(target->second.uses + target->second.bytes, 0, 8 - target->second.bytes);
@@ -1902,7 +1948,7 @@ lower_to_hw_instr(Program* program)
                for (const Operand& op : instr->operands) {
                   if (op.isConstant()) {
                      const Definition def = Definition(
-                        reg, RegClass(instr->definitions[0].getTemp().type(), op.size()));
+                        reg, instr->definitions[0].getTemp().regClass().resize(op.bytes()));
                      copy_operations[reg] = {op, def, op.bytes()};
                      reg.reg_b += op.bytes();
                      continue;
@@ -1916,7 +1962,7 @@ lower_to_hw_instr(Program* program)
                   RegClass rc_def =
                      op.regClass().is_subdword()
                         ? op.regClass()
-                        : RegClass(instr->definitions[0].getTemp().type(), op.size());
+                        : instr->definitions[0].getTemp().regClass().resize(op.bytes());
                   const Definition def = Definition(reg, rc_def);
                   copy_operations[def.physReg()] = {op, def, op.bytes()};
                   reg.reg_b += op.bytes();
@@ -1931,7 +1977,7 @@ lower_to_hw_instr(Program* program)
                for (const Definition& def : instr->definitions) {
                   RegClass rc_op = def.regClass().is_subdword()
                                       ? def.regClass()
-                                      : RegClass(instr->operands[0].getTemp().type(), def.size());
+                                      : instr->operands[0].getTemp().regClass().resize(def.bytes());
                   const Operand op = Operand(reg, rc_op);
                   copy_operations[def.physReg()] = {op, def, def.bytes()};
                   reg.reg_b += def.bytes();
@@ -2083,7 +2129,8 @@ lower_to_hw_instr(Program* program)
                      bld.sop2(signext ? aco_opcode::s_bfe_i32 : aco_opcode::s_bfe_u32, dst,
                               bld.def(s1, scc), op, Operand::c32((bits << 16) | offset));
                   }
-               } else if (dst.regClass() == v1 || ctx.program->chip_class <= GFX7) {
+               } else if ((dst.regClass() == v1 && op.regClass() == v1) ||
+                          ctx.program->chip_class <= GFX7) {
                   assert(op.physReg().byte() == 0 && dst.physReg().byte() == 0);
                   if (offset == (32 - bits) && op.regClass() != s1) {
                      bld.vop2(signext ? aco_opcode::v_ashrrev_i32 : aco_opcode::v_lshrrev_b32, dst,
@@ -2092,18 +2139,12 @@ lower_to_hw_instr(Program* program)
                      bld.vop3(signext ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32, dst, op,
                               Operand::c32(offset), Operand::c32(bits));
                   }
-               } else if (dst.regClass() == v2b) {
-                  aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(
-                     aco_opcode::v_mov_b32,
-                     (Format)((uint16_t)Format::VOP1 | (uint16_t)Format::SDWA), 1, 1)};
-                  sdwa->operands[0] = Operand(op.physReg().advance(-op.physReg().byte()),
-                                              RegClass::get(op.regClass().type(), 4));
-                  sdwa->definitions[0] = dst;
-                  sdwa->sel[0] = sdwa_ubyte0 + op.physReg().byte() + index;
-                  if (signext)
-                     sdwa->sel[0] |= sdwa_sext;
-                  sdwa->dst_sel = sdwa_uword;
-                  bld.insert(std::move(sdwa));
+               } else {
+                  assert(dst.regClass() == v2b || dst.regClass() == v1b || op.regClass() == v2b ||
+                         op.regClass() == v1b);
+                  SDWA_instruction& sdwa =
+                     bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op).instr->sdwa();
+                  sdwa.sel[0] = SubdwordSel(bits / 8, offset / 8, signext);
                }
                break;
             }
@@ -2138,14 +2179,8 @@ lower_to_hw_instr(Program* program)
                      bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits));
                   } else if (program->chip_class >= GFX9 ||
                              (op.regClass() != s1 && program->chip_class >= GFX8)) {
-                     aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(
-                        aco_opcode::v_mov_b32,
-                        (Format)((uint16_t)Format::VOP1 | (uint16_t)Format::SDWA), 1, 1)};
-                     sdwa->operands[0] = op;
-                     sdwa->definitions[0] = dst;
-                     sdwa->sel[0] = sdwa_udword;
-                     sdwa->dst_sel = (bits == 8 ? sdwa_ubyte0 : sdwa_uword0) + (offset / bits);
-                     bld.insert(std::move(sdwa));
+                     bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op).instr->sdwa().dst_sel =
+                        SubdwordSel(bits / 8, offset / 8, false);
                   } else {
                      bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits));
                      bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset),
@@ -2153,16 +2188,9 @@ lower_to_hw_instr(Program* program)
                   }
                } else {
                   assert(dst.regClass() == v2b);
-                  aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(
-                     aco_opcode::v_mov_b32,
-                     (Format)((uint16_t)Format::VOP1 | (uint16_t)Format::SDWA), 1, 1)};
-                  sdwa->operands[0] = op;
-                  sdwa->definitions[0] =
-                     Definition(dst.physReg().advance(-dst.physReg().byte()), v1);
-                  sdwa->sel[0] = sdwa_uword;
-                  sdwa->dst_sel = sdwa_ubyte0 + dst.physReg().byte() + index;
-                  sdwa->dst_preserve = 1;
-                  bld.insert(std::move(sdwa));
+                  bld.vop2_sdwa(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op)
+                     .instr->sdwa()
+                     .sel[1] = SubdwordSel::ubyte;
                }
                break;
             }
diff --git a/mesa 3D driver/src/amd/compiler/aco_opcodes.py b/mesa 3D driver/src/amd/compiler/aco_opcodes.py
index d9ab6a435e..bb02718045 100644
--- a/mesa 3D driver/src/amd/compiler/aco_opcodes.py	
+++ b/mesa 3D driver/src/amd/compiler/aco_opcodes.py	
@@ -178,9 +178,8 @@ class Format(Enum):
       res = ''
       if self == Format.SDWA:
          for i in range(min(num_operands, 2)):
-            res += 'instr->sel[{0}] = op{0}.op.bytes() == 2 ? sdwa_uword : (op{0}.op.bytes() == 1 ? sdwa_ubyte : sdwa_udword);\n'.format(i)
-         res += 'instr->dst_sel = def0.bytes() == 2 ? sdwa_uword : (def0.bytes() == 1 ? sdwa_ubyte : sdwa_udword);\n'
-         res += 'if (def0.bytes() < 4) instr->dst_preserve = true;'
+            res += 'instr->sel[{0}] = SubdwordSel(op{0}.op.bytes(), 0, false);'.format(i)
+         res += 'instr->dst_sel = SubdwordSel(def0.bytes(), 0, false);\n'
       return res
 
 
@@ -219,20 +218,15 @@ class Opcode(object):
 
       parts = name.replace('_e64', '').rsplit('_', 2)
       op_dtype = parts[-1]
-      def_dtype = parts[-2] if len(parts) > 1 else parts[-1]
 
-      def_dtype_sizes = {'{}{}'.format(prefix, size) : size for prefix in 'biuf' for size in [64, 32, 24, 16]}
-      op_dtype_sizes = {k:v for k, v in def_dtype_sizes.items()}
+      op_dtype_sizes = {'{}{}'.format(prefix, size) : size for prefix in 'biuf' for size in [64, 32, 24, 16]}
       # inline constants are 32-bit for 16-bit integer/typeless instructions: https://reviews.llvm.org/D81841
       op_dtype_sizes['b16'] = 32
       op_dtype_sizes['i16'] = 32
       op_dtype_sizes['u16'] = 32
 
-      # If we can't tell the definition size and the operand size, default to
-      # 32. Some opcodes can have a larger definition size, but
-      # get_subdword_definition_info() handles that.
+      # If we can't tell the operand size, default to 32.
       self.operand_size = op_dtype_sizes.get(op_dtype, 32)
-      self.definition_size = def_dtype_sizes.get(def_dtype, self.operand_size)
 
       # exceptions for operands:
       if 'qsad_' in name:
@@ -249,15 +243,6 @@ class Opcode(object):
                     'v_cvt_f32_ubyte2', 'v_cvt_f32_ubyte3']:
         self.operand_size = 32
 
-      # exceptions for definitions:
-      if 'qsad_' in name:
-        self.definition_size = 0
-      elif 'sad_' in name:
-        self.definition_size = 32
-      elif '_pk' in name:
-        self.definition_size = 32
-
-
 # global dictionary of opcodes
 opcodes = {}
 
@@ -695,6 +680,7 @@ VOP2 = {
    (0x0a, 0x0a, 0x07, 0x07, 0x0a, "v_mul_hi_i32_i24", False),
    (0x0b, 0x0b, 0x08, 0x08, 0x0b, "v_mul_u32_u24", False),
    (0x0c, 0x0c, 0x09, 0x09, 0x0c, "v_mul_hi_u32_u24", False),
+   (  -1,   -1,   -1, 0x39, 0x0d, "v_dot4c_i32_i8", False),
    (0x0d, 0x0d,   -1,   -1,   -1, "v_min_legacy_f32", True),
    (0x0e, 0x0e,   -1,   -1,   -1, "v_max_legacy_f32", True),
    (0x0f, 0x0f, 0x0a, 0x0a, 0x0f, "v_min_f32", True),
@@ -978,6 +964,10 @@ VOPP = {
 # (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, -1, code, code, name)
 for (code, name, modifiers) in VOPP:
    opcode(name, -1, code, code, Format.VOP3P, InstrClass.Valu32, modifiers, modifiers)
+opcode("v_dot2_i32_i16", -1, 0x26, 0x14, Format.VOP3P, InstrClass.Valu32)
+opcode("v_dot2_u32_u16", -1, 0x27, 0x15, Format.VOP3P, InstrClass.Valu32)
+opcode("v_dot4_i32_i8", -1, 0x28, 0x16, Format.VOP3P, InstrClass.Valu32)
+opcode("v_dot4_u32_u8", -1, 0x29, 0x17, Format.VOP3P, InstrClass.Valu32)
 
 
 # VINTERP instructions: 
@@ -1701,7 +1691,3 @@ for ver in ['gfx9', 'gfx10']:
         else:
             op_to_name[key] = op.name
 
-# These instructions write the entire 32-bit VGPR, but it's not clear in Opcode's constructor that
-# it should be 32, since it works accidentally.
-assert(opcodes['ds_read_u8'].definition_size == 32)
-assert(opcodes['ds_read_u16'].definition_size == 32)
diff --git a/mesa 3D driver/src/amd/compiler/aco_opcodes_cpp.py b/mesa 3D driver/src/amd/compiler/aco_opcodes_cpp.py
index a70bec934f..da6cb8ad3d 100644
--- a/mesa 3D driver/src/amd/compiler/aco_opcodes_cpp.py	
+++ b/mesa 3D driver/src/amd/compiler/aco_opcodes_cpp.py	
@@ -70,11 +70,6 @@ extern const aco::Info instr_info = {
       ${opcodes[name].operand_size},
       % endfor
    },
-   .definition_size = {
-      % for name in opcode_names:
-      ${opcodes[name].definition_size},
-      % endfor
-   },
    .classes = {
       % for name in opcode_names:
       (instr_class)${opcodes[name].cls.value},
diff --git a/mesa 3D driver/src/amd/compiler/aco_opt_value_numbering.cpp b/mesa 3D driver/src/amd/compiler/aco_opt_value_numbering.cpp
index 32c0bd8a12..d650665e0d 100644
--- a/mesa 3D driver/src/amd/compiler/aco_opt_value_numbering.cpp	
+++ b/mesa 3D driver/src/amd/compiler/aco_opt_value_numbering.cpp	
@@ -187,8 +187,8 @@ struct InstrPred {
          return aSDWA.sel[0] == bSDWA.sel[0] && aSDWA.sel[1] == bSDWA.sel[1] &&
                 aSDWA.dst_sel == bSDWA.dst_sel && aSDWA.abs[0] == bSDWA.abs[0] &&
                 aSDWA.abs[1] == bSDWA.abs[1] && aSDWA.neg[0] == bSDWA.neg[0] &&
-                aSDWA.neg[1] == bSDWA.neg[1] && aSDWA.dst_preserve == bSDWA.dst_preserve &&
-                aSDWA.clamp == bSDWA.clamp && aSDWA.omod == bSDWA.omod;
+                aSDWA.neg[1] == bSDWA.neg[1] && aSDWA.clamp == bSDWA.clamp &&
+                aSDWA.omod == bSDWA.omod;
       }
 
       switch (a->format) {
diff --git a/mesa 3D driver/src/amd/compiler/aco_optimizer.cpp b/mesa 3D driver/src/amd/compiler/aco_optimizer.cpp
index a4d84a3a4e..8d88d08e2d 100644
--- a/mesa 3D driver/src/amd/compiler/aco_optimizer.cpp	
+++ b/mesa 3D driver/src/amd/compiler/aco_optimizer.cpp	
@@ -22,6 +22,7 @@
  *
  */
 
+#include "aco_builder.h"
 #include "aco_ir.h"
 
 #include "util/half_float.h"
@@ -119,11 +120,12 @@ enum Label {
    label_canonicalized = 1ull << 32,
    label_extract = 1ull << 33,
    label_insert = 1ull << 34,
+   label_dpp = 1ull << 35,
 };
 
 static constexpr uint64_t instr_usedef_labels =
    label_vec | label_mul | label_mad | label_add_sub | label_vop3p | label_bitwise |
-   label_uniform_bitwise | label_minmax | label_vopc | label_usedef | label_extract;
+   label_uniform_bitwise | label_minmax | label_vopc | label_usedef | label_extract | label_dpp;
 static constexpr uint64_t instr_mod_labels =
    label_omod2 | label_omod4 | label_omod5 | label_clamp | label_insert;
 
@@ -452,6 +454,14 @@ struct ssa_info {
    }
 
    bool is_insert() { return label & label_insert; }
+
+   void set_dpp(Instruction* mov)
+   {
+      add_label(label_dpp);
+      instr = mov;
+   }
+
+   bool is_dpp() { return label & label_dpp; }
 };
 
 struct opt_ctx {
@@ -464,74 +474,6 @@ struct opt_ctx {
    std::vector<uint16_t> uses;
 };
 
-struct CmpInfo {
-   aco_opcode ordered;
-   aco_opcode unordered;
-   aco_opcode ordered_swapped;
-   aco_opcode unordered_swapped;
-   aco_opcode inverse;
-   aco_opcode f32;
-   unsigned size;
-};
-
-ALWAYS_INLINE bool get_cmp_info(aco_opcode op, CmpInfo* info);
-
-bool
-can_swap_operands(aco_ptr<Instruction>& instr)
-{
-   if (instr->operands[0].isConstant() ||
-       (instr->operands[0].isTemp() && instr->operands[0].getTemp().type() == RegType::sgpr))
-      return false;
-
-   switch (instr->opcode) {
-   case aco_opcode::v_add_u32:
-   case aco_opcode::v_add_co_u32:
-   case aco_opcode::v_add_co_u32_e64:
-   case aco_opcode::v_add_i32:
-   case aco_opcode::v_add_f16:
-   case aco_opcode::v_add_f32:
-   case aco_opcode::v_mul_f16:
-   case aco_opcode::v_mul_f32:
-   case aco_opcode::v_or_b32:
-   case aco_opcode::v_and_b32:
-   case aco_opcode::v_xor_b32:
-   case aco_opcode::v_max_f16:
-   case aco_opcode::v_max_f32:
-   case aco_opcode::v_min_f16:
-   case aco_opcode::v_min_f32:
-   case aco_opcode::v_max_i32:
-   case aco_opcode::v_min_i32:
-   case aco_opcode::v_max_u32:
-   case aco_opcode::v_min_u32:
-   case aco_opcode::v_max_i16:
-   case aco_opcode::v_min_i16:
-   case aco_opcode::v_max_u16:
-   case aco_opcode::v_min_u16:
-   case aco_opcode::v_max_i16_e64:
-   case aco_opcode::v_min_i16_e64:
-   case aco_opcode::v_max_u16_e64:
-   case aco_opcode::v_min_u16_e64: return true;
-   case aco_opcode::v_sub_f16: instr->opcode = aco_opcode::v_subrev_f16; return true;
-   case aco_opcode::v_sub_f32: instr->opcode = aco_opcode::v_subrev_f32; return true;
-   case aco_opcode::v_sub_co_u32: instr->opcode = aco_opcode::v_subrev_co_u32; return true;
-   case aco_opcode::v_sub_u16: instr->opcode = aco_opcode::v_subrev_u16; return true;
-   case aco_opcode::v_sub_u32: instr->opcode = aco_opcode::v_subrev_u32; return true;
-   default: {
-      CmpInfo info;
-      get_cmp_info(instr->opcode, &info);
-      if (info.ordered == instr->opcode) {
-         instr->opcode = info.ordered_swapped;
-         return true;
-      }
-      if (info.unordered == instr->opcode) {
-         instr->opcode = info.unordered_swapped;
-         return true;
-      }
-      return false;
-   }
-   }
-}
-
 bool
 can_use_VOP3(opt_ctx& ctx, const aco_ptr<Instruction>& instr)
 {
@@ -618,10 +560,11 @@ pseudo_propagate_temp(opt_ctx& ctx, aco_ptr<Instruction>& instr, Temp temp, unsi
    return true;
 }
 
+/* This expects the DPP modifier to be removed. */
 bool
 can_apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
-   if ((instr->isSDWA() && ctx.program->chip_class < GFX9) || instr->isDPP())
+   if (instr->isSDWA() && ctx.program->chip_class < GFX9)
       return false;
    return instr->opcode != aco_opcode::v_readfirstlane_b32 &&
           instr->opcode != aco_opcode::v_readlane_b32 &&
@@ -842,36 +785,33 @@ fixed_to_exec(Operand op)
    return op.isFixed() && op.physReg() == exec;
 }
 
-int
+SubdwordSel
 parse_extract(Instruction* instr)
 {
    if (instr->opcode == aco_opcode::p_extract) {
-      bool is_byte = instr->operands[2].constantEquals(8);
-      unsigned index = instr->operands[1].constantValue();
-      unsigned sel = (is_byte ? sdwa_ubyte0 : sdwa_uword0) + index;
-      if (!instr->operands[3].constantEquals(0))
-         sel |= sdwa_sext;
-      return sel;
+      unsigned size = instr->operands[2].constantValue() / 8;
+      unsigned offset = instr->operands[1].constantValue() * size;
+      bool sext = instr->operands[3].constantEquals(1);
+      return SubdwordSel(size, offset, sext);
    } else if (instr->opcode == aco_opcode::p_insert && instr->operands[1].constantEquals(0)) {
-      return instr->operands[2].constantEquals(8) ? sdwa_ubyte0 : sdwa_uword0;
+      return instr->operands[2].constantEquals(8) ? SubdwordSel::ubyte : SubdwordSel::uword;
    } else {
-      return -1;
+      return SubdwordSel();
    }
 }
 
-int
+SubdwordSel
 parse_insert(Instruction* instr)
 {
    if (instr->opcode == aco_opcode::p_extract && instr->operands[3].constantEquals(0) &&
        instr->operands[1].constantEquals(0)) {
-      return instr->operands[2].constantEquals(8) ? sdwa_ubyte0 : sdwa_uword0;
+      return instr->operands[2].constantEquals(8) ? SubdwordSel::ubyte : SubdwordSel::uword;
    } else if (instr->opcode == aco_opcode::p_insert) {
-      bool is_byte = instr->operands[2].constantEquals(8);
-      unsigned index = instr->operands[1].constantValue();
-      unsigned sel = (is_byte ? sdwa_ubyte0 : sdwa_uword0) + index;
-      return sel;
+      unsigned size = instr->operands[2].constantValue() / 8;
+      unsigned offset = instr->operands[1].constantValue() * size;
+      return SubdwordSel(size, offset, false);
    } else {
-      return -1;
+      return SubdwordSel();
    }
 }
 
@@ -882,20 +822,21 @@ can_apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_i
       return false;
 
    Temp tmp = info.instr->operands[0].getTemp();
-   unsigned sel = parse_extract(info.instr);
+   SubdwordSel sel = parse_extract(info.instr);
 
-   if (sel == sdwa_udword || sel == sdwa_sdword) {
+   if (!sel) {
+      return false;
+   } else if (sel.size() == 4) {
       return true;
-   } else if (instr->opcode == aco_opcode::v_cvt_f32_u32 && sel <= sdwa_ubyte3) {
+   } else if (instr->opcode == aco_opcode::v_cvt_f32_u32 && sel.size() == 1 && !sel.sign_extend()) {
       return true;
    } else if (can_use_SDWA(ctx.program->chip_class, instr, true) &&
               (tmp.type() == RegType::vgpr || ctx.program->chip_class >= GFX9)) {
-      if (instr->isSDWA() &&
-          (static_cast<SDWA_instruction*>(instr.get())->sel[idx] & sdwa_asuint) != sdwa_udword)
+      if (instr->isSDWA() && instr->sdwa().sel[idx] != SubdwordSel::dword)
          return false;
       return true;
-   } else if (instr->isVOP3() && (sel & sdwa_isword) &&
-              can_use_opsel(ctx.program->chip_class, instr->opcode, idx, (sel & sdwa_wordnum)) &&
+   } else if (instr->isVOP3() && sel.size() == 2 &&
+              can_use_opsel(ctx.program->chip_class, instr->opcode, idx, sel.offset()) &&
               !(instr->vop3().opsel & (1 << idx))) {
       return true;
    } else {
@@ -910,26 +851,38 @@ void
 apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info& info)
 {
    Temp tmp = info.instr->operands[0].getTemp();
-   unsigned sel = parse_extract(info.instr);
+   SubdwordSel sel = parse_extract(info.instr);
+   assert(sel);
 
-   if (sel == sdwa_udword || sel == sdwa_sdword) {
-   } else if (instr->opcode == aco_opcode::v_cvt_f32_u32 && sel <= sdwa_ubyte3) {
-      switch (sel) {
-      case sdwa_ubyte0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break;
-      case sdwa_ubyte1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break;
-      case sdwa_ubyte2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break;
-      case sdwa_ubyte3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break;
+   instr->operands[idx].set16bit(false);
+   instr->operands[idx].set24bit(false);
+
+   ctx.info[tmp.id()].label &= ~label_insert;
+
+   if (sel.size() == 4) {
+      /* full dword selection */
+   } else if (instr->opcode == aco_opcode::v_cvt_f32_u32 && sel.size() == 1 && !sel.sign_extend()) {
+      switch (sel.offset()) {
+      case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break;
+      case 1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break;
+      case 2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break;
+      case 3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break;
       }
+   } else if (instr->opcode == aco_opcode::v_lshlrev_b32 && instr->operands[0].isConstant() &&
+              sel.offset() == 0 &&
+              ((sel.size() == 2 && instr->operands[0].constantValue() >= 16u) ||
+               (sel.size() == 1 && instr->operands[0].constantValue() >= 24u))) {
+      /* The undesireable upper bits are already shifted out. */
+      return;
    } else if (can_use_SDWA(ctx.program->chip_class, instr, true) &&
               (tmp.type() == RegType::vgpr || ctx.program->chip_class >= GFX9)) {
       to_SDWA(ctx, instr);
       static_cast<SDWA_instruction*>(instr.get())->sel[idx] = sel;
    } else if (instr->isVOP3()) {
-      if (sel & sdwa_wordnum)
+      if (sel.offset())
          instr->vop3().opsel |= 1 << idx;
    }
 
-   ctx.info[tmp.id()].label &= ~label_insert;
    /* label_vopc seems to be the only one worth keeping at the moment */
    for (Definition& def : instr->definitions)
       ctx.info[def.tempId()].label &= label_vopc;
@@ -938,10 +891,6 @@ apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info&
 void
 check_sdwa_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
-   /* only VALU can use SDWA */
-   if (!instr->isVALU())
-      return;
-
    for (unsigned i = 0; i < instr->operands.size(); i++) {
       Operand op = instr->operands[i];
       if (!op.isTemp())
@@ -1069,6 +1018,7 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
          /* applying SGPRs to VOP1 doesn't increase code size and DCE is helped by doing it earlier */
          if (info.is_temp() && info.temp.type() == RegType::sgpr && can_apply_sgprs(ctx, instr) &&
              instr->operands.size() == 1) {
+            instr->format = withoutDPP(instr->format);
             instr->operands[i].setTemp(info.temp);
             info = ctx.info[info.temp.id()];
          }
@@ -1080,7 +1030,7 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
          can_use_mod = can_use_mod && instr_info.can_use_input_modifiers[(int)instr->opcode];
 
          if (instr->isSDWA())
-            can_use_mod = can_use_mod && (instr->sdwa().sel[i] & sdwa_asuint) == sdwa_udword;
+            can_use_mod = can_use_mod && instr->sdwa().sel[i].size() == 4;
          else
             can_use_mod = can_use_mod && (instr->isDPP() || can_use_VOP3(ctx, instr));
 
@@ -1114,6 +1064,7 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
                instr->vop3().abs[i] = true;
             continue;
          }
+
          unsigned bits = get_operand_size(instr, i);
          if (info.is_constant(bits) && alu_can_accept_constant(instr->opcode, i) &&
              (!instr->isSDWA() || ctx.program->chip_class >= GFX9)) {
@@ -1123,9 +1074,10 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
             if (i == 0 || instr->isSDWA() || instr->isVOP3P() ||
                 instr->opcode == aco_opcode::v_readlane_b32 ||
                 instr->opcode == aco_opcode::v_writelane_b32) {
+               instr->format = withoutDPP(instr->format);
                instr->operands[i] = op;
                continue;
-            } else if (!instr->isVOP3() && can_swap_operands(instr)) {
+            } else if (!instr->isVOP3() && can_swap_operands(instr, &instr->opcode)) {
                instr->operands[i] = instr->operands[0];
                instr->operands[0] = op;
                continue;
@@ -1472,10 +1424,18 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
          assert(instr->operands[0].isFixed());
       }
       break;
+   case aco_opcode::v_mov_b32:
+      if (instr->isDPP()) {
+         /* anything else doesn't make sense in SSA */
+         assert(instr->dpp().row_mask == 0xf && instr->dpp().bank_mask == 0xf);
+         ctx.info[instr->definitions[0].tempId()].set_dpp(instr.get());
+      }
+      break;
    case aco_opcode::p_is_helper:
       if (!ctx.program->needs_wqm)
          ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, 0u);
       break;
+   case aco_opcode::v_mul_f64: ctx.info[instr->definitions[0].tempId()].set_mul(instr.get()); break;
    case aco_opcode::v_mul_f16:
    case aco_opcode::v_mul_f32: { /* omod */
       ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());
@@ -1531,11 +1491,7 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
       break;
    }
    case aco_opcode::v_mul_lo_u16:
-      if (instr->definitions[0].isNUW()) {
-         /* Most of 16-bit mul optimizations are only valid if no overflow. */
-         ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
-      }
-      break;
+   case aco_opcode::v_mul_lo_u16_e64:
    case aco_opcode::v_mul_u32_u24:
       ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
       break;
@@ -1724,7 +1680,7 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
    case aco_opcode::p_extract: {
       if (instr->definitions[0].bytes() == 4) {
          ctx.info[instr->definitions[0].tempId()].set_extract(instr.get());
-         if (instr->operands[0].regClass() == v1 && parse_insert(instr.get()) >= 0)
+         if (instr->operands[0].regClass() == v1 && parse_insert(instr.get()))
             ctx.info[instr->operands[0].tempId()].set_insert(instr.get());
       }
       break;
@@ -1733,12 +1689,19 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
       if (instr->operands[0].bytes() == 4) {
          if (instr->operands[0].regClass() == v1)
             ctx.info[instr->operands[0].tempId()].set_insert(instr.get());
-         if (parse_extract(instr.get()) >= 0)
+         if (parse_extract(instr.get()))
             ctx.info[instr->definitions[0].tempId()].set_extract(instr.get());
          ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
       }
       break;
    }
+   case aco_opcode::ds_read_u8:
+   case aco_opcode::ds_read_u8_d16:
+   case aco_opcode::ds_read_u16:
+   case aco_opcode::ds_read_u16_d16: {
+      ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
+      break;
+   }
    default: break;
    }
 
@@ -1748,102 +1711,6 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
       check_sdwa_extract(ctx, instr);
 }
 
-ALWAYS_INLINE bool
-get_cmp_info(aco_opcode op, CmpInfo* info)
-{
-   info->ordered = aco_opcode::num_opcodes;
-   info->unordered = aco_opcode::num_opcodes;
-   info->ordered_swapped = aco_opcode::num_opcodes;
-   info->unordered_swapped = aco_opcode::num_opcodes;
-   switch (op) {
-      // clang-format off
-#define CMP2(ord, unord, ord_swap, unord_swap, sz)                                                 \
-   case aco_opcode::v_cmp_##ord##_f##sz:                                                           \
-   case aco_opcode::v_cmp_n##unord##_f##sz:                                                        \
-      info->ordered = aco_opcode::v_cmp_##ord##_f##sz;                                             \
-      info->unordered = aco_opcode::v_cmp_n##unord##_f##sz;                                        \
-      info->ordered_swapped = aco_opcode::v_cmp_##ord_swap##_f##sz;                                \
-      info->unordered_swapped = aco_opcode::v_cmp_n##unord_swap##_f##sz;                           \
-      info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz \
-                                                               : aco_opcode::v_cmp_n##ord##_f##sz; \
-      info->f32 = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord##_f32            \
-                                                        : aco_opcode::v_cmp_n##unord##_f32;        \
-      info->size = sz;                                                                             \
-      return true;
-#define CMP(ord, unord, ord_swap, unord_swap)                                                      \
-   CMP2(ord, unord, ord_swap, unord_swap, 16)                                                      \
-   CMP2(ord, unord, ord_swap, unord_swap, 32)                                                      \
-   CMP2(ord, unord, ord_swap, unord_swap, 64)
-      CMP(lt, /*n*/ge, gt, /*n*/le)
-      CMP(eq, /*n*/lg, eq, /*n*/lg)
-      CMP(le, /*n*/gt, ge, /*n*/lt)
-      CMP(gt, /*n*/le, lt, /*n*/le)
-      CMP(lg, /*n*/eq, lg, /*n*/eq)
-      CMP(ge, /*n*/lt, le, /*n*/gt)
-#undef CMP
-#undef CMP2
-#define ORD_TEST(sz)                                                                               \
-   case aco_opcode::v_cmp_u_f##sz:                                                                 \
-      info->f32 = aco_opcode::v_cmp_u_f32;                                                         \
-      info->inverse = aco_opcode::v_cmp_o_f##sz;                                                   \
-      info->size = sz;                                                                             \
-      return true;                                                                                 \
-   case aco_opcode::v_cmp_o_f##sz:                                                                 \
-      info->f32 = aco_opcode::v_cmp_o_f32;                                                         \
-      info->inverse = aco_opcode::v_cmp_u_f##sz;                                                   \
-      info->size = sz;                                                                             \
-      return true;
-      ORD_TEST(16)
-      ORD_TEST(32)
-      ORD_TEST(64)
-#undef ORD_TEST
-      // clang-format on
-   default: return false;
-   }
-}
-
-aco_opcode
-get_ordered(aco_opcode op)
-{
-   CmpInfo info;
-   return get_cmp_info(op, &info) ? info.ordered : aco_opcode::num_opcodes;
-}
-
-aco_opcode
-get_unordered(aco_opcode op)
-{
-   CmpInfo info;
-   return get_cmp_info(op, &info) ? info.unordered : aco_opcode::num_opcodes;
-}
-
-aco_opcode
-get_inverse(aco_opcode op)
-{
-   CmpInfo info;
-   return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes;
-}
-
-aco_opcode
-get_f32_cmp(aco_opcode op)
-{
-   CmpInfo info;
-   return get_cmp_info(op, &info) ? info.f32 : aco_opcode::num_opcodes;
-}
-
-unsigned
-get_cmp_bitsize(aco_opcode op)
-{
-   CmpInfo info;
-   return get_cmp_info(op, &info) ? info.size : 0;
-}
-
-bool
-is_cmp(aco_opcode op)
-{
-   CmpInfo info;
-   return get_cmp_info(op, &info) && info.ordered != aco_opcode::num_opcodes;
-}
-
 unsigned
 original_temp_id(opt_ctx& ctx, Temp tmp)
 {
@@ -2226,10 +2093,20 @@ combine_inverse_comparison(opt_ctx& ctx, aco_ptr<Instruction>& instr)
       memcpy(new_sdwa->sel, cmp_sdwa.sel, sizeof(new_sdwa->sel));
       memcpy(new_sdwa->neg, cmp_sdwa.neg, sizeof(new_sdwa->neg));
       new_sdwa->dst_sel = cmp_sdwa.dst_sel;
-      new_sdwa->dst_preserve = cmp_sdwa.dst_preserve;
       new_sdwa->clamp = cmp_sdwa.clamp;
       new_sdwa->omod = cmp_sdwa.omod;
       new_instr = new_sdwa;
+   } else if (cmp->isDPP()) {
+      DPP_instruction* new_dpp = create_instruction<DPP_instruction>(
+         new_opcode, (Format)((uint16_t)Format::DPP | (uint16_t)Format::VOPC), 2, 1);
+      DPP_instruction& cmp_dpp = cmp->dpp();
+      memcpy(new_dpp->abs, cmp_dpp.abs, sizeof(new_dpp->abs));
+      memcpy(new_dpp->neg, cmp_dpp.neg, sizeof(new_dpp->neg));
+      new_dpp->dpp_ctrl = cmp_dpp.dpp_ctrl;
+      new_dpp->row_mask = cmp_dpp.row_mask;
+      new_dpp->bank_mask = cmp_dpp.bank_mask;
+      new_dpp->bound_ctrl = cmp_dpp.bound_ctrl;
+      new_instr = new_dpp;
    } else {
       new_instr = create_instruction<VOPC_instruction>(new_opcode, Format::VOPC, 2, 1);
       instr->definitions[0].setHint(vcc);
@@ -2269,6 +2146,8 @@ match_op3_for_vop3(opt_ctx& ctx, aco_opcode op1, aco_opcode op2, Instruction* op
 
    if (op1_instr->isSDWA() || op2_instr->isSDWA())
       return false;
+   if (op1_instr->isDPP() || op2_instr->isDPP())
+      return false;
 
    /* don't support inbetween clamp/omod */
    if (op2_vop3 && (op2_vop3->clamp || op2_vop3->omod))
@@ -2380,7 +2259,7 @@ combine_add_or_then_and_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)
    if (combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, new_op_lshl, "210", 1 | 2))
       return true;
 
-   if (instr->isSDWA())
+   if (instr->isSDWA() || instr->isDPP())
       return false;
 
    /* v_or_b32(p_extract(a, 0, 8/16, 0), b) -> v_and_or_b32(a, 0xff/0xffff, b)
@@ -2640,7 +2519,7 @@ combine_add_bcnt(opt_ctx& ctx, aco_ptr<Instruction>& instr)
    for (unsigned i = 0; i < 2; i++) {
       Instruction* op_instr = follow_operand(ctx, instr->operands[i]);
       if (op_instr && op_instr->opcode == aco_opcode::v_bcnt_u32_b32 &&
-          op_instr->operands[0].isTemp() &&
+          !op_instr->usesModifiers() && op_instr->operands[0].isTemp() &&
           op_instr->operands[0].getTemp().type() == RegType::vgpr &&
           op_instr->operands[1].constantEquals(0)) {
          aco_ptr<Instruction> new_instr{
@@ -2876,6 +2755,9 @@ apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
       if (new_sgpr && num_sgprs >= max_sgprs)
          continue;
 
+      if (sgpr_idx == 0)
+         instr->format = withoutDPP(instr->format);
+
       if (sgpr_idx == 0 || instr->isVOP3() || instr->isSDWA() || instr->isVOP3P() ||
           info.is_extract()) {
          /* can_apply_extract() checks SGPR encoding restrictions */
@@ -2884,7 +2766,7 @@ apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
          else if (info.is_extract())
             continue;
          instr->operands[sgpr_idx] = Operand(sgpr);
-      } else if (can_swap_operands(instr)) {
+      } else if (can_swap_operands(instr, &instr->opcode)) {
          instr->operands[sgpr_idx] = instr->operands[0];
          instr->operands[0] = Operand(sgpr);
          /* swap bits using a 4-entry LUT */
@@ -2998,20 +2880,21 @@ apply_insert(opt_ctx& ctx, aco_ptr<Instruction>& instr)
    /* MADs/FMAs are created later, so we don't have to update the original add */
    assert(!ctx.info[instr->definitions[0].tempId()].is_mad());
 
-   unsigned sel = parse_insert(def_info.instr);
+   SubdwordSel sel = parse_insert(def_info.instr);
+   assert(sel);
 
-   if (instr->isVOP3() && (sel & sdwa_isword) && !(sel & sdwa_sext) &&
-       can_use_opsel(ctx.program->chip_class, instr->opcode, 3, (sel & sdwa_wordnum))) {
+   if (instr->isVOP3() && sel.size() == 2 && !sel.sign_extend() &&
+       can_use_opsel(ctx.program->chip_class, instr->opcode, 3, sel.offset())) {
       if (instr->vop3().opsel & (1 << 3))
          return false;
-      if (sel & sdwa_wordnum)
+      if (sel.offset())
          instr->vop3().opsel |= 1 << 3;
    } else {
       if (!can_use_SDWA(ctx.program->chip_class, instr, true))
          return false;
 
       to_SDWA(ctx, instr);
-      if ((static_cast<SDWA_instruction*>(instr.get())->dst_sel & sdwa_asuint) != sdwa_udword)
+      if (instr->sdwa().dst_sel.size() != 4)
          return false;
       static_cast<SDWA_instruction*>(instr.get())->dst_sel = sel;
    }
@@ -3023,6 +2906,57 @@ apply_insert(opt_ctx& ctx, aco_ptr<Instruction>& instr)
    return true;
 }
 
+/* Remove superfluous extract after ds_read like so:
+ * p_extract(ds_read_uN(), 0, N, 0) -> ds_read_uN()
+ */
+bool
+apply_ds_extract(opt_ctx& ctx, aco_ptr<Instruction>& extract)
+{
+   /* Check if p_extract has a usedef operand and is the only user. */
+   if (!ctx.info[extract->operands[0].tempId()].is_usedef() ||
+       ctx.uses[extract->operands[0].tempId()] > 1)
+      return false;
+
+   /* Check if the usedef is a DS instruction. */
+   Instruction* ds = ctx.info[extract->operands[0].tempId()].instr;
+   if (ds->format != Format::DS)
+      return false;
+
+   unsigned extract_idx = extract->operands[1].constantValue();
+   unsigned bits_extracted = extract->operands[2].constantValue();
+   unsigned sign_ext = extract->operands[3].constantValue();
+   unsigned dst_bitsize = extract->definitions[0].bytes() * 8u;
+
+   /* TODO: These are doable, but probably don't occour too often. */
+   if (extract_idx || sign_ext || dst_bitsize != 32)
+      return false;
+
+   unsigned bits_loaded = 0;
+   if (ds->opcode == aco_opcode::ds_read_u8 || ds->opcode == aco_opcode::ds_read_u8_d16)
+      bits_loaded = 8;
+   else if (ds->opcode == aco_opcode::ds_read_u16 || ds->opcode == aco_opcode::ds_read_u16_d16)
+      bits_loaded = 16;
+   else
+      return false;
+
+   /* Shrink the DS load if the extracted bit size is smaller. */
+   bits_loaded = MIN2(bits_loaded, bits_extracted);
+
+   /* Change the DS opcode so it writes the full register. */
+   if (bits_loaded == 8)
+      ds->opcode = aco_opcode::ds_read_u8;
+   else if (bits_loaded == 16)
+      ds->opcode = aco_opcode::ds_read_u16;
+   else
+      unreachable("Forgot to add DS opcode above.");
+
+   /* The DS now produces the exact same thing as the extract, remove the extract. */
+   std::swap(ds->definitions[0], extract->definitions[0]);
+   ctx.uses[extract->definitions[0].tempId()] = 0;
+   ctx.info[ds->definitions[0].tempId()].label = 0;
+   return true;
+}
+
 /* v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc) */
 bool
 combine_and_subbrev(opt_ctx& ctx, aco_ptr<Instruction>& instr)
@@ -3067,14 +3001,27 @@ combine_and_subbrev(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 }
 
 /* v_add_co(c, s_lshl(a, b)) -> v_mad_u32_u24(a, 1<<b, c)
- * v_add_co(c, v_lshlrev(a, b)) -> v_mad_u32_u24(b, 1<<a, c) */
+ * v_add_co(c, v_lshlrev(a, b)) -> v_mad_u32_u24(b, 1<<a, c)
+ * v_sub(c, s_lshl(a, b)) -> v_mad_i32_i24(a, -(1<<b), c)
+ * v_sub(c, v_lshlrev(a, b)) -> v_mad_i32_i24(b, -(1<<a), c)
+ */
 bool
-combine_add_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)
+combine_add_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr, bool is_sub)
 {
    if (instr->usesModifiers())
       return false;
 
-   for (unsigned i = 0; i < 2; i++) {
+   /* Substractions: start at operand 1 to avoid mixup such as
+    * turning v_sub(v_lshlrev(a, b), c) into v_mad_i32_i24(b, -(1<<a), c)
+    */
+   unsigned start_op_idx = is_sub ? 1 : 0;
+
+   /* Don't allow 24-bit operands on subtraction because
+    * v_mad_i32_i24 applies a sign extension.
+    */
+   bool allow_24bit = !is_sub;
+
+   for (unsigned i = start_op_idx; i < 2; i++) {
       Instruction* op_instr = follow_operand(ctx, instr->operands[i]);
       if (!op_instr)
          continue;
@@ -3083,25 +3030,32 @@ combine_add_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)
           op_instr->opcode != aco_opcode::v_lshlrev_b32)
          continue;
 
-      if (op_instr->opcode == aco_opcode::v_lshlrev_b32 && op_instr->operands[1].isTemp() &&
-          op_instr->operands[1].getTemp().type() == RegType::sgpr && instr->operands[!i].isTemp() &&
-          instr->operands[!i].getTemp().type() == RegType::sgpr)
-         return false;
-
       int shift_op_idx = op_instr->opcode == aco_opcode::s_lshl_b32 ? 1 : 0;
+
       if (op_instr->operands[shift_op_idx].isConstant() &&
-          op_instr->operands[shift_op_idx].constantValue() <= 6 && /* no literals */
-          (op_instr->operands[!shift_op_idx].is24bit() ||
+          ((allow_24bit && op_instr->operands[!shift_op_idx].is24bit()) ||
            op_instr->operands[!shift_op_idx].is16bit())) {
-         uint32_t multiplier = 1 << op_instr->operands[shift_op_idx].constantValue();
+         uint32_t multiplier = 1 << (op_instr->operands[shift_op_idx].constantValue() % 32u);
+         if (is_sub)
+            multiplier = -multiplier;
+         if (is_sub ? (multiplier < 0xff800000) : (multiplier > 0xffffff))
+            continue;
+
+         Operand ops[3] = {
+            op_instr->operands[!shift_op_idx],
+            Operand::c32(multiplier),
+            instr->operands[!i],
+         };
+         if (!check_vop3_operands(ctx, 3, ops))
+            return false;
 
          ctx.uses[instr->operands[i].tempId()]--;
 
+         aco_opcode mad_op = is_sub ? aco_opcode::v_mad_i32_i24 : aco_opcode::v_mad_u32_u24;
          aco_ptr<VOP3_instruction> new_instr{
-            create_instruction<VOP3_instruction>(aco_opcode::v_mad_u32_u24, Format::VOP3, 3, 1)};
-         new_instr->operands[0] = op_instr->operands[!shift_op_idx];
-         new_instr->operands[1] = Operand::c32(multiplier);
-         new_instr->operands[2] = instr->operands[!i];
+            create_instruction<VOP3_instruction>(mad_op, Format::VOP3, 3, 1)};
+         for (unsigned op_idx = 0; op_idx < 3; ++op_idx)
+            new_instr->operands[op_idx] = ops[op_idx];
          new_instr->definitions[0] = instr->definitions[0];
          instr = std::move(new_instr);
          ctx.info[instr->definitions[0].tempId()].label = 0;
@@ -3194,8 +3148,9 @@ combine_vop3p(opt_ctx& ctx, aco_ptr<Instruction>& instr)
       }
    }
 
-   if (instr->opcode == aco_opcode::v_pk_add_f16) {
-      if (instr->definitions[0].isPrecise())
+   if (instr->opcode == aco_opcode::v_pk_add_f16 || instr->opcode == aco_opcode::v_pk_add_u16) {
+      bool fadd = instr->opcode == aco_opcode::v_pk_add_f16;
+      if (fadd && instr->definitions[0].isPrecise())
          return;
 
       Instruction* mul_instr = nullptr;
@@ -3208,9 +3163,14 @@ combine_vop3p(opt_ctx& ctx, aco_ptr<Instruction>& instr)
          if (!instr->operands[i].isTemp() || !ctx.info[instr->operands[i].tempId()].is_vop3p())
             continue;
          ssa_info& info = ctx.info[instr->operands[i].tempId()];
-         if (info.instr->opcode != aco_opcode::v_pk_mul_f16 ||
-             info.instr->definitions[0].isPrecise())
-            continue;
+         if (fadd) {
+            if (info.instr->opcode != aco_opcode::v_pk_mul_f16 ||
+                info.instr->definitions[0].isPrecise())
+               continue;
+         } else {
+            if (info.instr->opcode != aco_opcode::v_pk_mul_lo_u16)
+               continue;
+         }
 
          Operand op[3] = {info.instr->operands[0], info.instr->operands[1], instr->operands[1 - i]};
          if (ctx.uses[instr->operands[i].tempId()] >= uses || !check_vop3_operands(ctx, 3, op))
@@ -3242,8 +3202,9 @@ combine_vop3p(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 
       /* turn packed mul+add into v_pk_fma_f16 */
       assert(mul_instr->isVOP3P());
+      aco_opcode mad = fadd ? aco_opcode::v_pk_fma_f16 : aco_opcode::v_pk_mad_u16;
       aco_ptr<VOP3P_instruction> fma{
-         create_instruction<VOP3P_instruction>(aco_opcode::v_pk_fma_f16, Format::VOP3P, 3, 1)};
+         create_instruction<VOP3P_instruction>(mad, Format::VOP3P, 3, 1)};
       VOP3P_instruction* mul = &mul_instr->vop3p();
       for (unsigned i = 0; i < 2; i++) {
          fma->operands[i] = op[i];
@@ -3262,7 +3223,7 @@ combine_vop3p(opt_ctx& ctx, aco_ptr<Instruction>& instr)
       fma->neg_lo[1] = fma->neg_lo[1] ^ vop3p->neg_lo[1 - add_op_idx];
       fma->neg_hi[1] = fma->neg_hi[1] ^ vop3p->neg_hi[1 - add_op_idx];
       fma->definitions[0] = instr->definitions[0];
-      instr.reset(fma.release());
+      instr = std::move(fma);
       ctx.info[instr->definitions[0].tempId()].set_vop3p(instr.get());
       return;
    }
@@ -3285,6 +3246,14 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
          if (!op.isTemp())
             continue;
          ssa_info& info = ctx.info[op.tempId()];
+         if (!info.is_extract())
+            continue;
+         /* if there are that many uses, there are likely better combinations */
+         // TODO: delay applying extract to a point where we know better
+         if (ctx.uses[op.tempId()] > 4) {
+            info.label &= ~label_extract;
+            continue;
+         }
          if (info.is_extract() &&
              (info.instr->operands[0].getTemp().type() == RegType::vgpr ||
               instr->operands[i].getTemp().type() == RegType::sgpr) &&
@@ -3309,9 +3278,12 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
       instr->definitions[0].setHint(vcc);
    }
 
-   if (instr->isSDWA())
+   if (instr->isSDWA() || instr->isDPP())
       return;
 
+   if (instr->opcode == aco_opcode::p_extract)
+      apply_ds_extract(ctx, instr);
+
    /* TODO: There are still some peephole optimizations that could be done:
     * - abs(a - b) -> s_absdiff_i32
     * - various patterns for s_bitcmp{0,1}_b32 and s_bitset{0,1}_b32
@@ -3336,7 +3308,7 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
          return;
       if (mul_instr->isVOP3() && mul_instr->vop3().clamp)
          return;
-      if (mul_instr->isSDWA())
+      if (mul_instr->isSDWA() || mul_instr->isDPP())
          return;
 
       /* convert to mul(neg(a), b) */
@@ -3370,9 +3342,11 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
                 instr->opcode == aco_opcode::v_subrev_f32;
    bool mad16 = instr->opcode == aco_opcode::v_add_f16 || instr->opcode == aco_opcode::v_sub_f16 ||
                 instr->opcode == aco_opcode::v_subrev_f16;
-   if (mad16 || mad32) {
-      bool need_fma = mad32 ? (ctx.fp_mode.denorm32 != 0 || ctx.program->chip_class >= GFX10_3)
-                            : (ctx.fp_mode.denorm16_64 != 0 || ctx.program->chip_class >= GFX10);
+   bool mad64 = instr->opcode == aco_opcode::v_add_f64;
+   if (mad16 || mad32 || mad64) {
+      bool need_fma =
+         mad32 ? (ctx.fp_mode.denorm32 != 0 || ctx.program->chip_class >= GFX10_3)
+               : (ctx.fp_mode.denorm16_64 != 0 || ctx.program->chip_class >= GFX10 || mad64);
       if (need_fma && instr->definitions[0].isPrecise())
          return;
       if (need_fma && mad32 && !ctx.program->dev.has_fast_fma32)
@@ -3395,7 +3369,7 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
             continue;
 
          Operand op[3] = {info.instr->operands[0], info.instr->operands[1], instr->operands[1 - i]};
-         if (info.instr->isSDWA() || !check_vop3_operands(ctx, 3, op) ||
+         if (info.instr->isSDWA() || info.instr->isDPP() || !check_vop3_operands(ctx, 3, op) ||
              ctx.uses[instr->operands[i].tempId()] >= uses)
             continue;
 
@@ -3457,6 +3431,8 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
                                                                  : aco_opcode::v_fma_f16)
                               : (ctx.program->chip_class == GFX8 ? aco_opcode::v_mad_legacy_f16
                                                                  : aco_opcode::v_mad_f16);
+         if (mad64)
+            mad_op = aco_opcode::v_fma_f64;
 
          aco_ptr<VOP3_instruction> mad{
             create_instruction<VOP3_instruction>(mad_op, Format::VOP3, 3, 1)};
@@ -3472,12 +3448,13 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
          /* mark this ssa_def to be re-checked for profitability and literals */
          ctx.mad_infos.emplace_back(std::move(instr), mul_instr->definitions[0].tempId());
          ctx.info[mad->definitions[0].tempId()].set_mad(mad.get(), ctx.mad_infos.size() - 1);
-         instr.reset(mad.release());
+         instr = std::move(mad);
          return;
       }
    }
    /* v_mul_f32(v_cndmask_b32(0, 1.0, cond), a) -> v_cndmask_b32(0, a, cond) */
-   else if (instr->opcode == aco_opcode::v_mul_f32 && !instr->isVOP3()) {
+   else if (instr->opcode == aco_opcode::v_mul_f32 && !ctx.fp_mode.preserve_signed_zero_inf_nan32 &&
+            !instr->usesModifiers() && !ctx.fp_mode.must_flush_denorms32) {
       for (unsigned i = 0; i < 2; i++) {
          if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2f() &&
              ctx.uses[instr->operands[i].tempId()] == 1 && instr->operands[!i].isTemp() &&
@@ -3491,7 +3468,7 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
             new_instr->operands[1] = instr->operands[!i];
             new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp);
             new_instr->definitions[0] = instr->definitions[0];
-            instr.reset(new_instr.release());
+            instr = std::move(new_instr);
             ctx.info[instr->definitions[0].tempId()].label = 0;
             return;
          }
@@ -3509,6 +3486,14 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
       } else if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xor3_b32,
                                        "012", 1 | 2)) {
       }
+   } else if (instr->opcode == aco_opcode::v_add_u16) {
+      combine_three_valu_op(
+         ctx, instr, aco_opcode::v_mul_lo_u16,
+         ctx.program->chip_class == GFX8 ? aco_opcode::v_mad_legacy_u16 : aco_opcode::v_mad_u16,
+         "120", 1 | 2);
+   } else if (instr->opcode == aco_opcode::v_add_u16_e64) {
+      combine_three_valu_op(ctx, instr, aco_opcode::v_mul_lo_u16_e64, aco_opcode::v_mad_u16, "120",
+                            1 | 2);
    } else if (instr->opcode == aco_opcode::v_add_u32) {
       if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) {
       } else if (combine_add_bcnt(ctx, instr)) {
@@ -3525,8 +3510,6 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
                                           "012", 1 | 2)) {
          } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add3_u32,
                                           "012", 1 | 2)) {
-         } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_lo_u16,
-                                          aco_opcode::v_mad_u32_u16, "120", 1 | 2)) {
          } else if (combine_add_or_then_and_lshl(ctx, instr)) {
          }
       }
@@ -3537,11 +3520,15 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
       } else if (!carry_out && combine_add_bcnt(ctx, instr)) {
       } else if (!carry_out && combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24,
                                                      aco_opcode::v_mad_u32_u24, "120", 1 | 2)) {
-      } else if (!carry_out && combine_add_lshl(ctx, instr)) {
+      } else if (!carry_out && combine_add_lshl(ctx, instr, false)) {
       }
    } else if (instr->opcode == aco_opcode::v_sub_u32 || instr->opcode == aco_opcode::v_sub_co_u32 ||
               instr->opcode == aco_opcode::v_sub_co_u32_e64) {
-      combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 2);
+      bool carry_out =
+         instr->opcode != aco_opcode::v_sub_u32 && ctx.uses[instr->definitions[1].tempId()] > 0;
+      if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 2)) {
+      } else if (!carry_out && combine_add_lshl(ctx, instr, true)) {
+      }
    } else if (instr->opcode == aco_opcode::v_subrev_u32 ||
               instr->opcode == aco_opcode::v_subrev_co_u32 ||
               instr->opcode == aco_opcode::v_subrev_co_u32_e64) {
@@ -3584,6 +3571,14 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 bool
 to_uniform_bool_instr(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
+   /* Check every operand to make sure they are suitable. */
+   for (Operand& op : instr->operands) {
+      if (!op.isTemp())
+         return false;
+      if (!ctx.info[op.tempId()].is_uniform_bool() && !ctx.info[op.tempId()].is_uniform_bitwise())
+         return false;
+   }
+
    switch (instr->opcode) {
    case aco_opcode::s_and_b32:
    case aco_opcode::s_and_b64: instr->opcode = aco_opcode::s_and_b32; break;
@@ -3626,52 +3621,6 @@ to_uniform_bool_instr(opt_ctx& ctx, aco_ptr<Instruction>& instr)
    return true;
 }
 
-void
-select_mul_u32_u24(opt_ctx& ctx, aco_ptr<Instruction>& instr)
-{
-   if (instr->usesModifiers())
-      return;
-
-   /* Only valid if the accumulator is zero (this is selected by isel to
-    * combine more v_add_u32+v_mad_u32_u16 together), but the optimizer
-    * fallbacks here when not possible.
-    */
-   if (!instr->operands[2].constantEquals(0))
-      return;
-
-   /* Only valid if the upper 16-bits of both operands are zero (because
-    * v_mul_u32_u24 doesn't mask them).
-    */
-   for (unsigned i = 0; i < 2; i++) {
-      if (instr->operands[i].isTemp() && !instr->operands[i].is16bit())
-         return;
-   }
-
-   bool swap = false;
-
-   /* VOP2 instructions can only take constants/sgprs in operand 0. */
-   if ((instr->operands[1].isConstant() ||
-        (instr->operands[1].hasRegClass() &&
-         instr->operands[1].regClass().type() == RegType::sgpr))) {
-      swap = true;
-      if ((instr->operands[0].isConstant() ||
-           (instr->operands[0].hasRegClass() &&
-            instr->operands[0].regClass().type() == RegType::sgpr))) {
-         /* VOP2 can't take both constants/sgprs, keep v_mad_u32_u16 because
-          * v_mul_u32_u24 has no advantages.
-          */
-         return;
-      }
-   }
-
-   VOP2_instruction* new_instr =
-      create_instruction<VOP2_instruction>(aco_opcode::v_mul_u32_u24, Format::VOP2, 2, 1);
-   new_instr->operands[0] = instr->operands[swap];
-   new_instr->operands[1] = instr->operands[!swap];
-   new_instr->definitions[0] = instr->definitions[0];
-   instr.reset(new_instr);
-}
-
 void
 select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
@@ -3722,7 +3671,7 @@ select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
                aco_opcode::p_create_vector, Format::PSEUDO, 1, 1)};
             extract->operands[0] = op;
             extract->definitions[0] = instr->definitions[idx];
-            instr.reset(extract.release());
+            instr = std::move(extract);
 
             done = true;
          }
@@ -3737,7 +3686,7 @@ select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
          extract->operands[1] =
             Operand::c32((uint32_t)split_offset / instr->definitions[idx].bytes());
          extract->definitions[0] = instr->definitions[idx];
-         instr.reset(extract.release());
+         instr = std::move(extract);
       }
    }
 
@@ -3755,7 +3704,7 @@ select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
          mad_info = NULL;
       }
       /* check literals */
-      else if (!instr->usesModifiers()) {
+      else if (!instr->usesModifiers() && instr->opcode != aco_opcode::v_fma_f64) {
          /* FMA can only take literals on GFX10+ */
          if ((instr->opcode == aco_opcode::v_fma_f32 || instr->opcode == aco_opcode::v_fma_f16) &&
              ctx.program->chip_class < GFX10)
@@ -3852,10 +3801,38 @@ select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
       return;
    }
 
-   if (instr->opcode == aco_opcode::v_mad_u32_u16)
-      select_mul_u32_u24(ctx, instr);
+   /* Combine DPP copies into VALU. This should be done after creating MAD/FMA. */
+   if (instr->isVALU()) {
+      for (unsigned i = 0; i < instr->operands.size(); i++) {
+         if (!instr->operands[i].isTemp())
+            continue;
+         ssa_info info = ctx.info[instr->operands[i].tempId()];
 
-   if (instr->isSDWA() || instr->isDPP() || (instr->isVOP3() && ctx.program->chip_class < GFX10) ||
+         aco_opcode swapped_op;
+         if (info.is_dpp() && info.instr->pass_flags == instr->pass_flags &&
+             (i == 0 || can_swap_operands(instr, &swapped_op)) && can_use_DPP(instr, true) &&
+             !instr->isDPP()) {
+            convert_to_DPP(instr);
+            DPP_instruction* dpp = static_cast<DPP_instruction*>(instr.get());
+            if (i) {
+               instr->opcode = swapped_op;
+               std::swap(instr->operands[0], instr->operands[1]);
+               std::swap(dpp->neg[0], dpp->neg[1]);
+               std::swap(dpp->abs[0], dpp->abs[1]);
+            }
+            if (--ctx.uses[info.instr->definitions[0].tempId()])
+               ctx.uses[info.instr->operands[0].tempId()]++;
+            instr->operands[0].setTemp(info.instr->operands[0].getTemp());
+            dpp->dpp_ctrl = info.instr->dpp().dpp_ctrl;
+            dpp->bound_ctrl = info.instr->dpp().bound_ctrl;
+            dpp->neg[0] ^= info.instr->dpp().neg[0] && !dpp->abs[0];
+            dpp->abs[0] |= info.instr->dpp().abs[0];
+            break;
+         }
+      }
+   }
+
+   if (instr->isSDWA() || (instr->isVOP3() && ctx.program->chip_class < GFX10) ||
        (instr->isVOP3P() && ctx.program->chip_class < GFX10))
       return; /* some encodings can't ever take literals */
 
@@ -3979,6 +3956,7 @@ apply_literals(opt_ctx& ctx, aco_ptr<Instruction>& instr)
          unsigned bits = get_operand_size(instr, i);
          if (op.isTemp() && ctx.info[op.tempId()].is_literal(bits) && ctx.uses[op.tempId()] == 0) {
             Operand literal = Operand::c32(ctx.info[op.tempId()].val);
+            instr->format = withoutDPP(instr->format);
             if (instr->isVALU() && i > 0 && instr->format != Format::VOP3P)
                to_VOP3(ctx, instr);
             instr->operands[i] = literal;
diff --git a/mesa 3D driver/src/amd/compiler/aco_optimizer_postRA.cpp b/mesa 3D driver/src/amd/compiler/aco_optimizer_postRA.cpp
index d086eff7ce..0d9eee8287 100644
--- a/mesa 3D driver/src/amd/compiler/aco_optimizer_postRA.cpp	
+++ b/mesa 3D driver/src/amd/compiler/aco_optimizer_postRA.cpp	
@@ -22,6 +22,7 @@
  *
  */
 
+#include "aco_builder.h"
 #include "aco_ir.h"
 
 #include <algorithm>
@@ -34,26 +35,65 @@ namespace {
 
 constexpr const size_t max_reg_cnt = 512;
 
-enum {
-   not_written_in_block = -1,
-   clobbered = -2,
-   const_or_undef = -3,
-   written_by_multiple_instrs = -4,
+struct Idx {
+   bool operator==(const Idx& other) const { return block == other.block && instr == other.instr; }
+   bool operator!=(const Idx& other) const { return !operator==(other); }
+
+   bool found() const { return block != UINT32_MAX; }
+
+   uint32_t block;
+   uint32_t instr;
 };
 
+Idx not_written_in_block{UINT32_MAX, 0};
+Idx clobbered{UINT32_MAX, 1};
+Idx const_or_undef{UINT32_MAX, 2};
+Idx written_by_multiple_instrs{UINT32_MAX, 3};
+
+bool
+is_instr_after(Idx second, Idx first)
+{
+   if (first == not_written_in_block && second != not_written_in_block)
+      return true;
+
+   if (!first.found() || !second.found())
+      return false;
+
+   return second.block > first.block || (second.block == first.block && second.instr > first.instr);
+}
+
 struct pr_opt_ctx {
    Program* program;
    Block* current_block;
-   int current_instr_idx;
+   uint32_t current_instr_idx;
    std::vector<uint16_t> uses;
-   std::array<int, max_reg_cnt * 4u> instr_idx_by_regs;
+   std::vector<std::array<Idx, max_reg_cnt>> instr_idx_by_regs;
 
    void reset_block(Block* block)
    {
       current_block = block;
-      current_instr_idx = -1;
-      std::fill(instr_idx_by_regs.begin(), instr_idx_by_regs.end(), not_written_in_block);
+      current_instr_idx = 0;
+
+      if ((block->kind & block_kind_loop_header) || block->linear_preds.empty()) {
+         std::fill(instr_idx_by_regs[block->index].begin(), instr_idx_by_regs[block->index].end(),
+                   not_written_in_block);
+      } else {
+         unsigned first_pred = block->linear_preds[0];
+         for (unsigned i = 0; i < max_reg_cnt; i++) {
+            bool all_same = std::all_of(
+               std::next(block->linear_preds.begin()), block->linear_preds.end(),
+               [&](unsigned pred)
+               { return instr_idx_by_regs[pred][i] == instr_idx_by_regs[first_pred][i]; });
+
+            if (all_same)
+               instr_idx_by_regs[block->index][i] = instr_idx_by_regs[first_pred][i];
+            else
+               instr_idx_by_regs[block->index][i] = not_written_in_block;
+         }
+      }
    }
+
+   Instruction* get(Idx idx) { return program->blocks[idx.block].instructions[idx.instr].get(); }
 };
 
 void
@@ -65,36 +105,42 @@ save_reg_writes(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
 
       unsigned dw_size = DIV_ROUND_UP(def.bytes(), 4u);
       unsigned r = def.physReg().reg();
-      int idx = ctx.current_instr_idx;
+      Idx idx{ctx.current_block->index, ctx.current_instr_idx};
 
       if (def.regClass().is_subdword())
          idx = clobbered;
 
+      assert((r + dw_size) <= max_reg_cnt);
       assert(def.size() == dw_size || def.regClass().is_subdword());
-      std::fill(&ctx.instr_idx_by_regs[r], &ctx.instr_idx_by_regs[r + dw_size], idx);
+      std::fill(ctx.instr_idx_by_regs[ctx.current_block->index].begin() + r,
+                ctx.instr_idx_by_regs[ctx.current_block->index].begin() + r + dw_size, idx);
    }
 }
 
-int
+Idx
 last_writer_idx(pr_opt_ctx& ctx, PhysReg physReg, RegClass rc)
 {
    /* Verify that all of the operand's registers are written by the same instruction. */
-   int instr_idx = ctx.instr_idx_by_regs[physReg.reg()];
+   assert(physReg.reg() < max_reg_cnt);
+   Idx instr_idx = ctx.instr_idx_by_regs[ctx.current_block->index][physReg.reg()];
    unsigned dw_size = DIV_ROUND_UP(rc.bytes(), 4u);
    unsigned r = physReg.reg();
-   bool all_same = std::all_of(&ctx.instr_idx_by_regs[r], &ctx.instr_idx_by_regs[r + dw_size],
-                               [instr_idx](int i) { return i == instr_idx; });
+   bool all_same =
+      std::all_of(ctx.instr_idx_by_regs[ctx.current_block->index].begin() + r,
+                  ctx.instr_idx_by_regs[ctx.current_block->index].begin() + r + dw_size,
+                  [instr_idx](Idx i) { return i == instr_idx; });
 
    return all_same ? instr_idx : written_by_multiple_instrs;
 }
 
-int
+Idx
 last_writer_idx(pr_opt_ctx& ctx, const Operand& op)
 {
    if (op.isConstant() || op.isUndefined())
       return const_or_undef;
 
-   int instr_idx = ctx.instr_idx_by_regs[op.physReg().reg()];
+   assert(op.physReg().reg() < max_reg_cnt);
+   Idx instr_idx = ctx.instr_idx_by_regs[ctx.current_block->index][op.physReg().reg()];
 
 #ifndef NDEBUG
    /* Debug mode:  */
@@ -129,21 +175,22 @@ try_apply_branch_vcc(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
        instr->operands[0].physReg() != scc)
       return;
 
-   int op0_instr_idx = last_writer_idx(ctx, instr->operands[0]);
-   int last_vcc_wr_idx = last_writer_idx(ctx, vcc, ctx.program->lane_mask);
-   int last_exec_wr_idx = last_writer_idx(ctx, exec, ctx.program->lane_mask);
+   Idx op0_instr_idx = last_writer_idx(ctx, instr->operands[0]);
+   Idx last_vcc_wr_idx = last_writer_idx(ctx, vcc, ctx.program->lane_mask);
+   Idx last_exec_wr_idx = last_writer_idx(ctx, exec, ctx.program->lane_mask);
 
    /* We need to make sure:
     * - the operand register used by the branch, and VCC were both written in the current block
     * - VCC was NOT written after the operand register
     * - EXEC is sane and was NOT written after the operand register
     */
-   if (op0_instr_idx < 0 || last_vcc_wr_idx < 0 || last_vcc_wr_idx > op0_instr_idx ||
-       last_exec_wr_idx > last_vcc_wr_idx || last_exec_wr_idx < not_written_in_block)
+   if (!op0_instr_idx.found() || !last_vcc_wr_idx.found() ||
+       !is_instr_after(last_vcc_wr_idx, last_exec_wr_idx) ||
+       !is_instr_after(op0_instr_idx, last_vcc_wr_idx))
       return;
 
-   aco_ptr<Instruction>& op0_instr = ctx.current_block->instructions[op0_instr_idx];
-   aco_ptr<Instruction>& last_vcc_wr = ctx.current_block->instructions[last_vcc_wr_idx];
+   Instruction* op0_instr = ctx.get(op0_instr_idx);
+   Instruction* last_vcc_wr = ctx.get(last_vcc_wr_idx);
 
    if ((op0_instr->opcode != aco_opcode::s_and_b64 /* wave64 */ &&
         op0_instr->opcode != aco_opcode::s_and_b32 /* wave32 */) ||
@@ -192,12 +239,12 @@ try_optimize_scc_nocompare(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
          return;
 
       /* Make sure both SCC and Operand 0 are written by the same instruction. */
-      int wr_idx = last_writer_idx(ctx, instr->operands[0]);
-      int sccwr_idx = last_writer_idx(ctx, scc, s1);
-      if (wr_idx < 0 || wr_idx != sccwr_idx)
+      Idx wr_idx = last_writer_idx(ctx, instr->operands[0]);
+      Idx sccwr_idx = last_writer_idx(ctx, scc, s1);
+      if (!wr_idx.found() || wr_idx != sccwr_idx)
          return;
 
-      aco_ptr<Instruction>& wr_instr = ctx.current_block->instructions[wr_idx];
+      Instruction* wr_instr = ctx.get(wr_idx);
       if (!wr_instr->isSALU() || wr_instr->definitions.size() < 2 ||
           wr_instr->definitions[1].physReg() != scc)
          return;
@@ -259,11 +306,11 @@ try_optimize_scc_nocompare(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
          scc_op_idx = 2;
       }
 
-      int wr_idx = last_writer_idx(ctx, instr->operands[scc_op_idx]);
-      if (wr_idx < 0)
+      Idx wr_idx = last_writer_idx(ctx, instr->operands[scc_op_idx]);
+      if (!wr_idx.found())
          return;
 
-      aco_ptr<Instruction>& wr_instr = ctx.current_block->instructions[wr_idx];
+      Instruction* wr_instr = ctx.get(wr_idx);
 
       /* Check if we found the pattern above. */
       if (wr_instr->opcode != aco_opcode::s_cmp_eq_u32 &&
@@ -296,17 +343,71 @@ try_optimize_scc_nocompare(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
    }
 }
 
+void
+try_combine_dpp(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
+{
+   if (!instr->isVALU() || instr->isDPP() || !can_use_DPP(instr, false))
+      return;
+
+   for (unsigned i = 0; i < MIN2(2, instr->operands.size()); i++) {
+      Idx op_instr_idx = last_writer_idx(ctx, instr->operands[i]);
+      if (!op_instr_idx.found())
+         continue;
+
+      Instruction* mov = ctx.get(op_instr_idx);
+      if (mov->opcode != aco_opcode::v_mov_b32 || !mov->isDPP())
+         continue;
+
+      /* If we aren't going to remove the v_mov_b32, we have to ensure that it doesn't overwrite
+       * it's own operand before we use it.
+       */
+      if (mov->definitions[0].physReg() == mov->operands[0].physReg() &&
+          (!mov->definitions[0].tempId() || ctx.uses[mov->definitions[0].tempId()] > 1))
+         continue;
+
+      Idx mov_src_idx = last_writer_idx(ctx, mov->operands[0]);
+      if (is_instr_after(mov_src_idx, op_instr_idx))
+         continue;
+
+      if (i && !can_swap_operands(instr, &instr->opcode))
+         continue;
+
+      /* anything else doesn't make sense in SSA */
+      assert(mov->dpp().row_mask == 0xf && mov->dpp().bank_mask == 0xf);
+
+      if (--ctx.uses[mov->definitions[0].tempId()])
+         ctx.uses[mov->operands[0].tempId()]++;
+
+      convert_to_DPP(instr);
+
+      DPP_instruction* dpp = &instr->dpp();
+      if (i) {
+         std::swap(dpp->operands[0], dpp->operands[1]);
+         std::swap(dpp->neg[0], dpp->neg[1]);
+         std::swap(dpp->abs[0], dpp->abs[1]);
+      }
+      dpp->operands[0] = mov->operands[0];
+      dpp->dpp_ctrl = mov->dpp().dpp_ctrl;
+      dpp->bound_ctrl = true;
+      dpp->neg[0] ^= mov->dpp().neg[0] && !dpp->abs[0];
+      dpp->abs[0] |= mov->dpp().abs[0];
+      return;
+   }
+}
+
 void
 process_instruction(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
-   ctx.current_instr_idx++;
-
    try_apply_branch_vcc(ctx, instr);
 
    try_optimize_scc_nocompare(ctx, instr);
 
+   try_combine_dpp(ctx, instr);
+
    if (instr)
       save_reg_writes(ctx, instr);
+
+   ctx.current_instr_idx++;
 }
 
 } // namespace
@@ -317,6 +418,7 @@ optimize_postRA(Program* program)
    pr_opt_ctx ctx;
    ctx.program = program;
    ctx.uses = dead_code_analysis(program);
+   ctx.instr_idx_by_regs.resize(program->blocks.size());
 
    /* Forward pass
     * Goes through each instruction exactly once, and can transform
diff --git a/mesa 3D driver/src/amd/compiler/aco_print_asm.cpp b/mesa 3D driver/src/amd/compiler/aco_print_asm.cpp
index dcc7c4bc74..9f15de5aac 100644
--- a/mesa 3D driver/src/amd/compiler/aco_print_asm.cpp	
+++ b/mesa 3D driver/src/amd/compiler/aco_print_asm.cpp	
@@ -24,11 +24,13 @@
 
 #include "aco_ir.h"
 
+#ifdef LLVM_AVAILABLE
 #include "llvm/ac_llvm_util.h"
 
 #include "llvm-c/Disassembler.h"
 #include <llvm/ADT/StringRef.h>
 #include <llvm/MC/MCDisassembler/MCDisassembler.h>
+#endif
 
 #include <array>
 #include <iomanip>
@@ -37,21 +39,75 @@
 namespace aco {
 namespace {
 
-/* LLVM disassembler only supports GFX8+, try to disassemble with CLRXdisasm
- * for GFX6-GFX7 if found on the system, this is better than nothing.
+/**
+ * Determines the GPU type to use for CLRXdisasm
  */
+const char*
+to_clrx_device_name(chip_class cc, radeon_family family)
+{
+   switch (cc) {
+   case GFX6:
+      switch (family) {
+      case CHIP_TAHITI: return "tahiti";
+      case CHIP_PITCAIRN: return "pitcairn";
+      case CHIP_VERDE: return "capeverde";
+      case CHIP_OLAND: return "oland";
+      case CHIP_HAINAN: return "hainan";
+      default: return nullptr;
+      }
+   case GFX7:
+      switch (family) {
+      case CHIP_BONAIRE: return "bonaire";
+      case CHIP_KAVERI: return "gfx700";
+      case CHIP_HAWAII: return "hawaii";
+      default: return nullptr;
+      }
+   case GFX8:
+      switch (family) {
+      case CHIP_TONGA: return "tonga";
+      case CHIP_ICELAND: return "iceland";
+      case CHIP_CARRIZO: return "carrizo";
+      case CHIP_FIJI: return "fiji";
+      case CHIP_STONEY: return "stoney";
+      case CHIP_POLARIS10: return "polaris10";
+      case CHIP_POLARIS11: return "polaris11";
+      case CHIP_POLARIS12: return "polaris12";
+      case CHIP_VEGAM: return "polaris11";
+      default: return nullptr;
+      }
+   case GFX9:
+      switch (family) {
+      case CHIP_VEGA10: return "vega10";
+      case CHIP_VEGA12: return "vega12";
+      case CHIP_VEGA20: return "vega20";
+      case CHIP_RAVEN: return "raven";
+      default: return nullptr;
+      }
+   case GFX10:
+      switch (family) {
+      case CHIP_NAVI10: return "gfx1010";
+      case CHIP_NAVI12: return "gfx1011";
+      default: return nullptr;
+      }
+   case GFX10_3:
+      return nullptr;
+   default: unreachable("Invalid chip class!"); return nullptr;
+   }
+}
+
 bool
-print_asm_gfx6_gfx7(Program* program, std::vector<uint32_t>& binary, FILE* output)
+print_asm_clrx(Program* program, std::vector<uint32_t>& binary, FILE* output)
 {
 #ifdef _WIN32
    return true;
 #else
    char path[] = "/tmp/fileXXXXXX";
    char line[2048], command[128];
-   const char* gpu_type;
    FILE* p;
    int fd;
 
+   const char* gpu_type = to_clrx_device_name(program->chip_class, program->family);
+
    /* Dump the binary into a temporary file. */
    fd = mkstemp(path);
    if (fd < 0)
@@ -62,24 +118,6 @@ print_asm_gfx6_gfx7(Program* program, std::vector<uint32_t>& binary, FILE* outpu
          goto fail;
    }
 
-   /* Determine the GPU type for CLRXdisasm. Use the family for GFX6 chips
-    * because it doesn't allow to use gfx600 directly.
-    */
-   switch (program->chip_class) {
-   case GFX6:
-      switch (program->family) {
-      case CHIP_TAHITI: gpu_type = "tahiti"; break;
-      case CHIP_PITCAIRN: gpu_type = "pitcairn"; break;
-      case CHIP_VERDE: gpu_type = "capeverde"; break;
-      case CHIP_OLAND: gpu_type = "oland"; break;
-      case CHIP_HAINAN: gpu_type = "hainan"; break;
-      default: unreachable("Invalid GFX6 family!");
-      }
-      break;
-   case GFX7: gpu_type = "gfx700"; break;
-   default: unreachable("Invalid chip class!");
-   }
-
    sprintf(command, "clrxdisasm --gpuType=%s -r %s", gpu_type, path);
 
    p = popen(command, "r");
@@ -106,6 +144,7 @@ fail:
 #endif
 }
 
+#ifdef LLVM_AVAILABLE
 std::pair<bool, size_t>
 disasm_instr(chip_class chip, LLVMDisasmContextRef disasm, uint32_t* binary, unsigned exec_size,
              size_t pos, char* outline, unsigned outline_size)
@@ -152,17 +191,10 @@ disasm_instr(chip_class chip, LLVMDisasmContextRef disasm, uint32_t* binary, uns
 
    return std::make_pair(invalid, size);
 }
-} /* end namespace */
 
 bool
-print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output)
+print_asm_llvm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output)
 {
-   if (program->chip_class <= GFX7) {
-      /* Do not abort if clrxdisasm isn't found. */
-      print_asm_gfx6_gfx7(program, binary, output);
-      return false;
-   }
-
    std::vector<bool> referenced_blocks(program->blocks.size());
    referenced_blocks[0] = true;
    for (Block& block : program->blocks) {
@@ -255,5 +287,40 @@ print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, F
 
    return invalid;
 }
+#endif /* LLVM_AVAILABLE */
+
+} /* end namespace */
+
+bool
+check_print_asm_support(Program* program)
+{
+#ifdef LLVM_AVAILABLE
+   if (program->chip_class >= GFX8) {
+      /* LLVM disassembler only supports GFX8+ */
+      return true;
+   }
+#endif
+
+#ifndef _WIN32
+   /* Check if CLRX disassembler binary is available and can disassemble the program */
+   return to_clrx_device_name(program->chip_class, program->family) &&
+          system("clrxdisasm --version") == 0;
+#else
+   return false;
+#endif
+}
+
+/* Returns true on failure */
+bool
+print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output)
+{
+#ifdef LLVM_AVAILABLE
+   if (program->chip_class >= GFX8) {
+      return print_asm_llvm(program, binary, exec_size, output);
+   }
+#endif
+
+   return print_asm_clrx(program, binary, output);
+}
 
 } // namespace aco
diff --git a/mesa 3D driver/src/amd/compiler/aco_print_ir.cpp b/mesa 3D driver/src/amd/compiler/aco_print_ir.cpp
index 339b938c3e..41938fe862 100644
--- a/mesa 3D driver/src/amd/compiler/aco_print_ir.cpp	
+++ b/mesa 3D driver/src/amd/compiler/aco_print_ir.cpp	
@@ -89,30 +89,14 @@ const std::array<const char*, num_reduce_ops> reduce_ops = []()
 static void
 print_reg_class(const RegClass rc, FILE* output)
 {
-   switch (rc) {
-   case RegClass::s1: fprintf(output, " s1: "); return;
-   case RegClass::s2: fprintf(output, " s2: "); return;
-   case RegClass::s3: fprintf(output, " s3: "); return;
-   case RegClass::s4: fprintf(output, " s4: "); return;
-   case RegClass::s6: fprintf(output, " s6: "); return;
-   case RegClass::s8: fprintf(output, " s8: "); return;
-   case RegClass::s16: fprintf(output, "s16: "); return;
-   case RegClass::v1: fprintf(output, " v1: "); return;
-   case RegClass::v2: fprintf(output, " v2: "); return;
-   case RegClass::v3: fprintf(output, " v3: "); return;
-   case RegClass::v4: fprintf(output, " v4: "); return;
-   case RegClass::v5: fprintf(output, " v5: "); return;
-   case RegClass::v6: fprintf(output, " v6: "); return;
-   case RegClass::v7: fprintf(output, " v7: "); return;
-   case RegClass::v8: fprintf(output, " v8: "); return;
-   case RegClass::v1b: fprintf(output, " v1b: "); return;
-   case RegClass::v2b: fprintf(output, " v2b: "); return;
-   case RegClass::v3b: fprintf(output, " v3b: "); return;
-   case RegClass::v4b: fprintf(output, " v4b: "); return;
-   case RegClass::v6b: fprintf(output, " v6b: "); return;
-   case RegClass::v8b: fprintf(output, " v8b: "); return;
-   case RegClass::v1_linear: fprintf(output, " v1: "); return;
-   case RegClass::v2_linear: fprintf(output, " v2: "); return;
+   if (rc.is_subdword()) {
+      fprintf(output, " v%ub: ", rc.bytes());
+   } else if (rc.type() == RegType::sgpr) {
+      fprintf(output, " s%u: ", rc.size());
+   } else if (rc.is_linear()) {
+      fprintf(output, " lv%u: ", rc.size());
+   } else {
+      fprintf(output, " v%u: ", rc.size());
    }
 }
 
@@ -627,23 +611,32 @@ print_instr_format_specific(const Instruction* instr, FILE* output)
       }
       if (sdwa.clamp)
          fprintf(output, " clamp");
-      switch (sdwa.dst_sel & sdwa_asuint) {
-      case sdwa_udword: break;
-      case sdwa_ubyte0:
-      case sdwa_ubyte1:
-      case sdwa_ubyte2:
-      case sdwa_ubyte3:
-         fprintf(output, " dst_sel:%sbyte%u", sdwa.dst_sel & sdwa_sext ? "s" : "u",
-                 sdwa.dst_sel & sdwa_bytenum);
-         break;
-      case sdwa_uword0:
-      case sdwa_uword1:
-         fprintf(output, " dst_sel:%sword%u", sdwa.dst_sel & sdwa_sext ? "s" : "u",
-                 sdwa.dst_sel & sdwa_wordnum);
-         break;
+      if (!instr->isVOPC()) {
+         char sext = sdwa.dst_sel.sign_extend() ? 's' : 'u';
+         unsigned offset = sdwa.dst_sel.offset();
+         if (instr->definitions[0].isFixed())
+            offset += instr->definitions[0].physReg().byte();
+         switch (sdwa.dst_sel.size()) {
+         case 1: fprintf(output, " dst_sel:%cbyte%u", sext, offset); break;
+         case 2: fprintf(output, " dst_sel:%cword%u", sext, offset >> 1); break;
+         case 4: fprintf(output, " dst_sel:dword"); break;
+         default: break;
+         }
+         if (instr->definitions[0].bytes() < 4)
+            fprintf(output, " dst_preserve");
+      }
+      for (unsigned i = 0; i < std::min<unsigned>(2, instr->operands.size()); i++) {
+         char sext = sdwa.sel[i].sign_extend() ? 's' : 'u';
+         unsigned offset = sdwa.sel[i].offset();
+         if (instr->operands[i].isFixed())
+            offset += instr->operands[i].physReg().byte();
+         switch (sdwa.sel[i].size()) {
+         case 1: fprintf(output, " src%d_sel:%cbyte%u", i, sext, offset); break;
+         case 2: fprintf(output, " src%d_sel:%cword%u", i, sext, offset >> 1); break;
+         case 4: fprintf(output, " src%d_sel:dword", i); break;
+         default: break;
+         }
       }
-      if (sdwa.dst_preserve)
-         fprintf(output, " dst_preserve");
    }
 }
 
@@ -663,12 +656,10 @@ aco_print_instr(const Instruction* instr, FILE* output, unsigned flags)
       bool* const abs = (bool*)alloca(instr->operands.size() * sizeof(bool));
       bool* const neg = (bool*)alloca(instr->operands.size() * sizeof(bool));
       bool* const opsel = (bool*)alloca(instr->operands.size() * sizeof(bool));
-      uint8_t* const sel = (uint8_t*)alloca(instr->operands.size() * sizeof(uint8_t));
       for (unsigned i = 0; i < instr->operands.size(); ++i) {
          abs[i] = false;
          neg[i] = false;
          opsel[i] = false;
-         sel[i] = sdwa_udword;
       }
       if (instr->isVOP3()) {
          const VOP3_instruction& vop3 = instr->vop3();
@@ -676,7 +667,6 @@ aco_print_instr(const Instruction* instr, FILE* output, unsigned flags)
             abs[i] = vop3.abs[i];
             neg[i] = vop3.neg[i];
             opsel[i] = vop3.opsel & (1 << i);
-            sel[i] = sdwa_udword;
          }
       } else if (instr->isDPP()) {
          const DPP_instruction& dpp = instr->dpp();
@@ -684,7 +674,6 @@ aco_print_instr(const Instruction* instr, FILE* output, unsigned flags)
             abs[i] = dpp.abs[i];
             neg[i] = dpp.neg[i];
             opsel[i] = false;
-            sel[i] = sdwa_udword;
          }
       } else if (instr->isSDWA()) {
          const SDWA_instruction& sdwa = instr->sdwa();
@@ -692,7 +681,6 @@ aco_print_instr(const Instruction* instr, FILE* output, unsigned flags)
             abs[i] = sdwa.abs[i];
             neg[i] = sdwa.neg[i];
             opsel[i] = false;
-            sel[i] = sdwa.sel[i];
          }
       }
       for (unsigned i = 0; i < instr->operands.size(); ++i) {
@@ -707,22 +695,9 @@ aco_print_instr(const Instruction* instr, FILE* output, unsigned flags)
             fprintf(output, "|");
          if (opsel[i])
             fprintf(output, "hi(");
-         else if (sel[i] & sdwa_sext)
-            fprintf(output, "sext(");
          aco_print_operand(&instr->operands[i], output, flags);
-         if (opsel[i] || (sel[i] & sdwa_sext))
+         if (opsel[i])
             fprintf(output, ")");
-         if (!(sel[i] & sdwa_isra)) {
-            if (sel[i] == sdwa_udword || sel[i] == sdwa_sdword) {
-               /* print nothing */
-            } else if (sel[i] & sdwa_isword) {
-               unsigned index = sel[i] & sdwa_wordnum;
-               fprintf(output, "[%u:%u]", index * 16, index * 16 + 15);
-            } else {
-               unsigned index = sel[i] & sdwa_bytenum;
-               fprintf(output, "[%u:%u]", index * 8, index * 8 + 7);
-            }
-         }
          if (abs[i])
             fprintf(output, "|");
 
diff --git a/mesa 3D driver/src/amd/compiler/aco_register_allocation.cpp b/mesa 3D driver/src/amd/compiler/aco_register_allocation.cpp
index 3e7e87e4ad..a037745aaa 100644
--- a/mesa 3D driver/src/amd/compiler/aco_register_allocation.cpp	
+++ b/mesa 3D driver/src/amd/compiler/aco_register_allocation.cpp	
@@ -43,25 +43,31 @@ void add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx
                           RegClass rc);
 std::pair<unsigned, unsigned>
 get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc);
-void add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, unsigned idx,
-                             PhysReg reg);
+void add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg);
 
 struct assignment {
    PhysReg reg;
    RegClass rc;
-   uint8_t assigned = 0;
+   bool assigned = false;
+   uint32_t affinity = 0;
    assignment() = default;
    assignment(PhysReg reg_, RegClass rc_) : reg(reg_), rc(rc_), assigned(-1) {}
+   void set(const Definition& def)
+   {
+      assigned = true;
+      reg = def.physReg();
+      rc = def.regClass();
+   }
 };
 
 struct ra_ctx {
 
    Program* program;
+   Block* block = NULL;
    std::vector<assignment> assignments;
    std::vector<std::unordered_map<unsigned, Temp>> renames;
    std::vector<uint32_t> loop_header;
    std::unordered_map<unsigned, Temp> orig_names;
-   std::unordered_map<unsigned, unsigned> affinities;
    std::unordered_map<unsigned, Instruction*> vectors;
    std::unordered_map<unsigned, Instruction*> split_vectors;
    aco_ptr<Instruction> pseudo_dummy;
@@ -153,7 +159,7 @@ struct PhysRegInterval {
 bool
 intersects(const PhysRegInterval& a, const PhysRegInterval& b)
 {
-   return ((a.lo() >= b.lo() && a.lo() < b.hi()) || (a.hi() > b.lo() && a.hi() <= b.hi()));
+   return a.hi() > b.lo() && b.hi() > a.lo();
 }
 
 /* Gets the stride for full (non-subdword) registers */
@@ -463,25 +469,30 @@ unsigned
 get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr, unsigned idx,
                             RegClass rc)
 {
-   /* v_readfirstlane_b32 cannot use SDWA */
-   if (instr->opcode == aco_opcode::p_as_uniform)
-      return 4;
-   if (instr->isPseudo() && chip >= GFX8)
-      return rc.bytes() % 2 == 0 ? 2 : 1;
+   if (instr->isPseudo()) {
+      /* v_readfirstlane_b32 cannot use SDWA */
+      if (instr->opcode == aco_opcode::p_as_uniform)
+         return 4;
+      else if (chip >= GFX8)
+         return rc.bytes() % 2 == 0 ? 2 : 1;
+      else
+         return 4;
+   }
 
-   if (instr->opcode == aco_opcode::v_cvt_f32_ubyte0) {
-      return 1;
-   } else if (can_use_SDWA(chip, instr, false)) {
-      return rc.bytes() % 2 == 0 ? 2 : 1;
-   } else if (rc.bytes() == 2 && can_use_opsel(chip, instr->opcode, idx, 1)) {
-      return 2;
-   } else if (instr->isVOP3P()) {
-      return 2;
+   assert(rc.bytes() <= 2);
+   if (instr->isVALU()) {
+      if (can_use_SDWA(chip, instr, false))
+         return rc.bytes();
+      if (can_use_opsel(chip, instr->opcode, idx, true))
+         return 2;
+      if (instr->format == Format::VOP3P)
+         return 2;
    }
 
    switch (instr->opcode) {
+   case aco_opcode::v_cvt_f32_ubyte0: return 1;
    case aco_opcode::ds_write_b8:
-   case aco_opcode::ds_write_b16: return chip >= GFX8 ? 2 : 4;
+   case aco_opcode::ds_write_b16: return chip >= GFX9 ? 2 : 4;
    case aco_opcode::buffer_store_byte:
    case aco_opcode::buffer_store_short:
    case aco_opcode::flat_store_byte:
@@ -490,10 +501,8 @@ get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr,
    case aco_opcode::scratch_store_short:
    case aco_opcode::global_store_byte:
    case aco_opcode::global_store_short: return chip >= GFX9 ? 2 : 4;
-   default: break;
+   default: return 4;
    }
-
-   return 4;
 }
 
 void
@@ -505,58 +514,59 @@ add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, uns
       return;
 
    assert(rc.bytes() <= 2);
-
-   if (!instr->usesModifiers() && instr->opcode == aco_opcode::v_cvt_f32_ubyte0) {
-      switch (byte) {
-      case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break;
-      case 1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break;
-      case 2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break;
-      case 3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break;
+   if (instr->isVALU()) {
+      /* check if we can use opsel */
+      if (instr->format == Format::VOP3) {
+         assert(byte == 2);
+         instr->vop3().opsel |= 1 << idx;
+         return;
       }
-      return;
-   } else if (can_use_SDWA(chip, instr, false)) {
-      aco_ptr<Instruction> tmp = convert_to_SDWA(chip, instr);
-      return;
-   } else if (rc.bytes() == 2 && can_use_opsel(chip, instr->opcode, idx, byte / 2)) {
-      instr->vop3().opsel |= (byte / 2) << idx;
-      return;
-   } else if (instr->isVOP3P() && byte == 2) {
-      VOP3P_instruction& vop3p = instr->vop3p();
-      assert(!(vop3p.opsel_lo & (1 << idx)));
-      vop3p.opsel_lo |= 1 << idx;
-      vop3p.opsel_hi |= 1 << idx;
+      if (instr->isVOP3P()) {
+         assert(byte == 2 && !(instr->vop3p().opsel_lo & (1 << idx)));
+         instr->vop3p().opsel_lo |= 1 << idx;
+         instr->vop3p().opsel_hi |= 1 << idx;
+         return;
+      }
+      if (instr->opcode == aco_opcode::v_cvt_f32_ubyte0) {
+         switch (byte) {
+         case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break;
+         case 1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break;
+         case 2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break;
+         case 3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break;
+         }
+         return;
+      }
+
+      /* use SDWA */
+      assert(can_use_SDWA(chip, instr, false));
+      convert_to_SDWA(chip, instr);
       return;
    }
 
-   if (chip >= GFX8 && instr->opcode == aco_opcode::ds_write_b8 && byte == 2) {
+   assert(byte == 2);
+   if (instr->opcode == aco_opcode::ds_write_b8)
       instr->opcode = aco_opcode::ds_write_b8_d16_hi;
-      return;
-   }
-   if (chip >= GFX8 && instr->opcode == aco_opcode::ds_write_b16 && byte == 2) {
+   else if (instr->opcode == aco_opcode::ds_write_b16)
       instr->opcode = aco_opcode::ds_write_b16_d16_hi;
-      return;
-   }
-
-   if (chip >= GFX9 && byte == 2) {
-      if (instr->opcode == aco_opcode::buffer_store_byte)
-         instr->opcode = aco_opcode::buffer_store_byte_d16_hi;
-      else if (instr->opcode == aco_opcode::buffer_store_short)
-         instr->opcode = aco_opcode::buffer_store_short_d16_hi;
-      else if (instr->opcode == aco_opcode::flat_store_byte)
-         instr->opcode = aco_opcode::flat_store_byte_d16_hi;
-      else if (instr->opcode == aco_opcode::flat_store_short)
-         instr->opcode = aco_opcode::flat_store_short_d16_hi;
-      else if (instr->opcode == aco_opcode::scratch_store_byte)
-         instr->opcode = aco_opcode::scratch_store_byte_d16_hi;
-      else if (instr->opcode == aco_opcode::scratch_store_short)
-         instr->opcode = aco_opcode::scratch_store_short_d16_hi;
-      else if (instr->opcode == aco_opcode::global_store_byte)
-         instr->opcode = aco_opcode::global_store_byte_d16_hi;
-      else if (instr->opcode == aco_opcode::global_store_short)
-         instr->opcode = aco_opcode::global_store_short_d16_hi;
-      else
-         unreachable("Something went wrong: Impossible register assignment.");
-   }
+   else if (instr->opcode == aco_opcode::buffer_store_byte)
+      instr->opcode = aco_opcode::buffer_store_byte_d16_hi;
+   else if (instr->opcode == aco_opcode::buffer_store_short)
+      instr->opcode = aco_opcode::buffer_store_short_d16_hi;
+   else if (instr->opcode == aco_opcode::flat_store_byte)
+      instr->opcode = aco_opcode::flat_store_byte_d16_hi;
+   else if (instr->opcode == aco_opcode::flat_store_short)
+      instr->opcode = aco_opcode::flat_store_short_d16_hi;
+   else if (instr->opcode == aco_opcode::scratch_store_byte)
+      instr->opcode = aco_opcode::scratch_store_byte_d16_hi;
+   else if (instr->opcode == aco_opcode::scratch_store_short)
+      instr->opcode = aco_opcode::scratch_store_short_d16_hi;
+   else if (instr->opcode == aco_opcode::global_store_byte)
+      instr->opcode = aco_opcode::global_store_byte_d16_hi;
+   else if (instr->opcode == aco_opcode::global_store_short)
+      instr->opcode = aco_opcode::global_store_short_d16_hi;
+   else
+      unreachable("Something went wrong: Impossible register assignment.");
+   return;
 }
 
 /* minimum_stride, bytes_written */
@@ -565,99 +575,124 @@ get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr
 {
    chip_class chip = program->chip_class;
 
-   if (instr->isPseudo() && chip >= GFX8)
-      return std::make_pair(rc.bytes() % 2 == 0 ? 2 : 1, rc.bytes());
-   else if (instr->isPseudo())
-      return std::make_pair(4, rc.size() * 4u);
-
-   unsigned bytes_written = chip >= GFX10 ? rc.bytes() : 4u;
-   switch (instr->opcode) {
-   case aco_opcode::v_mad_f16:
-   case aco_opcode::v_mad_u16:
-   case aco_opcode::v_mad_i16:
-   case aco_opcode::v_fma_f16:
-   case aco_opcode::v_div_fixup_f16:
-   case aco_opcode::v_interp_p2_f16: bytes_written = chip >= GFX9 ? rc.bytes() : 4u; break;
-   default: break;
+   if (instr->isPseudo()) {
+      if (chip >= GFX8)
+         return std::make_pair(rc.bytes() % 2 == 0 ? 2 : 1, rc.bytes());
+      else
+         return std::make_pair(4, rc.size() * 4u);
    }
-   bytes_written = bytes_written > 4 ? align(bytes_written, 4) : bytes_written;
-   bytes_written = MAX2(bytes_written, instr_info.definition_size[(int)instr->opcode] / 8u);
 
-   if (can_use_SDWA(chip, instr, false)) {
-      return std::make_pair(rc.bytes(), rc.bytes());
-   } else if (rc.bytes() == 2 && can_use_opsel(chip, instr->opcode, -1, 1)) {
-      return std::make_pair(2u, bytes_written);
+   if (instr->isVALU() || instr->isVINTRP()) {
+      assert(rc.bytes() <= 2);
+
+      if (can_use_SDWA(chip, instr, false))
+         return std::make_pair(rc.bytes(), rc.bytes());
+
+      unsigned bytes_written = 4u;
+      if (instr_is_16bit(chip, instr->opcode))
+         bytes_written = 2u;
+
+      unsigned stride = 4u;
+      if (instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
+          can_use_opsel(chip, instr->opcode, -1, true))
+         stride = 2u;
+
+      return std::make_pair(stride, bytes_written);
    }
 
    switch (instr->opcode) {
-   case aco_opcode::buffer_load_ubyte_d16:
-   case aco_opcode::buffer_load_short_d16:
-   case aco_opcode::flat_load_ubyte_d16:
-   case aco_opcode::flat_load_short_d16:
-   case aco_opcode::scratch_load_ubyte_d16:
-   case aco_opcode::scratch_load_short_d16:
-   case aco_opcode::global_load_ubyte_d16:
-   case aco_opcode::global_load_short_d16:
    case aco_opcode::ds_read_u8_d16:
+   case aco_opcode::ds_read_i8_d16:
    case aco_opcode::ds_read_u16_d16:
-      if (chip >= GFX9 && !program->dev.sram_ecc_enabled)
+   case aco_opcode::flat_load_ubyte_d16:
+   case aco_opcode::flat_load_sbyte_d16:
+   case aco_opcode::flat_load_short_d16:
+   case aco_opcode::global_load_ubyte_d16:
+   case aco_opcode::global_load_sbyte_d16:
+   case aco_opcode::global_load_short_d16:
+   case aco_opcode::scratch_load_ubyte_d16:
+   case aco_opcode::scratch_load_sbyte_d16:
+   case aco_opcode::scratch_load_short_d16:
+   case aco_opcode::buffer_load_ubyte_d16:
+   case aco_opcode::buffer_load_sbyte_d16:
+   case aco_opcode::buffer_load_short_d16: {
+      assert(chip >= GFX9);
+      if (!program->dev.sram_ecc_enabled)
          return std::make_pair(2u, 2u);
       else
          return std::make_pair(2u, 4u);
-   case aco_opcode::v_fma_mixlo_f16: return std::make_pair(2u, 2u);
-   default: break;
    }
 
-   return std::make_pair(4u, bytes_written);
+   default: return std::make_pair(4, rc.size() * 4u);
+   }
 }
 
 void
-add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg)
+add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg)
 {
-   RegClass rc = instr->definitions[idx].regClass();
-   chip_class chip = program->chip_class;
+   if (instr->isPseudo())
+      return;
 
-   if (instr->isPseudo()) {
-      return;
-   } else if (can_use_SDWA(chip, instr, false)) {
-      unsigned def_size = instr_info.definition_size[(int)instr->opcode];
-      if (reg.byte() || chip < GFX10 || def_size > rc.bytes() * 8u)
-         convert_to_SDWA(chip, instr);
-      return;
-   } else if (reg.byte() && rc.bytes() == 2 &&
-              can_use_opsel(chip, instr->opcode, -1, reg.byte() / 2)) {
-      VOP3_instruction& vop3 = instr->vop3();
-      if (reg.byte() == 2)
-         vop3.opsel |= (1 << 3); /* dst in high half */
-      return;
-   }
+   if (instr->isVALU()) {
+      chip_class chip = program->chip_class;
+      assert(instr->definitions[0].bytes() <= 2);
 
-   if (reg.byte() == 2) {
-      if (instr->opcode == aco_opcode::v_fma_mixlo_f16)
+      if (reg.byte() == 0 && instr_is_16bit(chip, instr->opcode))
+         return;
+
+      /* check if we can use opsel */
+      if (instr->format == Format::VOP3) {
+         assert(reg.byte() == 2);
+         assert(can_use_opsel(chip, instr->opcode, -1, true));
+         instr->vop3().opsel |= (1 << 3); /* dst in high half */
+         return;
+      }
+
+      if (instr->opcode == aco_opcode::v_fma_mixlo_f16) {
          instr->opcode = aco_opcode::v_fma_mixhi_f16;
-      else if (instr->opcode == aco_opcode::buffer_load_ubyte_d16)
-         instr->opcode = aco_opcode::buffer_load_ubyte_d16_hi;
-      else if (instr->opcode == aco_opcode::buffer_load_short_d16)
-         instr->opcode = aco_opcode::buffer_load_short_d16_hi;
-      else if (instr->opcode == aco_opcode::flat_load_ubyte_d16)
-         instr->opcode = aco_opcode::flat_load_ubyte_d16_hi;
-      else if (instr->opcode == aco_opcode::flat_load_short_d16)
-         instr->opcode = aco_opcode::flat_load_short_d16_hi;
-      else if (instr->opcode == aco_opcode::scratch_load_ubyte_d16)
-         instr->opcode = aco_opcode::scratch_load_ubyte_d16_hi;
-      else if (instr->opcode == aco_opcode::scratch_load_short_d16)
-         instr->opcode = aco_opcode::scratch_load_short_d16_hi;
-      else if (instr->opcode == aco_opcode::global_load_ubyte_d16)
-         instr->opcode = aco_opcode::global_load_ubyte_d16_hi;
-      else if (instr->opcode == aco_opcode::global_load_short_d16)
-         instr->opcode = aco_opcode::global_load_short_d16_hi;
-      else if (instr->opcode == aco_opcode::ds_read_u8_d16)
-         instr->opcode = aco_opcode::ds_read_u8_d16_hi;
-      else if (instr->opcode == aco_opcode::ds_read_u16_d16)
-         instr->opcode = aco_opcode::ds_read_u16_d16_hi;
-      else
-         unreachable("Something went wrong: Impossible register assignment.");
+         return;
+      }
+
+      /* use SDWA */
+      assert(can_use_SDWA(chip, instr, false));
+      convert_to_SDWA(chip, instr);
+      return;
    }
+
+   if (reg.byte() == 0)
+      return;
+   else if (instr->opcode == aco_opcode::buffer_load_ubyte_d16)
+      instr->opcode = aco_opcode::buffer_load_ubyte_d16_hi;
+   else if (instr->opcode == aco_opcode::buffer_load_sbyte_d16)
+      instr->opcode = aco_opcode::buffer_load_sbyte_d16_hi;
+   else if (instr->opcode == aco_opcode::buffer_load_short_d16)
+      instr->opcode = aco_opcode::buffer_load_short_d16_hi;
+   else if (instr->opcode == aco_opcode::flat_load_ubyte_d16)
+      instr->opcode = aco_opcode::flat_load_ubyte_d16_hi;
+   else if (instr->opcode == aco_opcode::flat_load_sbyte_d16)
+      instr->opcode = aco_opcode::flat_load_sbyte_d16_hi;
+   else if (instr->opcode == aco_opcode::flat_load_short_d16)
+      instr->opcode = aco_opcode::flat_load_short_d16_hi;
+   else if (instr->opcode == aco_opcode::scratch_load_ubyte_d16)
+      instr->opcode = aco_opcode::scratch_load_ubyte_d16_hi;
+   else if (instr->opcode == aco_opcode::scratch_load_sbyte_d16)
+      instr->opcode = aco_opcode::scratch_load_sbyte_d16_hi;
+   else if (instr->opcode == aco_opcode::scratch_load_short_d16)
+      instr->opcode = aco_opcode::scratch_load_short_d16_hi;
+   else if (instr->opcode == aco_opcode::global_load_ubyte_d16)
+      instr->opcode = aco_opcode::global_load_ubyte_d16_hi;
+   else if (instr->opcode == aco_opcode::global_load_sbyte_d16)
+      instr->opcode = aco_opcode::global_load_sbyte_d16_hi;
+   else if (instr->opcode == aco_opcode::global_load_short_d16)
+      instr->opcode = aco_opcode::global_load_short_d16_hi;
+   else if (instr->opcode == aco_opcode::ds_read_u8_d16)
+      instr->opcode = aco_opcode::ds_read_u8_d16_hi;
+   else if (instr->opcode == aco_opcode::ds_read_i8_d16)
+      instr->opcode = aco_opcode::ds_read_i8_d16_hi;
+   else if (instr->opcode == aco_opcode::ds_read_u16_d16)
+      instr->opcode = aco_opcode::ds_read_u16_d16_hi;
+   else
+      unreachable("Something went wrong: Impossible register assignment.");
 }
 
 void
@@ -882,7 +917,7 @@ get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info)
    if (rc.is_subdword()) {
       for (std::pair<const uint32_t, std::array<uint32_t, 4>>& entry : reg_file.subdword_regs) {
          assert(reg_file[PhysReg{entry.first}] == 0xF0000000);
-         if (!bounds.contains(PhysReg{entry.first}))
+         if (!bounds.contains({PhysReg{entry.first}, rc.size()}))
             continue;
 
          for (unsigned i = 0; i < 4; i += info.stride) {
@@ -1046,8 +1081,9 @@ get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
                n++;
                continue;
             }
-            /* we cannot split live ranges of linear vgprs */
-            if (ctx.assignments[reg_file[j]].rc & (1 << 6)) {
+            /* we cannot split live ranges of linear vgprs inside control flow */
+            if (!(ctx.block->kind & block_kind_top_level) &&
+                ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
                found = false;
                break;
             }
@@ -1193,8 +1229,10 @@ get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file,
             break;
          }
 
-         /* we cannot split live ranges of linear vgprs */
-         if (ctx.assignments[reg_file[j]].rc & (1 << 6)) {
+         /* we cannot split live ranges of linear vgprs inside control flow */
+         // TODO: ensure that live range splits inside control flow are never necessary
+         if (!(ctx.block->kind & block_kind_top_level) &&
+             ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
             found = false;
             break;
          }
@@ -1406,19 +1444,21 @@ is_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction* instr)
       if (ctx.assignments[op.tempId()].assigned) {
          PhysReg reg = ctx.assignments[op.tempId()].reg;
 
-         if (first.reg() != 512 && reg != first.advance(i * 4))
-            return false; /* not at the best position */
-
-         if ((reg.reg() - 256) < i)
-            return false; /* no space for previous operands */
-
-         first = reg.advance(i * -4);
-      } else if (first.reg() != 512) {
+         if (first.reg() == 512) {
+            PhysRegInterval bounds = get_reg_bounds(ctx.program, RegType::vgpr);
+            first = reg.advance(i * -4);
+            PhysRegInterval vec = PhysRegInterval{first, instr->operands.size() - 3u};
+            if (!bounds.contains(vec)) /* not enough space for other operands */
+               return false;
+         } else {
+            if (reg != first.advance(i * 4)) /* not at the best position */
+               return false;
+         }
+      } else {
          /* If there's an unexpected temporary, this operand is unlikely to be
           * placed in the best position.
           */
-         unsigned id = reg_file.get_id(first.advance(i * 4));
-         if (id && id != op.tempId())
+         if (first.reg() != 512 && reg_file.test(first.advance(i * 4), 4))
             return false;
       }
    }
@@ -1488,22 +1528,25 @@ get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
    if (split_vec != ctx.split_vectors.end()) {
       unsigned offset = 0;
       for (Definition def : split_vec->second->definitions) {
-         auto affinity_it = ctx.affinities.find(def.tempId());
-         if (affinity_it != ctx.affinities.end() && ctx.assignments[affinity_it->second].assigned) {
-            PhysReg reg = ctx.assignments[affinity_it->second].reg;
-            reg.reg_b -= offset;
-            if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
-               return reg;
+         if (ctx.assignments[def.tempId()].affinity) {
+            assignment& affinity = ctx.assignments[ctx.assignments[def.tempId()].affinity];
+            if (affinity.assigned) {
+               PhysReg reg = affinity.reg;
+               reg.reg_b -= offset;
+               if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
+                  return reg;
+            }
          }
          offset += def.bytes();
       }
    }
 
-   if (ctx.affinities.find(temp.id()) != ctx.affinities.end() &&
-       ctx.assignments[ctx.affinities[temp.id()]].assigned) {
-      PhysReg reg = ctx.assignments[ctx.affinities[temp.id()]].reg;
-      if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
-         return reg;
+   if (ctx.assignments[temp.id()].affinity) {
+      assignment& affinity = ctx.assignments[ctx.assignments[temp.id()].affinity];
+      if (affinity.assigned) {
+         if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, affinity.reg))
+            return affinity.reg;
+      }
    }
 
    std::pair<PhysReg, bool> res;
@@ -1597,7 +1640,7 @@ get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
 
    PhysReg best_pos{0xFFF};
    unsigned num_moves = 0xFF;
-   bool best_war_hint = true;
+   bool best_avoid = true;
 
    /* test for each operand which definition placement causes the least shuffle instructions */
    for (unsigned i = 0, offset = 0; i < instr->operands.size();
@@ -1631,14 +1674,10 @@ get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
           reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi()))
          continue;
 
-      /* count variables to be moved and check war_hint */
-      bool war_hint = false;
+      /* count variables to be moved and check "avoid" */
+      bool avoid = false;
       bool linear_vgpr = false;
       for (PhysReg j : reg_win) {
-         if (linear_vgpr) {
-            break;
-         }
-
          if (reg_file[j] != 0) {
             if (reg_file[j] == 0xF0000000) {
                PhysReg reg;
@@ -1648,14 +1687,21 @@ get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
                   k += reg_file.test(reg, 1);
             } else {
                k += 4;
-               /* we cannot split live ranges of linear vgprs */
-               if (ctx.assignments[reg_file[j]].rc & (1 << 6))
-                  linear_vgpr = true;
+               linear_vgpr |= ctx.assignments[reg_file[j]].rc.is_linear_vgpr();
             }
          }
-         war_hint |= ctx.war_hint[j];
+         avoid |= ctx.war_hint[j];
       }
-      if (linear_vgpr || (war_hint && !best_war_hint))
+
+      if (linear_vgpr) {
+         /* we cannot split live ranges of linear vgprs inside control flow */
+         if (ctx.block->kind & block_kind_top_level)
+            avoid = true;
+         else
+            continue;
+      }
+
+      if (avoid && !best_avoid)
          continue;
 
       /* count operands in wrong positions */
@@ -1673,7 +1719,7 @@ get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
 
       best_pos = reg_win.lo();
       num_moves = k;
-      best_war_hint = war_hint;
+      best_avoid = avoid;
    }
 
    if (num_moves >= bytes)
@@ -1745,51 +1791,43 @@ handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)
    default: return;
    }
 
-   /* if all definitions are vgpr, no need to care for SCC */
-   bool writes_sgpr = false;
+   bool writes_linear = false;
+   /* if all definitions are logical vgpr, no need to care for SCC */
    for (Definition& def : instr->definitions) {
-      if (def.getTemp().type() == RegType::sgpr) {
-         writes_sgpr = true;
-         break;
-      }
+      if (def.getTemp().regClass().is_linear())
+         writes_linear = true;
    }
    /* if all operands are constant, no need to care either */
-   bool reads_sgpr = false;
+   bool reads_linear = false;
    bool reads_subdword = false;
    for (Operand& op : instr->operands) {
-      if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
-         reads_sgpr = true;
-         break;
-      }
+      if (op.isTemp() && op.getTemp().regClass().is_linear())
+         reads_linear = true;
       if (op.isTemp() && op.regClass().is_subdword())
          reads_subdword = true;
    }
-   bool needs_scratch_reg =
-      (writes_sgpr && reads_sgpr) || (ctx.program->chip_class <= GFX7 && reads_subdword);
+   bool needs_scratch_reg = (writes_linear && reads_linear && reg_file[scc]) ||
+                            (ctx.program->chip_class <= GFX7 && reads_subdword);
    if (!needs_scratch_reg)
       return;
 
-   if (reg_file[scc]) {
-      instr->pseudo().tmp_in_scc = true;
+   instr->pseudo().tmp_in_scc = reg_file[scc];
 
-      int reg = ctx.max_used_sgpr;
-      for (; reg >= 0 && reg_file[PhysReg{(unsigned)reg}]; reg--)
+   int reg = ctx.max_used_sgpr;
+   for (; reg >= 0 && reg_file[PhysReg{(unsigned)reg}]; reg--)
+      ;
+   if (reg < 0) {
+      reg = ctx.max_used_sgpr + 1;
+      for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[PhysReg{(unsigned)reg}]; reg++)
          ;
-      if (reg < 0) {
-         reg = ctx.max_used_sgpr + 1;
-         for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[PhysReg{(unsigned)reg}]; reg++)
-            ;
-         if (reg == ctx.program->max_reg_demand.sgpr) {
-            assert(reads_subdword && reg_file[m0] == 0);
-            reg = m0;
-         }
+      if (reg == ctx.program->max_reg_demand.sgpr) {
+         assert(reads_subdword && reg_file[m0] == 0);
+         reg = m0;
       }
-
-      adjust_max_used_regs(ctx, s1, reg);
-      instr->pseudo().scratch_sgpr = PhysReg{(unsigned)reg};
-   } else {
-      instr->pseudo().tmp_in_scc = false;
    }
+
+   adjust_max_used_regs(ctx, s1, reg);
+   instr->pseudo().scratch_sgpr = PhysReg{(unsigned)reg};
 }
 
 bool
@@ -1873,6 +1911,145 @@ get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
    update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops | fill_killed_ops);
 }
 
+void
+get_regs_for_phis(ra_ctx& ctx, Block& block, RegisterFile& register_file,
+                  std::vector<aco_ptr<Instruction>>& instructions, IDSet& live_in)
+{
+   /* assign phis with all-matching registers to that register */
+   for (aco_ptr<Instruction>& phi : block.instructions) {
+      if (!is_phi(phi))
+         break;
+      Definition& definition = phi->definitions[0];
+      if (definition.isKill() || definition.isFixed())
+         continue;
+
+      if (!phi->operands[0].isTemp())
+         continue;
+
+      PhysReg reg = phi->operands[0].physReg();
+      auto OpsSame = [=](const Operand& op) -> bool
+      { return op.isTemp() && (!op.isFixed() || op.physReg() == reg); };
+      bool all_same = std::all_of(phi->operands.cbegin() + 1, phi->operands.cend(), OpsSame);
+      if (!all_same)
+         continue;
+
+      if (!get_reg_specified(ctx, register_file, definition.regClass(), phi, reg))
+         continue;
+
+      definition.setFixed(reg);
+      register_file.fill(definition);
+      ctx.assignments[definition.tempId()].set(definition);
+   }
+
+   /* try to find a register that is used by at least one operand */
+   for (aco_ptr<Instruction>& phi : block.instructions) {
+      if (!is_phi(phi))
+         break;
+      Definition& definition = phi->definitions[0];
+      if (definition.isKill() || definition.isFixed())
+         continue;
+
+      /* use affinity if available */
+      if (ctx.assignments[definition.tempId()].affinity &&
+          ctx.assignments[ctx.assignments[definition.tempId()].affinity].assigned) {
+         assignment& affinity = ctx.assignments[ctx.assignments[definition.tempId()].affinity];
+         assert(affinity.rc == definition.regClass());
+         if (get_reg_specified(ctx, register_file, definition.regClass(), phi, affinity.reg)) {
+            definition.setFixed(affinity.reg);
+            register_file.fill(definition);
+            ctx.assignments[definition.tempId()].set(definition);
+            continue;
+         }
+      }
+
+      /* by going backwards, we aim to avoid copies in else-blocks */
+      for (int i = phi->operands.size() - 1; i >= 0; i--) {
+         const Operand& op = phi->operands[i];
+         if (!op.isTemp() || !op.isFixed())
+            continue;
+
+         PhysReg reg = op.physReg();
+         if (get_reg_specified(ctx, register_file, definition.regClass(), phi, reg)) {
+            definition.setFixed(reg);
+            register_file.fill(definition);
+            ctx.assignments[definition.tempId()].set(definition);
+            break;
+         }
+      }
+   }
+
+   /* find registers for phis where the register was blocked or no operand was assigned */
+   for (aco_ptr<Instruction>& phi : block.instructions) {
+      if (!is_phi(phi))
+         break;
+
+      Definition& definition = phi->definitions[0];
+      if (definition.isKill())
+         continue;
+
+      if (definition.isFixed()) {
+         instructions.emplace_back(std::move(phi));
+         continue;
+      }
+
+      std::vector<std::pair<Operand, Definition>> parallelcopy;
+      definition.setFixed(get_reg(ctx, register_file, definition.getTemp(), parallelcopy, phi));
+      update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops);
+
+      /* process parallelcopy */
+      for (std::pair<Operand, Definition> pc : parallelcopy) {
+         /* see if it's a copy from a different phi */
+         // TODO: prefer moving some previous phis over live-ins
+         // TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a
+         // problem in practice since they can only be fixed to exec)
+         Instruction* prev_phi = NULL;
+         std::vector<aco_ptr<Instruction>>::iterator phi_it;
+         for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {
+            if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
+               prev_phi = phi_it->get();
+         }
+         if (prev_phi) {
+            /* if so, just update that phi's register */
+            prev_phi->definitions[0].setFixed(pc.second.physReg());
+            ctx.assignments[prev_phi->definitions[0].tempId()].set(pc.second);
+            continue;
+         }
+
+         /* rename */
+         std::unordered_map<unsigned, Temp>::iterator orig_it =
+            ctx.orig_names.find(pc.first.tempId());
+         Temp orig = pc.first.getTemp();
+         if (orig_it != ctx.orig_names.end())
+            orig = orig_it->second;
+         else
+            ctx.orig_names[pc.second.tempId()] = orig;
+         ctx.renames[block.index][orig.id()] = pc.second.getTemp();
+
+         /* otherwise, this is a live-in and we need to create a new phi
+          * to move it in this block's predecessors */
+         aco_opcode opcode =
+            pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
+         std::vector<unsigned>& preds =
+            pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
+         aco_ptr<Instruction> new_phi{
+            create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
+         new_phi->definitions[0] = pc.second;
+         for (unsigned i = 0; i < preds.size(); i++)
+            new_phi->operands[i] = Operand(pc.first);
+         instructions.emplace_back(std::move(new_phi));
+
+         /* Remove from live_out_per_block (now used for live-in), because handle_loop_phis()
+          * would re-create this phi later if this is a loop header.
+          */
+         live_in.erase(orig.id());
+      }
+
+      register_file.fill(definition);
+      ctx.assignments[definition.tempId()].set(definition);
+      instructions.emplace_back(std::move(phi));
+   }
+}
+
 Temp
 read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)
 {
@@ -1887,7 +2064,7 @@ Temp
 handle_live_in(ra_ctx& ctx, Temp val, Block* block)
 {
    std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
-   if (preds.size() == 0 || val.regClass() == val.regClass().as_linear())
+   if (preds.size() == 0)
       return val;
 
    if (preds.size() == 1) {
@@ -1910,22 +2087,23 @@ handle_live_in(ra_ctx& ctx, Temp val, Block* block)
    }
 
    if (needs_phi) {
+      assert(!val.regClass().is_linear_vgpr());
+
       /* the variable has been renamed differently in the predecessors: we need to insert a phi */
       aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
       aco_ptr<Instruction> phi{
          create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
       new_val = ctx.program->allocateTmp(val.regClass());
       phi->definitions[0] = Definition(new_val);
+      ctx.assignments.emplace_back();
+      assert(ctx.assignments.size() == ctx.program->peekAllocationId());
       for (unsigned i = 0; i < preds.size(); i++) {
          /* update the operands so that it uses the new affinity */
          phi->operands[i] = Operand(ops[i]);
          assert(ctx.assignments[ops[i].id()].assigned);
+         assert(ops[i].regClass() == new_val.regClass());
          phi->operands[i].setFixed(ctx.assignments[ops[i].id()].reg);
-         if (ops[i].regClass() == new_val.regClass())
-            ctx.affinities[new_val.id()] = ops[i].id();
       }
-      ctx.assignments.emplace_back();
-      assert(ctx.assignments.size() == ctx.program->peekAllocationId());
       block->instructions.insert(block->instructions.begin(), std::move(phi));
    }
 
@@ -2111,46 +2289,29 @@ get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
       std::vector<aco_ptr<Instruction>>::reverse_iterator rit;
       for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
          aco_ptr<Instruction>& instr = *rit;
-         if (is_phi(instr)) {
-            if (instr->definitions[0].isKill() || instr->definitions[0].isFixed()) {
-               live.erase(instr->definitions[0].tempId());
-               continue;
-            }
-            /* collect information about affinity-related temporaries */
-            std::vector<Temp> affinity_related;
-            /* affinity_related[0] is the last seen affinity-related temp */
-            affinity_related.emplace_back(instr->definitions[0].getTemp());
-            affinity_related.emplace_back(instr->definitions[0].getTemp());
-            for (const Operand& op : instr->operands) {
-               if (op.isTemp() && op.isKill() &&
-                   op.regClass() == instr->definitions[0].regClass()) {
-                  affinity_related.emplace_back(op.getTemp());
-                  temp_to_phi_ressources[op.tempId()] = phi_ressources.size();
-               }
-            }
-            phi_ressources.emplace_back(std::move(affinity_related));
-         } else {
-            /* add vector affinities */
-            if (instr->opcode == aco_opcode::p_create_vector) {
-               for (const Operand& op : instr->operands) {
-                  if (op.isTemp() && op.isFirstKill() &&
-                      op.getTemp().type() == instr->definitions[0].getTemp().type())
-                     ctx.vectors[op.tempId()] = instr.get();
-               }
-            } else if (instr->format == Format::MIMG && instr->operands.size() > 4) {
-               for (unsigned i = 3; i < instr->operands.size(); i++)
-                  ctx.vectors[instr->operands[i].tempId()] = instr.get();
-            }
+         if (is_phi(instr))
+            break;
 
-            if (instr->opcode == aco_opcode::p_split_vector &&
-                instr->operands[0].isFirstKillBeforeDef())
-               ctx.split_vectors[instr->operands[0].tempId()] = instr.get();
-
-            /* add operands to live variables */
+         /* add vector affinities */
+         if (instr->opcode == aco_opcode::p_create_vector) {
             for (const Operand& op : instr->operands) {
-               if (op.isTemp())
-                  live.insert(op.tempId());
+               if (op.isTemp() && op.isFirstKill() &&
+                   op.getTemp().type() == instr->definitions[0].getTemp().type())
+                  ctx.vectors[op.tempId()] = instr.get();
             }
+         } else if (instr->format == Format::MIMG && instr->operands.size() > 4) {
+            for (unsigned i = 3; i < instr->operands.size(); i++)
+               ctx.vectors[instr->operands[i].tempId()] = instr.get();
+         }
+
+         if (instr->opcode == aco_opcode::p_split_vector &&
+             instr->operands[0].isFirstKillBeforeDef())
+            ctx.split_vectors[instr->operands[0].tempId()] = instr.get();
+
+         /* add operands to live variables */
+         for (const Operand& op : instr->operands) {
+            if (op.isTemp())
+               live.insert(op.tempId());
          }
 
          /* erase definitions from live */
@@ -2197,13 +2358,76 @@ get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
             }
          }
       }
+
+      /* collect phi affinities */
+      for (; rit != block.instructions.rend(); ++rit) {
+         aco_ptr<Instruction>& instr = *rit;
+         assert(is_phi(instr));
+
+         live.erase(instr->definitions[0].tempId());
+         if (instr->definitions[0].isKill() || instr->definitions[0].isFixed())
+            continue;
+
+         assert(instr->definitions[0].isTemp());
+         std::unordered_map<unsigned, unsigned>::iterator it =
+            temp_to_phi_ressources.find(instr->definitions[0].tempId());
+         unsigned index = phi_ressources.size();
+         std::vector<Temp>* affinity_related;
+         if (it != temp_to_phi_ressources.end()) {
+            index = it->second;
+            phi_ressources[index][0] = instr->definitions[0].getTemp();
+            affinity_related = &phi_ressources[index];
+         } else {
+            phi_ressources.emplace_back(std::vector<Temp>{instr->definitions[0].getTemp()});
+            affinity_related = &phi_ressources.back();
+         }
+
+         for (const Operand& op : instr->operands) {
+            if (op.isTemp() && op.isKill() && op.regClass() == instr->definitions[0].regClass()) {
+               affinity_related->emplace_back(op.getTemp());
+               if (block.kind & block_kind_loop_header)
+                  continue;
+               temp_to_phi_ressources[op.tempId()] = index;
+            }
+         }
+      }
+
+      /* visit the loop header phis first in order to create nested affinities */
+      if (block.kind & block_kind_loop_exit) {
+         /* find loop header */
+         auto header_rit = block_rit;
+         while ((header_rit + 1)->loop_nest_depth > block.loop_nest_depth)
+            header_rit++;
+
+         for (aco_ptr<Instruction>& phi : header_rit->instructions) {
+            if (!is_phi(phi))
+               break;
+            if (phi->definitions[0].isKill() || phi->definitions[0].isFixed())
+               continue;
+
+            /* create an (empty) merge-set for the phi-related variables */
+            auto it = temp_to_phi_ressources.find(phi->definitions[0].tempId());
+            unsigned index = phi_ressources.size();
+            if (it == temp_to_phi_ressources.end()) {
+               temp_to_phi_ressources[phi->definitions[0].tempId()] = index;
+               phi_ressources.emplace_back(std::vector<Temp>{phi->definitions[0].getTemp()});
+            } else {
+               index = it->second;
+            }
+            for (unsigned i = 1; i < phi->operands.size(); i++) {
+               const Operand& op = phi->operands[i];
+               if (op.isTemp() && op.isKill() && op.regClass() == phi->definitions[0].regClass()) {
+                  temp_to_phi_ressources[op.tempId()] = index;
+               }
+            }
+         }
+      }
    }
    /* create affinities */
    for (std::vector<Temp>& vec : phi_ressources) {
-      assert(vec.size() > 1);
       for (unsigned i = 1; i < vec.size(); i++)
          if (vec[i].id() != vec[0].id())
-            ctx.affinities[vec[i].id()] = vec[0].id();
+            ctx.assignments[vec[i].id()].affinity = vec[0].id();
    }
 }
 
@@ -2219,152 +2443,17 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
    std::vector<std::bitset<128>> sgpr_live_in(program->blocks.size());
 
    for (Block& block : program->blocks) {
+      ctx.block = &block;
+
       /* initialize register file */
       RegisterFile register_file = init_reg_file(ctx, live_out_per_block, block);
       ctx.war_hint.reset();
 
       std::vector<aco_ptr<Instruction>> instructions;
-      std::vector<aco_ptr<Instruction>>::iterator instr_it;
 
       /* this is a slight adjustment from the paper as we already have phi nodes:
        * We consider them incomplete phis and only handle the definition. */
-
-      /* look up the affinities */
-      for (instr_it = block.instructions.begin(); instr_it != block.instructions.end();
-           ++instr_it) {
-         aco_ptr<Instruction>& phi = *instr_it;
-         if (!is_phi(phi))
-            break;
-         Definition& definition = phi->definitions[0];
-         if (definition.isKill() || definition.isFixed())
-            continue;
-
-         if (ctx.affinities.find(definition.tempId()) != ctx.affinities.end() &&
-             ctx.assignments[ctx.affinities[definition.tempId()]].assigned) {
-            assert(ctx.assignments[ctx.affinities[definition.tempId()]].rc ==
-                   definition.regClass());
-            PhysReg reg = ctx.assignments[ctx.affinities[definition.tempId()]].reg;
-            if (reg == scc) {
-               /* only use scc if all operands are already placed there */
-               bool use_scc =
-                  std::all_of(phi->operands.begin(), phi->operands.end(),
-                              [](const Operand& op)
-                              { return op.isTemp() && op.isFixed() && op.physReg() == scc; });
-               if (!use_scc)
-                  continue;
-            }
-
-            /* only assign if register is still free */
-            if (!register_file.test(reg, definition.bytes())) {
-               definition.setFixed(reg);
-               register_file.fill(definition);
-               ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
-            }
-         }
-      }
-
-      /* find registers for phis without affinity or where the register was blocked */
-      for (instr_it = block.instructions.begin(); instr_it != block.instructions.end();
-           ++instr_it) {
-         aco_ptr<Instruction>& phi = *instr_it;
-         if (!is_phi(phi))
-            break;
-
-         Definition& definition = phi->definitions[0];
-         if (definition.isKill())
-            continue;
-
-         if (!definition.isFixed()) {
-            std::vector<std::pair<Operand, Definition>> parallelcopy;
-            /* try to find a register that is used by at least one operand */
-            for (int i = phi->operands.size() - 1; i >= 0; i--) {
-               /* by going backwards, we aim to avoid copies in else-blocks */
-               const Operand& op = phi->operands[i];
-               if (!op.isTemp() || !op.isFixed())
-                  continue;
-               PhysReg reg = op.physReg();
-               /* we tried this already on the previous loop */
-               if (reg == scc)
-                  continue;
-               if (get_reg_specified(ctx, register_file, definition.regClass(), phi, reg)) {
-                  definition.setFixed(reg);
-                  break;
-               }
-            }
-            if (!definition.isFixed()) {
-               definition.setFixed(
-                  get_reg(ctx, register_file, definition.getTemp(), parallelcopy, phi));
-               update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops);
-            }
-
-            /* process parallelcopy */
-            for (std::pair<Operand, Definition> pc : parallelcopy) {
-               /* see if it's a copy from a different phi */
-               // TODO: prefer moving some previous phis over live-ins
-               // TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a
-               // problem in practice since they can only be fixed to exec)
-               Instruction* prev_phi = NULL;
-               std::vector<aco_ptr<Instruction>>::iterator phi_it;
-               for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {
-                  if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
-                     prev_phi = phi_it->get();
-               }
-               phi_it = instr_it;
-               while (!prev_phi && is_phi(*++phi_it)) {
-                  if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
-                     prev_phi = phi_it->get();
-               }
-               if (prev_phi) {
-                  /* if so, just update that phi's register */
-                  register_file.clear(prev_phi->definitions[0]);
-                  prev_phi->definitions[0].setFixed(pc.second.physReg());
-                  ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(),
-                                                                        pc.second.regClass()};
-                  register_file.fill(prev_phi->definitions[0]);
-                  continue;
-               }
-
-               /* rename */
-               std::unordered_map<unsigned, Temp>::iterator orig_it =
-                  ctx.orig_names.find(pc.first.tempId());
-               Temp orig = pc.first.getTemp();
-               if (orig_it != ctx.orig_names.end())
-                  orig = orig_it->second;
-               else
-                  ctx.orig_names[pc.second.tempId()] = orig;
-               ctx.renames[block.index][orig.id()] = pc.second.getTemp();
-
-               /* otherwise, this is a live-in and we need to create a new phi
-                * to move it in this block's predecessors */
-               aco_opcode opcode =
-                  pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
-               std::vector<unsigned>& preds =
-                  pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
-               aco_ptr<Instruction> new_phi{
-                  create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
-               new_phi->definitions[0] = pc.second;
-               for (unsigned i = 0; i < preds.size(); i++)
-                  new_phi->operands[i] = Operand(pc.first);
-               instructions.emplace_back(std::move(new_phi));
-
-               /* Remove from live_out_per_block (now used for live-in), because handle_loop_phis()
-                * would re-create this phi later if this is a loop header.
-                */
-               live_out_per_block[block.index].erase(orig.id());
-            }
-
-            register_file.fill(definition);
-            ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
-         }
-
-         /* update phi affinities */
-         for (const Operand& op : phi->operands) {
-            if (op.isTemp() && op.regClass() == phi->definitions[0].regClass())
-               ctx.affinities[op.tempId()] = definition.tempId();
-         }
-
-         instructions.emplace_back(std::move(*instr_it));
-      }
+      get_regs_for_phis(ctx, block, register_file, instructions, live_out_per_block[block.index]);
 
       /* fill in sgpr_live_in */
       for (unsigned i = 0; i <= ctx.max_used_sgpr; i++)
@@ -2372,6 +2461,9 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
       sgpr_live_in[block.index][127] = register_file[scc];
 
       /* Handle all other instructions of the block */
+      auto NonPhi = [](aco_ptr<Instruction>& instr) -> bool { return instr && !is_phi(instr); };
+      std::vector<aco_ptr<Instruction>>::iterator instr_it =
+         std::find_if(block.instructions.begin(), block.instructions.end(), NonPhi);
       for (; instr_it != block.instructions.end(); ++instr_it) {
          aco_ptr<Instruction>& instr = *instr_it;
 
@@ -2449,17 +2541,22 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
               instr->opcode == aco_opcode::v_mad_f16 ||
               instr->opcode == aco_opcode::v_mad_legacy_f16 ||
               (instr->opcode == aco_opcode::v_fma_f16 && program->chip_class >= GFX10) ||
-              (instr->opcode == aco_opcode::v_pk_fma_f16 && program->chip_class >= GFX10)) &&
+              (instr->opcode == aco_opcode::v_pk_fma_f16 && program->chip_class >= GFX10) ||
+              (instr->opcode == aco_opcode::v_dot4_i32_i8 && program->family != CHIP_VEGA20)) &&
              instr->operands[2].isTemp() && instr->operands[2].isKillBeforeDef() &&
              instr->operands[2].getTemp().type() == RegType::vgpr && instr->operands[1].isTemp() &&
              instr->operands[1].getTemp().type() == RegType::vgpr && !instr->usesModifiers() &&
              instr->operands[0].physReg().byte() == 0 && instr->operands[1].physReg().byte() == 0 &&
              instr->operands[2].physReg().byte() == 0) {
             unsigned def_id = instr->definitions[0].tempId();
-            auto it = ctx.affinities.find(def_id);
-            if (it == ctx.affinities.end() || !ctx.assignments[it->second].assigned ||
-                instr->operands[2].physReg() == ctx.assignments[it->second].reg ||
-                register_file.test(ctx.assignments[it->second].reg, instr->operands[2].bytes())) {
+            bool use_vop2 = true;
+            if (ctx.assignments[def_id].affinity) {
+               assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity];
+               if (affinity.assigned && affinity.reg != instr->operands[2].physReg() &&
+                   !register_file.test(affinity.reg, instr->operands[2].bytes()))
+                  use_vop2 = false;
+            }
+            if (use_vop2) {
                instr->format = Format::VOP2;
                switch (instr->opcode) {
                case aco_opcode::v_mad_f32: instr->opcode = aco_opcode::v_mac_f32; break;
@@ -2468,6 +2565,7 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
                case aco_opcode::v_mad_legacy_f16: instr->opcode = aco_opcode::v_mac_f16; break;
                case aco_opcode::v_fma_f16: instr->opcode = aco_opcode::v_fmac_f16; break;
                case aco_opcode::v_pk_fma_f16: instr->opcode = aco_opcode::v_pk_fmac_f16; break;
+               case aco_opcode::v_dot4_i32_i8: instr->opcode = aco_opcode::v_dot4c_i32_i8; break;
                default: break;
                }
             }
@@ -2479,7 +2577,8 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
              instr->opcode == aco_opcode::v_mac_f16 || instr->opcode == aco_opcode::v_fmac_f16 ||
              instr->opcode == aco_opcode::v_pk_fmac_f16 ||
              instr->opcode == aco_opcode::v_writelane_b32 ||
-             instr->opcode == aco_opcode::v_writelane_b32_e64) {
+             instr->opcode == aco_opcode::v_writelane_b32_e64 ||
+             instr->opcode == aco_opcode::v_dot4c_i32_i8) {
             instr->definitions[0].setFixed(instr->operands[2].physReg());
          } else if (instr->opcode == aco_opcode::s_addk_i32 ||
                     instr->opcode == aco_opcode::s_mulk_i32) {
@@ -2529,7 +2628,7 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
             if (!definition.isTemp())
                continue;
 
-            ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
+            ctx.assignments[definition.tempId()].set(definition);
             register_file.fill(definition);
          }
 
@@ -2576,7 +2675,7 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
                   PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, instr);
                   definition->setFixed(reg);
                   if (reg.byte() || register_file.test(reg, 4)) {
-                     add_subdword_definition(program, instr, i, reg);
+                     add_subdword_definition(program, instr, reg);
                      definition = &instr->definitions[i]; /* add_subdword_definition can invalidate
                                                              the reference */
                   }
@@ -2593,7 +2692,7 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
                ((definition->getTemp().type() == RegType::vgpr && definition->physReg() >= 256) ||
                 (definition->getTemp().type() != RegType::vgpr && definition->physReg() < 256)));
             ctx.defs_done.set(i);
-            ctx.assignments[definition->tempId()] = {definition->physReg(), definition->regClass()};
+            ctx.assignments[definition->tempId()].set(*definition);
             register_file.fill(*definition);
          }
 
@@ -2619,9 +2718,12 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
             pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy,
                                                             Format::PSEUDO, parallelcopy.size(),
                                                             parallelcopy.size()));
+            bool linear_vgpr = false;
             bool sgpr_operands_alias_defs = false;
             uint64_t sgpr_operands[4] = {0, 0, 0, 0};
             for (unsigned i = 0; i < parallelcopy.size(); i++) {
+               linear_vgpr |= parallelcopy[i].first.regClass().is_linear_vgpr();
+
                if (temp_in_scc && parallelcopy[i].first.isTemp() &&
                    parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
                   if (!sgpr_operands_alias_defs) {
@@ -2649,7 +2751,7 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
                ctx.renames[block.index][orig.id()] = pc->definitions[i].getTemp();
             }
 
-            if (temp_in_scc && sgpr_operands_alias_defs) {
+            if (temp_in_scc && (sgpr_operands_alias_defs || linear_vgpr)) {
                /* disable definitions and re-enable operands */
                RegisterFile tmp_file(register_file);
                for (const Definition& def : instr->definitions) {
diff --git a/mesa 3D driver/src/amd/compiler/aco_scheduler.cpp b/mesa 3D driver/src/amd/compiler/aco_scheduler.cpp
index 9b4c9ffa48..bfa08f5b72 100644
--- a/mesa 3D driver/src/amd/compiler/aco_scheduler.cpp	
+++ b/mesa 3D driver/src/amd/compiler/aco_scheduler.cpp	
@@ -37,7 +37,7 @@
 #define SMEM_MAX_MOVES      (64 - ctx.num_waves * 4)
 #define VMEM_MAX_MOVES      (256 - ctx.num_waves * 16)
 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
-#define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 8)
+#define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 2)
 #define POS_EXP_MAX_MOVES         512
 
 namespace aco {
@@ -788,6 +788,7 @@ schedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
    int window_size = VMEM_WINDOW_SIZE;
    int max_moves = VMEM_MAX_MOVES;
    int clause_max_grab_dist = VMEM_CLAUSE_MAX_GRAB_DIST;
+   bool only_clauses = false;
    int16_t k = 0;
 
    /* first, check if we have instructions before current to move down */
@@ -822,12 +823,28 @@ schedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
          /* We can't easily tell how much this will decrease the def-to-use
           * distances, so just use how far it will be moved as a heuristic. */
          part_of_clause =
-            grab_dist < clause_max_grab_dist && should_form_clause(current, candidate.get());
+            grab_dist < clause_max_grab_dist + k && should_form_clause(current, candidate.get());
       }
 
       /* if current depends on candidate, add additional dependencies and continue */
-      bool can_move_down = !is_vmem || part_of_clause;
-
+      bool can_move_down = !is_vmem || part_of_clause || candidate->definitions.empty();
+      if (only_clauses) {
+         /* In case of high register pressure, only try to form clauses,
+          * and only if the previous clause is not larger
+          * than the current one will be.
+          */
+         if (part_of_clause) {
+            int clause_size = cursor.insert_idx - cursor.insert_idx_clause;
+            int prev_clause_size = 1;
+            while (should_form_clause(current,
+                                      block->instructions[candidate_idx - prev_clause_size].get()))
+               prev_clause_size++;
+            if (prev_clause_size > clause_size + 1)
+               break;
+         } else {
+            can_move_down = false;
+         }
+      }
       HazardResult haz =
          perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false);
       if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
@@ -838,6 +855,8 @@ schedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
          break;
 
       if (!can_move_down) {
+         if (part_of_clause)
+            break;
          add_to_hazard_query(&indep_hq, candidate.get());
          add_to_hazard_query(&clause_hq, candidate.get());
          ctx.mv.downwards_skip(cursor);
@@ -847,12 +866,20 @@ schedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& registe
       Instruction* candidate_ptr = candidate.get();
       MoveResult res = ctx.mv.downwards_move(cursor, part_of_clause);
       if (res == move_fail_ssa || res == move_fail_rar) {
+         if (part_of_clause)
+            break;
          add_to_hazard_query(&indep_hq, candidate.get());
          add_to_hazard_query(&clause_hq, candidate.get());
          ctx.mv.downwards_skip(cursor);
          continue;
       } else if (res == move_fail_pressure) {
-         break;
+         only_clauses = true;
+         if (part_of_clause)
+            break;
+         add_to_hazard_query(&indep_hq, candidate.get());
+         add_to_hazard_query(&clause_hq, candidate.get());
+         ctx.mv.downwards_skip(cursor);
+         continue;
       }
       if (part_of_clause)
          add_to_hazard_query(&indep_hq, candidate_ptr);
diff --git a/mesa 3D driver/src/amd/compiler/aco_spill.cpp b/mesa 3D driver/src/amd/compiler/aco_spill.cpp
index 40a4d8c0fc..96f3bb8506 100644
--- a/mesa 3D driver/src/amd/compiler/aco_spill.cpp	
+++ b/mesa 3D driver/src/amd/compiler/aco_spill.cpp	
@@ -28,12 +28,26 @@
 
 #include "common/sid.h"
 
+#include <algorithm>
+#include <cstring>
 #include <map>
 #include <set>
 #include <stack>
+#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
+namespace std {
+template <> struct hash<aco::Temp> {
+   size_t operator()(aco::Temp temp) const noexcept
+   {
+      uint32_t v;
+      std::memcpy(&v, &temp, sizeof(temp));
+      return std::hash<uint32_t>{}(v);
+   }
+};
+} // namespace std
+
 /*
  * Implements the spilling algorithm on SSA-form from
  * "Register Spilling and Live-Range Splitting for SSA-Form Programs"
@@ -53,17 +67,19 @@ struct spill_ctx {
    Program* program;
    std::vector<std::vector<RegisterDemand>> register_demand;
    std::vector<std::map<Temp, Temp>> renames;
-   std::vector<std::map<Temp, uint32_t>> spills_entry;
-   std::vector<std::map<Temp, uint32_t>> spills_exit;
+   std::vector<std::unordered_map<Temp, uint32_t>> spills_entry;
+   std::vector<std::unordered_map<Temp, uint32_t>> spills_exit;
+
    std::vector<bool> processed;
-   std::stack<Block*> loop_header;
-   std::vector<std::map<Temp, std::pair<uint32_t, uint32_t>>> next_use_distances_start;
-   std::vector<std::map<Temp, std::pair<uint32_t, uint32_t>>> next_use_distances_end;
+   std::stack<Block*, std::vector<Block*>> loop_header;
+   std::vector<std::unordered_map<Temp, std::pair<uint32_t, uint32_t>>> next_use_distances_start;
+   std::vector<std::unordered_map<Temp, std::pair<uint32_t, uint32_t>>> next_use_distances_end;
+   std::vector<std::vector<std::pair<Temp, uint32_t>>> local_next_use_distance; /* Working buffer */
    std::vector<std::pair<RegClass, std::unordered_set<uint32_t>>> interferences;
    std::vector<std::vector<uint32_t>> affinities;
    std::vector<bool> is_reloaded;
-   std::map<Temp, remat_info> remat;
-   std::map<Instruction*, bool> remat_used;
+   std::unordered_map<Temp, remat_info> remat;
+   std::set<Instruction*> unused_remats;
    unsigned wave_size;
 
    spill_ctx(const RegisterDemand target_pressure_, Program* program_,
@@ -152,15 +168,17 @@ get_dominator(int idx_a, int idx_b, Program* program, bool is_linear)
 }
 
 void
-next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set<uint32_t>& worklist)
+next_uses_per_block(spill_ctx& ctx, unsigned block_idx, uint32_t& worklist)
 {
    Block* block = &ctx.program->blocks[block_idx];
-   std::map<Temp, std::pair<uint32_t, uint32_t>> next_uses = ctx.next_use_distances_end[block_idx];
+   ctx.next_use_distances_start[block_idx] = ctx.next_use_distances_end[block_idx];
+   auto& next_use_distances_start = ctx.next_use_distances_start[block_idx];
 
    /* to compute the next use distance at the beginning of the block, we have to add the block's
     * size */
-   for (std::map<Temp, std::pair<uint32_t, uint32_t>>::iterator it = next_uses.begin();
-        it != next_uses.end(); ++it)
+   for (std::unordered_map<Temp, std::pair<uint32_t, uint32_t>>::iterator it =
+           next_use_distances_start.begin();
+        it != next_use_distances_start.end(); ++it)
       it->second.second = it->second.second + block->instructions.size();
 
    int idx = block->instructions.size() - 1;
@@ -172,7 +190,7 @@ next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set<uint32_t>& work
 
       for (const Definition& def : instr->definitions) {
          if (def.isTemp())
-            next_uses.erase(def.getTemp());
+            next_use_distances_start.erase(def.getTemp());
       }
 
       for (const Operand& op : instr->operands) {
@@ -182,59 +200,67 @@ next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set<uint32_t>& work
          if (op.regClass().type() == RegType::vgpr && op.regClass().is_linear())
             continue;
          if (op.isTemp())
-            next_uses[op.getTemp()] = {block_idx, idx};
+            next_use_distances_start[op.getTemp()] = {block_idx, idx};
       }
       idx--;
    }
 
-   assert(block_idx != 0 || next_uses.empty());
-   ctx.next_use_distances_start[block_idx] = next_uses;
+   assert(block_idx != 0 || next_use_distances_start.empty());
+   std::unordered_set<Temp> phi_defs;
    while (idx >= 0) {
       aco_ptr<Instruction>& instr = block->instructions[idx];
       assert(instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_phi);
 
-      if (!instr->definitions[0].isTemp()) {
-         idx--;
-         continue;
+      std::pair<uint32_t, uint32_t> distance{block_idx, 0};
+
+      auto it = instr->definitions[0].isTemp() ? next_use_distances_start.find(instr->definitions[0].getTemp())
+                                               : next_use_distances_start.end();
+      if (it != next_use_distances_start.end() &&
+         phi_defs.insert(instr->definitions[0].getTemp()).second) {
+         distance = it->second;
       }
 
-      auto it = next_uses.find(instr->definitions[0].getTemp());
-      std::pair<uint32_t, uint32_t> distance =
-         it == next_uses.end() ? std::make_pair(block_idx, 0u) : it->second;
       for (unsigned i = 0; i < instr->operands.size(); i++) {
          unsigned pred_idx =
             instr->opcode == aco_opcode::p_phi ? block->logical_preds[i] : block->linear_preds[i];
          if (instr->operands[i].isTemp()) {
-            if (ctx.next_use_distances_end[pred_idx].find(instr->operands[i].getTemp()) ==
-                   ctx.next_use_distances_end[pred_idx].end() ||
-                ctx.next_use_distances_end[pred_idx][instr->operands[i].getTemp()] != distance)
-               worklist.insert(pred_idx);
-            ctx.next_use_distances_end[pred_idx][instr->operands[i].getTemp()] = distance;
+            auto insert_result = ctx.next_use_distances_end[pred_idx].insert(
+               std::make_pair(instr->operands[i].getTemp(), distance));
+            const bool inserted = insert_result.second;
+            std::pair<uint32_t, uint32_t>& entry_distance = insert_result.first->second;
+            if (inserted || entry_distance != distance)
+               worklist = std::max(worklist, pred_idx + 1);
+            entry_distance = distance;
          }
       }
-      next_uses.erase(instr->definitions[0].getTemp());
       idx--;
    }
 
    /* all remaining live vars must be live-out at the predecessors */
-   for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : next_uses) {
+   for (std::pair<const Temp, std::pair<uint32_t, uint32_t>>& pair : next_use_distances_start) {
       Temp temp = pair.first;
+      if (phi_defs.count(temp)) {
+         continue;
+      }
       uint32_t distance = pair.second.second;
       uint32_t dom = pair.second.first;
       std::vector<unsigned>& preds = temp.is_linear() ? block->linear_preds : block->logical_preds;
       for (unsigned pred_idx : preds) {
          if (ctx.program->blocks[pred_idx].loop_nest_depth > block->loop_nest_depth)
             distance += 0xFFFF;
-         if (ctx.next_use_distances_end[pred_idx].find(temp) !=
-             ctx.next_use_distances_end[pred_idx].end()) {
-            dom = get_dominator(dom, ctx.next_use_distances_end[pred_idx][temp].first, ctx.program,
-                                temp.is_linear());
-            distance = std::min(ctx.next_use_distances_end[pred_idx][temp].second, distance);
+         auto insert_result = ctx.next_use_distances_end[pred_idx].insert(
+            std::make_pair(temp, std::pair<uint32_t, uint32_t>{}));
+         const bool inserted = insert_result.second;
+         std::pair<uint32_t, uint32_t>& entry_distance = insert_result.first->second;
+
+         if (!inserted) {
+            dom = get_dominator(dom, entry_distance.first, ctx.program, temp.is_linear());
+            distance = std::min(entry_distance.second, distance);
+         }
+         if (entry_distance != std::pair<uint32_t, uint32_t>{dom, distance}) {
+            worklist = std::max(worklist, pred_idx + 1);
+            entry_distance = {dom, distance};
          }
-         if (ctx.next_use_distances_end[pred_idx][temp] !=
-             std::pair<uint32_t, uint32_t>{dom, distance})
-            worklist.insert(pred_idx);
-         ctx.next_use_distances_end[pred_idx][temp] = {dom, distance};
       }
    }
 }
@@ -244,14 +270,10 @@ compute_global_next_uses(spill_ctx& ctx)
 {
    ctx.next_use_distances_start.resize(ctx.program->blocks.size());
    ctx.next_use_distances_end.resize(ctx.program->blocks.size());
-   std::set<uint32_t> worklist;
-   for (Block& block : ctx.program->blocks)
-      worklist.insert(block.index);
 
-   while (!worklist.empty()) {
-      std::set<unsigned>::reverse_iterator b_it = worklist.rbegin();
-      unsigned block_idx = *b_it;
-      worklist.erase(block_idx);
+   uint32_t worklist = ctx.program->blocks.size();
+   while (worklist) {
+      unsigned block_idx = --worklist;
       next_uses_per_block(ctx, block_idx, worklist);
    }
 }
@@ -287,7 +309,7 @@ should_rematerialize(aco_ptr<Instruction>& instr)
 aco_ptr<Instruction>
 do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t spill_id)
 {
-   std::map<Temp, remat_info>::iterator remat = ctx.remat.find(tmp);
+   std::unordered_map<Temp, remat_info>::iterator remat = ctx.remat.find(tmp);
    if (remat != ctx.remat.end()) {
       Instruction* instr = remat->second.instr;
       assert((instr->isVOP1() || instr->isSOP1() || instr->isPseudo() || instr->isSOPK()) &&
@@ -317,7 +339,7 @@ do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t spill_id)
          if (instr->operands[i].isTemp()) {
             assert(false && "unsupported");
             if (ctx.remat.count(instr->operands[i].getTemp()))
-               ctx.remat_used[ctx.remat[instr->operands[i].getTemp()].instr] = true;
+               ctx.unused_remats.erase(ctx.remat[instr->operands[i].getTemp()].instr);
          }
       }
       res->definitions[0] = Definition(new_name);
@@ -346,7 +368,7 @@ get_rematerialize_info(spill_ctx& ctx)
             for (const Definition& def : instr->definitions) {
                if (def.isTemp()) {
                   ctx.remat[def.getTemp()] = remat_info{instr.get()};
-                  ctx.remat_used[instr.get()] = false;
+                  ctx.unused_remats.insert(instr.get());
                }
             }
          }
@@ -354,15 +376,22 @@ get_rematerialize_info(spill_ctx& ctx)
    }
 }
 
-std::vector<std::map<Temp, uint32_t>>
-local_next_uses(spill_ctx& ctx, Block* block)
+void
+update_local_next_uses(spill_ctx& ctx, Block* block,
+                std::vector<std::vector<std::pair<Temp, uint32_t>>>& local_next_uses)
 {
-   std::vector<std::map<Temp, uint32_t>> local_next_uses(block->instructions.size());
+   if (local_next_uses.size() < block->instructions.size()) {
+      /* Allocate more next-use-maps. Note that by never reducing the vector size, we enable
+       * future calls to this function to re-use already allocated map memory. */
+      local_next_uses.resize(block->instructions.size());
+   }
 
-   std::map<Temp, uint32_t> next_uses;
-   for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair :
-        ctx.next_use_distances_end[block->index])
-      next_uses[pair.first] = pair.second.second + block->instructions.size();
+   local_next_uses[block->instructions.size() - 1].clear();
+   for (std::pair<const Temp, std::pair<uint32_t, uint32_t>>& pair :
+        ctx.next_use_distances_end[block->index]) {
+      local_next_uses[block->instructions.size() - 1].push_back(std::make_pair<Temp, uint32_t>(
+         (Temp)pair.first, pair.second.second + block->instructions.size()));
+   }
 
    for (int idx = block->instructions.size() - 1; idx >= 0; idx--) {
       aco_ptr<Instruction>& instr = block->instructions[idx];
@@ -371,21 +400,35 @@ local_next_uses(spill_ctx& ctx, Block* block)
       if (instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi)
          break;
 
+      if (idx != (int)block->instructions.size() - 1) {
+         local_next_uses[idx] = local_next_uses[idx + 1];
+      }
+
       for (const Operand& op : instr->operands) {
          if (op.isFixed() && op.physReg() == exec)
             continue;
          if (op.regClass().type() == RegType::vgpr && op.regClass().is_linear())
             continue;
-         if (op.isTemp())
-            next_uses[op.getTemp()] = idx;
+         if (op.isTemp()) {
+            auto it = std::find_if(local_next_uses[idx].begin(), local_next_uses[idx].end(),
+                                   [op](auto& pair) { return pair.first == op.getTemp(); });
+            if (it == local_next_uses[idx].end()) {
+               local_next_uses[idx].push_back(std::make_pair<Temp, uint32_t>(op.getTemp(), idx));
+            } else {
+               it->second = idx;
+            }
+         }
       }
       for (const Definition& def : instr->definitions) {
-         if (def.isTemp())
-            next_uses.erase(def.getTemp());
+         if (def.isTemp()) {
+            auto it = std::find_if(local_next_uses[idx].begin(), local_next_uses[idx].end(),
+                                   [def](auto& pair) { return pair.first == def.getTemp(); });
+            if (it != local_next_uses[idx].end()) {
+               local_next_uses[idx].erase(it);
+            }
+         }
       }
-      local_next_uses[idx] = next_uses;
    }
-   return local_next_uses;
 }
 
 RegisterDemand
@@ -442,7 +485,7 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx)
       return {0, 0};
 
    /* next use distances at the beginning of the current block */
-   auto& next_use_distances = ctx.next_use_distances_start[block_idx];
+   const auto& next_use_distances = ctx.next_use_distances_start[block_idx];
 
    /* loop header block */
    if (block->loop_nest_depth > ctx.program->blocks[block_idx - 1].loop_nest_depth) {
@@ -489,12 +532,12 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx)
 
          unsigned distance = 0;
          Temp to_spill;
-         for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : next_use_distances) {
+         for (const std::pair<const Temp, std::pair<uint32_t, uint32_t>>& pair :
+              next_use_distances) {
             if (pair.first.type() == type &&
                 (pair.second.first >= loop_end ||
                  (ctx.remat.count(pair.first) && type == RegType::sgpr)) &&
-                pair.second.second > distance &&
-                ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) {
+                pair.second.second > distance && !ctx.spills_entry[block_idx].count(pair.first)) {
                to_spill = pair.first;
                distance = pair.second.second;
             }
@@ -509,8 +552,7 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx)
          }
 
          uint32_t spill_id;
-         if (ctx.spills_exit[block_idx - 1].find(to_spill) ==
-             ctx.spills_exit[block_idx - 1].end()) {
+         if (!ctx.spills_exit[block_idx - 1].count(to_spill)) {
             spill_id = ctx.allocate_spill_id(to_spill.regClass());
          } else {
             spill_id = ctx.spills_exit[block_idx - 1][to_spill];
@@ -533,9 +575,10 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx)
          Temp to_spill;
          type = reg_pressure.vgpr > ctx.target_pressure.vgpr ? RegType::vgpr : RegType::sgpr;
 
-         for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : next_use_distances) {
+         for (const std::pair<const Temp, std::pair<uint32_t, uint32_t>>& pair :
+              next_use_distances) {
             if (pair.first.type() == type && pair.second.second > distance &&
-                ctx.spills_entry[block_idx].find(pair.first) == ctx.spills_entry[block_idx].end()) {
+                !ctx.spills_entry[block_idx].count(pair.first)) {
                to_spill = pair.first;
                distance = pair.second.second;
             }
@@ -554,9 +597,12 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx)
       /* keep variables spilled if they are alive and not used in the current block */
       unsigned pred_idx = block->linear_preds[0];
       for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
-         if (pair.first.type() == RegType::sgpr &&
-             next_use_distances.find(pair.first) != next_use_distances.end() &&
-             next_use_distances[pair.first].first != block_idx) {
+         if (pair.first.type() != RegType::sgpr) {
+            continue;
+         }
+         auto next_use_distance_it = next_use_distances.find(pair.first);
+         if (next_use_distance_it != next_use_distances.end() &&
+             next_use_distance_it->second.first != block_idx) {
             ctx.spills_entry[block_idx].insert(pair);
             spilled_registers.sgpr += pair.first.size();
          }
@@ -564,9 +610,12 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx)
       if (block->logical_preds.size() == 1) {
          pred_idx = block->logical_preds[0];
          for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
-            if (pair.first.type() == RegType::vgpr &&
-                next_use_distances.find(pair.first) != next_use_distances.end() &&
-                next_use_distances[pair.first].first != block_idx) {
+            if (pair.first.type() != RegType::vgpr) {
+               continue;
+            }
+            auto next_use_distance_it = next_use_distances.find(pair.first);
+            if (next_use_distance_it != next_use_distances.end() &&
+                next_use_distance_it->second.first != block_idx) {
                ctx.spills_entry[block_idx].insert(pair);
                spilled_registers.vgpr += pair.first.size();
             }
@@ -578,8 +627,7 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx)
       if (block->register_demand.sgpr - spilled_registers.sgpr > ctx.target_pressure.sgpr) {
          pred_idx = block->linear_preds[0];
          for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
-            if (pair.first.type() == RegType::sgpr &&
-                next_use_distances.find(pair.first) != next_use_distances.end() &&
+            if (pair.first.type() == RegType::sgpr && next_use_distances.count(pair.first) &&
                 ctx.spills_entry[block_idx].insert(pair).second) {
                spilled_registers.sgpr += pair.first.size();
             }
@@ -589,8 +637,7 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx)
           block->logical_preds.size() == 1) {
          pred_idx = block->logical_preds[0];
          for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
-            if (pair.first.type() == RegType::vgpr &&
-                next_use_distances.find(pair.first) != next_use_distances.end() &&
+            if (pair.first.type() == RegType::vgpr && next_use_distances.count(pair.first) &&
                 ctx.spills_entry[block_idx].insert(pair).second) {
                spilled_registers.vgpr += pair.first.size();
             }
@@ -604,7 +651,7 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx)
    std::set<Temp> partial_spills;
 
    /* keep variables spilled on all incoming paths */
-   for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair : next_use_distances) {
+   for (const std::pair<const Temp, std::pair<uint32_t, uint32_t>>& pair : next_use_distances) {
       std::vector<unsigned>& preds =
          pair.first.is_linear() ? block->linear_preds : block->logical_preds;
       /* If it can be rematerialized, keep the variable spilled if all predecessors do not reload
@@ -618,12 +665,11 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx)
       uint32_t spill_id = 0;
       for (unsigned pred_idx : preds) {
          /* variable is not even live at the predecessor: probably from a phi */
-         if (ctx.next_use_distances_end[pred_idx].find(pair.first) ==
-             ctx.next_use_distances_end[pred_idx].end()) {
+         if (!ctx.next_use_distances_end[pred_idx].count(pair.first)) {
             spill = false;
             break;
          }
-         if (ctx.spills_exit[pred_idx].find(pair.first) == ctx.spills_exit[pred_idx].end()) {
+         if (!ctx.spills_exit[pred_idx].count(pair.first)) {
             if (!remat)
                spill = false;
          } else {
@@ -660,8 +706,7 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx)
             continue;
          }
 
-         if (ctx.spills_exit[preds[i]].find(phi->operands[i].getTemp()) ==
-             ctx.spills_exit[preds[i]].end())
+         if (!ctx.spills_exit[preds[i]].count(phi->operands[i].getTemp()))
             spill = false;
          else
             partial_spills.insert(phi->definitions[0].getTemp());
@@ -686,10 +731,10 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx)
       RegType type = reg_pressure.vgpr > ctx.target_pressure.vgpr ? RegType::vgpr : RegType::sgpr;
 
       while (it != partial_spills.end()) {
-         assert(ctx.spills_entry[block_idx].find(*it) == ctx.spills_entry[block_idx].end());
+         assert(!ctx.spills_entry[block_idx].count(*it));
 
-         if (it->type() == type && next_use_distances[*it].second > distance) {
-            distance = next_use_distances[*it].second;
+         if (it->type() == type && next_use_distances.at(*it).second > distance) {
+            distance = next_use_distances.at(*it).second;
             to_spill = *it;
          }
          ++it;
@@ -722,18 +767,19 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
       unsigned insert_idx = 0;
       RegisterDemand demand_before = get_demand_before(ctx, block_idx, 0);
 
-      for (std::pair<Temp, std::pair<uint32_t, uint32_t>> live :
+      for (std::pair<const Temp, std::pair<uint32_t, uint32_t>>& live :
            ctx.next_use_distances_start[block_idx]) {
          const unsigned pred_idx = block->linear_preds[0];
 
          if (!live.first.is_linear())
             continue;
          /* still spilled */
-         if (ctx.spills_entry[block_idx].find(live.first) != ctx.spills_entry[block_idx].end())
+         if (ctx.spills_entry[block_idx].count(live.first))
             continue;
 
          /* in register at end of predecessor */
-         if (ctx.spills_exit[pred_idx].find(live.first) == ctx.spills_exit[pred_idx].end()) {
+         auto spills_exit_it = ctx.spills_exit[pred_idx].find(live.first);
+         if (spills_exit_it == ctx.spills_exit[pred_idx].end()) {
             std::map<Temp, Temp>::iterator it = ctx.renames[pred_idx].find(live.first);
             if (it != ctx.renames[pred_idx].end())
                ctx.renames[block_idx].insert(*it);
@@ -742,8 +788,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
 
          /* variable is spilled at predecessor and live at current block: create reload instruction */
          Temp new_name = ctx.program->allocateTmp(live.first.regClass());
-         aco_ptr<Instruction> reload =
-            do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]);
+         aco_ptr<Instruction> reload = do_reload(ctx, live.first, new_name, spills_exit_it->second);
          instructions.emplace_back(std::move(reload));
          reg_demand.push_back(demand_before);
          ctx.renames[block_idx][live.first] = new_name;
@@ -758,16 +803,17 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
          } while (instructions.back()->opcode != aco_opcode::p_logical_start);
 
          unsigned pred_idx = block->logical_preds[0];
-         for (std::pair<Temp, std::pair<uint32_t, uint32_t>> live :
+         for (std::pair<const Temp, std::pair<uint32_t, uint32_t>>& live :
               ctx.next_use_distances_start[block_idx]) {
             if (live.first.is_linear())
                continue;
             /* still spilled */
-            if (ctx.spills_entry[block_idx].find(live.first) != ctx.spills_entry[block_idx].end())
+            if (ctx.spills_entry[block_idx].count(live.first))
                continue;
 
             /* in register at end of predecessor */
-            if (ctx.spills_exit[pred_idx].find(live.first) == ctx.spills_exit[pred_idx].end()) {
+            auto spills_exit_it = ctx.spills_exit[pred_idx].find(live.first);
+            if (spills_exit_it == ctx.spills_exit[pred_idx].end()) {
                std::map<Temp, Temp>::iterator it = ctx.renames[pred_idx].find(live.first);
                if (it != ctx.renames[pred_idx].end())
                   ctx.renames[block_idx].insert(*it);
@@ -778,7 +824,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
              * create reload instruction */
             Temp new_name = ctx.program->allocateTmp(live.first.regClass());
             aco_ptr<Instruction> reload =
-               do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]);
+               do_reload(ctx, live.first, new_name, spills_exit_it->second);
             instructions.emplace_back(std::move(reload));
             reg_demand.emplace_back(reg_demand.back());
             ctx.renames[block_idx][live.first] = new_name;
@@ -812,8 +858,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
 
       /* if the phi is not spilled, add to instructions */
       if (!phi->definitions[0].isTemp() ||
-          ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) ==
-             ctx.spills_entry[block_idx].end()) {
+          !ctx.spills_entry[block_idx].count(phi->definitions[0].getTemp())) {
          instructions.emplace_back(std::move(phi));
          continue;
       }
@@ -836,10 +881,10 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
             std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var);
             /* prevent the definining instruction from being DCE'd if it could be rematerialized */
             if (rename_it == ctx.renames[preds[i]].end() && ctx.remat.count(var))
-               ctx.remat_used[ctx.remat[var].instr] = true;
+               ctx.unused_remats.erase(ctx.remat[var].instr);
 
             /* check if variable is already spilled at predecessor */
-            std::map<Temp, uint32_t>::iterator spilled = ctx.spills_exit[pred_idx].find(var);
+            auto spilled = ctx.spills_exit[pred_idx].find(var);
             if (spilled != ctx.spills_exit[pred_idx].end()) {
                if (spilled->second != def_spill_id)
                   ctx.add_affinity(def_spill_id, spilled->second);
@@ -889,7 +934,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
 
       for (unsigned pred_idx : preds) {
          /* variable is already spilled at predecessor */
-         std::map<Temp, uint32_t>::iterator spilled = ctx.spills_exit[pred_idx].find(pair.first);
+         auto spilled = ctx.spills_exit[pred_idx].find(pair.first);
          if (spilled != ctx.spills_exit[pred_idx].end()) {
             if (spilled->second != pair.second)
                ctx.add_affinity(pair.second, spilled->second);
@@ -897,8 +942,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
          }
 
          /* variable is dead at predecessor, it must be from a phi: this works because of CSSA form */
-         if (ctx.next_use_distances_end[pred_idx].find(pair.first) ==
-             ctx.next_use_distances_end[pred_idx].end())
+         if (!ctx.next_use_distances_end[pred_idx].count(pair.first))
             continue;
 
          /* add interferences between spilled variable and predecessors exit spills */
@@ -938,8 +982,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
    for (aco_ptr<Instruction>& phi : instructions) {
       assert(phi->opcode == aco_opcode::p_phi || phi->opcode == aco_opcode::p_linear_phi);
       assert(!phi->definitions[0].isTemp() ||
-             ctx.spills_entry[block_idx].find(phi->definitions[0].getTemp()) ==
-                ctx.spills_entry[block_idx].end());
+             !ctx.spills_entry[block_idx].count(phi->definitions[0].getTemp()));
 
       std::vector<unsigned>& preds =
          phi->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
@@ -949,15 +992,18 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
          unsigned pred_idx = preds[i];
 
          /* if the operand was reloaded, rename */
-         if (ctx.spills_exit[pred_idx].find(phi->operands[i].getTemp()) ==
-             ctx.spills_exit[pred_idx].end()) {
+         if (!ctx.spills_exit[pred_idx].count(phi->operands[i].getTemp())) {
             std::map<Temp, Temp>::iterator it =
                ctx.renames[pred_idx].find(phi->operands[i].getTemp());
-            if (it != ctx.renames[pred_idx].end())
+            if (it != ctx.renames[pred_idx].end()) {
                phi->operands[i].setTemp(it->second);
             /* prevent the definining instruction from being DCE'd if it could be rematerialized */
-            else if (ctx.remat.count(phi->operands[i].getTemp()))
-               ctx.remat_used[ctx.remat[phi->operands[i].getTemp()].instr] = true;
+            } else {
+               auto remat_it = ctx.remat.find(phi->operands[i].getTemp());
+               if (remat_it != ctx.remat.end()) {
+                  ctx.unused_remats.erase(remat_it->second.instr);
+               }
+            }
             continue;
          }
 
@@ -993,10 +1039,10 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
 
    /* iterate live variables for which to reload */
    // TODO: reload at current block if variable is spilled on all predecessors
-   for (std::pair<Temp, std::pair<uint32_t, uint32_t>> pair :
+   for (std::pair<const Temp, std::pair<uint32_t, uint32_t>>& pair :
         ctx.next_use_distances_start[block_idx]) {
       /* skip spilled variables */
-      if (ctx.spills_entry[block_idx].find(pair.first) != ctx.spills_entry[block_idx].end())
+      if (ctx.spills_entry[block_idx].count(pair.first))
          continue;
       std::vector<unsigned> preds =
          pair.first.is_linear() ? block->linear_preds : block->logical_preds;
@@ -1004,15 +1050,14 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
       /* variable is dead at predecessor, it must be from a phi */
       bool is_dead = false;
       for (unsigned pred_idx : preds) {
-         if (ctx.next_use_distances_end[pred_idx].find(pair.first) ==
-             ctx.next_use_distances_end[pred_idx].end())
+         if (!ctx.next_use_distances_end[pred_idx].count(pair.first))
             is_dead = true;
       }
       if (is_dead)
          continue;
       for (unsigned pred_idx : preds) {
          /* the variable is not spilled at the predecessor */
-         if (ctx.spills_exit[pred_idx].find(pair.first) == ctx.spills_exit[pred_idx].end())
+         if (!ctx.spills_exit[pred_idx].count(pair.first))
             continue;
 
          /* variable is spilled at predecessor and has to be reloaded */
@@ -1038,7 +1083,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
       Temp rename = Temp();
       bool is_same = true;
       for (unsigned pred_idx : preds) {
-         if (ctx.renames[pred_idx].find(pair.first) == ctx.renames[pred_idx].end()) {
+         if (!ctx.renames[pred_idx].count(pair.first)) {
             if (rename == Temp())
                rename = pair.first;
             else
@@ -1062,7 +1107,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
          rename = ctx.program->allocateTmp(pair.first.regClass());
          for (unsigned i = 0; i < phi->operands.size(); i++) {
             Temp tmp;
-            if (ctx.renames[preds[i]].find(pair.first) != ctx.renames[preds[i]].end()) {
+            if (ctx.renames[preds[i]].count(pair.first)) {
                tmp = ctx.renames[preds[i]][pair.first];
             } else if (preds[i] >= block_idx) {
                tmp = rename;
@@ -1070,7 +1115,7 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
                tmp = pair.first;
                /* prevent the definining instruction from being DCE'd if it could be rematerialized */
                if (ctx.remat.count(tmp))
-                  ctx.remat_used[ctx.remat[tmp].instr] = true;
+                  ctx.unused_remats.erase(ctx.remat[tmp].instr);
             }
             phi->operands[i] = Operand(tmp);
          }
@@ -1106,12 +1151,10 @@ add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
 }
 
 void
-process_block(spill_ctx& ctx, unsigned block_idx, Block* block,
-              std::map<Temp, uint32_t>& current_spills, RegisterDemand spilled_registers)
+process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand spilled_registers)
 {
    assert(!ctx.processed[block_idx]);
 
-   std::vector<std::map<Temp, uint32_t>> local_next_use_distance;
    std::vector<aco_ptr<Instruction>> instructions;
    unsigned idx = 0;
 
@@ -1121,25 +1164,35 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block,
       instructions.emplace_back(std::move(block->instructions[idx++]));
    }
 
-   if (block->register_demand.exceeds(ctx.target_pressure))
-      local_next_use_distance = local_next_uses(ctx, block);
+   if (block->register_demand.exceeds(ctx.target_pressure)) {
+      update_local_next_uses(ctx, block, ctx.local_next_use_distance);
+   } else {
+      /* We won't use local_next_use_distance, so no initialization needed */
+   }
+
+   auto& current_spills = ctx.spills_exit[block_idx];
 
    while (idx < block->instructions.size()) {
       aco_ptr<Instruction>& instr = block->instructions[idx];
 
       std::map<Temp, std::pair<Temp, uint32_t>> reloads;
-      std::map<Temp, uint32_t> spills;
+
       /* rename and reload operands */
       for (Operand& op : instr->operands) {
          if (!op.isTemp())
             continue;
-         if (current_spills.find(op.getTemp()) == current_spills.end()) {
+         if (!current_spills.count(op.getTemp())) {
             /* the Operand is in register: check if it was renamed */
-            if (ctx.renames[block_idx].find(op.getTemp()) != ctx.renames[block_idx].end())
-               op.setTemp(ctx.renames[block_idx][op.getTemp()]);
-            /* prevent it's definining instruction from being DCE'd if it could be rematerialized */
-            else if (ctx.remat.count(op.getTemp()))
-               ctx.remat_used[ctx.remat[op.getTemp()].instr] = true;
+            auto rename_it = ctx.renames[block_idx].find(op.getTemp());
+            if (rename_it != ctx.renames[block_idx].end()) {
+               op.setTemp(rename_it->second);
+            } else {
+               /* prevent its definining instruction from being DCE'd if it could be rematerialized */
+               auto remat_it = ctx.remat.find(op.getTemp());
+               if (remat_it != ctx.remat.end()) {
+                  ctx.unused_remats.erase(remat_it->second.instr);
+               }
+            }
             continue;
          }
          /* the Operand is spilled: add it to reloads */
@@ -1157,7 +1210,7 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block,
          RegisterDemand new_demand = ctx.register_demand[block_idx][idx];
          new_demand.update(get_demand_before(ctx, block_idx, idx));
 
-         assert(!local_next_use_distance.empty());
+         assert(!ctx.local_next_use_distance.empty());
 
          /* if reg pressure is too high, spill variable with furthest next use */
          while ((new_demand - spilled_registers).exceeds(ctx.target_pressure)) {
@@ -1168,15 +1221,13 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block,
             if (new_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr)
                type = RegType::vgpr;
 
-            for (std::pair<Temp, uint32_t> pair : local_next_use_distance[idx]) {
+            for (std::pair<Temp, uint32_t> pair : ctx.local_next_use_distance[idx]) {
                if (pair.first.type() != type)
                   continue;
                bool can_rematerialize = ctx.remat.count(pair.first);
                if (((pair.second > distance && can_rematerialize == do_rematerialize) ||
                     (can_rematerialize && !do_rematerialize && pair.second > idx)) &&
-                   current_spills.find(pair.first) == current_spills.end() &&
-                   ctx.spills_exit[block_idx].find(pair.first) ==
-                      ctx.spills_exit[block_idx].end()) {
+                   !current_spills.count(pair.first)) {
                   to_spill = pair.first;
                   distance = pair.second;
                   do_rematerialize = can_rematerialize;
@@ -1189,14 +1240,14 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block,
             /* add interferences with currently spilled variables */
             for (std::pair<Temp, uint32_t> pair : current_spills)
                ctx.add_interference(spill_id, pair.second);
-            for (std::pair<Temp, std::pair<Temp, uint32_t>> pair : reloads)
+            for (std::pair<const Temp, std::pair<Temp, uint32_t>>& pair : reloads)
                ctx.add_interference(spill_id, pair.second.second);
 
             current_spills[to_spill] = spill_id;
             spilled_registers += to_spill;
 
             /* rename if necessary */
-            if (ctx.renames[block_idx].find(to_spill) != ctx.renames[block_idx].end()) {
+            if (ctx.renames[block_idx].count(to_spill)) {
                to_spill = ctx.renames[block_idx][to_spill];
             }
 
@@ -1210,7 +1261,7 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block,
       }
 
       /* add reloads and instruction to new instructions */
-      for (std::pair<Temp, std::pair<Temp, uint32_t>> pair : reloads) {
+      for (std::pair<const Temp, std::pair<Temp, uint32_t>>& pair : reloads) {
          aco_ptr<Instruction> reload =
             do_reload(ctx, pair.second.first, pair.first, pair.second.second);
          instructions.emplace_back(std::move(reload));
@@ -1220,7 +1271,6 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block,
    }
 
    block->instructions = std::move(instructions);
-   ctx.spills_exit[block_idx].insert(current_spills.begin(), current_spills.end());
 }
 
 void
@@ -1244,21 +1294,22 @@ spill_block(spill_ctx& ctx, unsigned block_idx)
       add_coupling_code(ctx, block, block_idx);
    }
 
-   std::map<Temp, uint32_t> current_spills = ctx.spills_entry[block_idx];
+   const auto& current_spills = ctx.spills_entry[block_idx];
 
    /* check conditions to process this block */
    bool process = (block->register_demand - spilled_registers).exceeds(ctx.target_pressure) ||
-                  !ctx.renames[block_idx].empty() || ctx.remat_used.size();
+                  !ctx.renames[block_idx].empty() || ctx.unused_remats.size();
 
    for (auto it = current_spills.begin(); !process && it != current_spills.end(); ++it) {
-      if (ctx.next_use_distances_start[block_idx][it->first].first == block_idx)
+      if (ctx.next_use_distances_start[block_idx].at(it->first).first == block_idx)
          process = true;
    }
 
-   if (process)
-      process_block(ctx, block_idx, block, current_spills, spilled_registers);
-   else
-      ctx.spills_exit[block_idx].insert(current_spills.begin(), current_spills.end());
+   assert(ctx.spills_exit[block_idx].empty());
+   ctx.spills_exit[block_idx] = current_spills;
+   if (process) {
+      process_block(ctx, block_idx, block, spilled_registers);
+   }
 
    ctx.processed[block_idx] = true;
 
@@ -1754,7 +1805,7 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr)
                reload->definitions[0] = (*it)->definitions[0];
                instructions.emplace_back(aco_ptr<Instruction>(reload));
             }
-         } else if (!ctx.remat_used.count(it->get()) || ctx.remat_used[it->get()]) {
+         } else if (!ctx.unused_remats.count(it->get())) {
             instructions.emplace_back(std::move(*it));
          }
       }
diff --git a/mesa 3D driver/src/amd/compiler/aco_statistics.cpp b/mesa 3D driver/src/amd/compiler/aco_statistics.cpp
index ce114e3f87..8ccb5198b0 100644
--- a/mesa 3D driver/src/amd/compiler/aco_statistics.cpp	
+++ b/mesa 3D driver/src/amd/compiler/aco_statistics.cpp	
@@ -473,6 +473,15 @@ collect_preasm_stats(Program* program)
    double usage[(int)BlockCycleEstimator::resource_count] = {0};
    std::vector<BlockCycleEstimator> blocks(program->blocks.size(), program);
 
+   if (program->stage.has(SWStage::VS) && program->info->vs.has_prolog) {
+      unsigned vs_input_latency = 320;
+      for (Definition def : program->vs_inputs) {
+         blocks[0].vm.push_back(vs_input_latency);
+         for (unsigned i = 0; i < def.size(); i++)
+            blocks[0].reg_available[def.physReg().reg() + i] = vs_input_latency;
+      }
+   }
+
    for (Block& block : program->blocks) {
       BlockCycleEstimator& block_est = blocks[block.index];
       for (unsigned pred : block.linear_preds)
diff --git a/mesa 3D driver/src/amd/compiler/aco_validate.cpp b/mesa 3D driver/src/amd/compiler/aco_validate.cpp
index af1393ba41..0badb3ce17 100644
--- a/mesa 3D driver/src/amd/compiler/aco_validate.cpp	
+++ b/mesa 3D driver/src/amd/compiler/aco_validate.cpp	
@@ -163,8 +163,34 @@ validate_ir(Program* program)
                check((instr->definitions[0].isFixed() && instr->definitions[0].physReg() == vcc) ||
                         program->chip_class >= GFX9,
                      "SDWA+VOPC definition must be fixed to vcc on GFX8", instr.get());
+            } else {
+               const Definition& def = instr->definitions[0];
+               check(def.bytes() <= 4, "SDWA definitions must not be larger than 4 bytes",
+                     instr.get());
+               check(def.bytes() >= sdwa.dst_sel.size() + sdwa.dst_sel.offset(),
+                     "SDWA definition selection size must be at most definition size", instr.get());
+               check(
+                  sdwa.dst_sel.size() == 1 || sdwa.dst_sel.size() == 2 || sdwa.dst_sel.size() == 4,
+                  "SDWA definition selection size must be 1, 2 or 4 bytes", instr.get());
+               check(sdwa.dst_sel.offset() % sdwa.dst_sel.size() == 0, "Invalid selection offset",
+                     instr.get());
+               check(def.bytes() == 4 || def.bytes() == sdwa.dst_sel.size(),
+                     "SDWA dst_sel size must be definition size for subdword definitions",
+                     instr.get());
+               check(def.bytes() == 4 || sdwa.dst_sel.offset() == 0,
+                     "SDWA dst_sel offset must be 0 for subdword definitions", instr.get());
             }
 
+            for (unsigned i = 0; i < std::min<unsigned>(2, instr->operands.size()); i++) {
+               const Operand& op = instr->operands[i];
+               check(op.bytes() <= 4, "SDWA operands must not be larger than 4 bytes", instr.get());
+               check(op.bytes() >= sdwa.sel[i].size() + sdwa.sel[i].offset(),
+                     "SDWA operand selection size must be at most operand size", instr.get());
+               check(sdwa.sel[i].size() == 1 || sdwa.sel[i].size() == 2 || sdwa.sel[i].size() == 4,
+                     "SDWA operand selection size must be 1, 2 or 4 bytes", instr.get());
+               check(sdwa.sel[i].offset() % sdwa.sel[i].size() == 0, "Invalid selection offset",
+                     instr.get());
+            }
             if (instr->operands.size() >= 3) {
                check(instr->operands[2].isFixed() && instr->operands[2].physReg() == vcc,
                      "3rd operand must be fixed to vcc with SDWA", instr.get());
@@ -192,10 +218,6 @@ validate_ir(Program* program)
                (instr->opcode == aco_opcode::v_mac_f32 && instr->opcode == aco_opcode::v_mac_f16);
 
             check(sdwa_opcodes || feature_mac, "SDWA can't be used with this opcode", instr.get());
-
-            if (instr->definitions[0].regClass().is_subdword())
-               check((sdwa.dst_sel & sdwa_asuint) == (sdwa_isra | instr->definitions[0].bytes()),
-                     "Unexpected SDWA sel for sub-dword definition", instr.get());
          }
 
          /* check opsel */
@@ -273,7 +295,7 @@ validate_ir(Program* program)
                if (instr->isSDWA())
                   scalar_mask = program->chip_class >= GFX9 ? 0x7 : 0x4;
                else if (instr->isDPP())
-                  scalar_mask = 0x0;
+                  scalar_mask = 0x4;
 
                if (instr->isVOPC() || instr->opcode == aco_opcode::v_readfirstlane_b32 ||
                    instr->opcode == aco_opcode::v_readlane_b32 ||
@@ -400,12 +422,20 @@ validate_ir(Program* program)
                for (unsigned i = 0; i < instr->operands.size(); i++) {
                   check(instr->definitions[i].bytes() == instr->operands[i].bytes(),
                         "Operand and Definition size must match", instr.get());
-                  if (instr->operands[i].isTemp())
+                  if (instr->operands[i].isTemp()) {
                      check((instr->definitions[i].getTemp().type() ==
                             instr->operands[i].regClass().type()) ||
                               (instr->definitions[i].getTemp().type() == RegType::vgpr &&
                                instr->operands[i].regClass().type() == RegType::sgpr),
                            "Operand and Definition types do not match", instr.get());
+                     check(instr->definitions[i].regClass().is_linear_vgpr() ==
+                              instr->operands[i].regClass().is_linear_vgpr(),
+                           "Operand and Definition types do not match", instr.get());
+                  } else {
+                     check(!instr->definitions[i].regClass().is_linear_vgpr(),
+                           "Can only copy linear VGPRs into linear VGPRs, not constant/undef",
+                           instr.get());
+                  }
                }
             } else if (instr->opcode == aco_opcode::p_phi) {
                check(instr->operands.size() == block.logical_preds.size(),
@@ -436,22 +466,29 @@ validate_ir(Program* program)
                         instr->operands[0].getTemp().type() == RegType::sgpr,
                      "Can't extract/insert VGPR to SGPR", instr.get());
 
-               if (instr->operands[0].getTemp().type() == RegType::vgpr)
+               if (instr->opcode == aco_opcode::p_insert)
                   check(instr->operands[0].bytes() == instr->definitions[0].bytes(),
-                        "Sizes of operand and definition must match", instr.get());
+                        "Sizes of p_insert data operand and definition must match", instr.get());
 
                if (instr->definitions[0].getTemp().type() == RegType::sgpr)
                   check(instr->definitions.size() >= 2 && instr->definitions[1].isFixed() &&
                            instr->definitions[1].physReg() == scc,
-                        "SGPR extract/insert needs a SCC definition", instr.get());
+                        "SGPR extract/insert needs an SCC definition", instr.get());
 
-               check(instr->operands[2].constantEquals(8) || instr->operands[2].constantEquals(16),
-                     "Size must be 8 or 16", instr.get());
-               check(instr->operands[2].constantValue() < instr->operands[0].getTemp().bytes() * 8u,
-                     "Size must be smaller than source", instr.get());
+               unsigned data_bits = instr->operands[0].getTemp().bytes() * 8u;
+               unsigned op_bits = instr->operands[2].constantValue();
 
-               unsigned comp =
-                  instr->operands[0].bytes() * 8u / MAX2(instr->operands[2].constantValue(), 1);
+               if (instr->opcode == aco_opcode::p_insert) {
+                  check(op_bits == 8 || op_bits == 16, "Size must be 8 or 16", instr.get());
+                  check(op_bits < data_bits, "Size must be smaller than source", instr.get());
+               } else if (instr->opcode == aco_opcode::p_extract) {
+                  check(op_bits == 8 || op_bits == 16 || op_bits == 32,
+                        "Size must be 8 or 16 or 32", instr.get());
+                  check(data_bits >= op_bits, "Can't extract more bits than what the data has.",
+                        instr.get());
+               }
+
+               unsigned comp = data_bits / MAX2(op_bits, 1);
                check(instr->operands[1].constantValue() < comp, "Index must be in-bounds",
                      instr.get());
             }
@@ -680,10 +717,9 @@ validate_subdword_operand(chip_class chip, const aco_ptr<Instruction>& instr, un
       return byte == 0;
    if (instr->isPseudo() && chip >= GFX8)
       return true;
-   if (instr->isSDWA()) {
-      unsigned sel = instr->sdwa().sel[index] & sdwa_asuint;
-      return (sel & sdwa_isra) && (sel & sdwa_rasize) <= op.bytes();
-   }
+   if (instr->isSDWA())
+      return byte + instr->sdwa().sel[index].offset() + instr->sdwa().sel[index].size() <= 4 &&
+             byte % instr->sdwa().sel[index].size() == 0;
    if (byte == 2 && can_use_opsel(chip, instr->opcode, index, 1))
       return true;
 
@@ -733,8 +769,9 @@ validate_subdword_definition(chip_class chip, const aco_ptr<Instruction>& instr)
 
    if (instr->isPseudo() && chip >= GFX8)
       return true;
-   if (instr->isSDWA() && instr->sdwa().dst_sel == (sdwa_isra | def.bytes()))
-      return true;
+   if (instr->isSDWA())
+      return byte + instr->sdwa().dst_sel.offset() + instr->sdwa().dst_sel.size() <= 4 &&
+             byte % instr->sdwa().dst_sel.size() == 0;
    if (byte == 2 && can_use_opsel(chip, instr->opcode, -1, 1))
       return true;
 
@@ -763,8 +800,16 @@ get_subdword_bytes_written(Program* program, const aco_ptr<Instruction>& instr,
 
    if (instr->isPseudo())
       return chip >= GFX8 ? def.bytes() : def.size() * 4u;
-   if (instr->isSDWA() && instr->sdwa().dst_sel == (sdwa_isra | def.bytes()))
-      return def.bytes();
+   if (instr->isVALU()) {
+      assert(def.bytes() <= 2);
+      if (instr->isSDWA())
+         return instr->sdwa().dst_sel.size();
+
+      if (instr_is_16bit(chip, instr->opcode))
+         return 2;
+
+      return 4;
+   }
 
    switch (instr->opcode) {
    case aco_opcode::buffer_load_ubyte_d16:
@@ -787,20 +832,8 @@ get_subdword_bytes_written(Program* program, const aco_ptr<Instruction>& instr,
    case aco_opcode::global_load_short_d16_hi:
    case aco_opcode::ds_read_u8_d16_hi:
    case aco_opcode::ds_read_u16_d16_hi: return program->dev.sram_ecc_enabled ? 4 : 2;
-   case aco_opcode::v_mad_f16:
-   case aco_opcode::v_mad_u16:
-   case aco_opcode::v_mad_i16:
-   case aco_opcode::v_fma_f16:
-   case aco_opcode::v_div_fixup_f16:
-   case aco_opcode::v_interp_p2_f16:
-      if (chip >= GFX9)
-         return 2;
-      break;
-   default: break;
+   default: return def.size() * 4;
    }
-
-   return MAX2(chip >= GFX10 ? def.bytes() : 4,
-               instr_info.definition_size[(int)instr->opcode] / 8u);
 }
 
 } /* end namespace */
diff --git a/mesa 3D driver/src/amd/compiler/tests/helpers.cpp b/mesa 3D driver/src/amd/compiler/tests/helpers.cpp
index 7b6859c4e2..58303346a8 100644
--- a/mesa 3D driver/src/amd/compiler/tests/helpers.cpp	
+++ b/mesa 3D driver/src/amd/compiler/tests/helpers.cpp	
@@ -169,7 +169,7 @@ void finish_opt_test()
    aco_print_program(program.get(), output);
 }
 
-void finish_ra_test(ra_test_policy policy)
+void finish_ra_test(ra_test_policy policy, bool lower)
 {
    finish_program(program.get());
    if (!aco::validate_ir(program.get())) {
@@ -186,8 +186,12 @@ void finish_ra_test(ra_test_policy policy)
       return;
    }
 
-   finish_program(program.get());
-   aco::optimize_postRA(program.get());
+   if (lower) {
+      aco::ssa_elimination(program.get());
+      aco::lower_to_hw_instr(program.get());
+   }
+
+   aco_print_program(program.get(), output);
 }
 
 void finish_optimizer_postRA_test()
@@ -382,28 +386,31 @@ void print_pipeline_ir(VkDevice device, VkPipeline pipeline, VkShaderStageFlagBi
    result = GetPipelineExecutableInternalRepresentationsKHR(device, &exec_info, &ir_count, ir);
    assert(result == VK_SUCCESS);
 
-   for (unsigned i = 0; i < ir_count; i++) {
-      if (strcmp(ir[i].name, name))
-         continue;
+   VkPipelineExecutableInternalRepresentationKHR* requested_ir = nullptr;
+   for (unsigned i = 0; i < ir_count; ++i) {
+      if (strcmp(ir[i].name, name) == 0) {
+         requested_ir = &ir[i];
+         break;
+      }
+   }
+   assert(requested_ir && "Could not find requested IR");
 
-      char *data = (char*)malloc(ir[i].dataSize);
-      ir[i].pData = data;
-      result = GetPipelineExecutableInternalRepresentationsKHR(device, &exec_info, &ir_count, ir);
-      assert(result == VK_SUCCESS);
+   char *data = (char*)malloc(requested_ir->dataSize);
+   requested_ir->pData = data;
+   result = GetPipelineExecutableInternalRepresentationsKHR(device, &exec_info, &ir_count, ir);
+   assert(result == VK_SUCCESS);
 
-      if (remove_encoding) {
-         for (char *c = data; *c; c++) {
-            if (*c == ';') {
-               for (; *c && *c != '\n'; c++)
-                  *c = ' ';
-            }
+   if (remove_encoding) {
+      for (char *c = data; *c; c++) {
+         if (*c == ';') {
+            for (; *c && *c != '\n'; c++)
+               *c = ' ';
          }
       }
-
-      fprintf(output, "%s", data);
-      free(data);
-      return;
    }
+
+   fprintf(output, "%s", data);
+   free(data);
 }
 
 VkShaderModule __qoCreateShaderModule(VkDevice dev, const QoShaderModuleCreateInfo *module_info)
diff --git a/mesa 3D driver/src/amd/compiler/tests/helpers.h b/mesa 3D driver/src/amd/compiler/tests/helpers.h
index ee3e68da38..7299939f32 100644
--- a/mesa 3D driver/src/amd/compiler/tests/helpers.h	
+++ b/mesa 3D driver/src/amd/compiler/tests/helpers.h	
@@ -79,7 +79,7 @@ bool setup_cs(const char *input_spec, enum chip_class chip_class,
 void finish_program(aco::Program *program);
 void finish_validator_test();
 void finish_opt_test();
-void finish_ra_test(aco::ra_test_policy);
+void finish_ra_test(aco::ra_test_policy, bool lower=false);
 void finish_optimizer_postRA_test();
 void finish_to_hw_instr_test();
 void finish_insert_nops_test();
diff --git a/mesa 3D driver/src/amd/compiler/tests/meson.build b/mesa 3D driver/src/amd/compiler/tests/meson.build
index f72556cf3e..7659df29aa 100644
--- a/mesa 3D driver/src/amd/compiler/tests/meson.build	
+++ b/mesa 3D driver/src/amd/compiler/tests/meson.build	
@@ -61,7 +61,7 @@ test(
       libamd_common, libamd_common_llvm, libvulkan_radeon,
     ],
     dependencies : [
-      dep_llvm, dep_thread, idep_aco, idep_nir, idep_mesautil, idep_vulkan_util_headers,
+      dep_llvm, dep_thread, idep_aco, idep_nir, idep_mesautil, idep_vulkan_util_headers, idep_amdgfxregs_h,
     ],
     gnu_symbol_visibility : 'hidden',
     build_by_default : true,
diff --git a/mesa 3D driver/src/amd/compiler/tests/test_optimizer.cpp b/mesa 3D driver/src/amd/compiler/tests/test_optimizer.cpp
index 54a11d399f..31a229f99e 100644
--- a/mesa 3D driver/src/amd/compiler/tests/test_optimizer.cpp	
+++ b/mesa 3D driver/src/amd/compiler/tests/test_optimizer.cpp	
@@ -395,73 +395,6 @@ BEGIN_TEST(optimize.add_lshl)
    }
 END_TEST
 
-Temp create_mad_u32_u16(Operand a, Operand b, Operand c, bool is16bit = true)
-{
-   a.set16bit(is16bit);
-   b.set16bit(is16bit);
-
-   return bld.vop3(aco_opcode::v_mad_u32_u16, bld.def(v1), a, b, c);
-}
-
-BEGIN_TEST(optimize.mad_u32_u16)
-   for (unsigned i = GFX9; i <= GFX10; i++) {
-      //>> v1: %a, v1: %b, s1: %c = p_startpgm
-      if (!setup_cs("v1 v1 s1", (chip_class)i))
-         continue;
-
-      //! v1: %res0 = v_mul_u32_u24 (is16bit)%a, (is16bit)%b
-      //! p_unit_test 0, %res0
-      writeout(0, create_mad_u32_u16(Operand(inputs[0]), Operand(inputs[1]), Operand::zero()));
-
-      //! v1: %res1 = v_mul_u32_u24 42, (is16bit)%a
-      //! p_unit_test 1, %res1
-      writeout(1, create_mad_u32_u16(Operand::c32(42u), Operand(inputs[0]), Operand::zero()));
-
-      //! v1: %res2 = v_mul_u32_u24 42, (is16bit)%a
-      //! p_unit_test 2, %res2
-      writeout(2, create_mad_u32_u16(Operand(inputs[0]), Operand::c32(42u), Operand::zero()));
-
-      //! v1: %res3 = v_mul_u32_u24 (is16bit)%c, (is16bit)%a
-      //! p_unit_test 3, %res3
-      writeout(3, create_mad_u32_u16(Operand(inputs[2]), Operand(inputs[0]), Operand::zero()));
-
-      //! v1: %res4 = v_mad_u32_u16 42, (is16bit)%c, 0
-      //! p_unit_test 4, %res4
-      writeout(4, create_mad_u32_u16(Operand::c32(42u), Operand(inputs[2]), Operand::zero()));
-
-      //! v1: %res5 = v_mad_u32_u16 42, %a, 0
-      //! p_unit_test 5, %res5
-      writeout(5,
-               create_mad_u32_u16(Operand::c32(42u), Operand(inputs[0]), Operand::zero(), false));
-
-      //~gfx9! v1: %mul6 = v_mul_lo_u16 %a, %b
-      //~gfx9! v1: %res6 = v_add_u32 %mul6, %b
-      //~gfx10! v1: %mul6 = v_mul_lo_u16_e64 %a, %b
-      //~gfx10! v1: %res6 = v_add_u32 %mul6, %b
-      //! p_unit_test 6, %res6
-      Temp mul;
-      if (i >= GFX10) {
-         mul = bld.vop3(aco_opcode::v_mul_lo_u16_e64, bld.def(v1), inputs[0], inputs[1]);
-      } else {
-         mul = bld.vop2(aco_opcode::v_mul_lo_u16, bld.def(v1), inputs[0], inputs[1]);
-      }
-      writeout(6, bld.vadd32(bld.def(v1), mul, inputs[1]));
-
-      //~gfx9! v1: %res7 = v_mad_u32_u16 %a, %b, %b
-      //~gfx10! v1: (nuw)%mul7 = v_mul_lo_u16_e64 %a, %b
-      //~gfx10! v1: %res7 = v_add_u32 %mul7, %b
-      //! p_unit_test 7, %res7
-      if (i >= GFX10) {
-         mul = bld.nuw().vop3(aco_opcode::v_mul_lo_u16_e64, bld.def(v1), inputs[0], inputs[1]);
-      } else {
-         mul = bld.nuw().vop2(aco_opcode::v_mul_lo_u16, bld.def(v1), inputs[0], inputs[1]);
-      }
-      writeout(7, bld.vadd32(bld.def(v1), mul, inputs[1]));
-
-      finish_opt_test();
-   }
-END_TEST
-
 BEGIN_TEST(optimize.bcnt)
    for (unsigned i = GFX8; i <= GFX10; i++) {
       //>> v1: %a, s1: %b = p_startpgm
@@ -869,8 +802,7 @@ BEGIN_TEST(optimize.add_lshlrev)
       lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4u), a_16bit);
       writeout(4, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
 
-      //~gfx8! v1: %lshl5 = v_lshlrev_b32 4, (is24bit)%c
-      //~gfx8! v1: %res5, s2: %_ = v_add_co_u32 %c, %lshl5
+      //~gfx8! v1: %res5 = v_mad_u32_u24 (is24bit)%c, 16, %c
       //~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%c, 4, %c
       //! p_unit_test 5, %res5
       Operand c_24bit = Operand(inputs[2]);
@@ -1020,3 +952,125 @@ BEGIN_TEST(optimize.denorm_propagation)
       }
    }
 END_TEST
+
+BEGIN_TEST(optimizer.dpp)
+   //>> v1: %a, v1: %b, s2: %c = p_startpgm
+   if (!setup_cs("v1 v1 s2", GFX10_3))
+      return;
+
+   Operand a(inputs[0]);
+   Operand b(inputs[1]);
+   Operand c(inputs[2]);
+
+   /* basic optimization */
+   //! v1: %res0 = v_add_f32 %a, %b row_mirror bound_ctrl:1
+   //! p_unit_test 0, %res0
+   Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
+   Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp0, b);
+   writeout(0, res0);
+
+   /* operand swapping */
+   //! v1: %res1 = v_subrev_f32 %a, %b row_mirror bound_ctrl:1
+   //! p_unit_test 1, %res1
+   Temp tmp1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
+   Temp res1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), b, tmp1);
+   writeout(1, res1);
+
+   //! v1: %tmp2 = v_mov_b32 %a row_mirror bound_ctrl:1
+   //! v1: %res2 = v_sub_f32 %b, %tmp2 row_half_mirror bound_ctrl:1
+   //! p_unit_test 2, %res2
+   Temp tmp2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
+   Temp res2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), b, tmp2, dpp_row_half_mirror);
+   writeout(2, res2);
+
+   /* modifiers */
+   //! v1: %res3 = v_add_f32 -%a, %b row_mirror bound_ctrl:1
+   //! p_unit_test 3, %res3
+   auto tmp3 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
+   tmp3.instr->dpp().neg[0] = true;
+   Temp res3 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp3, b);
+   writeout(3, res3);
+
+   //! v1: %res4 = v_add_f32 -%a, %b row_mirror bound_ctrl:1
+   //! p_unit_test 4, %res4
+   Temp tmp4 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
+   auto res4 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp4, b);
+   res4.instr->vop3().neg[0] = true;
+   writeout(4, res4);
+
+   //! v1: %tmp5 = v_mov_b32 %a row_mirror bound_ctrl:1
+   //! v1: %res5 = v_add_f32 %tmp5, %b clamp
+   //! p_unit_test 5, %res5
+   Temp tmp5 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
+   auto res5 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp5, b);
+   res5.instr->vop3().clamp = true;
+   writeout(5, res5);
+
+   //! v1: %res6 = v_add_f32 |%a|, %b row_mirror bound_ctrl:1
+   //! p_unit_test 6, %res6
+   auto tmp6 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
+   tmp6.instr->dpp().neg[0] = true;
+   auto res6 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp6, b);
+   res6.instr->vop3().abs[0] = true;
+   writeout(6, res6);
+
+   //! v1: %res7 = v_subrev_f32 %a, |%b| row_mirror bound_ctrl:1
+   //! p_unit_test 7, %res7
+   Temp tmp7 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
+   auto res7 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), b, tmp7);
+   res7.instr->vop3().abs[0] = true;
+   writeout(7, res7);
+
+   /* vcc */
+   //! v1: %res8 = v_cndmask_b32 %a, %b, %c:vcc row_mirror bound_ctrl:1
+   //! p_unit_test 8, %res8
+   Temp tmp8 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
+   Temp res8 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp8, b, c);
+   writeout(8, res8);
+
+   finish_opt_test();
+END_TEST
+
+BEGIN_TEST(optimize.dpp_prop)
+   //>> v1: %a, s1: %b = p_startpgm
+   if (!setup_cs("v1 s1", GFX10))
+      return;
+
+   //! v1: %one = p_parallelcopy 1
+   //! v1: %res0 = v_mul_f32 1, %a
+   //! p_unit_test 0, %res0
+   Temp one = bld.copy(bld.def(v1), Operand::c32(1));
+   writeout(0, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), one, inputs[0], dpp_row_sl(1)));
+
+   //! v1: %res1 = v_mul_f32 %a, %one row_shl:1 bound_ctrl:1
+   //! p_unit_test 1, %res1
+   writeout(1, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], one, dpp_row_sl(1)));
+
+   //! v1: %res2 = v_mul_f32 0x12345678, %a
+   //! p_unit_test 2, %res2
+   Temp literal1 = bld.copy(bld.def(v1), Operand::c32(0x12345678u));
+   writeout(2, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal1, inputs[0], dpp_row_sl(1)));
+
+   //! v1: %literal2 = p_parallelcopy 0x12345679
+   //! v1: %res3 = v_mul_f32 %a, %literal row_shl:1 bound_ctrl:1
+   //! p_unit_test 3, %res3
+   Temp literal2 = bld.copy(bld.def(v1), Operand::c32(0x12345679u));
+   writeout(3, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], literal2, dpp_row_sl(1)));
+
+   //! v1: %b_v = p_parallelcopy %b
+   //! v1: %res4 = v_mul_f32 %b, %a
+   //! p_unit_test 4, %res4
+   Temp b_v = bld.copy(bld.def(v1), inputs[1]);
+   writeout(4, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), b_v, inputs[0], dpp_row_sl(1)));
+
+   //! v1: %res5 = v_mul_f32 %a, %b_v row_shl:1 bound_ctrl:1
+   //! p_unit_test 5, %res5
+   writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], b_v, dpp_row_sl(1)));
+
+   //! v1: %res6 = v_rcp_f32 %b
+   //! p_unit_test 6, %res6
+   writeout(6, bld.vop1_dpp(aco_opcode::v_rcp_f32, bld.def(v1), b_v, dpp_row_sl(1)));
+
+   finish_opt_test();
+END_TEST
+
diff --git a/mesa 3D driver/src/amd/compiler/tests/test_optimizer_postRA.cpp b/mesa 3D driver/src/amd/compiler/tests/test_optimizer_postRA.cpp
index 9887ab60d2..f0345296fc 100644
--- a/mesa 3D driver/src/amd/compiler/tests/test_optimizer_postRA.cpp	
+++ b/mesa 3D driver/src/amd/compiler/tests/test_optimizer_postRA.cpp	
@@ -276,3 +276,121 @@ BEGIN_TEST(optimizer_postRA.scc_nocmp_opt)
 
     finish_optimizer_postRA_test();
 END_TEST
+
+BEGIN_TEST(optimizer_postRA.dpp)
+   //>> v1: %a:v[0], v1: %b:v[1], s2: %c:vcc, s2: %d:s[0-1] = p_startpgm
+   if (!setup_cs("v1 v1 s2 s2", GFX10_3))
+      return;
+
+   bld.instructions->at(0)->definitions[0].setFixed(PhysReg(256));
+   bld.instructions->at(0)->definitions[1].setFixed(PhysReg(257));
+   bld.instructions->at(0)->definitions[2].setFixed(vcc);
+   bld.instructions->at(0)->definitions[3].setFixed(PhysReg(0));
+
+   PhysReg reg_v0(256);
+   PhysReg reg_v2(258);
+   Operand a(inputs[0], PhysReg(256));
+   Operand b(inputs[1], PhysReg(257));
+   Operand c(inputs[2], vcc);
+   Operand d(inputs[3], PhysReg(0));
+
+   /* basic optimization */
+   //! v1: %res0:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1
+   //! p_unit_test 0, %res0:v[2]
+   Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
+   Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp0, reg_v2), b);
+   writeout(0, Operand(res0, reg_v2));
+
+   /* operand swapping */
+   //! v1: %res1:v[2] = v_subrev_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1
+   //! p_unit_test 1, %res1:v[2]
+   Temp tmp1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
+   Temp res1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp1, reg_v2));
+   writeout(1, Operand(res1, reg_v2));
+
+   //! v1: %tmp2:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1
+   //! v1: %res2:v[2] = v_sub_f32 %b:v[1], %tmp2:v[2] row_half_mirror bound_ctrl:1
+   //! p_unit_test 2, %res2:v[2]
+   Temp tmp2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
+   Temp res2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp2, reg_v2), dpp_row_half_mirror);
+   writeout(2, Operand(res2, reg_v2));
+
+   /* modifiers */
+   //! v1: %res3:v[2] = v_add_f32 -%a:v[0], %b:v[1] row_mirror bound_ctrl:1
+   //! p_unit_test 3, %res3:v[2]
+   auto tmp3 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
+   tmp3.instr->dpp().neg[0] = true;
+   Temp res3 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp3, reg_v2), b);
+   writeout(3, Operand(res3, reg_v2));
+
+   //! v1: %res4:v[2] = v_add_f32 -%a:v[0], %b:v[1] row_mirror bound_ctrl:1
+   //! p_unit_test 4, %res4:v[2]
+   Temp tmp4 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
+   auto res4 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp4, reg_v2), b);
+   res4.instr->vop3().neg[0] = true;
+   writeout(4, Operand(res4, reg_v2));
+
+   //! v1: %tmp5:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1
+   //! v1: %res5:v[2] = v_add_f32 %tmp5:v[2], %b:v[1] clamp
+   //! p_unit_test 5, %res5:v[2]
+   Temp tmp5 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
+   auto res5 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp5, reg_v2), b);
+   res5.instr->vop3().clamp = true;
+   writeout(5, Operand(res5, reg_v2));
+
+   //! v1: %res6:v[2] = v_add_f32 |%a:v[0]|, %b:v[1] row_mirror bound_ctrl:1
+   //! p_unit_test 6, %res6:v[2]
+   auto tmp6 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
+   tmp6.instr->dpp().neg[0] = true;
+   auto res6 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp6, reg_v2), b);
+   res6.instr->vop3().abs[0] = true;
+   writeout(6, Operand(res6, reg_v2));
+
+   //! v1: %res7:v[2] = v_subrev_f32 %a:v[0], |%b:v[1]| row_mirror bound_ctrl:1
+   //! p_unit_test 7, %res7:v[2]
+   Temp tmp7 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
+   auto res7 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp7, reg_v2));
+   res7.instr->vop3().abs[0] = true;
+   writeout(7, Operand(res7, reg_v2));
+
+   /* vcc */
+   //! v1: %res8:v[2] = v_cndmask_b32 %a:v[0], %b:v[1], %c:vcc row_mirror bound_ctrl:1
+   //! p_unit_test 8, %res8:v[2]
+   Temp tmp8 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
+   Temp res8 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1, reg_v2), Operand(tmp8, reg_v2), b, c);
+   writeout(8, Operand(res8, reg_v2));
+
+   //! v1: %tmp9:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1
+   //! v1: %res9:v[2] = v_cndmask_b32 %tmp9:v[2], %b:v[1], %d:s[0-1]
+   //! p_unit_test 9, %res9:v[2]
+   Temp tmp9 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
+   Temp res9 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1, reg_v2), Operand(tmp9, reg_v2), b, d);
+   writeout(9, Operand(res9, reg_v2));
+
+   /* control flow */
+   //! BB1
+   //! /* logical preds: / linear preds: BB0, / kind: uniform, */
+   //! v1: %res10:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1
+   //! p_unit_test 10, %res10:v[2]
+   Temp tmp10 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
+
+   bld.reset(program->create_and_insert_block());
+   program->blocks[0].linear_succs.push_back(1);
+   program->blocks[1].linear_preds.push_back(0);
+
+   Temp res10 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp10, reg_v2), b);
+   writeout(10, Operand(res10, reg_v2));
+
+   /* can't combine if the v_mov_b32's operand is modified */
+   //! v1: %tmp11_1:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1
+   //! v1: %tmp11_2:v[0] = v_mov_b32 0
+   //! v1: %res11:v[2] = v_add_f32 %tmp11_1:v[2], %b:v[1]
+   //! p_unit_test 11, %res11_1:v[2], %tmp11_2:v[0]
+   Temp tmp11_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
+   Temp tmp11_2 = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1, reg_v0), Operand::c32(0));
+   Temp res11 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp11_1, reg_v2), b);
+   writeout(11, Operand(res11, reg_v2), Operand(tmp11_2, reg_v0));
+
+   finish_optimizer_postRA_test();
+END_TEST
+
diff --git a/mesa 3D driver/src/amd/compiler/tests/test_regalloc.cpp b/mesa 3D driver/src/amd/compiler/tests/test_regalloc.cpp
index 52449a41b8..dcae5760c6 100644
--- a/mesa 3D driver/src/amd/compiler/tests/test_regalloc.cpp	
+++ b/mesa 3D driver/src/amd/compiler/tests/test_regalloc.cpp	
@@ -46,7 +46,7 @@ BEGIN_TEST(regalloc.subdword_alloc.reuse_16bit_operands)
          //! v2b: %_:v[#a][0:16], v2b: %res1:v[#a][16:32] = p_split_vector %_:v[#a]
          Builder::Result tmp = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), inputs[0]);
 
-         //! v1: %_:v[#b] = v_cvt_f32_f16 %_:v[#a][16:32]
+         //! v1: %_:v[#b] = v_cvt_f32_f16 %_:v[#a][16:32] dst_sel:dword src0_sel:uword1
          //! v1: %_:v[#a] = v_cvt_f32_f16 %_:v[#a][0:16]
          //; success = (b != a)
          auto result1 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), tmp.def(1).getTemp());
@@ -69,7 +69,7 @@ BEGIN_TEST(regalloc.32bit_partial_write)
    Temp hi = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), inputs[0]).def(1).getTemp();
 
    /* This test checks if this instruction uses SDWA. */
-   //! v2b: %_:v[0][0:16] = v_not_b32 0 dst_preserve
+   //! v2b: %_:v[0][0:16] = v_not_b32 0 dst_sel:uword0 dst_preserve src0_sel:dword
    Temp lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v2b), Operand::zero());
 
    //! v1: %_:v[0] = p_create_vector %_:v[0][0:16], %_:v[0][16:32]
@@ -147,3 +147,142 @@ BEGIN_TEST(regalloc.precolor.vector.collect)
 
    finish_ra_test(ra_test_policy());
 END_TEST
+
+BEGIN_TEST(regalloc.scratch_sgpr.create_vector)
+   if (!setup_cs("v1 s1", GFX7))
+      return;
+
+   Temp tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), inputs[0], Operand::zero());
+
+   //>> v3b: %0:v[0][0:24] = v_and_b32 0xffffff, %0:v[0][0:24]
+   //! s1: %0:s[1] = s_mov_b32 0x1000001
+   //! v1: %0:v[0] = v_mul_lo_u32 %0:s[1], %_:v[0][0:8]
+   bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), Operand(v3b), Operand(tmp));
+
+   //! p_unit_test %_:s[0]
+   //! s_endpgm
+   bld.pseudo(aco_opcode::p_unit_test, inputs[1]);
+
+   finish_ra_test(ra_test_policy(), true);
+END_TEST
+
+BEGIN_TEST(regalloc.scratch_sgpr.create_vector_sgpr_operand)
+   if (!setup_cs("v2 s1", GFX7))
+      return;
+
+   Temp tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), inputs[0], Operand::c32(4u));
+
+   //>> v1: %0:v[0] = v_mov_b32 %_:s[0]
+   //! v3b: %0:v[1][0:24] = v_and_b32 0xffffff, %0:v[1][0:24]
+   //! s1: %0:s[1] = s_mov_b32 0x1000001
+   //! v1: %0:v[1] = v_mul_lo_u32 %0:s[1], %_:v[1][0:8]
+   bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), inputs[1], Operand(v3b), Operand(tmp));
+
+   //! p_unit_test %_:s[0]
+   //! s_endpgm
+   bld.pseudo(aco_opcode::p_unit_test, inputs[1]);
+
+   finish_ra_test(ra_test_policy(), true);
+END_TEST
+
+BEGIN_TEST(regalloc.linear_vgpr.live_range_split.fixed_def)
+   //>> p_startpgm
+   if (!setup_cs("", GFX10))
+      return;
+
+   PhysReg reg_v0{256};
+   PhysReg reg_v1{257};
+
+   //! lv1: %tmp1:v[0] = p_unit_test
+   Temp tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(v1.as_linear(), reg_v0));
+
+   //! lv1: %tmp2:v[1] = p_parallelcopy %tmp1:v[0]
+   //! v1: %_:v[0] = p_unit_test
+   bld.pseudo(aco_opcode::p_unit_test, Definition(reg_v0, v1));
+
+   //! p_unit_test %tmp2:v[1]
+   bld.pseudo(aco_opcode::p_unit_test, tmp);
+
+   finish_ra_test(ra_test_policy());
+END_TEST
+
+BEGIN_TEST(regalloc.linear_vgpr.live_range_split.get_reg_impl)
+   //>> p_startpgm
+   if (!setup_cs("", GFX10))
+      return;
+
+   program->dev.vgpr_limit = 3;
+
+   PhysReg reg_v1{257};
+
+   //! s1: %scc_tmp:scc, s1: %1:s[0] = p_unit_test
+   Temp s0_tmp = bld.tmp(s1);
+   Temp scc_tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(s1, scc), Definition(s0_tmp.id(), PhysReg{0}, s1));
+
+   //! lv1: %tmp1:v[1] = p_unit_test
+   Temp tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(v1.as_linear(), reg_v1));
+
+   //! lv1: %tmp2:v[2] = p_parallelcopy %tmp1:v[1]
+   //! v2: %_:v[0-1] = p_unit_test
+   bld.pseudo(aco_opcode::p_unit_test, bld.def(v2));
+
+   //! p_unit_test %tmp2:v[2], %scc_tmp:scc, %1:s[0]
+   bld.pseudo(aco_opcode::p_unit_test, tmp, scc_tmp, s0_tmp);
+
+   finish_ra_test(ra_test_policy());
+
+   //>> lv1: %5:v[2] = p_parallelcopy %3:v[1] scc:1 scratch:s1
+   Pseudo_instruction& parallelcopy = program->blocks[0].instructions[3]->pseudo();
+   aco_print_instr(&parallelcopy, output);
+   fprintf(output, " scc:%u scratch:s%u\n", parallelcopy.tmp_in_scc, parallelcopy.scratch_sgpr.reg());
+END_TEST
+
+BEGIN_TEST(regalloc.linear_vgpr.live_range_split.get_regs_for_copies)
+   //>> p_startpgm
+   if (!setup_cs("", GFX10))
+      return;
+
+   program->dev.vgpr_limit = 6;
+
+   PhysReg reg_v2{258};
+   PhysReg reg_v4{260};
+
+   //! lv1: %lin_tmp1:v[4] = p_unit_test
+   Temp lin_tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(v1.as_linear(), reg_v4));
+   //! v2: %log_tmp1:v[2-3] = p_unit_test
+   Temp log_tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(v2, reg_v2));
+
+   //! lv1: %lin_tmp2:v[0], v2: %log_tmp2:v[4-5] = p_parallelcopy %lin_tmp1:v[4], %log_tmp1:v[2-3]
+   //! v3: %_:v[1-3] = p_unit_test
+   bld.pseudo(aco_opcode::p_unit_test, bld.def(v3));
+
+   //! p_unit_test %log_tmp2:v[4-5], %lin_tmp2:v[0]
+   bld.pseudo(aco_opcode::p_unit_test, log_tmp, lin_tmp);
+
+   finish_ra_test(ra_test_policy());
+END_TEST
+
+BEGIN_TEST(regalloc.linear_vgpr.live_range_split.get_reg_create_vector)
+   //>> p_startpgm
+   if (!setup_cs("", GFX10))
+      return;
+
+   program->dev.vgpr_limit = 4;
+
+   PhysReg reg_v0{256};
+   PhysReg reg_v1{257};
+
+   //! lv1: %lin_tmp1:v[0] = p_unit_test
+   Temp lin_tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(v1.as_linear(), reg_v0));
+   //! v1: %log_tmp:v[1] = p_unit_test
+   Temp log_tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(v1, reg_v1));
+
+   //! lv1: %lin_tmp2:v[2] = p_parallelcopy %lin_tmp1:v[0]
+   //! v2: %_:v[0-1] = p_create_vector v1: undef, %log_tmp:v[1]
+   bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(v1), log_tmp);
+
+   //! p_unit_test %lin_tmp2:v[2]
+   bld.pseudo(aco_opcode::p_unit_test, lin_tmp);
+
+   finish_ra_test(ra_test_policy());
+END_TEST
diff --git a/mesa 3D driver/src/amd/compiler/tests/test_sdwa.cpp b/mesa 3D driver/src/amd/compiler/tests/test_sdwa.cpp
index 6a16700ccf..73a9a43b8b 100644
--- a/mesa 3D driver/src/amd/compiler/tests/test_sdwa.cpp	
+++ b/mesa 3D driver/src/amd/compiler/tests/test_sdwa.cpp	
@@ -37,13 +37,11 @@ BEGIN_TEST(validate.sdwa.allow)
       SDWA_instruction *sdwa = &bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]).instr->sdwa();
       sdwa->neg[0] = sdwa->neg[1] = sdwa->abs[0] = sdwa->abs[1] = true;
 
-      sdwa = &bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]).instr->sdwa();
-      sdwa->dst_preserve = true;
-      sdwa->dst_sel = sdwa_ubyte0;
+      sdwa = &bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1b), inputs[0], inputs[1]).instr->sdwa();
 
       sdwa = &bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]).instr->sdwa();
-      sdwa->sel[0] = sdwa_sbyte2;
-      sdwa->sel[1] = sdwa_uword1;
+      sdwa->sel[0] = SubdwordSel::sbyte2;
+      sdwa->sel[1] = SubdwordSel::uword1;
 
       finish_validator_test();
    }
@@ -56,7 +54,7 @@ BEGIN_TEST(validate.sdwa.support)
          continue;
       //>> Validation results:
 
-      //~gfx7! SDWA is GFX8+ only: v1: %t0 = v_mul_f32 %a, %b
+      //~gfx7! SDWA is GFX8+ only: v1: %t0 = v_mul_f32 %a, %b dst_sel:dword src0_sel:dword src1_sel:dword
       //~gfx7! Validation failed
       //~gfx([89]|10)! Validation passed
       bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]);
@@ -72,19 +70,19 @@ BEGIN_TEST(validate.sdwa.operands)
          continue;
       //>> Validation results:
 
-      //~gfx8! Wrong source position for SGPR argument: v1: %_ = v_mul_f32 %sgpr0, %vgpr1
-      //~gfx8! Wrong source position for SGPR argument: v1: %_ = v_mul_f32 %vgpr0, %sgpr1
+      //~gfx8! Wrong source position for SGPR argument: v1: %_ = v_mul_f32 %sgpr0, %vgpr1 dst_sel:dword src0_sel:dword src1_sel:dword
+      //~gfx8! Wrong source position for SGPR argument: v1: %_ = v_mul_f32 %vgpr0, %sgpr1 dst_sel:dword src0_sel:dword src1_sel:dword
       bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1), inputs[2], inputs[1]);
       bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[3]);
 
-      //~gfx8! Wrong source position for constant argument: v1: %_ = v_mul_f32 4, %vgpr1
-      //~gfx8! Wrong source position for constant argument: v1: %_ = v_mul_f32 %vgpr0, 4
+      //~gfx8! Wrong source position for constant argument: v1: %_ = v_mul_f32 4, %vgpr1 dst_sel:dword src0_sel:dword src1_sel:dword
+      //~gfx8! Wrong source position for constant argument: v1: %_ = v_mul_f32 %vgpr0, 4 dst_sel:dword src0_sel:dword src1_sel:dword
       bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(4u), inputs[1]);
       bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], Operand::c32(4u));
 
-      //! Literal applied on wrong instruction format: v1: %_ = v_mul_f32 0x1234, %vgpr1
-      //! Literal applied on wrong instruction format: v1: %_ = v_mul_f32 %vgpr0, 0x1234
-      //! Wrong source position for Literal argument: v1: %_ = v_mul_f32 %vgpr0, 0x1234
+      //! Literal applied on wrong instruction format: v1: %_ = v_mul_f32 0x1234, %vgpr1 dst_sel:dword src0_sel:dword src1_sel:dword
+      //! Literal applied on wrong instruction format: v1: %_ = v_mul_f32 %vgpr0, 0x1234 dst_sel:dword src0_sel:dword src1_sel:dword
+      //! Wrong source position for Literal argument: v1: %_ = v_mul_f32 %vgpr0, 0x1234 dst_sel:dword src0_sel:dword src1_sel:dword
       bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x1234u), inputs[1]);
       bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], Operand::c32(0x1234u));
 
@@ -103,10 +101,10 @@ BEGIN_TEST(validate.sdwa.vopc)
 
       bld.vopc_sdwa(aco_opcode::v_cmp_gt_f32, bld.def(bld.lm, vcc), inputs[0], inputs[1]);
 
-      //~gfx8! SDWA+VOPC definition must be fixed to vcc on GFX8: s2: %_ = v_cmp_lt_f32 %vgpr0, %vgpr1
+      //~gfx8! SDWA+VOPC definition must be fixed to vcc on GFX8: s2: %_ = v_cmp_lt_f32 %vgpr0, %vgpr1 src0_sel:dword src1_sel:dword
       bld.vopc_sdwa(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), inputs[0], inputs[1]);
 
-      //~gfx(9|10)! SDWA VOPC clamp only supported on GFX8: s2: %_:vcc = v_cmp_eq_f32 %vgpr0, %vgpr1 clamp
+      //~gfx(9|10)! SDWA VOPC clamp only supported on GFX8: s2: %_:vcc = v_cmp_eq_f32 %vgpr0, %vgpr1 clamp src0_sel:dword src1_sel:dword
       bld.vopc_sdwa(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm, vcc), inputs[0], inputs[1]).instr->sdwa().clamp = true;
 
       //! Validation failed
@@ -122,7 +120,7 @@ BEGIN_TEST(validate.sdwa.omod)
          continue;
       //>> Validation results:
 
-      //~gfx8! SDWA omod only supported on GFX9+: v1: %_ = v_mul_f32 %vgpr0, %vgpr1 *2
+      //~gfx8! SDWA omod only supported on GFX9+: v1: %_ = v_mul_f32 %vgpr0, %vgpr1 *2 dst_sel:dword src0_sel:dword src1_sel:dword
       //~gfx8! Validation failed
       //~gfx(9|10)! Validation passed
       bld.vop2_sdwa(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]).instr->sdwa().omod = 1;
@@ -138,11 +136,11 @@ BEGIN_TEST(validate.sdwa.vcc)
          continue;
       //>> Validation results:
 
-      //! 3rd operand must be fixed to vcc with SDWA: v1: %_ = v_cndmask_b32 %vgpr0, %vgpr1, %_
+      //! 3rd operand must be fixed to vcc with SDWA: v1: %_ = v_cndmask_b32 %vgpr0, %vgpr1, %_ dst_sel:dword src0_sel:dword src1_sel:dword
       bld.vop2_sdwa(aco_opcode::v_cndmask_b32, bld.def(v1), inputs[0], inputs[1], inputs[2]);
       bld.vop2_sdwa(aco_opcode::v_cndmask_b32, bld.def(v1), inputs[0], inputs[1], bld.vcc(inputs[2]));
 
-      //! 2nd definition must be fixed to vcc with SDWA: v1: %_, s2: %_ = v_add_co_u32 %vgpr0, %vgpr1
+      //! 2nd definition must be fixed to vcc with SDWA: v1: %_, s2: %_ = v_add_co_u32 %vgpr0, %vgpr1 dst_sel:dword src0_sel:dword src1_sel:dword
       bld.vop2_sdwa(aco_opcode::v_add_co_u32, bld.def(v1), bld.def(bld.lm), inputs[0], inputs[1]);
       bld.vop2_sdwa(aco_opcode::v_add_co_u32, bld.def(v1), bld.def(bld.lm, vcc), inputs[0], inputs[1]);
 
@@ -159,53 +157,51 @@ BEGIN_TEST(optimize.sdwa.extract)
       if (!setup_cs("v1 v1 s1 s1", (chip_class)i, CHIP_UNKNOWN, is_signed ? "_signed" : "_unsigned"))
          continue;
 
-      //; funcs['b'] = lambda bits: ('sext(%%b)[%s]' if variant.endswith('_signed') else '%%b[%s]') % bits
-
-      //; def standard_test(index, offset, size):
-      //;    res = 'v1: %%res%d = v_mul_f32 %%a, @b(%d:%d)\n' % (index, offset % 32, offset % 32 + size % 32 - 1)
-      //;    res += 'p_unit_test %d, %%res%d' % (index, index)
+      //; def standard_test(index, sel):
+      //;    res = 'v1: %%res%s = v_mul_f32 %%a, %%b dst_sel:dword src0_sel:dword src1_sel:%c%s\n' % (index, 's' if variant.endswith('_signed') else 'u', sel)
+      //;    res += 'p_unit_test %s, %%res%s' % (index, index)
       //;    return res
-      //; funcs['standard_test'] = lambda a: standard_test(*(int(v) for v in a.split(',')))
+      //; funcs['standard_test'] = lambda a: standard_test(*(v for v in a.split(',')))
 
       aco_opcode ext = aco_opcode::p_extract;
       aco_opcode ins = aco_opcode::p_insert;
 
       {
-      //~gfx[^7].*! @standard_test(0, 0, 8)
+      //~gfx[^7].*! @standard_test(0,byte0)
       Temp bfe_byte0_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::zero(), Operand::c32(8u),
                                     Operand::c32(is_signed));
       writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfe_byte0_b));
 
-      //~gfx[^7].*! @standard_test(1, 8, 8)
+      //~gfx[^7].*! @standard_test(1,byte1)
       Temp bfe_byte1_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(1u), Operand::c32(8u),
                                     Operand::c32(is_signed));
       writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfe_byte1_b));
 
-      //~gfx[^7].*! @standard_test(2, 16, 8)
+      //~gfx[^7].*! @standard_test(2,byte2)
       Temp bfe_byte2_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(2u), Operand::c32(8u),
                                     Operand::c32(is_signed));
       writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfe_byte2_b));
 
-      //~gfx[^7].*! @standard_test(3, 24, 8)
+      //~gfx[^7].*! @standard_test(3,byte3)
       Temp bfe_byte3_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(3u), Operand::c32(8u),
                                     Operand::c32(is_signed));
       writeout(3, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfe_byte3_b));
 
-      //~gfx[^7].*! @standard_test(4, 0, 16)
+      //~gfx[^7].*! @standard_test(4,word0)
       Temp bfe_word0_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::zero(), Operand::c32(16u),
                                     Operand::c32(is_signed));
       writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfe_word0_b));
 
-      //~gfx[^7].*! @standard_test(5, 16, 16)
+      //~gfx[^7].*! @standard_test(5,word1)
       Temp bfe_word1_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(1u),
                                     Operand::c32(16u), Operand::c32(is_signed));
       writeout(5, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfe_word1_b));
 
-      //~gfx[^7]_unsigned! @standard_test(6, 0, 8)
+      //~gfx[^7]_unsigned! @standard_test(6,byte0)
       Temp bfi_byte0_b = bld.pseudo(ins, bld.def(v1), inputs[1], Operand::zero(), Operand::c32(8u));
       writeout(6, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfi_byte0_b));
 
-      //~gfx[^7]_unsigned! @standard_test(7, 0, 16)
+      //~gfx[^7]_unsigned! @standard_test(7,word0)
       Temp bfi_word0_b =
          bld.pseudo(ins, bld.def(v1), inputs[1], Operand::zero(), Operand::c32(16u));
       writeout(7, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], bfi_word0_b));
@@ -225,7 +221,7 @@ BEGIN_TEST(optimize.sdwa.extract)
       /* v_cvt_f32_ubyte[0-3] can be used instead of v_cvt_f32_u32+sdwa */
       //~gfx7_signed! v1: %bfe_byte0_b = p_extract %b, 0, 8, 1
       //~gfx7_signed! v1: %res9 = v_cvt_f32_u32 %bfe_byte0_b
-      //~gfx[^7]+_signed! v1: %res9 = v_cvt_f32_u32 @b(0:7)
+      //~gfx[^7]+_signed! v1: %res9 = v_cvt_f32_u32 %b dst_sel:dword src0_sel:sbyte0
       //~gfx\d+_unsigned! v1: %res9 = v_cvt_f32_ubyte0 %b
       //! p_unit_test 9, %res9
       Temp bfe_byte0_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::zero(), Operand::c32(8u),
@@ -234,7 +230,7 @@ BEGIN_TEST(optimize.sdwa.extract)
 
       //~gfx7_signed! v1: %bfe_byte1_b = p_extract %b, 1, 8, 1
       //~gfx7_signed! v1: %res10 = v_cvt_f32_u32 %bfe_byte1_b
-      //~gfx[^7]+_signed! v1: %res10 = v_cvt_f32_u32 @b(8:15)
+      //~gfx[^7]+_signed! v1: %res10 = v_cvt_f32_u32 %b dst_sel:dword src0_sel:sbyte1
       //~gfx\d+_unsigned! v1: %res10 = v_cvt_f32_ubyte1 %b
       //! p_unit_test 10, %res10
       Temp bfe_byte1_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(1u), Operand::c32(8u),
@@ -243,7 +239,7 @@ BEGIN_TEST(optimize.sdwa.extract)
 
       //~gfx7_signed! v1: %bfe_byte2_b = p_extract %b, 2, 8, 1
       //~gfx7_signed! v1: %res11 = v_cvt_f32_u32 %bfe_byte2_b
-      //~gfx[^7]+_signed! v1: %res11 = v_cvt_f32_u32 @b(16:23)
+      //~gfx[^7]+_signed! v1: %res11 = v_cvt_f32_u32 %b dst_sel:dword src0_sel:sbyte2
       //~gfx\d+_unsigned! v1: %res11 = v_cvt_f32_ubyte2 %b
       //! p_unit_test 11, %res11
       Temp bfe_byte2_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(2u), Operand::c32(8u),
@@ -252,7 +248,7 @@ BEGIN_TEST(optimize.sdwa.extract)
 
       //~gfx7_signed! v1: %bfe_byte3_b = p_extract %b, 3, 8, 1
       //~gfx7_signed! v1: %res12 = v_cvt_f32_u32 %bfe_byte3_b
-      //~gfx[^7]+_signed! v1: %res12 = v_cvt_f32_u32 @b(24:31)
+      //~gfx[^7]+_signed! v1: %res12 = v_cvt_f32_u32 %b dst_sel:dword src0_sel:sbyte3
       //~gfx\d+_unsigned! v1: %res12 = v_cvt_f32_ubyte3 %b
       //! p_unit_test 12, %res12
       Temp bfe_byte3_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::c32(3u), Operand::c32(8u),
@@ -286,7 +282,7 @@ BEGIN_TEST(optimize.sdwa.extract_modifiers)
 
       aco_opcode ext = aco_opcode::p_extract;
 
-      //! v1: %res0 = v_mul_f32 %a, -%b[0:7]
+      //! v1: %res0 = v_mul_f32 %a, -%b dst_sel:dword src0_sel:dword src1_sel:ubyte0
       //! p_unit_test 0, %res0
       Temp byte0 = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::zero(), Operand::c32(8u),
                               Operand::zero());
@@ -294,8 +290,8 @@ BEGIN_TEST(optimize.sdwa.extract_modifiers)
       writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_byte0));
 
       //~gfx8! v1: %neg = v_mul_f32 -1.0, %b
-      //~gfx8! v1: %res1 = v_mul_f32 %a, %neg[0:7]
-      //~gfx(9|10)! v1: %neg_byte0 = v_mul_f32 -1.0, %b dst_sel:ubyte0
+      //~gfx8! v1: %res1 = v_mul_f32 %a, %neg dst_sel:dword src0_sel:dword src1_sel:ubyte0
+      //~gfx(9|10)! v1: %neg_byte0 = v_mul_f32 -1.0, %b dst_sel:ubyte0 src0_sel:dword src1_sel:dword
       //~gfx(9|10)! v1: %res1 = v_mul_f32 %a, %neg_byte0
       //! p_unit_test 1, %res1
       Temp neg = fneg(inputs[1]);
@@ -303,27 +299,27 @@ BEGIN_TEST(optimize.sdwa.extract_modifiers)
          bld.pseudo(ext, bld.def(v1), neg, Operand::zero(), Operand::c32(8u), Operand::zero());
       writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], byte0_neg));
 
-      //! v1: %res2 = v_mul_f32 %a, |%b[0:7]|
+      //! v1: %res2 = v_mul_f32 %a, |%b| dst_sel:dword src0_sel:dword src1_sel:ubyte0
       //! p_unit_test 2, %res2
       Temp abs_byte0 = fabs(byte0);
       writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], abs_byte0));
 
       //! v1: %abs = v_mul_f32 1.0, |%b|
-      //! v1: %res3 = v_mul_f32 %a, %abs[0:7]
+      //! v1: %res3 = v_mul_f32 %a, %abs dst_sel:dword src0_sel:dword src1_sel:ubyte0
       //! p_unit_test 3, %res3
       Temp abs = fabs(inputs[1]);
       Temp byte0_abs =
          bld.pseudo(ext, bld.def(v1), abs, Operand::zero(), Operand::c32(8u), Operand::zero());
       writeout(3, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], byte0_abs));
 
-      //! v1: %res4 = v_mul_f32 %1, -|%2[0:7]|
+      //! v1: %res4 = v_mul_f32 %1, -|%2| dst_sel:dword src0_sel:dword src1_sel:ubyte0
       //! p_unit_test 4, %res4
       Temp neg_abs_byte0 = fneg(abs_byte0);
       writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_abs_byte0));
 
       //~gfx8! v1: %neg_abs = v_mul_f32 -1.0, %abs
-      //~gfx8! v1: %res5 = v_mul_f32 %a, %neg_abs[0:7]
-      //~gfx(9|10)! v1: %neg_abs_byte0 = v_mul_f32 -1.0, %abs dst_sel:ubyte0
+      //~gfx8! v1: %res5 = v_mul_f32 %a, %neg_abs dst_sel:dword src0_sel:dword src1_sel:ubyte0
+      //~gfx(9|10)! v1: %neg_abs_byte0 = v_mul_f32 -1.0, %abs dst_sel:ubyte0 src0_sel:dword src1_sel:dword
       //~gfx(9|10)! v1: %res5 = v_mul_f32 %a, %neg_abs_byte0
       //! p_unit_test 5, %res5
       Temp neg_abs = fneg(abs);
@@ -345,7 +341,7 @@ BEGIN_TEST(optimize.sdwa.extract.sgpr)
 
       //~gfx8! v1: %byte0_b = p_extract %b, 0, 8, 0
       //~gfx8! v1: %res1 = v_mul_f32 %c, %byte0_b
-      //~gfx(9|10)! v1: %res1 = v_mul_f32 %c, %b[0:7]
+      //~gfx(9|10)! v1: %res1 = v_mul_f32 %c, %b dst_sel:dword src0_sel:dword src1_sel:ubyte0
       //! p_unit_test 1, %res1
       Temp byte0_b = bld.pseudo(ext, bld.def(v1), inputs[1], Operand::zero(), Operand::c32(8u),
                                 Operand::zero());
@@ -353,7 +349,7 @@ BEGIN_TEST(optimize.sdwa.extract.sgpr)
 
       //~gfx8! v1: %byte0_c = p_extract %c, 0, 8, 0
       //~gfx8! v1: %res2 = v_mul_f32 %a, %byte0_c
-      //~gfx(9|10)! v1: %res2 = v_mul_f32 %a, %c[0:7]
+      //~gfx(9|10)! v1: %res2 = v_mul_f32 %a, %c dst_sel:dword src0_sel:dword src1_sel:ubyte0
       //! p_unit_test 2, %res2
       Temp byte0_c = bld.pseudo(ext, bld.def(v1), inputs[2], Operand::zero(), Operand::c32(8u),
                                 Operand::zero());
@@ -361,7 +357,7 @@ BEGIN_TEST(optimize.sdwa.extract.sgpr)
 
       //~gfx8! v1: %byte0_c_2 = p_extract %c, 0, 8, 0
       //~gfx8! v1: %res3 = v_mul_f32 %c, %byte0_c_2
-      //~gfx(9|10)! v1: %res3 = v_mul_f32 %c, %c[0:7]
+      //~gfx(9|10)! v1: %res3 = v_mul_f32 %c, %c dst_sel:dword src0_sel:dword src1_sel:ubyte0
       //! p_unit_test 3, %res3
       byte0_c = bld.pseudo(ext, bld.def(v1), inputs[2], Operand::zero(), Operand::c32(8u),
                            Operand::zero());
@@ -369,7 +365,7 @@ BEGIN_TEST(optimize.sdwa.extract.sgpr)
 
       //~gfx(8|9)! v1: %byte0_c_3 = p_extract %c, 0, 8, 0
       //~gfx(8|9)! v1: %res4 = v_mul_f32 %d, %byte0_c_3
-      //~gfx10! v1: %res4 = v_mul_f32 %d, %c[0:7]
+      //~gfx10! v1: %res4 = v_mul_f32 %d, %c dst_sel:dword src0_sel:dword src1_sel:ubyte0
       //! p_unit_test 4, %res4
       byte0_c = bld.pseudo(ext, bld.def(v1), inputs[2], Operand::zero(), Operand::c32(8u),
                            Operand::zero());
@@ -385,7 +381,7 @@ BEGIN_TEST(optimize.sdwa.from_vop3)
       if (!setup_cs("v1 v1 s1 s1", (chip_class)i))
          continue;
 
-      //! v1: %res0 = v_mul_f32 -|%a|, %b[0:7]
+      //! v1: %res0 = v_mul_f32 -|%a|, %b dst_sel:dword src0_sel:dword src1_sel:ubyte0
       //! p_unit_test 0, %res0
       Temp byte0_b = bld.pseudo(aco_opcode::p_extract, bld.def(v1), inputs[1], Operand::zero(),
                                 Operand::c32(8u), Operand::zero());
@@ -396,7 +392,7 @@ BEGIN_TEST(optimize.sdwa.from_vop3)
 
       //~gfx8! v1: %byte0_b_0 = p_extract %b, 0, 8, 0
       //~gfx8! v1: %res1 = v_mul_f32 %a, %byte0_b_0 *4
-      //~gfx(9|10)! v1: %res1 = v_mul_f32 %a, %b[0:7] *4
+      //~gfx(9|10)! v1: %res1 = v_mul_f32 %a, %b *4 dst_sel:dword src0_sel:dword src1_sel:ubyte0
       //! p_unit_test 1, %res1
       byte0_b = bld.pseudo(aco_opcode::p_extract, bld.def(v1), inputs[1], Operand::zero(),
                            Operand::c32(8u), Operand::zero());
@@ -406,7 +402,7 @@ BEGIN_TEST(optimize.sdwa.from_vop3)
 
       //~gfx8! v1: %byte0_b_1 = p_extract %b, 0, 8, 0
       //~gfx8! v1: %res2 = v_mul_f32 %byte0_b_1, %c
-      //~gfx(9|10)! v1: %res2 = v_mul_f32 %b[0:7], %c
+      //~gfx(9|10)! v1: %res2 = v_mul_f32 %b, %c dst_sel:dword src0_sel:ubyte0 src1_sel:dword
       //! p_unit_test 2, %res2
       byte0_b = bld.pseudo(aco_opcode::p_extract, bld.def(v1), inputs[1], Operand::zero(),
                            Operand::c32(8u), Operand::zero());
@@ -435,43 +431,43 @@ BEGIN_TEST(optimize.sdwa.insert)
       aco_opcode ext = aco_opcode::p_extract;
       aco_opcode ins = aco_opcode::p_insert;
 
-      //~gfx[^7]! v1: %res0 = v_mul_f32 %a, %b dst_sel:ubyte0
+      //~gfx[^7]! v1: %res0 = v_mul_f32 %a, %b dst_sel:ubyte0 src0_sel:dword src1_sel:dword
       //~gfx[^7]! p_unit_test 0, %res0
       Temp val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]);
       writeout(0, bld.pseudo(ins, bld.def(v1), val, Operand::zero(), Operand::c32(8u)));
 
-      //~gfx[^7]! v1: %res1 = v_mul_f32 %a, %b dst_sel:ubyte1
+      //~gfx[^7]! v1: %res1 = v_mul_f32 %a, %b dst_sel:ubyte1 src0_sel:dword src1_sel:dword
       //~gfx[^7]! p_unit_test 1, %res1
       val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]);
       writeout(1, bld.pseudo(ins, bld.def(v1), val, Operand::c32(1u), Operand::c32(8u)));
 
-      //~gfx[^7]! v1: %res2 = v_mul_f32 %a, %b dst_sel:ubyte2
+      //~gfx[^7]! v1: %res2 = v_mul_f32 %a, %b dst_sel:ubyte2 src0_sel:dword src1_sel:dword
       //~gfx[^7]! p_unit_test 2, %res2
       val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]);
       writeout(2, bld.pseudo(ins, bld.def(v1), val, Operand::c32(2u), Operand::c32(8u)));
 
-      //~gfx[^7]! v1: %res3 = v_mul_f32 %a, %b dst_sel:ubyte3
+      //~gfx[^7]! v1: %res3 = v_mul_f32 %a, %b dst_sel:ubyte3 src0_sel:dword src1_sel:dword
       //~gfx[^7]! p_unit_test 3, %res3
       val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]);
       writeout(3, bld.pseudo(ins, bld.def(v1), val, Operand::c32(3u), Operand::c32(8u)));
 
-      //~gfx[^7]! v1: %res4 = v_mul_f32 %a, %b dst_sel:uword0
+      //~gfx[^7]! v1: %res4 = v_mul_f32 %a, %b dst_sel:uword0 src0_sel:dword src1_sel:dword
       //~gfx[^7]! p_unit_test 4, %res4
       val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]);
       writeout(4, bld.pseudo(ins, bld.def(v1), val, Operand::zero(), Operand::c32(16u)));
 
-      //~gfx[^7]! v1: %res5 = v_mul_f32 %a, %b dst_sel:uword1
+      //~gfx[^7]! v1: %res5 = v_mul_f32 %a, %b dst_sel:uword1 src0_sel:dword src1_sel:dword
       //~gfx[^7]! p_unit_test 5, %res5
       val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]);
       writeout(5, bld.pseudo(ins, bld.def(v1), val, Operand::c32(1u), Operand::c32(16u)));
 
-      //~gfx[^7]! v1: %res6 = v_mul_f32 %a, %b dst_sel:ubyte0
+      //~gfx[^7]! v1: %res6 = v_mul_f32 %a, %b dst_sel:ubyte0 src0_sel:dword src1_sel:dword
       //~gfx[^7]! p_unit_test 6, %res6
       val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]);
       writeout(
          6, bld.pseudo(ext, bld.def(v1), val, Operand::zero(), Operand::c32(8u), Operand::zero()));
 
-      //~gfx[^7]! v1: %res7 = v_mul_f32 %a, %b dst_sel:uword0
+      //~gfx[^7]! v1: %res7 = v_mul_f32 %a, %b dst_sel:uword0 src0_sel:dword src1_sel:dword
       //~gfx[^7]! p_unit_test 7, %res7
       val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], inputs[1]);
       writeout(
@@ -532,20 +528,20 @@ BEGIN_TEST(optimize.sdwa.insert_modifiers)
 
       //~gfx8! v1: %tmp0 = v_rcp_f32 %a *2
       //~gfx8! v1: %res0 = p_insert %tmp0, 0, 8
-      //~gfx9! v1: %res0 = v_rcp_f32 %a *2 dst_sel:ubyte0
+      //~gfx9! v1: %res0 = v_rcp_f32 %a *2 dst_sel:ubyte0 src0_sel:dword
       //! p_unit_test 0, %res0
       Temp val = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), inputs[0]);
       val = bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), val, Operand::c32(0x40000000u));
       writeout(0, bld.pseudo(ins, bld.def(v1), val, Operand::zero(), Operand::c32(8u)));
 
-      //! v1: %res1 = v_rcp_f32 %a clamp dst_sel:ubyte0
+      //! v1: %res1 = v_rcp_f32 %a clamp dst_sel:ubyte0 src0_sel:dword
       //! p_unit_test 1, %res1
       val = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), inputs[0]);
       val = bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), val, Operand::zero(),
                      Operand::c32(0x3f800000u));
       writeout(1, bld.pseudo(ins, bld.def(v1), val, Operand::zero(), Operand::c32(8u)));
 
-      //! v1: %tmp2 = v_rcp_f32 %a dst_sel:ubyte0
+      //! v1: %tmp2 = v_rcp_f32 %a dst_sel:ubyte0 src0_sel:dword
       //! v1: %res2 = v_mul_f32 %tmp2, 2.0
       //! p_unit_test 2, %res2
       val = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), inputs[0]);
@@ -553,7 +549,7 @@ BEGIN_TEST(optimize.sdwa.insert_modifiers)
       val = bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), val, Operand::c32(0x40000000u));
       writeout(2, val);
 
-      //! v1: %tmp3 = v_rcp_f32 %a dst_sel:ubyte0
+      //! v1: %tmp3 = v_rcp_f32 %a dst_sel:ubyte0 src0_sel:dword
       //! v1: %res3 = v_med3_f32 %tmp3, 0, 1.0
       //! p_unit_test 3, %res3
       val = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), inputs[0]);
@@ -564,7 +560,7 @@ BEGIN_TEST(optimize.sdwa.insert_modifiers)
 
       //~gfx8! v1: %tmp4 = v_rcp_f32 %a *2 clamp
       //~gfx8! v1: %res4 = p_insert %tmp4, 0, 8
-      //~gfx9! v1: %res4 = v_rcp_f32 %a *2 clamp dst_sel:ubyte0
+      //~gfx9! v1: %res4 = v_rcp_f32 %a *2 clamp dst_sel:ubyte0 src0_sel:dword
       //! p_unit_test 4, %res4
       val = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), inputs[0]);
       val = bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), val, Operand::c32(0x40000000u));
diff --git a/mesa 3D driver/src/amd/compiler/tests/test_to_hw_instr.cpp b/mesa 3D driver/src/amd/compiler/tests/test_to_hw_instr.cpp
index 4e641111a1..0914bdc14c 100644
--- a/mesa 3D driver/src/amd/compiler/tests/test_to_hw_instr.cpp	
+++ b/mesa 3D driver/src/amd/compiler/tests/test_to_hw_instr.cpp	
@@ -240,18 +240,18 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
       //~gfx9! v1: %0:v[0],  v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
-      //~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_preserve
+      //~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1
       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
       bld.pseudo(aco_opcode::p_parallelcopy,
                  Definition(v0_lo, v1), Definition(v1_lo, v2b),
                  Operand(v1_lo, v1), Operand(v0_lo, v2b));
 
       //~gfx[89]! p_unit_test 2
-      //~gfx[89]! v2b: %0:v[0][16:32] = v_mov_b32 %0:v[1][16:32] dst_preserve
-      //~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][0:16] dst_preserve
-      //~gfx[89]! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_preserve
-      //~gfx[89]! v2b: %0:v[0][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_preserve
-      //~gfx[89]! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_preserve
+      //~gfx[89]! v2b: %0:v[0][16:32] = v_mov_b32 %0:v[1][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1
+      //~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][0:16] dst_sel:uword1 dst_preserve src0_sel:uword0
+      //~gfx[89]! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
+      //~gfx[89]! v2b: %0:v[0][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
+      //~gfx[89]! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
       bld.pseudo(aco_opcode::p_parallelcopy,
                  Definition(v0_lo, v1), Definition(v1_lo, v2b), Definition(v1_hi, v2b),
@@ -262,8 +262,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
       //~gfx9! v1: %0:v[0],  v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
-      //~gfx[89]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16] dst_preserve
-      //~gfx[89]! v1b: %0:v[1][16:24] = v_mov_b32 %0:v[0][16:24] dst_preserve
+      //~gfx[89]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0
+      //~gfx[89]! v1b: %0:v[1][16:24] = v_mov_b32 %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2
       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
       bld.pseudo(aco_opcode::p_parallelcopy,
                  Definition(v0_lo, v1), Definition(v1_b3, v1b),
@@ -274,8 +274,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
       //~gfx9! v1: %0:v[0],  v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
-      //~gfx[89]! v1b: %0:v[1][8:16] = v_mov_b32 %0:v[0][8:16] dst_preserve
-      //~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_preserve
+      //~gfx[89]! v1b: %0:v[1][8:16] = v_mov_b32 %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1
+      //~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1
       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
       bld.pseudo(aco_opcode::p_parallelcopy,
                  Definition(v0_lo, v1), Definition(v1_lo, v1b),
@@ -286,8 +286,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[0], %0:v[1]
       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[0], %0:v[1]
       //~gfx9! v1: %0:v[1],  v1: %0:v[0] = v_swap_b32 %0:v[0], %0:v[1]
-      //~gfx[89]! v1b: %0:v[0][8:16] = v_mov_b32 %0:v[1][8:16] dst_preserve
-      //~gfx[89]! v1b: %0:v[0][24:32] = v_mov_b32 %0:v[1][24:32] dst_preserve
+      //~gfx[89]! v1b: %0:v[0][8:16] = v_mov_b32 %0:v[1][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1
+      //~gfx[89]! v1b: %0:v[0][24:32] = v_mov_b32 %0:v[1][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3
       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5u));
       bld.pseudo(aco_opcode::p_parallelcopy,
                  Definition(v0_lo, v1b), Definition(v0_hi, v1b), Definition(v1_lo, v1),
@@ -319,9 +319,9 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
       //~gfx9! v1: %0:v[0],  v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
-      //~gfx[89]! v1b: %0:v[1][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_preserve
-      //~gfx[89]! v1b: %0:v[0][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_preserve
-      //~gfx[89]! v1b: %0:v[1][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_preserve
+      //~gfx[89]! v1b: %0:v[1][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3
+      //~gfx[89]! v1b: %0:v[0][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3
+      //~gfx[89]! v1b: %0:v[1][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3
       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8u));
       bld.pseudo(aco_opcode::p_parallelcopy,
                  Definition(v0_lo, v3b), Definition(v1_lo, v3b),
@@ -332,26 +332,26 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
       //~gfx9! v1: %0:v[0],  v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
-      //~gfx[89]! v1b: %0:v[1][24:32] = v_mov_b32 %0:v[0][24:32] dst_preserve
+      //~gfx[89]! v1b: %0:v[1][24:32] = v_mov_b32 %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3
       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9u));
       bld.pseudo(aco_opcode::p_parallelcopy,
                  Definition(v0_lo, v3b), Definition(v1_lo, v3b), Definition(v0_b3, v1b),
                  Operand(v1_lo, v3b), Operand(v0_lo, v3b), Operand(v1_b3, v1b));
 
       //~gfx[89]! p_unit_test 10
-      //~gfx[89]! v1b: %0:v[1][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_preserve
-      //~gfx[89]! v1b: %0:v[0][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_preserve
-      //~gfx[89]! v1b: %0:v[1][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_preserve
-      //~gfx[89]! v1b: %0:v[1][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_preserve
-      //~gfx[89]! v1b: %0:v[0][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_preserve
-      //~gfx[89]! v1b: %0:v[1][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_preserve
+      //~gfx[89]! v1b: %0:v[1][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1
+      //~gfx[89]! v1b: %0:v[0][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1
+      //~gfx[89]! v1b: %0:v[1][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1
+      //~gfx[89]! v1b: %0:v[1][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2
+      //~gfx[89]! v1b: %0:v[0][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2
+      //~gfx[89]! v1b: %0:v[1][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2
       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10u));
       bld.pseudo(aco_opcode::p_parallelcopy,
                  Definition(v0_b1, v2b), Definition(v1_b1, v2b),
                  Operand(v1_b1, v2b), Operand(v0_b1, v2b));
 
       //~gfx[89]! p_unit_test 11
-      //~gfx[89]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][16:32] dst_preserve
+      //~gfx[89]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][16:32] dst_sel:uword0 dst_preserve src0_sel:uword1
       //~gfx[89]! v1: %0:v[0] = v_mov_b32 42
       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11u));
       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v2b),
@@ -367,6 +367,7 @@ BEGIN_TEST(to_hw_instr.subdword_constant)
    PhysReg v0_lo{256};
    PhysReg v0_hi{256};
    PhysReg v0_b1{256};
+   PhysReg v1_lo{257};
    PhysReg v1_hi{257};
    v0_hi.reg_b += 2;
    v0_b1.reg_b += 1;
@@ -419,7 +420,7 @@ BEGIN_TEST(to_hw_instr.subdword_constant)
 
       /* 16-bit copy */
       //! p_unit_test 6
-      //! v2b: %_:v[0][0:16] = v_add_f16 0.5, 0 dst_preserve
+      //! v2b: %_:v[0][0:16] = v_add_f16 0.5, 0 dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:dword
       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6u));
       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Operand::c16(0x3800));
 
@@ -438,23 +439,31 @@ BEGIN_TEST(to_hw_instr.subdword_constant)
       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_hi, v2b), Operand::c16(0x4205));
 
       //! p_unit_test 9
-      //! v1b: %_:v[0][8:16] = v_mov_b32 0 dst_preserve
-      //! v1b: %_:v[0][16:24] = v_mov_b32 56 dst_preserve
+      //! v1b: %_:v[0][8:16] = v_mov_b32 0 dst_sel:ubyte1 dst_preserve src0_sel:dword
+      //! v1b: %_:v[0][16:24] = v_mov_b32 56 dst_sel:ubyte2 dst_preserve src0_sel:dword
       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9u));
       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_b1, v2b), Operand::c16(0x3800));
 
       //! p_unit_test 10
-      //! v1b: %_:v[0][8:16] = v_mov_b32 5 dst_preserve
-      //! v1b: %_:v[0][16:24] = v_mul_u32_u24 2, 33 dst_preserve
+      //! v1b: %_:v[0][8:16] = v_mov_b32 5 dst_sel:ubyte1 dst_preserve src0_sel:dword
+      //! v1b: %_:v[0][16:24] = v_mul_u32_u24 2, 33 dst_sel:ubyte2 dst_preserve src0_sel:dword src1_sel:dword
       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10u));
       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_b1, v2b), Operand::c16(0x4205));
 
       /* 8-bit copy */
       //! p_unit_test 11
-      //! v1b: %_:v[0][0:8] = v_mul_u32_u24 2, 33 dst_preserve
+      //! v1b: %_:v[0][0:8] = v_mul_u32_u24 2, 33 dst_sel:ubyte0 dst_preserve src0_sel:dword src1_sel:dword
       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11u));
       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), Operand::c8(0x42));
 
+      /* 32-bit and 8-bit copy */
+      //! p_unit_test 12
+      //! v1: %_:v[0] = v_mov_b32 0
+      //! v1b: %_:v[1][0:8] = v_mov_b32 0 dst_sel:ubyte0 dst_preserve src0_sel:dword
+      bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12u));
+      bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v1b),
+                 Operand::zero(), Operand::zero(1));
+
       //! s_endpgm
 
       finish_to_hw_instr_test();
@@ -505,7 +514,7 @@ BEGIN_TEST(to_hw_instr.extract)
       //; funcs['v_shr'] = lambda _: 'v_ashrrev_i32' if variant.endswith('_signed') else 'v_lshrrev_b32'
       //; funcs['s_bfe'] = lambda _: 's_bfe_i32' if variant.endswith('_signed') else 's_bfe_u32'
       //; funcs['s_shr'] = lambda _: 's_ashr_i32' if variant.endswith('_signed') else 's_lshr_b32'
-      //; funcs['sel'] = lambda bits: ('sext(%%_:v[1])[%s]' if variant.endswith('_signed') else '%%_:v[1][%s]') % bits
+      //; funcs['byte'] = lambda n: '%cbyte%s' % ('s' if variant.endswith('_signed') else 'u', n)
 
       //>> p_unit_test 0
       bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
@@ -554,15 +563,15 @@ BEGIN_TEST(to_hw_instr.extract)
       //>> p_unit_test 4
       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
       //~gfx7.*! v2b: %_:v[0][0:16] = @v_bfe %_:v[1][0:16], 0, 8
-      //~gfx[^7].*! v2b: %_:v[0][0:16] = v_mov_b32 @sel(0:7)
+      //~gfx[^7].*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:@byte(0)
       EXT(0, 0)
-      //~gfx[^7].*! v2b: %_:v[0][0:16] = v_mov_b32 @sel(16:23)
+      //~gfx[^7].*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][16:32] dst_sel:uword0 dst_preserve src0_sel:@byte(2)
       if (i != GFX7)
          EXT(0, 2)
       //~gfx7.*! v2b: %_:v[0][0:16] = @v_bfe %_:v[1][0:16], 8, 8
-      //~gfx[^7].*! v2b: %_:v[0][0:16] = v_mov_b32 @sel(8:15)
+      //~gfx[^7].*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:@byte(1)
       EXT(1, 0)
-      //~gfx[^7].*! v2b: %_:v[0][0:16] = v_mov_b32 @sel(24:31)
+      //~gfx[^7].*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][16:32] dst_sel:uword0 dst_preserve src0_sel:@byte(3)
       if (i != GFX7)
          EXT(1, 2)
 
@@ -595,11 +604,11 @@ BEGIN_TEST(to_hw_instr.insert)
       INS(0, 8)
       //~gfx7! v1: %0:v[0] = v_bfe_u32 %0:v[1], 0, 8
       //~gfx7! v1: %0:v[0] = v_lshlrev_b32 8, %0:v[0]
-      //~gfx[^7]! v1: %0:v[0] = v_mov_b32 %0:v[1] dst_sel:ubyte1
+      //~gfx[^7]! v1: %0:v[0] = v_mov_b32 %0:v[1] dst_sel:ubyte1 src0_sel:dword
       INS(1, 8)
       //~gfx7! v1: %0:v[0] = v_bfe_u32 %0:v[1], 0, 8
       //~gfx7! v1: %0:v[0] = v_lshlrev_b32 16, %0:v[0]
-      //~gfx[^7]! v1: %0:v[0] = v_mov_b32 %0:v[1] dst_sel:ubyte2
+      //~gfx[^7]! v1: %0:v[0] = v_mov_b32 %0:v[1] dst_sel:ubyte2 src0_sel:dword
       INS(2, 8)
       //! v1: %0:v[0] = v_lshlrev_b32 24, %0:v[1]
       INS(3, 8)
@@ -640,15 +649,15 @@ BEGIN_TEST(to_hw_instr.insert)
       //>> p_unit_test 2
       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
       //~gfx7! v2b: %_:v[0][0:16] = v_bfe_u32 %_:v[1][0:16], 0, 8
-      //~gfx[^7]! v1: %_:v[0] = v_mov_b32 %_:v[1][0:16] dst_sel:ubyte0 dst_preserve
+      //~gfx[^7]! v2b: %0:v[0][0:16] = v_lshlrev_b32 0, %0:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:dword src1_sel:ubyte0
       INS(0, 0)
-      //~gfx[^7]! v1: %_:v[0] = v_mov_b32 %_:v[1][0:16] dst_sel:ubyte2 dst_preserve
+      //~gfx[^7]! v2b: %0:v[0][16:32] = v_lshlrev_b32 0, %0:v[1][0:16] dst_sel:uword1 dst_preserve src0_sel:dword src1_sel:ubyte0
       if (i != GFX7)
          INS(0, 2)
       //~gfx7! v2b: %_:v[0][0:16] = v_lshlrev_b32 8, %_:v[1][0:16]
-      //~gfx[^7]! v1: %_:v[0] = v_mov_b32 %_:v[1][0:16] dst_sel:ubyte1 dst_preserve
+      //~gfx[^7]! v2b: %0:v[0][0:16] = v_lshlrev_b32 8, %0:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:dword src1_sel:ubyte0
       INS(1, 0)
-      //~gfx[^7]! v1: %_:v[0] = v_mov_b32 %_:v[1][0:16] dst_sel:ubyte3 dst_preserve
+      //~gfx[^7]! v2b: %0:v[0][16:32] = v_lshlrev_b32 8, %0:v[1][0:16] dst_sel:uword1 dst_preserve src0_sel:dword src1_sel:ubyte0
       if (i != GFX7)
          INS(1, 2)
 
@@ -659,3 +668,57 @@ BEGIN_TEST(to_hw_instr.insert)
       //! s_endpgm
    }
 END_TEST
+
+BEGIN_TEST(to_hw_instr.copy_linear_vgpr_scc)
+   if (!setup_cs(NULL, GFX10))
+      return;
+
+   PhysReg reg_s0{0};
+   PhysReg reg_s1{1};
+   PhysReg v0_lo{256};
+   PhysReg v0_b3{256};
+   v0_b3.reg_b += 3;
+   PhysReg v1_lo{257};
+
+   //>> p_unit_test 0
+   bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
+
+   /* It would be better if the scc=s0 copy was done later, but handle_operands() is complex
+    * enough
+    */
+
+   //! s1: %0:scc = s_cmp_lg_i32 %0:s[0], 0
+   //! s1: %0:m0 = s_mov_b32 %0:scc
+   //! lv1: %0:v[0] = v_mov_b32 %0:v[1]
+   //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
+   //! lv1: %0:v[0] = v_mov_b32 %0:v[1]
+   //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
+   //! s1: %0:scc = s_cmp_lg_i32 %0:m0, 0
+   Instruction *instr = bld.pseudo(
+      aco_opcode::p_parallelcopy,
+      Definition(scc, s1), Definition(v0_lo, v1.as_linear()),
+      Operand(reg_s0, s1), Operand(v1_lo, v1.as_linear()));
+   instr->pseudo().scratch_sgpr = m0;
+
+   finish_to_hw_instr_test();
+END_TEST
+
+BEGIN_TEST(to_hw_instr.swap_linear_vgpr)
+   if (!setup_cs(NULL, GFX10))
+      return;
+
+   PhysReg reg_v0{256};
+   PhysReg reg_v1{257};
+   RegClass v1_linear = v1.as_linear();
+
+   //>> p_unit_test 0
+   bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
+
+   Instruction *instr = bld.pseudo(
+      aco_opcode::p_parallelcopy,
+      Definition(reg_v0, v1_linear), Definition(reg_v1, v1_linear),
+      Operand(reg_v1, v1_linear), Operand(reg_v0, v1_linear));
+   instr->pseudo().scratch_sgpr = m0;
+
+   finish_to_hw_instr_test();
+END_TEST
diff --git a/mesa 3D driver/src/amd/llvm/ac_llvm_build.c b/mesa 3D driver/src/amd/llvm/ac_llvm_build.c
index c605772d64..815410412a 100644
--- a/mesa 3D driver/src/amd/llvm/ac_llvm_build.c	
+++ b/mesa 3D driver/src/amd/llvm/ac_llvm_build.c	
@@ -2076,6 +2076,10 @@ static const char *get_atomic_name(enum ac_atomic_op op)
       return "inc";
    case ac_atomic_dec_wrap:
       return "dec";
+   case ac_atomic_fmin:
+      return "fmin";
+   case ac_atomic_fmax:
+      return "fmax";
    }
    unreachable("bad atomic op");
 }
@@ -3344,6 +3348,16 @@ void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, LL
    LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load);
    fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value, ac->i32_0, "");
 
+   /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
+    * resource descriptor is 0 (invalid).
+    */
+   LLVMValueRef tmp;
+   tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
+   tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, "");
+   tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, "");
+   fmask_value =
+      LLVMBuildSelect(ac->builder, tmp, fmask_value, LLVMConstInt(ac->i32, 0x76543210, false), "");
+
    /* Apply the formula. */
    unsigned sample_chan = is_array_tex ? 3 : 2;
    LLVMValueRef final_sample;
@@ -3353,20 +3367,9 @@ void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, LL
                                 LLVMBuildZExt(ac->builder, final_sample, ac->i32, ""), "");
    /* Mask the sample index by 0x7, because 0x8 means an unknown value
     * with EQAA, so those will map to 0. */
-   final_sample = LLVMBuildAnd(ac->builder, final_sample, LLVMConstInt(ac->i32, 0x7, 0), "");
+   addr[sample_chan] = LLVMBuildAnd(ac->builder, final_sample, LLVMConstInt(ac->i32, 0x7, 0), "");
    if (fmask_load.a16)
-      final_sample = LLVMBuildTrunc(ac->builder, final_sample, ac->i16, "");
-
-   /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
-    * resource descriptor is 0 (invalid).
-    */
-   LLVMValueRef tmp;
-   tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
-   tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, "");
-   tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, "");
-
-   /* Replace the MSAA sample index. */
-   addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample, addr[sample_chan], "");
+      addr[sample_chan] = LLVMBuildTrunc(ac->builder, final_sample, ac->i16, "");
 }
 
 static LLVMValueRef _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src,
@@ -4597,6 +4600,22 @@ void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wav
    ac_build_endif(ctx, 5020);
 }
 
+
+LLVMValueRef ac_pack_edgeflags_for_export(struct ac_llvm_context *ctx,
+                                          const struct ac_shader_args *args)
+{
+   /* Use the following trick to extract the edge flags:
+    *   extracted = v_and_b32 gs_invocation_id, 0x700 ; get edge flags at bits 8, 9, 10
+    *   shifted = v_mul_u32_u24 extracted, 0x80402u   ; shift the bits: 8->9, 9->19, 10->29
+    *   result = v_and_b32 shifted, 0x20080200        ; remove garbage
+    */
+   LLVMValueRef tmp = LLVMBuildAnd(ctx->builder,
+                                   ac_get_arg(ctx, args->gs_invocation_id),
+                                   LLVMConstInt(ctx->i32, 0x700, 0), "");
+   tmp = LLVMBuildMul(ctx->builder, tmp, LLVMConstInt(ctx->i32, 0x80402u, 0), "");
+   return LLVMBuildAnd(ctx->builder, tmp, LLVMConstInt(ctx->i32, 0x20080200, 0), "");
+}
+
 LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim)
 {
    /* The prim export format is:
@@ -4611,13 +4630,11 @@ LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, const struct ac_ng
    LLVMBuilderRef builder = ctx->builder;
    LLVMValueRef tmp = LLVMBuildZExt(builder, prim->isnull, ctx->i32, "");
    LLVMValueRef result = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 31, false), "");
+   result = LLVMBuildOr(ctx->builder, result, prim->edgeflags, "");
 
    for (unsigned i = 0; i < prim->num_vertices; ++i) {
       tmp = LLVMBuildShl(builder, prim->index[i], LLVMConstInt(ctx->i32, 10 * i, false), "");
       result = LLVMBuildOr(builder, result, tmp, "");
-      tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->i32, "");
-      tmp = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 10 * i + 9, false), "");
-      result = LLVMBuildOr(builder, result, tmp, "");
    }
    return result;
 }
@@ -4759,3 +4776,13 @@ void ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context *ctx, LL
                             LLVMBuildSelect(builder, is_odd, index[1], index[2], ""), index[2], "");
    memcpy(index, out, sizeof(out));
 }
+
+LLVMValueRef ac_build_is_inf_or_nan(struct ac_llvm_context *ctx, LLVMValueRef a)
+{
+   LLVMValueRef args[2] = {
+      a,
+      LLVMConstInt(ctx->i32, S_NAN | Q_NAN | N_INFINITY | P_INFINITY, 0),
+   };
+   return ac_build_intrinsic(ctx, "llvm.amdgcn.class.f32", ctx->i1, args, 2,
+                             AC_FUNC_ATTR_READNONE);
+}
diff --git a/mesa 3D driver/src/amd/llvm/ac_llvm_build.h b/mesa 3D driver/src/amd/llvm/ac_llvm_build.h
index 965ba28b7e..1f38cbdae0 100644
--- a/mesa 3D driver/src/amd/llvm/ac_llvm_build.h	
+++ b/mesa 3D driver/src/amd/llvm/ac_llvm_build.h	
@@ -388,6 +388,8 @@ enum ac_atomic_op
    ac_atomic_xor,
    ac_atomic_inc_wrap,
    ac_atomic_dec_wrap,
+   ac_atomic_fmin,
+   ac_atomic_fmax,
 };
 
 /* These cache policy bits match the definitions used by the LLVM intrinsics. */
@@ -581,10 +583,12 @@ struct ac_ngg_prim {
    unsigned num_vertices;
    LLVMValueRef isnull;
    LLVMValueRef index[3];
-   LLVMValueRef edgeflag[3];
+   LLVMValueRef edgeflags;
    LLVMValueRef passthrough;
 };
 
+LLVMValueRef ac_pack_edgeflags_for_export(struct ac_llvm_context *ctx,
+                                          const struct ac_shader_args *args);
 LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim);
 void ac_build_export_prim(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim);
 
@@ -611,6 +615,7 @@ void ac_build_s_endpgm(struct ac_llvm_context *ctx);
 void ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context *ctx, LLVMValueRef is_odd,
                                                  LLVMValueRef flatshade_first,
                                                  LLVMValueRef index[3]);
+LLVMValueRef ac_build_is_inf_or_nan(struct ac_llvm_context *ctx, LLVMValueRef a);
 
 #ifdef __cplusplus
 }
diff --git a/mesa 3D driver/src/amd/llvm/ac_llvm_cull.c b/mesa 3D driver/src/amd/llvm/ac_llvm_cull.c
index 3ef1dab831..681c186cd3 100644
--- a/mesa 3D driver/src/amd/llvm/ac_llvm_cull.c	
+++ b/mesa 3D driver/src/amd/llvm/ac_llvm_cull.c	
@@ -52,7 +52,7 @@ struct ac_position_w_info {
 };
 
 static void ac_analyze_position_w(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
-                                  struct ac_position_w_info *w)
+                                  struct ac_position_w_info *w, unsigned num_vertices)
 {
    LLVMBuilderRef builder = ctx->builder;
    LLVMValueRef all_w_negative = ctx->i1true;
@@ -60,7 +60,7 @@ static void ac_analyze_position_w(struct ac_llvm_context *ctx, LLVMValueRef pos[
    w->w_reflection = ctx->i1false;
    w->any_w_negative = ctx->i1false;
 
-   for (unsigned i = 0; i < 3; i++) {
+   for (unsigned i = 0; i < num_vertices; i++) {
       LLVMValueRef neg_w;
 
       neg_w = LLVMBuildFCmp(builder, LLVMRealOLT, pos[i][3], ctx->f32_0, "");
@@ -92,9 +92,9 @@ static LLVMValueRef ac_cull_face(struct ac_llvm_context *ctx, LLVMValueRef pos[3
    LLVMValueRef det_t1 = LLVMBuildFSub(builder, pos[1][1], pos[0][1], "");
    LLVMValueRef det_t2 = LLVMBuildFSub(builder, pos[0][0], pos[1][0], "");
    LLVMValueRef det_t3 = LLVMBuildFSub(builder, pos[0][1], pos[2][1], "");
-   LLVMValueRef det_p0 = LLVMBuildFMul(builder, det_t0, det_t1, "");
-   LLVMValueRef det_p1 = LLVMBuildFMul(builder, det_t2, det_t3, "");
-   LLVMValueRef det = LLVMBuildFSub(builder, det_p0, det_p1, "");
+   /* t0 * t1 - t2 * t3  =  t2 * -t3 + t0 * t1  =  fma(t2, -t3, t0 * t1) */
+   LLVMValueRef det = ac_build_fmad(ctx, det_t2, LLVMBuildFNeg(builder, det_t3, ""),
+                                    LLVMBuildFMul(builder, det_t0, det_t1, ""));
 
    /* Negative W negates the determinant. */
    det = LLVMBuildSelect(builder, w->w_reflection, LLVMBuildFNeg(builder, det, ""), det, "");
@@ -109,6 +109,14 @@ static LLVMValueRef ac_cull_face(struct ac_llvm_context *ctx, LLVMValueRef pos[3
    } else if (cull_zero_area) {
       accepted = LLVMBuildFCmp(builder, LLVMRealONE, det, ctx->f32_0, "");
    }
+
+   if (accepted) {
+      /* Don't reject NaN and +/-infinity, these are tricky.
+       * Just trust fixed-function HW to handle these cases correctly.
+       */
+      accepted = LLVMBuildOr(builder, accepted, ac_build_is_inf_or_nan(ctx, det), "");
+   }
+
    return accepted;
 }
 
@@ -117,14 +125,13 @@ static LLVMValueRef ac_cull_face(struct ac_llvm_context *ctx, LLVMValueRef pos[3
 static void cull_bbox(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
                       LLVMValueRef initially_accepted, struct ac_position_w_info *w,
                       LLVMValueRef vp_scale[2], LLVMValueRef vp_translate[2],
-                      LLVMValueRef small_prim_precision, bool cull_view_xy,
-                      bool cull_view_near_z, bool cull_view_far_z, bool cull_small_prims,
-                      bool use_halfz_clip_space, ac_cull_accept_func accept_func,
-                      void *userdata)
+                      LLVMValueRef small_prim_precision, struct ac_cull_options *options,
+                      ac_cull_accept_func accept_func, void *userdata)
 {
    LLVMBuilderRef builder = ctx->builder;
 
-   if (!cull_view_xy && !cull_view_near_z && !cull_view_far_z && !cull_small_prims) {
+   if (!options->cull_view_xy && !options->cull_view_near_z && !options->cull_view_far_z &&
+       !options->cull_small_prims) {
       if (accept_func)
          accept_func(ctx, initially_accepted, userdata);
       return;
@@ -136,27 +143,31 @@ static void cull_bbox(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
       LLVMValueRef accepted = ctx->i1true;
 
       /* Compute the primitive bounding box for easy culling. */
-      for (unsigned chan = 0; chan < (cull_view_near_z || cull_view_far_z ? 3 : 2); chan++) {
+      for (unsigned chan = 0; chan < (options->cull_view_near_z ||
+                                      options->cull_view_far_z ? 3 : 2); chan++) {
+         assert(options->num_vertices >= 2);
          bbox_min[chan] = ac_build_fmin(ctx, pos[0][chan], pos[1][chan]);
-         bbox_min[chan] = ac_build_fmin(ctx, bbox_min[chan], pos[2][chan]);
-
          bbox_max[chan] = ac_build_fmax(ctx, pos[0][chan], pos[1][chan]);
-         bbox_max[chan] = ac_build_fmax(ctx, bbox_max[chan], pos[2][chan]);
+
+         if (options->num_vertices == 3) {
+            bbox_min[chan] = ac_build_fmin(ctx, bbox_min[chan], pos[2][chan]);
+            bbox_max[chan] = ac_build_fmax(ctx, bbox_max[chan], pos[2][chan]);
+         }
       }
 
       /* View culling. */
-      if (cull_view_xy || cull_view_near_z || cull_view_far_z) {
+      if (options->cull_view_xy || options->cull_view_near_z || options->cull_view_far_z) {
          for (unsigned chan = 0; chan < 3; chan++) {
             LLVMValueRef visible;
 
-            if ((cull_view_xy && chan <= 1) || (cull_view_near_z && chan == 2)) {
-               float t = chan == 2 && use_halfz_clip_space ? 0 : -1;
+            if ((options->cull_view_xy && chan <= 1) || (options->cull_view_near_z && chan == 2)) {
+               float t = chan == 2 && options->use_halfz_clip_space ? 0 : -1;
                visible = LLVMBuildFCmp(builder, LLVMRealOGE, bbox_max[chan],
                                        LLVMConstReal(ctx->f32, t), "");
                accepted = LLVMBuildAnd(builder, accepted, visible, "");
             }
 
-            if ((cull_view_xy && chan <= 1) || (cull_view_far_z && chan == 2)) {
+            if ((options->cull_view_xy && chan <= 1) || (options->cull_view_far_z && chan == 2)) {
                visible = LLVMBuildFCmp(builder, LLVMRealOLE, bbox_min[chan], ctx->f32_1, "");
                accepted = LLVMBuildAnd(builder, accepted, visible, "");
             }
@@ -164,7 +175,7 @@ static void cull_bbox(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
       }
 
       /* Small primitive elimination. */
-      if (cull_small_prims) {
+      if (options->cull_small_prims) {
          /* Assuming a sample position at (0.5, 0.5), if we round
           * the bounding box min/max extents and the results of
           * the rounding are equal in either the X or Y direction,
@@ -224,14 +235,14 @@ static void cull_bbox(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
  * \param options               See ac_cull_options.
  * \param accept_func           Callback invoked in the inner-most branch where the primitive is accepted.
  */
-void ac_cull_triangle(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
-                      LLVMValueRef initially_accepted, LLVMValueRef vp_scale[2],
-                      LLVMValueRef vp_translate[2], LLVMValueRef small_prim_precision,
-                      struct ac_cull_options *options, ac_cull_accept_func accept_func,
-                      void *userdata)
+void ac_cull_primitive(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
+                       LLVMValueRef initially_accepted, LLVMValueRef vp_scale[2],
+                       LLVMValueRef vp_translate[2], LLVMValueRef small_prim_precision,
+                       struct ac_cull_options *options, ac_cull_accept_func accept_func,
+                       void *userdata)
 {
    struct ac_position_w_info w;
-   ac_analyze_position_w(ctx, pos, &w);
+   ac_analyze_position_w(ctx, pos, &w, options->num_vertices);
 
    /* W culling. */
    LLVMValueRef accepted = options->cull_w ? w.w_accepted : ctx->i1true;
@@ -244,8 +255,6 @@ void ac_cull_triangle(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
       "");
 
    /* View culling and small primitive elimination. */
-   cull_bbox(ctx, pos, accepted, &w, vp_scale, vp_translate, small_prim_precision,
-             options->cull_view_xy, options->cull_view_near_z, options->cull_view_far_z,
-             options->cull_small_prims, options->use_halfz_clip_space, accept_func,
-             userdata);
+   cull_bbox(ctx, pos, accepted, &w, vp_scale, vp_translate, small_prim_precision, options,
+             accept_func, userdata);
 }
diff --git a/mesa 3D driver/src/amd/llvm/ac_llvm_cull.h b/mesa 3D driver/src/amd/llvm/ac_llvm_cull.h
index 676587d5b2..db1dcdde9f 100644
--- a/mesa 3D driver/src/amd/llvm/ac_llvm_cull.h	
+++ b/mesa 3D driver/src/amd/llvm/ac_llvm_cull.h	
@@ -46,16 +46,18 @@ struct ac_cull_options {
    bool cull_w; /* cull primitives with all W < 0 */
 
    bool use_halfz_clip_space;
+
+   uint8_t num_vertices; /* 1..3 */
 };
 
 /* Callback invoked in the inner-most branch where the primitive is accepted. */
 typedef void (*ac_cull_accept_func)(struct ac_llvm_context *ctx, LLVMValueRef accepted,
                                     void *userdata);
 
-void ac_cull_triangle(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
-                      LLVMValueRef initially_accepted, LLVMValueRef vp_scale[2],
-                      LLVMValueRef vp_translate[2], LLVMValueRef small_prim_precision,
-                      struct ac_cull_options *options, ac_cull_accept_func accept_func,
-                      void *userdata);
+void ac_cull_primitive(struct ac_llvm_context *ctx, LLVMValueRef pos[3][4],
+                       LLVMValueRef initially_accepted, LLVMValueRef vp_scale[2],
+                       LLVMValueRef vp_translate[2], LLVMValueRef small_prim_precision,
+                       struct ac_cull_options *options, ac_cull_accept_func accept_func,
+                       void *userdata);
 
 #endif
diff --git a/mesa 3D driver/src/amd/llvm/ac_llvm_helper.cpp b/mesa 3D driver/src/amd/llvm/ac_llvm_helper.cpp
index 0316e1d9f9..b9dd89f29f 100644
--- a/mesa 3D driver/src/amd/llvm/ac_llvm_helper.cpp	
+++ b/mesa 3D driver/src/amd/llvm/ac_llvm_helper.cpp	
@@ -60,7 +60,7 @@ bool ac_is_sgpr_param(LLVMValueRef arg)
    llvm::Argument *A = llvm::unwrap<llvm::Argument>(arg);
    llvm::AttributeList AS = A->getParent()->getAttributes();
    unsigned ArgNo = A->getArgNo();
-   return AS.hasAttribute(ArgNo + 1, llvm::Attribute::InReg);
+   return AS.hasParamAttr(ArgNo, llvm::Attribute::InReg);
 }
 
 LLVMValueRef ac_llvm_get_called_value(LLVMValueRef call)
diff --git a/mesa 3D driver/src/amd/llvm/ac_nir_to_llvm.c b/mesa 3D driver/src/amd/llvm/ac_nir_to_llvm.c
index f0f22a6462..bd72e39a0a 100644
--- a/mesa 3D driver/src/amd/llvm/ac_nir_to_llvm.c	
+++ b/mesa 3D driver/src/amd/llvm/ac_nir_to_llvm.c	
@@ -56,6 +56,13 @@ struct ac_nir_context {
    LLVMValueRef main_function;
    LLVMBasicBlockRef continue_block;
    LLVMBasicBlockRef break_block;
+
+   LLVMValueRef vertex_id_replaced;
+   LLVMValueRef instance_id_replaced;
+   LLVMValueRef tes_u_replaced;
+   LLVMValueRef tes_v_replaced;
+   LLVMValueRef tes_rel_patch_id_replaced;
+   LLVMValueRef tes_patch_id_replaced;
 };
 
 static LLVMValueRef get_sampler_desc_index(struct ac_nir_context *ctx, nir_deref_instr *deref_instr,
@@ -582,6 +589,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
    case nir_op_cube_face_index_amd:
       src_components = 3;
       break;
+   case nir_op_pack_32_4x8:
    case nir_op_pack_64_4x16:
       src_components = 4;
       break;
@@ -625,6 +633,15 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
       else
          result = LLVMBuildAdd(ctx->ac.builder, src[0], src[1], "");
       break;
+   case nir_op_uadd_sat:
+   case nir_op_iadd_sat: {
+      char name[64], type[64];
+      ac_build_type_name_for_intr(def_type, type, sizeof(type));
+      snprintf(name, sizeof(name), "llvm.%cadd.sat.%s",
+               instr->op == nir_op_uadd_sat ? 'u' : 's', type);
+      result = ac_build_intrinsic(&ctx->ac, name, def_type, src, 2, AC_FUNC_ATTR_READNONE);
+      break;
+   }
    case nir_op_fadd:
       src[0] = ac_to_float(&ctx->ac, src[0]);
       src[1] = ac_to_float(&ctx->ac, src[1]);
@@ -1153,6 +1170,7 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
       break;
    }
 
+   case nir_op_pack_32_4x8:
    case nir_op_pack_32_2x16: {
       result = LLVMBuildBitCast(ctx->ac.builder, src[0],
             ctx->ac.i32, "");
@@ -1236,6 +1254,40 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
       break;
    }
 
+   case nir_op_sdot_4x8_iadd:
+   case nir_op_udot_4x8_uadd:
+   case nir_op_sdot_4x8_iadd_sat:
+   case nir_op_udot_4x8_uadd_sat: {
+      const char *name = instr->op == nir_op_sdot_4x8_iadd ||
+                         instr->op == nir_op_sdot_4x8_iadd_sat
+                         ? "llvm.amdgcn.sdot4" : "llvm.amdgcn.udot4";
+      src[3] = LLVMConstInt(ctx->ac.i1, instr->op == nir_op_sdot_4x8_iadd_sat ||
+                                        instr->op == nir_op_udot_4x8_uadd_sat, false);
+      result = ac_build_intrinsic(&ctx->ac, name, def_type, src, 4, AC_FUNC_ATTR_READNONE);
+      break;
+   }
+
+   case nir_op_sdot_2x16_iadd:
+   case nir_op_udot_2x16_uadd:
+   case nir_op_sdot_2x16_iadd_sat:
+   case nir_op_udot_2x16_uadd_sat: {
+      const char *name = instr->op == nir_op_sdot_2x16_iadd ||
+                         instr->op == nir_op_sdot_2x16_iadd_sat
+                         ? "llvm.amdgcn.sdot2" : "llvm.amdgcn.udot2";
+      src[0] = LLVMBuildBitCast(ctx->ac.builder, src[0], ctx->ac.v2i16, "");
+      src[1] = LLVMBuildBitCast(ctx->ac.builder, src[1], ctx->ac.v2i16, "");
+      src[3] = LLVMConstInt(ctx->ac.i1, instr->op == nir_op_sdot_2x16_iadd_sat ||
+                                        instr->op == nir_op_udot_2x16_uadd_sat, false);
+      result = ac_build_intrinsic(&ctx->ac, name, def_type, src, 4, AC_FUNC_ATTR_READNONE);
+      break;
+   }
+
+   case nir_op_sad_u8x4:
+      result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.sad.u8", ctx->ac.i32,
+                                  (LLVMValueRef[]){src[0], src[1], src[2]}, 3,
+                                  AC_FUNC_ATTR_READNONE);
+      break;
+
    default:
       fprintf(stderr, "Unknown NIR alu instr: ");
       nir_print_instr(&instr->instr, stderr);
@@ -1521,8 +1573,8 @@ static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx, const nir_te
    case nir_texop_lod:
       args->opcode = ac_image_get_lod;
       break;
-   case nir_texop_fragment_fetch:
-   case nir_texop_fragment_mask_fetch:
+   case nir_texop_fragment_fetch_amd:
+   case nir_texop_fragment_mask_fetch_amd:
       args->opcode = ac_image_load;
       args->level_zero = false;
       break;
@@ -1592,7 +1644,12 @@ static LLVMValueRef visit_load_push_constant(struct ac_nir_context *ctx, nir_int
 
       offset -= ctx->args->base_inline_push_consts;
 
-      unsigned num_inline_push_consts = ctx->args->num_inline_push_consts;
+      unsigned num_inline_push_consts = 0;
+      for (unsigned i = 0; i < ARRAY_SIZE(ctx->args->inline_push_consts); i++) {
+         if (ctx->args->inline_push_consts[i].used)
+            num_inline_push_consts++;
+      }
+
       if (offset + count <= num_inline_push_consts) {
          LLVMValueRef *const push_constants = alloca(num_inline_push_consts * sizeof(LLVMValueRef));
          for (unsigned i = 0; i < num_inline_push_consts; i++)
@@ -1927,6 +1984,12 @@ static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx, nir_intrinsic_
    case nir_intrinsic_ssbo_atomic_comp_swap:
       op = "cmpswap";
       break;
+   case nir_intrinsic_ssbo_atomic_fmin:
+      op = "fmin";
+      break;
+   case nir_intrinsic_ssbo_atomic_fmax:
+      op = "fmax";
+      break;
    default:
       abort();
    }
@@ -1937,10 +2000,17 @@ static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx, nir_intrinsic_
       result = emit_ssbo_comp_swap_64(ctx, descriptor, get_src(ctx, instr->src[1]),
                                       get_src(ctx, instr->src[2]), get_src(ctx, instr->src[3]), false);
    } else {
+      LLVMValueRef data = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0);
+
       if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) {
          params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[3]), 0);
       }
-      params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0);
+      if (instr->intrinsic == nir_intrinsic_ssbo_atomic_fmin ||
+          instr->intrinsic == nir_intrinsic_ssbo_atomic_fmax) {
+         data = ac_to_float(&ctx->ac, data);
+         return_type = LLVMTypeOf(data);
+      }
+      params[arg_count++] = data;
       params[arg_count++] = descriptor;
       params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */
       params[arg_count++] = ctx->ac.i32_0;               /* soffset */
@@ -1950,6 +2020,11 @@ static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx, nir_intrinsic_
       snprintf(name, sizeof(name), "llvm.amdgcn.raw.buffer.atomic.%s.%s", op, type);
 
       result = ac_build_intrinsic(&ctx->ac, name, return_type, params, arg_count, 0);
+
+      if (instr->intrinsic == nir_intrinsic_ssbo_atomic_fmin ||
+          instr->intrinsic == nir_intrinsic_ssbo_atomic_fmax) {
+         result = ac_to_integer(&ctx->ac, result);
+      }
    }
 
    result = exit_waterfall(ctx, &wctx, result);
@@ -2092,7 +2167,13 @@ static LLVMValueRef visit_global_atomic(struct ac_nir_context *ctx,
    /* use "singlethread" sync scope to implement relaxed ordering */
    const char *sync_scope = "singlethread-one-as";
 
-   LLVMTypeRef ptr_type = LLVMPointerType(LLVMTypeOf(data), AC_ADDR_SPACE_GLOBAL);
+   if (instr->intrinsic == nir_intrinsic_global_atomic_fmin ||
+       instr->intrinsic == nir_intrinsic_global_atomic_fmax) {
+      data = ac_to_float(&ctx->ac, data);
+   }
+
+   LLVMTypeRef data_type = LLVMTypeOf(data);
+   LLVMTypeRef ptr_type = LLVMPointerType(data_type, AC_ADDR_SPACE_GLOBAL);
 
    addr = LLVMBuildIntToPtr(ctx->ac.builder, addr, ptr_type, "");
 
@@ -2100,6 +2181,21 @@ static LLVMValueRef visit_global_atomic(struct ac_nir_context *ctx,
       LLVMValueRef data1 = get_src(ctx, instr->src[2]);
       result = ac_build_atomic_cmp_xchg(&ctx->ac, addr, data, data1, sync_scope);
       result = LLVMBuildExtractValue(ctx->ac.builder, result, 0, "");
+   } else if (instr->intrinsic == nir_intrinsic_global_atomic_fmin ||
+              instr->intrinsic == nir_intrinsic_global_atomic_fmax) {
+      const char *op = instr->intrinsic == nir_intrinsic_global_atomic_fmin ? "fmin" : "fmax";
+      char name[64], type[8];
+      LLVMValueRef params[2];
+      int arg_count = 0;
+
+      params[arg_count++] = addr;
+      params[arg_count++] = data;
+
+      ac_build_type_name_for_intr(data_type, type, sizeof(type));
+      snprintf(name, sizeof(name), "llvm.amdgcn.global.atomic.%s.%s.p1%s.%s", op, type, type, type);
+
+      result = ac_build_intrinsic(&ctx->ac, name, data_type, params, arg_count, 0);
+      result = ac_to_integer(&ctx->ac, result);
    } else {
       switch (instr->intrinsic) {
       case nir_intrinsic_global_atomic_add:
@@ -2296,6 +2392,9 @@ static LLVMValueRef adjust_sample_index_using_fmask(struct ac_llvm_context *ctx,
                                                     LLVMValueRef coord_z, LLVMValueRef sample_index,
                                                     LLVMValueRef fmask_desc_ptr)
 {
+   if (!fmask_desc_ptr)
+      return sample_index;
+
    unsigned sample_chan = coord_z ? 3 : 2;
    LLVMValueRef addr[4] = {coord_x, coord_y, coord_z};
    addr[sample_chan] = sample_index;
@@ -2674,6 +2773,14 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx, const nir_int
       atomic_name = "dec";
       atomic_subop = ac_atomic_dec_wrap;
       break;
+   case nir_intrinsic_image_deref_atomic_fmin:
+      atomic_name = "fmin";
+      atomic_subop = ac_atomic_fmin;
+      break;
+   case nir_intrinsic_image_deref_atomic_fmax:
+      atomic_name = "fmax";
+      atomic_subop = ac_atomic_fmax;
+      break;
    default:
       abort();
    }
@@ -2682,6 +2789,9 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx, const nir_int
       params[param_count++] = get_src(ctx, instr->src[4]);
    params[param_count++] = get_src(ctx, instr->src[3]);
 
+   if (atomic_subop == ac_atomic_fmin || atomic_subop == ac_atomic_fmax)
+      params[0] = ac_to_float(&ctx->ac, params[0]);
+
    LLVMValueRef result;
    if (dim == GLSL_SAMPLER_DIM_BUF) {
       params[param_count++] = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_BUFFER, true);
@@ -2691,12 +2801,16 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx, const nir_int
       if (cmpswap && instr->dest.ssa.bit_size == 64) {
          result = emit_ssbo_comp_swap_64(ctx, params[2], params[3], params[1], params[0], true);
       } else {
+         LLVMTypeRef data_type = LLVMTypeOf(params[0]);
+         char type[8];
+
          params[param_count++] = ctx->ac.i32_0; /* soffset */
          params[param_count++] = ctx->ac.i32_0; /* slc */
 
+         ac_build_type_name_for_intr(data_type, type, sizeof(type));
          length = snprintf(intrinsic_name, sizeof(intrinsic_name),
-                           "llvm.amdgcn.struct.buffer.atomic.%s.%s", atomic_name,
-                           instr->dest.ssa.bit_size == 64 ? "i64" : "i32");
+                           "llvm.amdgcn.struct.buffer.atomic.%s.%s",
+                           atomic_name, type);
 
          assert(length < sizeof(intrinsic_name));
          result = ac_build_intrinsic(&ctx->ac, intrinsic_name, LLVMTypeOf(params[0]), params, param_count, 0);
@@ -2728,6 +2842,18 @@ static LLVMValueRef visit_image_samples(struct ac_nir_context *ctx, nir_intrinsi
    LLVMValueRef rsrc = get_image_descriptor(ctx, instr, dynamic_index, AC_DESC_IMAGE, false);
 
    LLVMValueRef ret = ac_build_image_get_sample_count(&ctx->ac, rsrc);
+   if (ctx->abi->robust_buffer_access) {
+      LLVMValueRef dword1, is_null_descriptor;
+
+      /* Extract the second dword of the descriptor, if it's
+       * all zero, then it's a null descriptor.
+       */
+      dword1 =
+         LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 1, false), "");
+      is_null_descriptor = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, dword1,
+                                         LLVMConstInt(ctx->ac.i32, 0, false), "");
+      ret = LLVMBuildSelect(ctx->ac.builder, is_null_descriptor, ctx->ac.i32_0, ret, "");
+   }
 
    return exit_waterfall(ctx, &wctx, ret);
 }
@@ -2821,7 +2947,8 @@ static void emit_discard(struct ac_nir_context *ctx, const nir_intrinsic_instr *
        instr->intrinsic == nir_intrinsic_terminate_if) {
       cond = LLVMBuildNot(ctx->ac.builder, get_src(ctx, instr->src[0]), "");
    } else {
-      assert(instr->intrinsic == nir_intrinsic_discard);
+      assert(instr->intrinsic == nir_intrinsic_discard ||
+             instr->intrinsic == nir_intrinsic_terminate);
       cond = ctx->ac.i1false;
    }
 
@@ -2908,6 +3035,8 @@ static LLVMValueRef visit_load_subgroup_id(struct ac_nir_context *ctx)
       result = LLVMBuildAnd(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->tg_size),
                             LLVMConstInt(ctx->ac.i32, 0xfc0, false), "");
       return LLVMBuildLShr(ctx->ac.builder, result, LLVMConstInt(ctx->ac.i32, 6, false), "");
+   } else if (ctx->args->merged_wave_info.used) {
+      return ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 24, 4);
    } else {
       return LLVMConstInt(ctx->ac.i32, 0, false);
    }
@@ -2938,15 +3067,18 @@ static LLVMValueRef visit_first_invocation(struct ac_nir_context *ctx)
 
 static LLVMValueRef visit_load_shared(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr)
 {
-   unsigned alignment = nir_intrinsic_align(instr);
+   LLVMValueRef values[4], derived_ptr, index, ret;
    unsigned const_off = nir_intrinsic_base(instr);
 
    LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[0], instr->dest.ssa.bit_size, const_off);
-   LLVMTypeRef result_type = get_def_type(ctx, &instr->dest.ssa);
-   int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
-   LLVMValueRef derived_ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, LLVMPointerType(result_type, addr_space), "");
-   LLVMValueRef ret = LLVMBuildLoad(ctx->ac.builder, derived_ptr, "");
-   LLVMSetAlignment(ret, alignment);
+
+   for (int chan = 0; chan < instr->num_components; chan++) {
+      index = LLVMConstInt(ctx->ac.i32, chan, 0);
+      derived_ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &index, 1, "");
+      values[chan] = LLVMBuildLoad(ctx->ac.builder, derived_ptr, "");
+   }
+
+   ret = ac_build_gather_values(&ctx->ac, values, instr->num_components);
 
    return LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, &instr->dest.ssa), "");
 }
@@ -2989,6 +3121,32 @@ static LLVMValueRef visit_var_atomic(struct ac_nir_context *ctx, const nir_intri
       LLVMValueRef src1 = get_src(ctx, instr->src[src_idx + 1]);
       result = ac_build_atomic_cmp_xchg(&ctx->ac, ptr, src, src1, sync_scope);
       result = LLVMBuildExtractValue(ctx->ac.builder, result, 0, "");
+   } else if (instr->intrinsic == nir_intrinsic_shared_atomic_fmin ||
+              instr->intrinsic == nir_intrinsic_shared_atomic_fmax) {
+      const char *op = instr->intrinsic == nir_intrinsic_shared_atomic_fmin ? "fmin" : "fmax";
+      char name[64], type[8];
+      LLVMValueRef params[5];
+      LLVMTypeRef src_type;
+      int arg_count = 0;
+
+      src = ac_to_float(&ctx->ac, src);
+      src_type = LLVMTypeOf(src);
+
+      LLVMTypeRef ptr_type =
+         LLVMPointerType(src_type, LLVMGetPointerAddressSpace(LLVMTypeOf(ptr)));
+      ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ptr_type, "");
+
+      params[arg_count++] = ptr;
+      params[arg_count++] = src;
+      params[arg_count++] = ctx->ac.i32_0;
+      params[arg_count++] = ctx->ac.i32_0;
+      params[arg_count++] = ctx->ac.i1false;
+
+      ac_build_type_name_for_intr(src_type, type, sizeof(type));
+      snprintf(name, sizeof(name), "llvm.amdgcn.ds.%s.%s", op, type);
+
+      result = ac_build_intrinsic(&ctx->ac, name, src_type, params, arg_count, 0);
+      result = ac_to_integer(&ctx->ac, result);
    } else {
       LLVMAtomicRMWBinOp op;
       switch (instr->intrinsic) {
@@ -3190,11 +3348,7 @@ static LLVMValueRef load_interpolated_input(struct ac_nir_context *ctx, LLVMValu
 
    /* Workaround for issue 2647: kill threads with infinite interpolation coeffs */
    if (ctx->verified_interp && !_mesa_hash_table_search(ctx->verified_interp, interp_param)) {
-      LLVMValueRef args[2];
-      args[0] = i;
-      args[1] = LLVMConstInt(ctx->ac.i32, S_NAN | Q_NAN | N_INFINITY | P_INFINITY, false);
-      LLVMValueRef cond = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.class.f32", ctx->ac.i1, args, 2,
-                                             AC_FUNC_ATTR_READNONE);
+      LLVMValueRef cond = ac_build_is_inf_or_nan(&ctx->ac, i);
       ac_build_kill_if_false(&ctx->ac, LLVMBuildNot(ctx->ac.builder, cond, ""));
       _mesa_hash_table_insert(ctx->verified_interp, interp_param, interp_param);
    }
@@ -3286,22 +3440,22 @@ static LLVMValueRef visit_load(struct ac_nir_context *ctx, nir_intrinsic_instr *
        nir_intrinsic_io_semantics(instr).fb_fetch_output)
       return ctx->abi->emit_fbfetch(ctx->abi);
 
-   /* Other non-fragment cases have inputs and outputs in temporaries. */
-   if (ctx->stage != MESA_SHADER_FRAGMENT) {
-      for (unsigned chan = component; chan < count + component; chan++) {
-         if (is_output) {
-            values[chan] = LLVMBuildLoad(ctx->ac.builder, ctx->abi->outputs[base * 4 + chan], "");
-         } else {
-            values[chan] = ctx->abi->inputs[base * 4 + chan];
-            if (!values[chan])
-               values[chan] = LLVMGetUndef(ctx->ac.i32);
-         }
-      }
+   if (ctx->stage == MESA_SHADER_VERTEX && !is_output)
+      return ctx->abi->load_inputs(ctx->abi, base, component, count, 0, component_type);
+
+   /* Other non-fragment cases have outputs in temporaries. */
+   if (is_output && (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL)) {
+      assert(is_output);
+
+      for (unsigned chan = component; chan < count + component; chan++)
+         values[chan] = LLVMBuildLoad(ctx->ac.builder, ctx->abi->outputs[base * 4 + chan], "");
+
       LLVMValueRef result = ac_build_varying_gather_values(&ctx->ac, values, count, component);
       return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, "");
    }
 
    /* Fragment shader inputs. */
+   assert(ctx->stage == MESA_SHADER_FRAGMENT);
    unsigned vertex_id = 2; /* P0 */
 
    if (instr->intrinsic == nir_intrinsic_load_input_vertex) {
@@ -3438,11 +3592,13 @@ static void visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
       result = ctx->abi->load_local_group_size(ctx->abi);
       break;
    case nir_intrinsic_load_vertex_id:
-      result = LLVMBuildAdd(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->vertex_id),
+      result = LLVMBuildAdd(ctx->ac.builder,
+                            ctx->vertex_id_replaced ? ctx->vertex_id_replaced :
+                                                      ac_get_arg(&ctx->ac, ctx->args->vertex_id),
                             ac_get_arg(&ctx->ac, ctx->args->base_vertex), "");
       break;
    case nir_intrinsic_load_vertex_id_zero_base: {
-      result = ctx->abi->vertex_id;
+      result = ctx->vertex_id_replaced ? ctx->vertex_id_replaced : ctx->abi->vertex_id;
       break;
    }
    case nir_intrinsic_load_local_invocation_id: {
@@ -3489,7 +3645,8 @@ static void visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
       } else if (ctx->stage == MESA_SHADER_TESS_CTRL) {
          result = ac_get_arg(&ctx->ac, ctx->args->tcs_patch_id);
       } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
-         result = ac_get_arg(&ctx->ac, ctx->args->tes_patch_id);
+         result = ctx->tes_patch_id_replaced ? ctx->tes_patch_id_replaced
+                                             : ac_get_arg(&ctx->ac, ctx->args->tes_patch_id);
       } else
          fprintf(stderr, "Unknown primitive id intrinsic: %d", ctx->stage);
       break;
@@ -3508,9 +3665,6 @@ static void visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
    case nir_intrinsic_load_frag_shading_rate:
       result = emit_load_frag_shading_rate(ctx);
       break;
-   case nir_intrinsic_load_layer_id:
-      result = ctx->abi->inputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)];
-      break;
    case nir_intrinsic_load_front_face:
       result = emit_i2b(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->front_face));
       break;
@@ -3531,7 +3685,7 @@ static void visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
       result = ctx->abi->user_data;
       break;
    case nir_intrinsic_load_instance_id:
-      result = ctx->abi->instance_id;
+      result = ctx->instance_id_replaced ? ctx->instance_id_replaced : ctx->abi->instance_id;
       break;
    case nir_intrinsic_load_num_workgroups:
       result = ac_get_arg(&ctx->ac, ctx->args->num_work_groups);
@@ -3565,6 +3719,7 @@ static void visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
    case nir_intrinsic_load_ssbo:
       result = visit_load_buffer(ctx, instr);
       break;
+   case nir_intrinsic_load_global_constant:
    case nir_intrinsic_load_global:
       result = visit_load_global(ctx, instr);
       break;
@@ -3581,6 +3736,8 @@ static void visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
    case nir_intrinsic_global_atomic_xor:
    case nir_intrinsic_global_atomic_exchange:
    case nir_intrinsic_global_atomic_comp_swap:
+   case nir_intrinsic_global_atomic_fmin:
+   case nir_intrinsic_global_atomic_fmax:
       result = visit_global_atomic(ctx, instr);
       break;
    case nir_intrinsic_ssbo_atomic_add:
@@ -3593,6 +3750,8 @@ static void visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
    case nir_intrinsic_ssbo_atomic_xor:
    case nir_intrinsic_ssbo_atomic_exchange:
    case nir_intrinsic_ssbo_atomic_comp_swap:
+   case nir_intrinsic_ssbo_atomic_fmin:
+   case nir_intrinsic_ssbo_atomic_fmax:
       result = visit_atomic_ssbo(ctx, instr);
       break;
    case nir_intrinsic_load_ubo:
@@ -3663,6 +3822,8 @@ static void visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
    case nir_intrinsic_image_deref_atomic_comp_swap:
    case nir_intrinsic_image_deref_atomic_inc_wrap:
    case nir_intrinsic_image_deref_atomic_dec_wrap:
+   case nir_intrinsic_image_deref_atomic_fmin:
+   case nir_intrinsic_image_deref_atomic_fmax:
       result = visit_image_atomic(ctx, instr, false);
       break;
    case nir_intrinsic_bindless_image_size:
@@ -3698,7 +3859,7 @@ static void visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
       nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
 
       unsigned wait_flags = 0;
-      if (modes & (nir_var_mem_global | nir_var_mem_ssbo))
+      if (modes & (nir_var_mem_global | nir_var_mem_ssbo | nir_var_image))
          wait_flags |= AC_WAIT_VLOAD | AC_WAIT_VSTORE;
       if (modes & nir_var_mem_shared)
          wait_flags |= AC_WAIT_LGKM;
@@ -3725,7 +3886,9 @@ static void visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
    case nir_intrinsic_shared_atomic_xor:
    case nir_intrinsic_shared_atomic_exchange:
    case nir_intrinsic_shared_atomic_comp_swap:
-   case nir_intrinsic_shared_atomic_fadd: {
+   case nir_intrinsic_shared_atomic_fadd:
+   case nir_intrinsic_shared_atomic_fmin:
+   case nir_intrinsic_shared_atomic_fmax: {
       LLVMValueRef ptr = get_memory_ptr(ctx, instr->src[0], instr->src[1].ssa->bit_size, 0);
       result = visit_var_atomic(ctx, instr, ptr, 1);
       break;
@@ -3794,9 +3957,21 @@ static void visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
    case nir_intrinsic_end_primitive_with_counter:
       ctx->abi->emit_primitive(ctx->abi, nir_intrinsic_stream_id(instr));
       break;
-   case nir_intrinsic_load_tess_coord:
-      result = ctx->abi->load_tess_coord(ctx->abi);
+   case nir_intrinsic_load_tess_coord: {
+      LLVMValueRef coord[] = {
+         ctx->tes_u_replaced ? ctx->tes_u_replaced : ac_get_arg(&ctx->ac, ctx->args->tes_u),
+         ctx->tes_v_replaced ? ctx->tes_v_replaced : ac_get_arg(&ctx->ac, ctx->args->tes_v),
+         ctx->ac.f32_0,
+      };
+
+      /* For triangles, the vector should be (u, v, 1-u-v). */
+      if (ctx->info->tess.primitive_mode == GL_TRIANGLES) {
+         coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1,
+                                  LLVMBuildFAdd(ctx->ac.builder, coord[0], coord[1], ""), "");
+      }
+      result = ac_build_gather_values(&ctx->ac, coord, 3);
       break;
+   }
    case nir_intrinsic_load_tess_level_outer:
       result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_OUTER, false);
       break;
@@ -3816,7 +3991,8 @@ static void visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
       if (ctx->stage == MESA_SHADER_TESS_CTRL)
          result = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->tcs_rel_ids), 0, 8);
       else if (ctx->stage == MESA_SHADER_TESS_EVAL)
-         result = ac_get_arg(&ctx->ac, ctx->args->tes_rel_patch_id);
+         result = ctx->tes_rel_patch_id_replaced ? ctx->tes_rel_patch_id_replaced
+                                                 : ac_get_arg(&ctx->ac, ctx->args->tes_rel_patch_id);
       else
          unreachable("tess_rel_patch_id_amd is only supported by tessellation shaders");
       break;
@@ -4038,6 +4214,85 @@ static void visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
                                   cache_policy);
       break;
    }
+   case nir_intrinsic_load_packed_passthrough_primitive_amd:
+      result = ac_get_arg(&ctx->ac, ctx->args->gs_vtx_offset[0]);
+      break;
+   case nir_intrinsic_load_initial_edgeflags_amd:
+      if (ctx->stage == MESA_SHADER_VERTEX && !ctx->info->vs.blit_sgprs_amd)
+         result = ac_pack_edgeflags_for_export(&ctx->ac, ctx->args);
+      else
+         result = ctx->ac.i32_0;
+      break;
+   case nir_intrinsic_has_input_vertex_amd: {
+      LLVMValueRef num =
+         ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 0, 8);
+      result = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), num, "");
+      break;
+   }
+   case nir_intrinsic_has_input_primitive_amd: {
+      LLVMValueRef num =
+         ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->merged_wave_info), 8, 8);
+      result = LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), num, "");
+      break;
+   }
+   case nir_intrinsic_load_workgroup_num_input_vertices_amd:
+      result = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_tg_info), 12, 9);
+      break;
+   case nir_intrinsic_load_workgroup_num_input_primitives_amd:
+      result = ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->gs_tg_info), 22, 9);
+      break;
+   case nir_intrinsic_alloc_vertices_and_primitives_amd:
+      /* The caller should only call this conditionally for wave 0, so assume that the current
+       * wave is always wave 0.
+       */
+      ac_build_sendmsg_gs_alloc_req(&ctx->ac, ctx->ac.i32_0,
+                                    get_src(ctx, instr->src[0]),
+                                    get_src(ctx, instr->src[1]));
+      break;
+   case nir_intrinsic_overwrite_vs_arguments_amd:
+      ctx->vertex_id_replaced = get_src(ctx, instr->src[0]);
+      ctx->instance_id_replaced = get_src(ctx, instr->src[1]);
+      break;
+   case nir_intrinsic_overwrite_tes_arguments_amd:
+      ctx->tes_u_replaced = get_src(ctx, instr->src[0]);
+      ctx->tes_v_replaced = get_src(ctx, instr->src[1]);
+      ctx->tes_rel_patch_id_replaced = get_src(ctx, instr->src[2]);
+      ctx->tes_patch_id_replaced = get_src(ctx, instr->src[3]);
+      break;
+   case nir_intrinsic_export_primitive_amd: {
+      struct ac_ngg_prim prim = {0};
+      prim.passthrough = get_src(ctx, instr->src[0]);
+      ac_build_export_prim(&ctx->ac, &prim);
+      break;
+   }
+   case nir_intrinsic_export_vertex_amd:
+      ctx->abi->export_vertex(ctx->abi);
+      break;
+   case nir_intrinsic_elect:
+      result = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, visit_first_invocation(ctx),
+                             ac_get_thread_id(&ctx->ac), "");
+      break;
+   case nir_intrinsic_byte_permute_amd:
+      if (LLVM_VERSION_MAJOR < 13) {
+         assert("unimplemented byte_permute, LLVM 12 doesn't have amdgcn.perm");
+         break;
+      }
+      result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.perm", ctx->ac.i32,
+                                  (LLVMValueRef[]){get_src(ctx, instr->src[0]),
+                                                   get_src(ctx, instr->src[1]),
+                                                   get_src(ctx, instr->src[2])},
+                                  3, AC_FUNC_ATTR_READNONE);
+      break;
+   case nir_intrinsic_lane_permute_16_amd:
+      result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.permlane16", ctx->ac.i32,
+                                  (LLVMValueRef[]){get_src(ctx, instr->src[0]),
+                                                   get_src(ctx, instr->src[0]),
+                                                   get_src(ctx, instr->src[1]),
+                                                   get_src(ctx, instr->src[2]),
+                                                   ctx->ac.i1false,
+                                                   ctx->ac.i1false},
+                                  6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
+      break;
    default:
       fprintf(stderr, "Unknown intrinsic: ");
       nir_print_instr(&instr->instr, stderr);
@@ -4254,7 +4509,7 @@ static void tex_fetch_ptrs(struct ac_nir_context *ctx, nir_tex_instr *instr,
       main_descriptor = AC_DESC_PLANE_0 + plane;
    }
 
-   if (instr->op == nir_texop_fragment_mask_fetch) {
+   if (instr->op == nir_texop_fragment_mask_fetch_amd) {
       /* The fragment mask is fetched from the compressed
        * multisampled surface.
        */
@@ -4491,7 +4746,7 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
         instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
         instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
        instr->is_array && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms &&
-       instr->op != nir_texop_fragment_fetch && instr->op != nir_texop_fragment_mask_fetch) {
+       instr->op != nir_texop_fragment_fetch_amd && instr->op != nir_texop_fragment_mask_fetch_amd) {
       args.coords[2] = apply_round_slice(&ctx->ac, args.coords[2]);
    }
 
@@ -4509,7 +4764,7 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
    }
 
    /* Pack sample index */
-   if (sample_index && (instr->op == nir_texop_txf_ms || instr->op == nir_texop_fragment_fetch))
+   if (sample_index && (instr->op == nir_texop_txf_ms || instr->op == nir_texop_fragment_fetch_amd))
       args.coords[instr->coord_components] = sample_index;
 
    if (instr->op == nir_texop_samples_identical) {
@@ -4528,8 +4783,8 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
 
    if ((instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS ||
         instr->sampler_dim == GLSL_SAMPLER_DIM_MS) &&
-       instr->op != nir_texop_txs && instr->op != nir_texop_fragment_fetch &&
-       instr->op != nir_texop_fragment_mask_fetch) {
+       instr->op != nir_texop_txs && instr->op != nir_texop_fragment_fetch_amd &&
+       instr->op != nir_texop_fragment_mask_fetch_amd) {
       unsigned sample_chan = instr->is_array ? 3 : 2;
       args.coords[sample_chan] = adjust_sample_index_using_fmask(
          &ctx->ac, args.coords[0], args.coords[1], instr->is_array ? args.coords[2] : NULL,
@@ -4572,7 +4827,7 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
     * multisampled images and (x,y,layer) for 2D multisampled layered
     * images or for multisampled input attachments.
     */
-   if (instr->op == nir_texop_fragment_mask_fetch) {
+   if (instr->op == nir_texop_fragment_mask_fetch_amd) {
       if (args.dim == ac_image_2dmsaa) {
          args.dim = ac_image_2d;
       } else {
@@ -4611,6 +4866,14 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
       LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false);
       LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, result, two, "");
       result = LLVMBuildInsertElement(ctx->ac.builder, result, layers, ctx->ac.i32_1, "");
+   } else if (instr->op == nir_texop_fragment_mask_fetch_amd) {
+      /* Use 0x76543210 if the image doesn't have FMASK. */
+      LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, args.resource, ctx->ac.v8i32, "");
+      tmp = LLVMBuildExtractElement(ctx->ac.builder, tmp, ctx->ac.i32_1, "");
+      tmp = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, tmp, ctx->ac.i32_0, "");
+      result = LLVMBuildSelect(ctx->ac.builder, tmp,
+                               LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, ""),
+                               LLVMConstInt(ctx->ac.i32, 0x76543210, false), "");
    } else if (nir_tex_instr_result_size(instr) != 4)
       result = ac_trim_vector(&ctx->ac, result, instr->dest.ssa.num_components);
 
@@ -5116,7 +5379,7 @@ void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi,
       ac_build_kill_if_false(&ctx.ac, LLVMBuildLoad(ctx.ac.builder, ctx.ac.postponed_kill, ""));
 
    if (!gl_shader_stage_is_compute(nir->info.stage))
-      ctx.abi->emit_outputs(ctx.abi, AC_LLVM_MAX_OUTPUTS, ctx.abi->outputs);
+      ctx.abi->emit_outputs(ctx.abi);
 
    free(ctx.ssa_defs);
    ralloc_free(ctx.defs);
diff --git a/mesa 3D driver/src/amd/llvm/ac_shader_abi.h b/mesa 3D driver/src/amd/llvm/ac_shader_abi.h
index c3bfba35b5..941cc3ce58 100644
--- a/mesa 3D driver/src/amd/llvm/ac_shader_abi.h	
+++ b/mesa 3D driver/src/amd/llvm/ac_shader_abi.h	
@@ -58,17 +58,12 @@ struct ac_shader_abi {
    LLVMValueRef color0, color1;
    LLVMValueRef user_data;
 
-   /* For VS and PS: pre-loaded shader inputs.
-    *
-    * Currently only used for NIR shaders; indexed by variables'
-    * driver_location.
-    */
-   LLVMValueRef *inputs;
-
    /* Varying -> attribute number mapping. Also NIR-only */
    unsigned fs_input_attr_indices[MAX_VARYING];
 
-   void (*emit_outputs)(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
+   void (*export_vertex)(struct ac_shader_abi *abi);
+
+   void (*emit_outputs)(struct ac_shader_abi *abi);
 
    void (*emit_vertex)(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addrs);
 
@@ -93,8 +88,6 @@ struct ac_shader_abi {
                              LLVMValueRef src, unsigned writemask,
                              unsigned component, unsigned location, unsigned driver_location);
 
-   LLVMValueRef (*load_tess_coord)(struct ac_shader_abi *abi);
-
    LLVMValueRef (*load_patch_vertices_in)(struct ac_shader_abi *abi);
 
    LLVMValueRef (*load_ring_tess_offchip)(struct ac_shader_abi *abi);
diff --git a/mesa 3D driver/src/amd/meson.build b/mesa 3D driver/src/amd/meson.build
index 4a27c8c3e4..463c78544a 100644
--- a/mesa 3D driver/src/amd/meson.build	
+++ b/mesa 3D driver/src/amd/meson.build	
@@ -22,7 +22,11 @@ inc_amd = include_directories('.')
 
 subdir('addrlib')
 subdir('common')
-subdir('llvm')
+if with_llvm
+  subdir('llvm')
+else
+  libamd_common_llvm = []
+endif
 if with_amd_vk
   subdir('compiler')
   subdir('vulkan')
diff --git a/mesa 3D driver/src/amd/vulkan/layers/radv_sqtt_layer.c b/mesa 3D driver/src/amd/vulkan/layers/radv_sqtt_layer.c
index 818caca944..6a4acda4d1 100644
--- a/mesa 3D driver/src/amd/vulkan/layers/radv_sqtt_layer.c	
+++ b/mesa 3D driver/src/amd/vulkan/layers/radv_sqtt_layer.c	
@@ -390,7 +390,7 @@ radv_handle_thread_trace(VkQueue _queue)
 
       if (frame_trigger || file_trigger || resize_trigger) {
          /* FIXME: SQTT on compute hangs. */
-         if (queue->queue_family_index == RADV_QUEUE_COMPUTE) {
+         if (queue->vk.queue_family_index == RADV_QUEUE_COMPUTE) {
             fprintf(stderr, "RADV: Capturing a SQTT trace on the compute "
                             "queue is currently broken and might hang! "
                             "Please, disable presenting on compute if "
@@ -624,12 +624,6 @@ sqtt_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPoo
 
 #define API_MARKER(cmd_name, ...) API_MARKER_ALIAS(cmd_name, cmd_name, __VA_ARGS__);
 
-static bool
-radv_sqtt_dump_pipeline()
-{
-   return getenv("RADV_THREAD_TRACE_PIPELINE");
-}
-
 void
 sqtt_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
                      VkPipeline _pipeline)
@@ -638,7 +632,7 @@ sqtt_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline
 
    API_MARKER(BindPipeline, commandBuffer, pipelineBindPoint, _pipeline);
 
-   if (radv_sqtt_dump_pipeline())
+   if (radv_is_instruction_timing_enabled())
       radv_describe_pipeline_bind(cmd_buffer, pipelineBindPoint, pipeline);
 }
 
@@ -890,7 +884,7 @@ radv_add_code_object(struct radv_device *device, struct radv_pipeline *pipeline)
       }
       memcpy(code, shader->code_ptr, shader->code_size);
 
-      va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
+      va = radv_shader_variant_get_va(shader);
 
       record->shader_data[i].hash[0] = (uint64_t)(uintptr_t)shader;
       record->shader_data[i].hash[1] = (uint64_t)(uintptr_t)shader >> 32;
@@ -898,6 +892,8 @@ radv_add_code_object(struct radv_device *device, struct radv_pipeline *pipeline)
       record->shader_data[i].code = code;
       record->shader_data[i].vgpr_count = shader->config.num_vgprs;
       record->shader_data[i].sgpr_count = shader->config.num_sgprs;
+      record->shader_data[i].scratch_memory_size = shader->config.scratch_bytes_per_wave;
+      record->shader_data[i].wavefront_size = shader->info.wave_size;
       record->shader_data[i].base_address = va & 0xffffffffffff;
       record->shader_data[i].elf_symbol_offset = 0;
       record->shader_data[i].hw_stage = radv_mesa_to_rgp_shader_stage(pipeline, i);
@@ -933,7 +929,7 @@ radv_register_pipeline(struct radv_device *device, struct radv_pipeline *pipelin
       if (!shader)
          continue;
 
-      va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
+      va = radv_shader_variant_get_va(shader);
       base_va = MIN2(base_va, va);
    }
 
@@ -1020,7 +1016,7 @@ sqtt_CreateGraphicsPipelines(VkDevice _device, VkPipelineCache pipelineCache, ui
    if (result != VK_SUCCESS)
       return result;
 
-   if (radv_sqtt_dump_pipeline()) {
+   if (radv_is_instruction_timing_enabled()) {
       for (unsigned i = 0; i < count; i++) {
          RADV_FROM_HANDLE(radv_pipeline, pipeline, pPipelines[i]);
 
@@ -1056,7 +1052,7 @@ sqtt_CreateComputePipelines(VkDevice _device, VkPipelineCache pipelineCache, uin
    if (result != VK_SUCCESS)
       return result;
 
-   if (radv_sqtt_dump_pipeline()) {
+   if (radv_is_instruction_timing_enabled()) {
       for (unsigned i = 0; i < count; i++) {
          RADV_FROM_HANDLE(radv_pipeline, pipeline, pPipelines[i]);
 
@@ -1089,7 +1085,7 @@ sqtt_DestroyPipeline(VkDevice _device, VkPipeline _pipeline,
    if (!_pipeline)
       return;
 
-   if (radv_sqtt_dump_pipeline())
+   if (radv_is_instruction_timing_enabled())
       radv_unregister_pipeline(device, pipeline);
 
    radv_DestroyPipeline(_device, _pipeline, pAllocator);
diff --git a/mesa 3D driver/src/amd/vulkan/meson.build b/mesa 3D driver/src/amd/vulkan/meson.build
index f0e4a0a1ad..1d09248036 100644
--- a/mesa 3D driver/src/amd/vulkan/meson.build	
+++ b/mesa 3D driver/src/amd/vulkan/meson.build	
@@ -39,6 +39,7 @@ libradv_files = files(
   'winsys/null/radv_null_winsys.c',
   'winsys/null/radv_null_winsys_public.h',
   'radv_acceleration_structure.c',
+  'radv_acceleration_structure.h',
   'radv_android.c',
   'radv_cmd_buffer.c',
   'radv_cs.h',
@@ -49,7 +50,6 @@ libradv_files = files(
   'radv_descriptor_set.h',
   'radv_formats.c',
   'radv_image.c',
-  'radv_llvm_helper.cpp',
   'radv_meta.c',
   'radv_meta.h',
   'radv_meta_blit.c',
@@ -67,17 +67,16 @@ libradv_files = files(
   'radv_meta_resolve_cs.c',
   'radv_meta_resolve_fs.c',
   'radv_nir_lower_ycbcr_textures.c',
-  'radv_nir_to_llvm.c',
   'radv_pass.c',
   'radv_pipeline.c',
   'radv_pipeline_cache.c',
+  'radv_pipeline_rt.c',
   'radv_private.h',
   'radv_radeon_winsys.h',
   'radv_shader.c',
   'radv_shader.h',
   'radv_shader_args.c',
   'radv_shader_args.h',
-  'radv_shader_helper.h',
   'radv_shader_info.c',
   'radv_sqtt.c',
   'radv_query.c',
@@ -101,45 +100,35 @@ if not with_platform_windows
   )
 endif
 
+if with_llvm
+  libradv_files += files(
+    'radv_llvm_helper.cpp',
+    'radv_llvm_helper.h',
+    'radv_nir_to_llvm.c',
+  )
+endif
+
 radv_deps = []
 radv_flags = cc.get_supported_arguments(['-Wimplicit-fallthrough', '-Wshadow'])
 
 if with_platform_x11
   radv_deps += dep_xcb_dri3
-  radv_flags += [
-    '-DVK_USE_PLATFORM_XCB_KHR',
-    '-DVK_USE_PLATFORM_XLIB_KHR',
-  ]
-  libradv_files += files('radv_wsi_x11.c')
 endif
 
 if with_platform_wayland
   radv_deps += dep_wayland_client
-  radv_flags += '-DVK_USE_PLATFORM_WAYLAND_KHR'
-  libradv_files += files('radv_wsi_wayland.c')
 endif
 
 if system_has_kms_drm and not with_platform_android
-  radv_flags += '-DVK_USE_PLATFORM_DISPLAY_KHR'
   libradv_files += files('radv_wsi_display.c')
 endif
 
 if with_xlib_lease
   radv_deps += [dep_xlib_xrandr]
-  radv_flags += '-DVK_USE_PLATFORM_XLIB_XRANDR_EXT'
 endif
 
 if with_platform_android
   radv_deps += dep_android
-  radv_flags += [
-    '-DVK_USE_PLATFORM_ANDROID_KHR'
-  ]
-endif
-
-if with_platform_windows
-  radv_flags += [
-    '-DVK_USE_PLATFORM_WIN32_KHR',
-  ]
 endif
 
 # When static linking LLVM, all its symbols are public API.
@@ -159,15 +148,16 @@ libvulkan_radeon = shared_library(
   [libradv_files, radv_entrypoints, sha1_h],
   vs_module_defs : vulkan_radv_def,
   include_directories : [
-    inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_amd, inc_amd_common, inc_amd_common_llvm, inc_compiler, inc_util, inc_vulkan_wsi,
+    inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_amd, inc_amd_common, inc_amd_common_llvm, inc_compiler, inc_util,
   ],
   link_with : [
-    libamd_common, libamd_common_llvm, libamdgpu_addrlib, libvulkan_wsi,
+    libamd_common, libamd_common_llvm, libamdgpu_addrlib,
   ],
   dependencies : [
     dep_llvm, dep_libdrm_amdgpu, dep_thread, dep_elf, dep_dl, dep_m,
     dep_valgrind, radv_deps, idep_aco,
-    idep_mesautil, idep_nir, idep_vulkan_util, idep_amdgfxregs_h, idep_xmlconfig,
+    idep_mesautil, idep_nir, idep_vulkan_util, idep_vulkan_wsi,
+    idep_amdgfxregs_h, idep_xmlconfig,
   ],
   c_args : [no_override_init_args, radv_flags, c_msvc_compat_args],
   cpp_args : [radv_flags, cpp_msvc_compat_args],
diff --git a/mesa 3D driver/src/amd/vulkan/radv_acceleration_structure.c b/mesa 3D driver/src/amd/vulkan/radv_acceleration_structure.c
index 15d1a62f08..d438288638 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_acceleration_structure.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_acceleration_structure.c	
@@ -20,68 +20,15 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */
+#include "radv_acceleration_structure.h"
 #include "radv_private.h"
 
+#include "util/format/format_utils.h"
 #include "util/half_float.h"
 #include "nir_builder.h"
 #include "radv_cs.h"
 #include "radv_meta.h"
 
-struct radv_accel_struct_header {
-   uint32_t root_node_offset;
-   uint32_t reserved;
-   float aabb[2][3];
-   uint64_t compacted_size;
-   uint64_t serialization_size;
-};
-
-struct radv_bvh_triangle_node {
-   float coords[3][3];
-   uint32_t reserved[3];
-   uint32_t triangle_id;
-   /* flags in upper 4 bits */
-   uint32_t geometry_id_and_flags;
-   uint32_t reserved2;
-   uint32_t id;
-};
-
-struct radv_bvh_aabb_node {
-   float aabb[2][3];
-   uint32_t primitive_id;
-   /* flags in upper 4 bits */
-   uint32_t geometry_id_and_flags;
-   uint32_t reserved[8];
-};
-
-struct radv_bvh_instance_node {
-   uint64_t base_ptr;
-   /* lower 24 bits are the custom instance index, upper 8 bits are the visibility mask */
-   uint32_t custom_instance_and_mask;
-   /* lower 24 bits are the sbt offset, upper 8 bits are VkGeometryInstanceFlagsKHR */
-   uint32_t sbt_offset_and_flags;
-
-   /* The translation component is actually a pre-translation instead of a post-translation. If you
-    * want to get a proper matrix out of it you need to apply the directional component of the
-    * matrix to it. The pre-translation of the world->object matrix is the same as the
-    * post-translation of the object->world matrix so this way we can share data between both
-    * matrices. */
-   float wto_matrix[12];
-   float aabb[2][3];
-   uint32_t instance_id;
-   uint32_t reserved[9];
-};
-
-struct radv_bvh_box16_node {
-   uint32_t children[4];
-   uint32_t coords[4][3];
-};
-
-struct radv_bvh_box32_node {
-   uint32_t children[4];
-   float coords[4][2][3];
-   uint32_t reserved[4];
-};
-
 void
 radv_GetAccelerationStructureBuildSizesKHR(
    VkDevice _device, VkAccelerationStructureBuildTypeKHR buildType,
@@ -90,6 +37,12 @@ radv_GetAccelerationStructureBuildSizesKHR(
 {
    uint64_t triangles = 0, boxes = 0, instances = 0;
 
+   STATIC_ASSERT(sizeof(struct radv_bvh_triangle_node) == 64);
+   STATIC_ASSERT(sizeof(struct radv_bvh_aabb_node) == 64);
+   STATIC_ASSERT(sizeof(struct radv_bvh_instance_node) == 128);
+   STATIC_ASSERT(sizeof(struct radv_bvh_box16_node) == 64);
+   STATIC_ASSERT(sizeof(struct radv_bvh_box32_node) == 128);
+
    for (uint32_t i = 0; i < pBuildInfo->geometryCount; ++i) {
       const VkAccelerationStructureGeometryKHR *geometry;
       if (pBuildInfo->pGeometries)
@@ -144,7 +97,7 @@ radv_CreateAccelerationStructureKHR(VkDevice _device,
    accel = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*accel), 8,
                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (accel == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    vk_object_base_init(&device->vk, &accel->base, VK_OBJECT_TYPE_ACCELERATION_STRUCTURE_KHR);
 
@@ -192,7 +145,7 @@ radv_WriteAccelerationStructuresPropertiesKHR(
       RADV_FROM_HANDLE(radv_acceleration_structure, accel, pAccelerationStructures[i]);
       const char *base_ptr = (const char *)device->ws->buffer_map(accel->bo);
       if (!base_ptr)
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
       const struct radv_accel_struct_header *header = (const void*)(base_ptr + accel->mem_offset);
       if (stride * i + sizeof(VkDeviceSize) <= dataSize) {
@@ -268,6 +221,12 @@ build_triangles(struct radv_bvh_build_ctx *ctx, const VkAccelerationStructureGeo
          const char *v_data = (const char *)tri_data->vertexData.hostAddress + v_index * tri_data->vertexStride;
          float coords[4];
          switch (tri_data->vertexFormat) {
+         case VK_FORMAT_R32G32_SFLOAT:
+            coords[0] = *(const float *)(v_data + 0);
+            coords[1] = *(const float *)(v_data + 4);
+            coords[2] = 0.0f;
+            coords[3] = 1.0f;
+            break;
          case VK_FORMAT_R32G32B32_SFLOAT:
             coords[0] = *(const float *)(v_data + 0);
             coords[1] = *(const float *)(v_data + 4);
@@ -280,6 +239,12 @@ build_triangles(struct radv_bvh_build_ctx *ctx, const VkAccelerationStructureGeo
             coords[2] = *(const float *)(v_data + 8);
             coords[3] = *(const float *)(v_data + 12);
             break;
+         case VK_FORMAT_R16G16_SFLOAT:
+            coords[0] = _mesa_half_to_float(*(const uint16_t *)(v_data + 0));
+            coords[1] = _mesa_half_to_float(*(const uint16_t *)(v_data + 2));
+            coords[2] = 0.0f;
+            coords[3] = 1.0f;
+            break;
          case VK_FORMAT_R16G16B16_SFLOAT:
             coords[0] = _mesa_half_to_float(*(const uint16_t *)(v_data + 0));
             coords[1] = _mesa_half_to_float(*(const uint16_t *)(v_data + 2));
@@ -292,6 +257,24 @@ build_triangles(struct radv_bvh_build_ctx *ctx, const VkAccelerationStructureGeo
             coords[2] = _mesa_half_to_float(*(const uint16_t *)(v_data + 4));
             coords[3] = _mesa_half_to_float(*(const uint16_t *)(v_data + 6));
             break;
+         case VK_FORMAT_R16G16_SNORM:
+            coords[0] = _mesa_snorm_to_float(*(const int16_t *)(v_data + 0), 16);
+            coords[1] = _mesa_snorm_to_float(*(const int16_t *)(v_data + 2), 16);
+            coords[2] = 0.0f;
+            coords[3] = 1.0f;
+            break;
+         case VK_FORMAT_R16G16B16A16_SNORM:
+            coords[0] = _mesa_snorm_to_float(*(const int16_t *)(v_data + 0), 16);
+            coords[1] = _mesa_snorm_to_float(*(const int16_t *)(v_data + 2), 16);
+            coords[2] = _mesa_snorm_to_float(*(const int16_t *)(v_data + 4), 16);
+            coords[3] = _mesa_snorm_to_float(*(const int16_t *)(v_data + 6), 16);
+            break;
+         case VK_FORMAT_R16G16B16A16_UNORM:
+            coords[0] = _mesa_unorm_to_float(*(const uint16_t *)(v_data + 0), 16);
+            coords[1] = _mesa_unorm_to_float(*(const uint16_t *)(v_data + 2), 16);
+            coords[2] = _mesa_unorm_to_float(*(const uint16_t *)(v_data + 4), 16);
+            coords[3] = _mesa_unorm_to_float(*(const uint16_t *)(v_data + 6), 16);
+            break;
          default:
             unreachable("Unhandled vertex format in BVH build");
          }
@@ -348,11 +331,15 @@ build_instances(struct radv_device *device, struct radv_bvh_build_ctx *ctx,
          instance->instanceShaderBindingTableRecordOffset | (instance->flags << 24);
       node->instance_id = p;
 
+      for (unsigned i = 0; i < 3; ++i)
+         for (unsigned j = 0; j < 3; ++j)
+            node->otw_matrix[i * 3 + j] = instance->transform.matrix[j][i];
+
       RADV_FROM_HANDLE(radv_acceleration_structure, src_accel_struct,
                        (VkAccelerationStructureKHR)instance->accelerationStructureReference);
       const void *src_base = device->ws->buffer_map(src_accel_struct->bo);
       if (!src_base)
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
       src_base = (const char *)src_base + src_accel_struct->mem_offset;
       const struct radv_accel_struct_header *src_header = src_base;
@@ -382,7 +369,7 @@ build_aabbs(struct radv_bvh_build_ctx *ctx, const VkAccelerationStructureGeometr
    for (uint32_t p = 0; p < range->primitiveCount; ++p, ctx->curr_ptr += 64) {
       struct radv_bvh_aabb_node *node = (void*)ctx->curr_ptr;
       uint32_t node_offset = ctx->curr_ptr - ctx->base;
-      uint32_t node_id = (node_offset >> 3) | 6;
+      uint32_t node_id = (node_offset >> 3) | 7;
       *ctx->write_scratch++ = node_id;
 
       const VkAabbPositionsKHR *aabb =
@@ -461,6 +448,77 @@ compute_bounds(const char *base_ptr, uint32_t node_id, float *bounds)
    }
 }
 
+struct bvh_opt_entry {
+   uint64_t key;
+   uint32_t node_id;
+};
+
+static int
+bvh_opt_compare(const void *_a, const void *_b)
+{
+   const struct bvh_opt_entry *a = _a;
+   const struct bvh_opt_entry *b = _b;
+
+   if (a->key < b->key)
+      return -1;
+   if (a->key > b->key)
+      return 1;
+   if (a->node_id < b->node_id)
+      return -1;
+   if (a->node_id > b->node_id)
+      return 1;
+   return 0;
+}
+
+static void
+optimize_bvh(const char *base_ptr, uint32_t *node_ids, uint32_t node_count)
+{
+   float bounds[6];
+   for (unsigned i = 0; i < 3; ++i)
+      bounds[i] = INFINITY;
+   for (unsigned i = 0; i < 3; ++i)
+      bounds[3 + i] = -INFINITY;
+
+   for (uint32_t i = 0; i < node_count; ++i) {
+      float node_bounds[6];
+      compute_bounds(base_ptr, node_ids[i], node_bounds);
+      for (unsigned j = 0; j < 3; ++j)
+         bounds[j] = MIN2(bounds[j], node_bounds[j]);
+      for (unsigned j = 0; j < 3; ++j)
+         bounds[3 + j] = MAX2(bounds[3 + j], node_bounds[3 + j]);
+   }
+
+   struct bvh_opt_entry *entries = calloc(node_count, sizeof(struct bvh_opt_entry));
+   if (!entries)
+      return;
+
+   for (uint32_t i = 0; i < node_count; ++i) {
+      float node_bounds[6];
+      compute_bounds(base_ptr, node_ids[i], node_bounds);
+      float node_coords[3];
+      for (unsigned j = 0; j < 3; ++j)
+         node_coords[j] = (node_bounds[j] + node_bounds[3 + j]) * 0.5;
+      int32_t coords[3];
+      for (unsigned j = 0; j < 3; ++j)
+         coords[j] = MAX2(
+            MIN2((int32_t)((node_coords[j] - bounds[j]) / (bounds[3 + j] - bounds[j]) * (1 << 21)),
+                 (1 << 21) - 1),
+            0);
+      uint64_t key = 0;
+      for (unsigned j = 0; j < 21; ++j)
+         for (unsigned k = 0; k < 3; ++k)
+            key |= (uint64_t)((coords[k] >> j) & 1) << (j * 3 + k);
+      entries[i].key = key;
+      entries[i].node_id = node_ids[i];
+   }
+
+   qsort(entries, node_count, sizeof(entries[0]), bvh_opt_compare);
+   for (unsigned i = 0; i < node_count; ++i)
+      node_ids[i] = entries[i].node_id;
+
+   free(entries);
+}
+
 static VkResult
 build_bvh(struct radv_device *device, const VkAccelerationStructureBuildGeometryInfoKHR *info,
           const VkAccelerationStructureBuildRangeInfoKHR *ranges)
@@ -474,7 +532,7 @@ build_bvh(struct radv_device *device, const VkAccelerationStructureBuildGeometry
 
    char *base_ptr = (char*)device->ws->buffer_map(accel->bo);
    if (!base_ptr)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    base_ptr = base_ptr + accel->mem_offset;
    struct radv_accel_struct_header *header = (void*)base_ptr;
@@ -484,30 +542,42 @@ build_bvh(struct radv_device *device, const VkAccelerationStructureBuildGeometry
                                     .base = base_ptr,
                                     .curr_ptr = (char *)first_node_ptr + 128};
 
-   /* This initializes the leaf nodes of the BVH all at the same level. */
-   for (uint32_t i = 0; i < info->geometryCount; ++i) {
-      const VkAccelerationStructureGeometryKHR *geom =
-         info->pGeometries ? &info->pGeometries[i] : info->ppGeometries[i];
+   uint64_t instance_offset = (const char *)ctx.curr_ptr - (const char *)base_ptr;
+   uint64_t instance_count = 0;
 
-      switch (geom->geometryType) {
-      case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
-         build_triangles(&ctx, geom, ranges + i, i);
-         break;
-      case VK_GEOMETRY_TYPE_AABBS_KHR:
-         build_aabbs(&ctx, geom, ranges + i, i);
-         break;
-      case VK_GEOMETRY_TYPE_INSTANCES_KHR: {
-         result = build_instances(device, &ctx, geom, ranges + i);
-         if (result != VK_SUCCESS)
-            goto fail;
-         break;
-      }
-      case VK_GEOMETRY_TYPE_MAX_ENUM_KHR:
-         unreachable("VK_GEOMETRY_TYPE_MAX_ENUM_KHR unhandled");
+   /* This initializes the leaf nodes of the BVH all at the same level. */
+   for (int inst = 1; inst >= 0; --inst) {
+      for (uint32_t i = 0; i < info->geometryCount; ++i) {
+         const VkAccelerationStructureGeometryKHR *geom =
+            info->pGeometries ? &info->pGeometries[i] : info->ppGeometries[i];
+
+         if ((inst && geom->geometryType != VK_GEOMETRY_TYPE_INSTANCES_KHR) ||
+             (!inst && geom->geometryType == VK_GEOMETRY_TYPE_INSTANCES_KHR))
+            continue;
+
+         switch (geom->geometryType) {
+         case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
+            build_triangles(&ctx, geom, ranges + i, i);
+            break;
+         case VK_GEOMETRY_TYPE_AABBS_KHR:
+            build_aabbs(&ctx, geom, ranges + i, i);
+            break;
+         case VK_GEOMETRY_TYPE_INSTANCES_KHR: {
+            result = build_instances(device, &ctx, geom, ranges + i);
+            if (result != VK_SUCCESS)
+               goto fail;
+
+            instance_count += ranges[i].primitiveCount;
+            break;
+         }
+         case VK_GEOMETRY_TYPE_MAX_ENUM_KHR:
+            unreachable("VK_GEOMETRY_TYPE_MAX_ENUM_KHR unhandled");
+         }
       }
    }
 
    uint32_t node_counts[2] = {ctx.write_scratch - scratch[0], 0};
+   optimize_bvh(base_ptr, scratch[0], node_counts[0]);
    unsigned d;
 
    /*
@@ -569,7 +639,19 @@ build_bvh(struct radv_device *device, const VkAccelerationStructureBuildGeometry
 
    compute_bounds(base_ptr, header->root_node_offset, &header->aabb[0][0]);
 
-   /* TODO init sizes and figure out what is needed for serialization. */
+   header->instance_offset = instance_offset;
+   header->instance_count = instance_count;
+   header->compacted_size = (char *)ctx.curr_ptr - base_ptr;
+
+   /* 16 bytes per invocation, 64 invocations per workgroup */
+   header->copy_dispatch_size[0] = DIV_ROUND_UP(header->compacted_size, 16 * 64);
+   header->copy_dispatch_size[1] = 1;
+   header->copy_dispatch_size[2] = 1;
+
+   header->serialization_size =
+      header->compacted_size + align(sizeof(struct radv_accel_struct_serialization_header) +
+                                        sizeof(uint64_t) * header->instance_count,
+                                     128);
 
 fail:
    device->ws->buffer_unmap(accel->bo);
@@ -593,6 +675,35 @@ radv_BuildAccelerationStructuresKHR(
    return result;
 }
 
+VkResult
+radv_CopyAccelerationStructureKHR(VkDevice _device, VkDeferredOperationKHR deferredOperation,
+                                  const VkCopyAccelerationStructureInfoKHR *pInfo)
+{
+   RADV_FROM_HANDLE(radv_device, device, _device);
+   RADV_FROM_HANDLE(radv_acceleration_structure, src_struct, pInfo->src);
+   RADV_FROM_HANDLE(radv_acceleration_structure, dst_struct, pInfo->dst);
+
+   char *src_ptr = (char *)device->ws->buffer_map(src_struct->bo);
+   if (!src_ptr)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   char *dst_ptr = (char *)device->ws->buffer_map(dst_struct->bo);
+   if (!dst_ptr) {
+      device->ws->buffer_unmap(src_struct->bo);
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+   }
+
+   src_ptr += src_struct->mem_offset;
+   dst_ptr += dst_struct->mem_offset;
+
+   const struct radv_accel_struct_header *header = (const void *)src_ptr;
+   memcpy(dst_ptr, src_ptr, header->compacted_size);
+
+   device->ws->buffer_unmap(src_struct->bo);
+   device->ws->buffer_unmap(dst_struct->bo);
+   return VK_SUCCESS;
+}
+
 static nir_ssa_def *
 get_indices(nir_builder *b, nir_ssa_def *addr, nir_ssa_def *type, nir_ssa_def *id)
 {
@@ -657,10 +768,9 @@ get_vertices(nir_builder *b, nir_ssa_def *addresses, nir_ssa_def *format, nir_ss
       nir_variable_create(b->shader, nir_var_shader_temp, vec3_type, "vertex2")};
 
    VkFormat formats[] = {
-      VK_FORMAT_R32G32B32_SFLOAT,
-      VK_FORMAT_R32G32B32A32_SFLOAT,
-      VK_FORMAT_R16G16B16_SFLOAT,
-      VK_FORMAT_R16G16B16A16_SFLOAT,
+      VK_FORMAT_R32G32B32_SFLOAT,    VK_FORMAT_R32G32B32A32_SFLOAT, VK_FORMAT_R16G16B16_SFLOAT,
+      VK_FORMAT_R16G16B16A16_SFLOAT, VK_FORMAT_R16G16_SFLOAT,       VK_FORMAT_R32G32_SFLOAT,
+      VK_FORMAT_R16G16_SNORM,        VK_FORMAT_R16G16B16A16_SNORM,  VK_FORMAT_R16G16B16A16_UNORM,
    };
 
    for (unsigned f = 0; f < ARRAY_SIZE(formats); ++f) {
@@ -676,15 +786,47 @@ get_vertices(nir_builder *b, nir_ssa_def *addresses, nir_ssa_def *format, nir_ss
                                                 .align_mul = 4, .align_offset = 0),
                           7);
             break;
+         case VK_FORMAT_R32G32_SFLOAT:
+         case VK_FORMAT_R16G16_SFLOAT:
          case VK_FORMAT_R16G16B16_SFLOAT:
-         case VK_FORMAT_R16G16B16A16_SFLOAT: {
+         case VK_FORMAT_R16G16B16A16_SFLOAT:
+         case VK_FORMAT_R16G16_SNORM:
+         case VK_FORMAT_R16G16B16A16_SNORM:
+         case VK_FORMAT_R16G16B16A16_UNORM: {
+            unsigned components = MIN2(3, vk_format_get_nr_components(formats[f]));
+            unsigned comp_bits =
+               vk_format_get_blocksizebits(formats[f]) / vk_format_get_nr_components(formats[f]);
+            unsigned comp_bytes = comp_bits / 8;
             nir_ssa_def *values[3];
             nir_ssa_def *addr = nir_channel(b, addresses, i);
-            for (unsigned j = 0; j < 3; ++j)
-               values[j] =
-                  nir_build_load_global(b, 1, 16, nir_iadd(b, addr, nir_imm_int64(b, j * 2)),
-                                        .align_mul = 2, .align_offset = 0);
-            nir_store_var(b, results[i], nir_f2f32(b, nir_vec(b, values, 3)), 7);
+            for (unsigned j = 0; j < components; ++j)
+               values[j] = nir_build_load_global(
+                  b, 1, comp_bits, nir_iadd(b, addr, nir_imm_int64(b, j * comp_bytes)),
+                  .align_mul = comp_bytes, .align_offset = 0);
+
+            for (unsigned j = components; j < 3; ++j)
+               values[j] = nir_imm_intN_t(b, 0, comp_bits);
+
+            nir_ssa_def *vec;
+            if (util_format_is_snorm(vk_format_to_pipe_format(formats[f]))) {
+               for (unsigned j = 0; j < 3; ++j) {
+                  values[j] = nir_fdiv(b, nir_i2f32(b, values[j]),
+                                       nir_imm_float(b, (1u << (comp_bits - 1)) - 1));
+                  values[j] = nir_fmax(b, values[j], nir_imm_float(b, -1.0));
+               }
+               vec = nir_vec(b, values, 3);
+            } else if (util_format_is_unorm(vk_format_to_pipe_format(formats[f]))) {
+               for (unsigned j = 0; j < 3; ++j) {
+                  values[j] =
+                     nir_fdiv(b, nir_u2f32(b, values[j]), nir_imm_float(b, (1u << comp_bits) - 1));
+                  values[j] = nir_fmin(b, values[j], nir_imm_float(b, 1.0));
+               }
+               vec = nir_vec(b, values, 3);
+            } else if (comp_bits == 16)
+               vec = nir_f2f32(b, nir_vec(b, values, 3));
+            else
+               vec = nir_vec(b, values, 3);
+            nir_store_var(b, results[i], vec, 7);
             break;
          }
          default:
@@ -721,6 +863,7 @@ struct build_primitive_constants {
       };
       struct {
          uint64_t instance_data;
+         uint32_t array_of_pointers;
       };
       struct {
          uint64_t aabb_addr;
@@ -918,9 +1061,25 @@ build_leaf_shader(struct radv_device *dev)
    nir_push_else(&b, NULL);
    { /* Instances */
 
-      nir_ssa_def *instance_addr =
-         nir_iadd(&b, nir_pack_64_2x32(&b, nir_channels(&b, pconst2, 3)),
-                  nir_u2u64(&b, nir_imul(&b, global_id, nir_imm_int(&b, 64))));
+      nir_variable *instance_addr_var =
+         nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint64_t_type(), "instance_addr");
+      nir_push_if(&b, nir_ine(&b, nir_channel(&b, pconst2, 2), nir_imm_int(&b, 0)));
+      {
+         nir_ssa_def *ptr = nir_iadd(&b, nir_pack_64_2x32(&b, nir_channels(&b, pconst2, 3)),
+                                     nir_u2u64(&b, nir_imul(&b, global_id, nir_imm_int(&b, 8))));
+         nir_ssa_def *addr = nir_pack_64_2x32(
+            &b, nir_build_load_global(&b, 2, 32, ptr, .align_mul = 8, .align_offset = 0));
+         nir_store_var(&b, instance_addr_var, addr, 1);
+      }
+      nir_push_else(&b, NULL);
+      {
+         nir_ssa_def *addr = nir_iadd(&b, nir_pack_64_2x32(&b, nir_channels(&b, pconst2, 3)),
+                                      nir_u2u64(&b, nir_imul(&b, global_id, nir_imm_int(&b, 64))));
+         nir_store_var(&b, instance_addr_var, addr, 1);
+      }
+      nir_pop_if(&b, NULL);
+      nir_ssa_def *instance_addr = nir_load_var(&b, instance_addr_var);
+
       nir_ssa_def *inst_transform[] = {
          nir_build_load_global(&b, 4, 32, nir_iadd(&b, instance_addr, nir_imm_int64(&b, 0)),
                                .align_mul = 4, .align_offset = 0),
@@ -977,6 +1136,17 @@ build_leaf_shader(struct radv_device *dev)
       nir_store_var(&b, bounds[0], nir_vec(&b, bound_defs[0], 3), 7);
       nir_store_var(&b, bounds[1], nir_vec(&b, bound_defs[1], 3), 7);
 
+      /* Store object to world matrix */
+      for (unsigned i = 0; i < 3; ++i) {
+         nir_ssa_def *vals[3];
+         for (unsigned j = 0; j < 3; ++j)
+            vals[j] = nir_channel(&b, inst_transform[j], i);
+
+         nir_build_store_global(&b, nir_vec(&b, vals, 3),
+                                nir_iadd(&b, node_dst_addr, nir_imm_int64(&b, 92 + 12 * i)),
+                                .write_mask = 0x7, .align_mul = 4, .align_offset = 0);
+      }
+
       nir_ssa_def *m_in[3][3], *m_out[3][3], *m_vec[3][4];
       for (unsigned i = 0; i < 3; ++i)
          for (unsigned j = 0; j < 3; ++j)
@@ -1187,14 +1357,276 @@ build_internal_shader(struct radv_device *dev)
    return b.shader;
 }
 
+enum copy_mode {
+   COPY_MODE_COPY,
+   COPY_MODE_SERIALIZE,
+   COPY_MODE_DESERIALIZE,
+};
+
+struct copy_constants {
+   uint64_t src_addr;
+   uint64_t dst_addr;
+   uint32_t mode;
+};
+
+static nir_shader *
+build_copy_shader(struct radv_device *dev)
+{
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, "accel_copy");
+   b.shader->info.workgroup_size[0] = 64;
+   b.shader->info.workgroup_size[1] = 1;
+   b.shader->info.workgroup_size[2] = 1;
+
+   nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
+   nir_ssa_def *wg_id = nir_load_workgroup_id(&b, 32);
+   nir_ssa_def *block_size =
+      nir_imm_ivec4(&b, b.shader->info.workgroup_size[0], b.shader->info.workgroup_size[1],
+                    b.shader->info.workgroup_size[2], 0);
+
+   nir_ssa_def *global_id =
+      nir_channel(&b, nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id), 0);
+
+   nir_variable *offset_var =
+      nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "offset");
+   nir_ssa_def *offset = nir_imul(&b, global_id, nir_imm_int(&b, 16));
+   nir_store_var(&b, offset_var, offset, 1);
+
+   nir_ssa_def *increment = nir_imul(&b, nir_channel(&b, nir_load_num_workgroups(&b, 32), 0),
+                                     nir_imm_int(&b, b.shader->info.workgroup_size[0] * 16));
+
+   nir_ssa_def *pconst0 =
+      nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .base = 0, .range = 16);
+   nir_ssa_def *pconst1 =
+      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 16, .range = 4);
+   nir_ssa_def *src_base_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 3));
+   nir_ssa_def *dst_base_addr = nir_pack_64_2x32(&b, nir_channels(&b, pconst0, 0xc));
+   nir_ssa_def *mode = nir_channel(&b, pconst1, 0);
+
+   nir_variable *compacted_size_var =
+      nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint64_t_type(), "compacted_size");
+   nir_variable *src_offset_var =
+      nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "src_offset");
+   nir_variable *dst_offset_var =
+      nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "dst_offset");
+   nir_variable *instance_offset_var =
+      nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "instance_offset");
+   nir_variable *instance_count_var =
+      nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "instance_count");
+   nir_variable *value_var =
+      nir_variable_create(b.shader, nir_var_shader_temp, glsl_vec4_type(), "value");
+
+   nir_push_if(&b, nir_ieq(&b, mode, nir_imm_int(&b, COPY_MODE_SERIALIZE)));
+   {
+      nir_ssa_def *instance_count = nir_build_load_global(
+         &b, 1, 32,
+         nir_iadd(&b, src_base_addr,
+                  nir_imm_int64(&b, offsetof(struct radv_accel_struct_header, instance_count))),
+         .align_mul = 4, .align_offset = 0);
+      nir_ssa_def *compacted_size = nir_build_load_global(
+         &b, 1, 64,
+         nir_iadd(&b, src_base_addr,
+                  nir_imm_int64(&b, offsetof(struct radv_accel_struct_header, compacted_size))),
+         .align_mul = 8, .align_offset = 0);
+      nir_ssa_def *serialization_size = nir_build_load_global(
+         &b, 1, 64,
+         nir_iadd(&b, src_base_addr,
+                  nir_imm_int64(&b, offsetof(struct radv_accel_struct_header, serialization_size))),
+         .align_mul = 8, .align_offset = 0);
+
+      nir_store_var(&b, compacted_size_var, compacted_size, 1);
+      nir_store_var(
+         &b, instance_offset_var,
+         nir_build_load_global(
+            &b, 1, 32,
+            nir_iadd(&b, src_base_addr,
+                     nir_imm_int64(&b, offsetof(struct radv_accel_struct_header, instance_offset))),
+            .align_mul = 4, .align_offset = 0),
+         1);
+      nir_store_var(&b, instance_count_var, instance_count, 1);
+
+      nir_ssa_def *dst_offset =
+         nir_iadd(&b, nir_imm_int(&b, sizeof(struct radv_accel_struct_serialization_header)),
+                  nir_imul(&b, instance_count, nir_imm_int(&b, sizeof(uint64_t))));
+      nir_store_var(&b, src_offset_var, nir_imm_int(&b, 0), 1);
+      nir_store_var(&b, dst_offset_var, dst_offset, 1);
+
+      nir_push_if(&b, nir_ieq(&b, global_id, nir_imm_int(&b, 0)));
+      {
+         nir_build_store_global(
+            &b, serialization_size,
+            nir_iadd(&b, dst_base_addr,
+                     nir_imm_int64(&b, offsetof(struct radv_accel_struct_serialization_header,
+                                                serialization_size))),
+            .write_mask = 0x1, .align_mul = 8, .align_offset = 0);
+         nir_build_store_global(
+            &b, compacted_size,
+            nir_iadd(&b, dst_base_addr,
+                     nir_imm_int64(&b, offsetof(struct radv_accel_struct_serialization_header,
+                                                compacted_size))),
+            .write_mask = 0x1, .align_mul = 8, .align_offset = 0);
+         nir_build_store_global(
+            &b, nir_u2u64(&b, instance_count),
+            nir_iadd(&b, dst_base_addr,
+                     nir_imm_int64(&b, offsetof(struct radv_accel_struct_serialization_header,
+                                                instance_count))),
+            .write_mask = 0x1, .align_mul = 8, .align_offset = 0);
+      }
+      nir_pop_if(&b, NULL);
+   }
+   nir_push_else(&b, NULL);
+   nir_push_if(&b, nir_ieq(&b, mode, nir_imm_int(&b, COPY_MODE_DESERIALIZE)));
+   {
+      nir_ssa_def *instance_count = nir_build_load_global(
+         &b, 1, 32,
+         nir_iadd(&b, src_base_addr,
+                  nir_imm_int64(
+                     &b, offsetof(struct radv_accel_struct_serialization_header, instance_count))),
+         .align_mul = 4, .align_offset = 0);
+      nir_ssa_def *src_offset =
+         nir_iadd(&b, nir_imm_int(&b, sizeof(struct radv_accel_struct_serialization_header)),
+                  nir_imul(&b, instance_count, nir_imm_int(&b, sizeof(uint64_t))));
+
+      nir_ssa_def *header_addr = nir_iadd(&b, src_base_addr, nir_u2u64(&b, src_offset));
+      nir_store_var(
+         &b, compacted_size_var,
+         nir_build_load_global(
+            &b, 1, 64,
+            nir_iadd(&b, header_addr,
+                     nir_imm_int64(&b, offsetof(struct radv_accel_struct_header, compacted_size))),
+            .align_mul = 8, .align_offset = 0),
+         1);
+      nir_store_var(
+         &b, instance_offset_var,
+         nir_build_load_global(
+            &b, 1, 32,
+            nir_iadd(&b, header_addr,
+                     nir_imm_int64(&b, offsetof(struct radv_accel_struct_header, instance_offset))),
+            .align_mul = 4, .align_offset = 0),
+         1);
+      nir_store_var(&b, instance_count_var, instance_count, 1);
+      nir_store_var(&b, src_offset_var, src_offset, 1);
+      nir_store_var(&b, dst_offset_var, nir_imm_int(&b, 0), 1);
+   }
+   nir_push_else(&b, NULL); /* COPY_MODE_COPY */
+   {
+      nir_store_var(
+         &b, compacted_size_var,
+         nir_build_load_global(
+            &b, 1, 64,
+            nir_iadd(&b, src_base_addr,
+                     nir_imm_int64(&b, offsetof(struct radv_accel_struct_header, compacted_size))),
+            .align_mul = 8, .align_offset = 0),
+         1);
+
+      nir_store_var(&b, src_offset_var, nir_imm_int(&b, 0), 1);
+      nir_store_var(&b, dst_offset_var, nir_imm_int(&b, 0), 1);
+      nir_store_var(&b, instance_offset_var, nir_imm_int(&b, 0), 1);
+      nir_store_var(&b, instance_count_var, nir_imm_int(&b, 0), 1);
+   }
+   nir_pop_if(&b, NULL);
+   nir_pop_if(&b, NULL);
+
+   nir_ssa_def *instance_bound =
+      nir_imul(&b, nir_imm_int(&b, sizeof(struct radv_bvh_instance_node)),
+               nir_load_var(&b, instance_count_var));
+   nir_ssa_def *compacted_size = nir_build_load_global(
+      &b, 1, 32,
+      nir_iadd(&b, src_base_addr,
+               nir_imm_int64(&b, offsetof(struct radv_accel_struct_header, compacted_size))),
+      .align_mul = 4, .align_offset = 0);
+
+   nir_push_loop(&b);
+   {
+      offset = nir_load_var(&b, offset_var);
+      nir_push_if(&b, nir_ilt(&b, offset, compacted_size));
+      {
+         nir_ssa_def *src_offset = nir_iadd(&b, offset, nir_load_var(&b, src_offset_var));
+         nir_ssa_def *dst_offset = nir_iadd(&b, offset, nir_load_var(&b, dst_offset_var));
+         nir_ssa_def *src_addr = nir_iadd(&b, src_base_addr, nir_u2u64(&b, src_offset));
+         nir_ssa_def *dst_addr = nir_iadd(&b, dst_base_addr, nir_u2u64(&b, dst_offset));
+
+         nir_ssa_def *value =
+            nir_build_load_global(&b, 4, 32, src_addr, .align_mul = 16, .align_offset = 0);
+         nir_store_var(&b, value_var, value, 0xf);
+
+         nir_ssa_def *instance_offset = nir_isub(&b, offset, nir_load_var(&b, instance_offset_var));
+         nir_ssa_def *in_instance_bound =
+            nir_iand(&b, nir_uge(&b, offset, nir_load_var(&b, instance_offset_var)),
+                     nir_ult(&b, instance_offset, instance_bound));
+         nir_ssa_def *instance_start =
+            nir_ieq(&b,
+                    nir_iand(&b, instance_offset,
+                             nir_imm_int(&b, sizeof(struct radv_bvh_instance_node) - 1)),
+                    nir_imm_int(&b, 0));
+
+         nir_push_if(&b, nir_iand(&b, in_instance_bound, instance_start));
+         {
+            nir_ssa_def *instance_id = nir_ushr(&b, instance_offset, nir_imm_int(&b, 7));
+
+            nir_push_if(&b, nir_ieq(&b, mode, nir_imm_int(&b, COPY_MODE_SERIALIZE)));
+            {
+               nir_ssa_def *instance_addr =
+                  nir_imul(&b, instance_id, nir_imm_int(&b, sizeof(uint64_t)));
+               instance_addr =
+                  nir_iadd(&b, instance_addr,
+                           nir_imm_int(&b, sizeof(struct radv_accel_struct_serialization_header)));
+               instance_addr = nir_iadd(&b, dst_base_addr, nir_u2u64(&b, instance_addr));
+
+               nir_build_store_global(&b, nir_channels(&b, value, 3), instance_addr,
+                                      .write_mask = 3, .align_mul = 8, .align_offset = 0);
+            }
+            nir_push_else(&b, NULL);
+            {
+               nir_ssa_def *instance_addr =
+                  nir_imul(&b, instance_id, nir_imm_int(&b, sizeof(uint64_t)));
+               instance_addr =
+                  nir_iadd(&b, instance_addr,
+                           nir_imm_int(&b, sizeof(struct radv_accel_struct_serialization_header)));
+               instance_addr = nir_iadd(&b, src_base_addr, nir_u2u64(&b, instance_addr));
+
+               nir_ssa_def *instance_value = nir_build_load_global(
+                  &b, 2, 32, instance_addr, .align_mul = 8, .align_offset = 0);
+
+               nir_ssa_def *values[] = {
+                  nir_channel(&b, instance_value, 0),
+                  nir_channel(&b, instance_value, 1),
+                  nir_channel(&b, value, 2),
+                  nir_channel(&b, value, 3),
+               };
+
+               nir_store_var(&b, value_var, nir_vec(&b, values, 4), 0xf);
+            }
+            nir_pop_if(&b, NULL);
+         }
+         nir_pop_if(&b, NULL);
+
+         nir_store_var(&b, offset_var, nir_iadd(&b, offset, increment), 1);
+
+         nir_build_store_global(&b, nir_load_var(&b, value_var), dst_addr, .write_mask = 0xf,
+                                .align_mul = 16, .align_offset = 0);
+      }
+      nir_push_else(&b, NULL);
+      {
+         nir_jump(&b, nir_jump_break);
+      }
+      nir_pop_if(&b, NULL);
+   }
+   nir_pop_loop(&b, NULL);
+   return b.shader;
+}
+
 void
 radv_device_finish_accel_struct_build_state(struct radv_device *device)
 {
    struct radv_meta_state *state = &device->meta_state;
+   radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.copy_pipeline,
+                        &state->alloc);
    radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.internal_pipeline,
                         &state->alloc);
    radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.leaf_pipeline,
                         &state->alloc);
+   radv_DestroyPipelineLayout(radv_device_to_handle(device),
+                              state->accel_struct_build.copy_p_layout, &state->alloc);
    radv_DestroyPipelineLayout(radv_device_to_handle(device),
                               state->accel_struct_build.internal_p_layout, &state->alloc);
    radv_DestroyPipelineLayout(radv_device_to_handle(device),
@@ -1207,6 +1639,7 @@ radv_device_init_accel_struct_build_state(struct radv_device *device)
    VkResult result;
    nir_shader *leaf_cs = build_leaf_shader(device);
    nir_shader *internal_cs = build_internal_shader(device);
+   nir_shader *copy_cs = build_copy_shader(device);
 
    const VkPipelineLayoutCreateInfo leaf_pl_create_info = {
       .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
@@ -1278,10 +1711,50 @@ radv_device_init_accel_struct_build_state(struct radv_device *device)
    if (result != VK_SUCCESS)
       goto fail;
 
+   const VkPipelineLayoutCreateInfo copy_pl_create_info = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+      .setLayoutCount = 0,
+      .pushConstantRangeCount = 1,
+      .pPushConstantRanges =
+         &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(struct copy_constants)},
+   };
+
+   result = radv_CreatePipelineLayout(radv_device_to_handle(device), &copy_pl_create_info,
+                                      &device->meta_state.alloc,
+                                      &device->meta_state.accel_struct_build.copy_p_layout);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   VkPipelineShaderStageCreateInfo copy_shader_stage = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+      .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+      .module = vk_shader_module_handle_from_nir(copy_cs),
+      .pName = "main",
+      .pSpecializationInfo = NULL,
+   };
+
+   VkComputePipelineCreateInfo copy_pipeline_info = {
+      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+      .stage = copy_shader_stage,
+      .flags = 0,
+      .layout = device->meta_state.accel_struct_build.copy_p_layout,
+   };
+
+   result = radv_CreateComputePipelines(
+      radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,
+      &copy_pipeline_info, NULL, &device->meta_state.accel_struct_build.copy_pipeline);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   ralloc_free(copy_cs);
+   ralloc_free(internal_cs);
+   ralloc_free(leaf_cs);
+
    return VK_SUCCESS;
 
 fail:
    radv_device_finish_accel_struct_build_state(device);
+   ralloc_free(copy_cs);
    ralloc_free(internal_cs);
    ralloc_free(leaf_cs);
    return result;
@@ -1291,6 +1764,9 @@ struct bvh_state {
    uint32_t node_offset;
    uint32_t node_count;
    uint32_t scratch_offset;
+
+   uint32_t instance_offset;
+   uint32_t instance_count;
 };
 
 void
@@ -1320,51 +1796,62 @@ radv_CmdBuildAccelerationStructuresKHR(
          .dst_offset = ALIGN(sizeof(struct radv_accel_struct_header), 64) + 128,
          .dst_scratch_offset = 0,
       };
+      bvh_states[i].node_offset = prim_consts.dst_offset;
+      bvh_states[i].instance_offset = prim_consts.dst_offset;
 
-      for (unsigned j = 0; j < pInfos[i].geometryCount; ++j) {
-         const VkAccelerationStructureGeometryKHR *geom =
-            pInfos[i].pGeometries ? &pInfos[i].pGeometries[j] : pInfos[i].ppGeometries[j];
+      for (int inst = 1; inst >= 0; --inst) {
+         for (unsigned j = 0; j < pInfos[i].geometryCount; ++j) {
+            const VkAccelerationStructureGeometryKHR *geom =
+               pInfos[i].pGeometries ? &pInfos[i].pGeometries[j] : pInfos[i].ppGeometries[j];
 
-         prim_consts.geometry_type = geom->geometryType;
-         prim_consts.geometry_id = j | (geom->flags << 28);
-         unsigned prim_size;
-         switch (geom->geometryType) {
-         case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
-            prim_consts.vertex_addr =
-               geom->geometry.triangles.vertexData.deviceAddress +
-               ppBuildRangeInfos[i][j].firstVertex * geom->geometry.triangles.vertexStride +
-               (geom->geometry.triangles.indexType != VK_INDEX_TYPE_NONE_KHR
-                   ? ppBuildRangeInfos[i][j].primitiveOffset
-                   : 0);
-            prim_consts.index_addr = geom->geometry.triangles.indexData.deviceAddress +
-                                     ppBuildRangeInfos[i][j].primitiveOffset;
-            prim_consts.transform_addr = geom->geometry.triangles.transformData.deviceAddress +
-                                         ppBuildRangeInfos[i][j].transformOffset;
-            prim_consts.vertex_stride = geom->geometry.triangles.vertexStride;
-            prim_consts.vertex_format = geom->geometry.triangles.vertexFormat;
-            prim_consts.index_format = geom->geometry.triangles.indexType;
-            prim_size = 64;
-            break;
-         case VK_GEOMETRY_TYPE_AABBS_KHR:
-            prim_consts.aabb_addr =
-               geom->geometry.aabbs.data.deviceAddress + ppBuildRangeInfos[i][j].primitiveOffset;
-            prim_consts.aabb_stride = geom->geometry.aabbs.stride;
-            prim_size = 64;
-            break;
-         case VK_GEOMETRY_TYPE_INSTANCES_KHR:
-            prim_consts.instance_data = geom->geometry.instances.data.deviceAddress;
-            prim_size = 128;
-            break;
-         default:
-            unreachable("Unknown geometryType");
+            if ((inst && geom->geometryType != VK_GEOMETRY_TYPE_INSTANCES_KHR) ||
+                (!inst && geom->geometryType == VK_GEOMETRY_TYPE_INSTANCES_KHR))
+               continue;
+
+            prim_consts.geometry_type = geom->geometryType;
+            prim_consts.geometry_id = j | (geom->flags << 28);
+            unsigned prim_size;
+            switch (geom->geometryType) {
+            case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
+               prim_consts.vertex_addr =
+                  geom->geometry.triangles.vertexData.deviceAddress +
+                  ppBuildRangeInfos[i][j].firstVertex * geom->geometry.triangles.vertexStride +
+                  (geom->geometry.triangles.indexType != VK_INDEX_TYPE_NONE_KHR
+                      ? ppBuildRangeInfos[i][j].primitiveOffset
+                      : 0);
+               prim_consts.index_addr = geom->geometry.triangles.indexData.deviceAddress +
+                                        ppBuildRangeInfos[i][j].primitiveOffset;
+               prim_consts.transform_addr = geom->geometry.triangles.transformData.deviceAddress +
+                                            ppBuildRangeInfos[i][j].transformOffset;
+               prim_consts.vertex_stride = geom->geometry.triangles.vertexStride;
+               prim_consts.vertex_format = geom->geometry.triangles.vertexFormat;
+               prim_consts.index_format = geom->geometry.triangles.indexType;
+               prim_size = 64;
+               break;
+            case VK_GEOMETRY_TYPE_AABBS_KHR:
+               prim_consts.aabb_addr =
+                  geom->geometry.aabbs.data.deviceAddress + ppBuildRangeInfos[i][j].primitiveOffset;
+               prim_consts.aabb_stride = geom->geometry.aabbs.stride;
+               prim_size = 64;
+               break;
+            case VK_GEOMETRY_TYPE_INSTANCES_KHR:
+               prim_consts.instance_data = geom->geometry.instances.data.deviceAddress;
+               prim_consts.array_of_pointers = geom->geometry.instances.arrayOfPointers ? 1 : 0;
+               prim_size = 128;
+               bvh_states[i].instance_count += ppBuildRangeInfos[i][j].primitiveCount;
+               break;
+            default:
+               unreachable("Unknown geometryType");
+            }
+
+            radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
+                                  cmd_buffer->device->meta_state.accel_struct_build.leaf_p_layout,
+                                  VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(prim_consts),
+                                  &prim_consts);
+            radv_unaligned_dispatch(cmd_buffer, ppBuildRangeInfos[i][j].primitiveCount, 1, 1);
+            prim_consts.dst_offset += prim_size * ppBuildRangeInfos[i][j].primitiveCount;
+            prim_consts.dst_scratch_offset += 4 * ppBuildRangeInfos[i][j].primitiveCount;
          }
-
-         radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
-                               cmd_buffer->device->meta_state.accel_struct_build.leaf_p_layout,
-                               VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(prim_consts), &prim_consts);
-         radv_unaligned_dispatch(cmd_buffer, ppBuildRangeInfos[i][j].primitiveCount, 1, 1);
-         prim_consts.dst_offset += prim_size * ppBuildRangeInfos[i][j].primitiveCount;
-         prim_consts.dst_scratch_offset += 4 * ppBuildRangeInfos[i][j].primitiveCount;
       }
       bvh_states[i].node_offset = prim_consts.dst_offset;
       bvh_states[i].node_count = prim_consts.dst_scratch_offset / 4;
@@ -1411,11 +1898,222 @@ radv_CmdBuildAccelerationStructuresKHR(
                                cmd_buffer->device->meta_state.accel_struct_build.internal_p_layout,
                                VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
          radv_unaligned_dispatch(cmd_buffer, dst_node_count, 1, 1);
-         bvh_states[i].node_offset += dst_node_count * 128;
+         if (!final_iter)
+            bvh_states[i].node_offset += dst_node_count * 128;
          bvh_states[i].node_count = dst_node_count;
          bvh_states[i].scratch_offset = dst_scratch_offset;
       }
    }
+   for (uint32_t i = 0; i < infoCount; ++i) {
+      RADV_FROM_HANDLE(radv_acceleration_structure, accel_struct,
+                       pInfos[i].dstAccelerationStructure);
+      const size_t base = offsetof(struct radv_accel_struct_header, compacted_size);
+      struct radv_accel_struct_header header;
+
+      header.instance_offset = bvh_states[i].instance_offset;
+      header.instance_count = bvh_states[i].instance_count;
+      header.compacted_size = bvh_states[i].node_offset;
+
+      /* 16 bytes per invocation, 64 invocations per workgroup */
+      header.copy_dispatch_size[0] = DIV_ROUND_UP(header.compacted_size, 16 * 64);
+      header.copy_dispatch_size[1] = 1;
+      header.copy_dispatch_size[2] = 1;
+
+      header.serialization_size =
+         header.compacted_size + align(sizeof(struct radv_accel_struct_serialization_header) +
+                                          sizeof(uint64_t) * header.instance_count,
+                                       128);
+
+      radv_update_buffer_cp(cmd_buffer,
+                            radv_buffer_get_va(accel_struct->bo) + accel_struct->mem_offset + base,
+                            (const char *)&header + base, sizeof(header) - base);
+   }
    free(bvh_states);
    radv_meta_restore(&saved_state, cmd_buffer);
-}
\ No newline at end of file
+}
+
+void
+radv_CmdCopyAccelerationStructureKHR(VkCommandBuffer commandBuffer,
+                                     const VkCopyAccelerationStructureInfoKHR *pInfo)
+{
+   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   RADV_FROM_HANDLE(radv_acceleration_structure, src, pInfo->src);
+   RADV_FROM_HANDLE(radv_acceleration_structure, dst, pInfo->dst);
+   struct radv_meta_saved_state saved_state;
+
+   radv_meta_save(
+      &saved_state, cmd_buffer,
+      RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_CONSTANTS);
+
+   uint64_t src_addr = radv_accel_struct_get_va(src);
+   uint64_t dst_addr = radv_accel_struct_get_va(dst);
+
+   radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
+                        cmd_buffer->device->meta_state.accel_struct_build.copy_pipeline);
+
+   const struct copy_constants consts = {
+      .src_addr = src_addr,
+      .dst_addr = dst_addr,
+      .mode = COPY_MODE_COPY,
+   };
+
+   radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
+                         cmd_buffer->device->meta_state.accel_struct_build.copy_p_layout,
+                         VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
+
+   radv_indirect_dispatch(cmd_buffer, src->bo,
+                          src_addr + offsetof(struct radv_accel_struct_header, copy_dispatch_size));
+   radv_meta_restore(&saved_state, cmd_buffer);
+}
+
+void
+radv_GetDeviceAccelerationStructureCompatibilityKHR(
+   VkDevice _device, const VkAccelerationStructureVersionInfoKHR *pVersionInfo,
+   VkAccelerationStructureCompatibilityKHR *pCompatibility)
+{
+   RADV_FROM_HANDLE(radv_device, device, _device);
+   uint8_t zero[VK_UUID_SIZE] = {
+      0,
+   };
+   bool compat =
+      memcmp(pVersionInfo->pVersionData, device->physical_device->driver_uuid, VK_UUID_SIZE) == 0 &&
+      memcmp(pVersionInfo->pVersionData + VK_UUID_SIZE, zero, VK_UUID_SIZE) == 0;
+   *pCompatibility = compat ? VK_ACCELERATION_STRUCTURE_COMPATIBILITY_COMPATIBLE_KHR
+                            : VK_ACCELERATION_STRUCTURE_COMPATIBILITY_INCOMPATIBLE_KHR;
+}
+
+VkResult
+radv_CopyMemoryToAccelerationStructureKHR(VkDevice _device,
+                                          VkDeferredOperationKHR deferredOperation,
+                                          const VkCopyMemoryToAccelerationStructureInfoKHR *pInfo)
+{
+   RADV_FROM_HANDLE(radv_device, device, _device);
+   RADV_FROM_HANDLE(radv_acceleration_structure, accel_struct, pInfo->dst);
+
+   char *base = device->ws->buffer_map(accel_struct->bo);
+   if (!base)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   base += accel_struct->mem_offset;
+   const struct radv_accel_struct_header *header = (const struct radv_accel_struct_header *)base;
+
+   const char *src = pInfo->src.hostAddress;
+   struct radv_accel_struct_serialization_header *src_header = (void *)src;
+   src += sizeof(*src_header) + sizeof(uint64_t) * src_header->instance_count;
+
+   memcpy(base, src, src_header->compacted_size);
+
+   for (unsigned i = 0; i < src_header->instance_count; ++i) {
+      uint64_t *p = (uint64_t *)(base + i * 128 + header->instance_offset);
+      *p = (*p & 63) | src_header->instances[i];
+   }
+
+   device->ws->buffer_unmap(accel_struct->bo);
+   return VK_SUCCESS;
+}
+
+VkResult
+radv_CopyAccelerationStructureToMemoryKHR(VkDevice _device,
+                                          VkDeferredOperationKHR deferredOperation,
+                                          const VkCopyAccelerationStructureToMemoryInfoKHR *pInfo)
+{
+   RADV_FROM_HANDLE(radv_device, device, _device);
+   RADV_FROM_HANDLE(radv_acceleration_structure, accel_struct, pInfo->src);
+
+   const char *base = device->ws->buffer_map(accel_struct->bo);
+   if (!base)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   base += accel_struct->mem_offset;
+   const struct radv_accel_struct_header *header = (const struct radv_accel_struct_header *)base;
+
+   char *dst = pInfo->dst.hostAddress;
+   struct radv_accel_struct_serialization_header *dst_header = (void *)dst;
+   dst += sizeof(*dst_header) + sizeof(uint64_t) * header->instance_count;
+
+   memcpy(dst_header->driver_uuid, device->physical_device->driver_uuid, VK_UUID_SIZE);
+   memset(dst_header->accel_struct_compat, 0, VK_UUID_SIZE);
+
+   dst_header->serialization_size = header->serialization_size;
+   dst_header->compacted_size = header->compacted_size;
+   dst_header->instance_count = header->instance_count;
+
+   memcpy(dst, base, header->compacted_size);
+
+   for (unsigned i = 0; i < header->instance_count; ++i) {
+      dst_header->instances[i] =
+         *(const uint64_t *)(base + i * 128 + header->instance_offset) & ~63ull;
+   }
+
+   device->ws->buffer_unmap(accel_struct->bo);
+   return VK_SUCCESS;
+}
+
+void
+radv_CmdCopyMemoryToAccelerationStructureKHR(
+   VkCommandBuffer commandBuffer, const VkCopyMemoryToAccelerationStructureInfoKHR *pInfo)
+{
+   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   RADV_FROM_HANDLE(radv_acceleration_structure, dst, pInfo->dst);
+   struct radv_meta_saved_state saved_state;
+
+   radv_meta_save(
+      &saved_state, cmd_buffer,
+      RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_CONSTANTS);
+
+   uint64_t dst_addr = radv_accel_struct_get_va(dst);
+
+   radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
+                        cmd_buffer->device->meta_state.accel_struct_build.copy_pipeline);
+
+   const struct copy_constants consts = {
+      .src_addr = pInfo->src.deviceAddress,
+      .dst_addr = dst_addr,
+      .mode = COPY_MODE_DESERIALIZE,
+   };
+
+   radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
+                         cmd_buffer->device->meta_state.accel_struct_build.copy_p_layout,
+                         VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
+
+   radv_CmdDispatch(commandBuffer, 512, 1, 1);
+   radv_meta_restore(&saved_state, cmd_buffer);
+}
+
+void
+radv_CmdCopyAccelerationStructureToMemoryKHR(
+   VkCommandBuffer commandBuffer, const VkCopyAccelerationStructureToMemoryInfoKHR *pInfo)
+{
+   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   RADV_FROM_HANDLE(radv_acceleration_structure, src, pInfo->src);
+   struct radv_meta_saved_state saved_state;
+
+   radv_meta_save(
+      &saved_state, cmd_buffer,
+      RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_CONSTANTS);
+
+   uint64_t src_addr = radv_accel_struct_get_va(src);
+
+   radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
+                        cmd_buffer->device->meta_state.accel_struct_build.copy_pipeline);
+
+   const struct copy_constants consts = {
+      .src_addr = src_addr,
+      .dst_addr = pInfo->dst.deviceAddress,
+      .mode = COPY_MODE_SERIALIZE,
+   };
+
+   radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
+                         cmd_buffer->device->meta_state.accel_struct_build.copy_p_layout,
+                         VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(consts), &consts);
+
+   radv_indirect_dispatch(cmd_buffer, src->bo,
+                          src_addr + offsetof(struct radv_accel_struct_header, copy_dispatch_size));
+   radv_meta_restore(&saved_state, cmd_buffer);
+
+   /* Set the header of the serialized data. */
+   uint8_t header_data[2 * VK_UUID_SIZE] = {0};
+   memcpy(header_data, cmd_buffer->device->physical_device->driver_uuid, VK_UUID_SIZE);
+
+   radv_update_buffer_cp(cmd_buffer, pInfo->dst.deviceAddress, header_data, sizeof(header_data));
+}
diff --git a/mesa 3D driver/src/amd/vulkan/radv_acceleration_structure.h b/mesa 3D driver/src/amd/vulkan/radv_acceleration_structure.h
new file mode 100644
index 0000000000..062edde501
--- /dev/null
+++ b/mesa 3D driver/src/amd/vulkan/radv_acceleration_structure.h	
@@ -0,0 +1,102 @@
+/*
+ * Copyright © 2021 Bas Nieuwenhuizen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef RADV_ACCELERATION_STRUCTURE_H
+#define RADV_ACCELERATION_STRUCTURE_H
+
+#include <stdint.h>
+#include <vulkan/vulkan.h>
+
+struct radv_accel_struct_serialization_header {
+   uint8_t driver_uuid[VK_UUID_SIZE];
+   uint8_t accel_struct_compat[VK_UUID_SIZE];
+   uint64_t serialization_size;
+   uint64_t compacted_size;
+   uint64_t instance_count;
+   uint64_t instances[];
+};
+
+struct radv_accel_struct_header {
+   uint32_t root_node_offset;
+   uint32_t reserved;
+   float aabb[2][3];
+
+   /* Everything after this gets updated/copied from the CPU. */
+   uint64_t compacted_size;
+   uint64_t serialization_size;
+   uint32_t copy_dispatch_size[3];
+   uint64_t instance_offset;
+   uint64_t instance_count;
+};
+
+struct radv_bvh_triangle_node {
+   float coords[3][3];
+   uint32_t reserved[3];
+   uint32_t triangle_id;
+   /* flags in upper 4 bits */
+   uint32_t geometry_id_and_flags;
+   uint32_t reserved2;
+   uint32_t id;
+};
+
+struct radv_bvh_aabb_node {
+   float aabb[2][3];
+   uint32_t primitive_id;
+   /* flags in upper 4 bits */
+   uint32_t geometry_id_and_flags;
+   uint32_t reserved[8];
+};
+
+struct radv_bvh_instance_node {
+   uint64_t base_ptr;
+   /* lower 24 bits are the custom instance index, upper 8 bits are the visibility mask */
+   uint32_t custom_instance_and_mask;
+   /* lower 24 bits are the sbt offset, upper 8 bits are VkGeometryInstanceFlagsKHR */
+   uint32_t sbt_offset_and_flags;
+
+   /* The translation component is actually a pre-translation instead of a post-translation. If you
+    * want to get a proper matrix out of it you need to apply the directional component of the
+    * matrix to it. The pre-translation of the world->object matrix is the same as the
+    * post-translation of the object->world matrix so this way we can share data between both
+    * matrices. */
+   float wto_matrix[12];
+   float aabb[2][3];
+   uint32_t instance_id;
+
+   /* Object to world matrix transposed from the initial transform. Translate part is store in the
+    * wto_matrix. */
+   float otw_matrix[9];
+};
+
+struct radv_bvh_box16_node {
+   uint32_t children[4];
+   uint32_t coords[4][3];
+};
+
+struct radv_bvh_box32_node {
+   uint32_t children[4];
+   float coords[4][2][3];
+   uint32_t reserved[4];
+};
+
+#endif
\ No newline at end of file
diff --git a/mesa 3D driver/src/amd/vulkan/radv_android.c b/mesa 3D driver/src/amd/vulkan/radv_android.c
index 066cae971e..790e2c5750 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_android.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_android.c	
@@ -121,7 +121,7 @@ radv_image_from_gralloc(VkDevice device_h, const VkImageCreateInfo *base_info,
    VkResult result;
 
    if (gralloc_info->handle->numFds != 1) {
-      return vk_errorf(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+      return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
                        "VkNativeBufferANDROID::handle::numFds is %d, "
                        "expected 1",
                        gralloc_info->handle->numFds);
@@ -196,7 +196,13 @@ radv_image_from_gralloc(VkDevice device_h, const VkImageCreateInfo *base_info,
 
    radv_image_override_offset_stride(device, image, 0, gralloc_info->stride);
 
-   radv_BindImageMemory(device_h, image_h, memory_h, 0);
+   VkBindImageMemoryInfo bind_info = {
+      .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO,
+      .image = image_h,
+      .memory = memory_h,
+      .memoryOffset = 0
+   };
+   radv_BindImageMemory2(device_h, 1, &bind_info);
 
    image->owned_memory = memory_h;
    /* Don't clobber the out-parameter until success is certain. */
@@ -249,7 +255,7 @@ radv_GetSwapchainGrallocUsageANDROID(VkDevice device_h, VkFormat format,
    result = radv_GetPhysicalDeviceImageFormatProperties2(phys_dev_h, &image_format_info,
                                                          &image_format_props);
    if (result != VK_SUCCESS) {
-      return vk_errorf(device->instance, result,
+      return vk_errorf(device, result,
                        "radv_GetPhysicalDeviceImageFormatProperties2 failed "
                        "inside %s",
                        __func__);
@@ -266,7 +272,7 @@ radv_GetSwapchainGrallocUsageANDROID(VkDevice device_h, VkFormat format,
     * gralloc swapchains.
     */
    if (imageUsage != 0) {
-      return vk_errorf(device->instance, VK_ERROR_FORMAT_NOT_SUPPORTED,
+      return vk_errorf(device, VK_ERROR_FORMAT_NOT_SUPPORTED,
                        "unsupported VkImageUsageFlags(0x%x) for gralloc "
                        "swapchain",
                        imageUsage);
@@ -307,7 +313,7 @@ radv_GetSwapchainGrallocUsage2ANDROID(VkDevice device_h, VkFormat format,
    *grallocProducerUsage = 0;
 
    if (swapchainImageUsage & VK_SWAPCHAIN_IMAGE_USAGE_SHARED_BIT_ANDROID)
-      return vk_errorf(device->instance, VK_ERROR_FORMAT_NOT_SUPPORTED,
+      return vk_errorf(device, VK_ERROR_FORMAT_NOT_SUPPORTED,
                        "The Vulkan loader tried to query shared presentable image support");
 
    const VkPhysicalDeviceImageFormatInfo2 image_format_info = {
@@ -326,7 +332,7 @@ radv_GetSwapchainGrallocUsage2ANDROID(VkDevice device_h, VkFormat format,
    result = radv_GetPhysicalDeviceImageFormatProperties2(phys_dev_h, &image_format_info,
                                                          &image_format_props);
    if (result != VK_SUCCESS) {
-      return vk_errorf(device->instance, result,
+      return vk_errorf(device, result,
                        "radv_GetPhysicalDeviceImageFormatProperties2 failed "
                        "inside %s",
                        __func__);
@@ -344,7 +350,7 @@ radv_GetSwapchainGrallocUsage2ANDROID(VkDevice device_h, VkFormat format,
    }
 
    if (imageUsage != 0) {
-      return vk_errorf(device->instance, VK_ERROR_FORMAT_NOT_SUPPORTED,
+      return vk_errorf(device, VK_ERROR_FORMAT_NOT_SUPPORTED,
                        "unsupported VkImageUsageFlags(0x%x) for gralloc "
                        "swapchain",
                        imageUsage);
@@ -400,7 +406,7 @@ radv_AcquireImageANDROID(VkDevice device_h, VkImage image_h, int nativeFenceFd,
             VkResult err = (errno == EMFILE) ? VK_ERROR_TOO_MANY_OBJECTS :
                                                VK_ERROR_OUT_OF_HOST_MEMORY;
             close(nativeFenceFd);
-            return vk_error(device->instance, err);
+            return vk_error(device, err);
          }
       } else if (semaphore != VK_NULL_HANDLE) {
          semaphore_fd = nativeFenceFd;
@@ -501,7 +507,7 @@ radv_QueueSignalReleaseImageANDROID(VkQueue _queue, uint32_t waitSemaphoreCount,
 
 enum {
    /* Usage bit equal to GRALLOC_USAGE_HW_CAMERA_MASK */
-   AHARDWAREBUFFER_USAGE_CAMERA_MASK = 0x00060000U,
+   BUFFER_USAGE_CAMERA_MASK = 0x00060000U,
 };
 
 static inline VkFormat
@@ -522,7 +528,7 @@ vk_format_from_android(unsigned android_format, unsigned android_usage)
    case AHARDWAREBUFFER_FORMAT_Y8Cb8Cr8_420:
       return VK_FORMAT_G8_B8R8_2PLANE_420_UNORM;
    case AHARDWAREBUFFER_FORMAT_IMPLEMENTATION_DEFINED:
-      if (android_usage & AHARDWAREBUFFER_USAGE_CAMERA_MASK)
+      if (android_usage & BUFFER_USAGE_CAMERA_MASK)
          return VK_FORMAT_G8_B8R8_2PLANE_420_UNORM;
       else
          return VK_FORMAT_R8G8B8_UNORM;
@@ -605,14 +611,17 @@ get_ahb_buffer_format_properties(VkDevice device_h, const struct AHardwareBuffer
    p->format = vk_format_from_android(desc.format, desc.usage);
    p->externalFormat = (uint64_t)(uintptr_t)p->format;
 
-   VkFormatProperties format_properties;
-   radv_GetPhysicalDeviceFormatProperties(radv_physical_device_to_handle(device->physical_device),
-                                          p->format, &format_properties);
+   VkFormatProperties2 format_properties = {
+      .sType = VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2
+   };
+
+   radv_GetPhysicalDeviceFormatProperties2(radv_physical_device_to_handle(device->physical_device),
+                                               p->format, &format_properties);
 
    if (desc.usage & AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER)
-      p->formatFeatures = format_properties.linearTilingFeatures;
+      p->formatFeatures = format_properties.formatProperties.linearTilingFeatures;
    else
-      p->formatFeatures = format_properties.optimalTilingFeatures;
+      p->formatFeatures = format_properties.formatProperties.optimalTilingFeatures;
 
    /* "Images can be created with an external format even if the Android hardware
     *  buffer has a format which has an equivalent Vulkan format to enable
@@ -651,6 +660,82 @@ get_ahb_buffer_format_properties(VkDevice device_h, const struct AHardwareBuffer
    return VK_SUCCESS;
 }
 
+static VkResult
+get_ahb_buffer_format_properties2(VkDevice device_h, const struct AHardwareBuffer *buffer,
+                                  VkAndroidHardwareBufferFormatProperties2ANDROID *pProperties)
+{
+   RADV_FROM_HANDLE(radv_device, device, device_h);
+
+   /* Get a description of buffer contents . */
+   AHardwareBuffer_Desc desc;
+   AHardwareBuffer_describe(buffer, &desc);
+
+   /* Verify description. */
+   const uint64_t gpu_usage = AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE |
+                              AHARDWAREBUFFER_USAGE_GPU_COLOR_OUTPUT |
+                              AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER;
+
+   /* "Buffer must be a valid Android hardware buffer object with at least
+    * one of the AHARDWAREBUFFER_USAGE_GPU_* usage flags."
+    */
+   if (!(desc.usage & (gpu_usage)))
+      return VK_ERROR_INVALID_EXTERNAL_HANDLE;
+
+   /* Fill properties fields based on description. */
+   VkAndroidHardwareBufferFormatProperties2ANDROID *p = pProperties;
+
+   p->format = vk_format_from_android(desc.format, desc.usage);
+   p->externalFormat = (uint64_t)(uintptr_t)p->format;
+
+   VkFormatProperties2 format_properties = {
+      .sType = VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2
+   };
+
+   radv_GetPhysicalDeviceFormatProperties2(radv_physical_device_to_handle(device->physical_device),
+                                               p->format, &format_properties);
+
+   if (desc.usage & AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER)
+      p->formatFeatures = format_properties.formatProperties.linearTilingFeatures;
+   else
+      p->formatFeatures = format_properties.formatProperties.optimalTilingFeatures;
+
+   /* "Images can be created with an external format even if the Android hardware
+    *  buffer has a format which has an equivalent Vulkan format to enable
+    *  consistent handling of images from sources that might use either category
+    *  of format. However, all images created with an external format are subject
+    *  to the valid usage requirements associated with external formats, even if
+    *  the Android hardware buffer’s format has a Vulkan equivalent."
+    *
+    * "The formatFeatures member *must* include
+    *  VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT_KHR and at least one of
+    *  VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT_KHR or
+    *  VK_FORMAT_FEATURE_2_COSITED_CHROMA_SAMPLES_BIT_KHR"
+    */
+   assert(p->formatFeatures & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT_KHR);
+
+   p->formatFeatures |= VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT_KHR;
+
+   /* "Implementations may not always be able to determine the color model,
+    *  numerical range, or chroma offsets of the image contents, so the values
+    *  in VkAndroidHardwareBufferFormatPropertiesANDROID are only suggestions.
+    *  Applications should treat these values as sensible defaults to use in
+    *  the absence of more reliable information obtained through some other
+    *  means."
+    */
+   p->samplerYcbcrConversionComponents.r = VK_COMPONENT_SWIZZLE_IDENTITY;
+   p->samplerYcbcrConversionComponents.g = VK_COMPONENT_SWIZZLE_IDENTITY;
+   p->samplerYcbcrConversionComponents.b = VK_COMPONENT_SWIZZLE_IDENTITY;
+   p->samplerYcbcrConversionComponents.a = VK_COMPONENT_SWIZZLE_IDENTITY;
+
+   p->suggestedYcbcrModel = VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_601;
+   p->suggestedYcbcrRange = VK_SAMPLER_YCBCR_RANGE_ITU_FULL;
+
+   p->suggestedXChromaOffset = VK_CHROMA_LOCATION_MIDPOINT;
+   p->suggestedYChromaOffset = VK_CHROMA_LOCATION_MIDPOINT;
+
+   return VK_SUCCESS;
+}
+
 VkResult
 radv_GetAndroidHardwareBufferPropertiesANDROID(VkDevice device_h,
                                                const struct AHardwareBuffer *buffer,
@@ -666,6 +751,11 @@ radv_GetAndroidHardwareBufferPropertiesANDROID(VkDevice device_h,
    if (format_prop)
       get_ahb_buffer_format_properties(device_h, buffer, format_prop);
 
+   VkAndroidHardwareBufferFormatProperties2ANDROID *format_prop2 =
+      vk_find_struct(pProperties->pNext, ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_2_ANDROID);
+   if (format_prop2)
+      get_ahb_buffer_format_properties2(device_h, buffer, format_prop2);
+
    /* NOTE - We support buffers with only one handle but do not error on
     * multiple handle case. Reason is that we want to support YUV formats
     * where we have many logical planes but they all point to the same
diff --git a/mesa 3D driver/src/amd/vulkan/radv_cmd_buffer.c b/mesa 3D driver/src/amd/vulkan/radv_cmd_buffer.c
index 7c14f6de2a..43cdf37853 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_cmd_buffer.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_cmd_buffer.c	
@@ -37,6 +37,8 @@
 
 #include "ac_debug.h"
 
+#include "util/fast_idiv_by_const.h"
+
 enum {
    RADV_PREFETCH_VBO_DESCRIPTORS = (1 << 0),
    RADV_PREFETCH_VS = (1 << 1),
@@ -61,6 +63,8 @@ static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer,
                                          uint32_t dst_family, const VkImageSubresourceRange *range,
                                          struct radv_sample_locations_state *sample_locs);
 
+static void radv_set_rt_stack_size(struct radv_cmd_buffer *cmd_buffer, uint32_t size);
+
 const struct radv_dynamic_state default_dynamic_state = {
    .viewport =
       {
@@ -403,10 +407,14 @@ radv_destroy_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
    if (cmd_buffer->cs)
       cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs);
 
-   for (unsigned i = 0; i < MAX_BIND_POINTS; i++)
+   for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
       free(cmd_buffer->descriptors[i].push_set.set.mapped_ptr);
+      vk_object_base_finish(&cmd_buffer->descriptors[i].push_set.set.base);
+   }
 
-   vk_object_base_finish(&cmd_buffer->base);
+   vk_object_base_finish(&cmd_buffer->meta_push_descriptors.base);
+
+   vk_command_buffer_finish(&cmd_buffer->vk);
    vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
 }
 
@@ -418,9 +426,14 @@ radv_create_cmd_buffer(struct radv_device *device, struct radv_cmd_pool *pool,
    unsigned ring;
    cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (cmd_buffer == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   vk_object_base_init(&device->vk, &cmd_buffer->base, VK_OBJECT_TYPE_COMMAND_BUFFER);
+   VkResult result =
+      vk_command_buffer_init(&cmd_buffer->vk, &device->vk);
+   if (result != VK_SUCCESS) {
+      vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
+      return result;
+   }
 
    cmd_buffer->device = device;
    cmd_buffer->pool = pool;
@@ -434,9 +447,16 @@ radv_create_cmd_buffer(struct radv_device *device, struct radv_cmd_pool *pool,
    cmd_buffer->cs = device->ws->cs_create(device->ws, ring);
    if (!cmd_buffer->cs) {
       radv_destroy_cmd_buffer(cmd_buffer);
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
    }
 
+   vk_object_base_init(&device->vk, &cmd_buffer->meta_push_descriptors.base,
+                       VK_OBJECT_TYPE_DESCRIPTOR_SET);
+
+   for (unsigned i = 0; i < MAX_BIND_POINTS; i++)
+      vk_object_base_init(&device->vk, &cmd_buffer->descriptors[i].push_set.set.base,
+                          VK_OBJECT_TYPE_DESCRIPTOR_SET);
+
    *pCommandBuffer = radv_cmd_buffer_to_handle(cmd_buffer);
 
    list_inithead(&cmd_buffer->upload.list);
@@ -447,6 +467,8 @@ radv_create_cmd_buffer(struct radv_device *device, struct radv_cmd_pool *pool,
 static VkResult
 radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
 {
+   vk_command_buffer_reset(&cmd_buffer->vk);
+
    cmd_buffer->device->ws->cs_reset(cmd_buffer->cs);
 
    list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list)
@@ -973,19 +995,17 @@ radv_emit_sample_locations(struct radv_cmd_buffer *cmd_buffer)
 
 static void
 radv_emit_inline_push_consts(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline,
-                             gl_shader_stage stage, int idx, int count, uint32_t *values)
+                             gl_shader_stage stage, int idx, uint32_t *values)
 {
    struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
    uint32_t base_reg = pipeline->user_data_0[stage];
    if (loc->sgpr_idx == -1)
       return;
 
-   assert(loc->num_sgprs == count);
+   radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 2 + loc->num_sgprs);
 
-   radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 2 + count);
-
-   radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, count);
-   radeon_emit_array(cmd_buffer->cs, values, count);
+   radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, loc->num_sgprs);
+   radeon_emit_array(cmd_buffer->cs, values, loc->num_sgprs);
 }
 
 static void
@@ -1043,7 +1063,7 @@ radv_emit_shader_prefetch(struct radv_cmd_buffer *cmd_buffer, struct radv_shader
    if (!shader)
       return;
 
-   va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
+   va = radv_shader_variant_get_va(shader);
 
    si_cp_dma_prefetch(cmd_buffer, va, shader->code_size);
 }
@@ -1591,6 +1611,8 @@ radv_emit_fragment_shading_rate(struct radv_cmd_buffer *cmd_buffer)
    uint32_t vertex_comb_mode = d->fragment_shading_rate.combiner_ops[0];
    uint32_t htile_comb_mode = d->fragment_shading_rate.combiner_ops[1];
 
+   assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10_3);
+
    if (subpass && !subpass->vrs_attachment) {
       /* When the current subpass has no VRS attachment, the VRS rates are expected to be 1x1, so we
        * can cheat by tweaking the different combiner modes.
@@ -2280,6 +2302,12 @@ radv_update_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
 
    assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, iview->base_mip));
 
+   /* Do not need to update the clear value for images that are fast cleared with the comp-to-single
+    * mode because the hardware gets the value from the image directly.
+    */
+   if (iview->image->support_comp_to_single)
+      return;
+
    radv_set_color_clear_metadata(cmd_buffer, image, &range, color_values);
 
    radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
@@ -2298,6 +2326,9 @@ radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_i
    if (!radv_image_has_cmask(image) && !radv_dcc_enabled(image, iview->base_mip))
       return;
 
+   if (iview->image->support_comp_to_single)
+      return;
+
    if (!radv_image_has_clear_value(image)) {
       uint32_t color_values[2] = {0, 0};
       radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
@@ -2404,8 +2435,8 @@ radv_cmd_buffer_get_vrs_image(struct radv_cmd_buffer *cmd_buffer)
    if (!device->vrs.image) {
       VkResult result;
 
-      /* The global VRS image is created on-demand to avoid wasting space */
-      result = radv_device_init_vrs_image(device);
+      /* The global VRS state is initialized on-demand to avoid wasting VRAM. */
+      result = radv_device_init_vrs_state(device);
       if (result != VK_SUCCESS) {
          cmd_buffer->record_result = result;
          return NULL;
@@ -2474,6 +2505,7 @@ radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer)
        * bind our internal depth buffer that contains the VRS data as part of HTILE.
        */
       VkImageLayout layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
+      struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer;
       struct radv_image *image = cmd_buffer->device->vrs.image;
       struct radv_ds_buffer_info ds;
       struct radv_image_view iview;
@@ -2495,11 +2527,13 @@ radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer)
                            },
                            NULL);
 
-      radv_initialise_ds_surface(cmd_buffer->device, &ds, &iview);
+      radv_initialise_vrs_surface(image, htile_buffer, &ds);
 
-      radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, image->bo);
+      radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, htile_buffer->bo);
 
       radv_emit_fb_ds_state(cmd_buffer, &ds, &iview, layout, false);
+
+      radv_image_view_finish(&iview);
    } else {
       if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9)
          radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2);
@@ -2617,8 +2651,335 @@ radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer)
    cmd_buffer->state.context_roll_without_scissor_emitted = true;
 }
 
+unsigned
+radv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_inputs)
+{
+   /* instance_rate_vs_prologs is a flattened array of array of arrays of different sizes, or a
+    * single array sorted in ascending order using:
+    * - total number of attributes
+    * - number of instanced attributes
+    * - index of first instanced attribute
+    */
+
+   /* From total number of attributes to offset. */
+   static const uint16_t total_to_offset[16] = {0,   1,   4,   10,  20,  35,  56,  84,
+                                                120, 165, 220, 286, 364, 455, 560, 680};
+   unsigned start_index = total_to_offset[num_attributes - 1];
+
+   /* From number of instanced attributes to offset. This would require a different LUT depending on
+    * the total number of attributes, but we can exploit a pattern to use just the LUT for 16 total
+    * attributes.
+    */
+   static const uint8_t count_to_offset_total16[16] = {0,   16,  31,  45,  58,  70,  81,  91,
+                                                       100, 108, 115, 121, 126, 130, 133, 135};
+   unsigned count = util_bitcount(instance_rate_inputs);
+   unsigned offset_from_start_index =
+      count_to_offset_total16[count - 1] - ((16 - num_attributes) * (count - 1));
+
+   unsigned first = ffs(instance_rate_inputs) - 1;
+   return start_index + offset_from_start_index + first;
+}
+
+union vs_prolog_key_header {
+   struct {
+      uint32_t key_size : 8;
+      uint32_t num_attributes : 6;
+      uint32_t as_ls : 1;
+      uint32_t is_ngg : 1;
+      uint32_t wave32 : 1;
+      uint32_t next_stage : 3;
+      uint32_t instance_rate_inputs : 1;
+      uint32_t alpha_adjust_lo : 1;
+      uint32_t alpha_adjust_hi : 1;
+      uint32_t misaligned_mask : 1;
+      uint32_t post_shuffle : 1;
+      uint32_t nontrivial_divisors : 1;
+      /* We need this to ensure the padding is zero. It's useful even if it's unused. */
+      uint32_t padding0 : 6;
+   };
+   uint32_t v;
+};
+
+uint32_t
+radv_hash_vs_prolog(const void *key_)
+{
+   const uint32_t *key = key_;
+   union vs_prolog_key_header header;
+   header.v = key[0];
+   return _mesa_hash_data(key, header.key_size);
+}
+
+bool
+radv_cmp_vs_prolog(const void *a_, const void *b_)
+{
+   const uint32_t *a = a_;
+   const uint32_t *b = b_;
+   if (a[0] != b[0])
+      return false;
+
+   union vs_prolog_key_header header;
+   header.v = a[0];
+   return memcmp(a, b, header.key_size) == 0;
+}
+
+static struct radv_shader_prolog *
+lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader,
+                 uint32_t *nontrivial_divisors)
+{
+   STATIC_ASSERT(sizeof(union vs_prolog_key_header) == 4);
+   assert(vs_shader->info.vs.dynamic_inputs);
+
+   const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
+   const struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
+   struct radv_device *device = cmd_buffer->device;
+
+   unsigned num_attributes = pipeline->last_vertex_attrib_bit;
+   uint32_t attribute_mask = BITFIELD_MASK(num_attributes);
+
+   uint32_t instance_rate_inputs = state->instance_rate_inputs & attribute_mask;
+   *nontrivial_divisors = state->nontrivial_divisors & attribute_mask;
+   enum chip_class chip = device->physical_device->rad_info.chip_class;
+   const uint32_t misaligned_mask = chip == GFX6 || chip >= GFX10 ? cmd_buffer->state.vbo_misaligned_mask : 0;
+
+   /* try to use a pre-compiled prolog first */
+   struct radv_shader_prolog *prolog = NULL;
+   if (pipeline->can_use_simple_input &&
+       (!vs_shader->info.vs.as_ls || !instance_rate_inputs) &&
+       !misaligned_mask && !state->alpha_adjust_lo && !state->alpha_adjust_hi) {
+      if (!instance_rate_inputs) {
+         prolog = device->simple_vs_prologs[num_attributes - 1];
+      } else if (num_attributes <= 16 && !*nontrivial_divisors &&
+                 util_bitcount(instance_rate_inputs) ==
+                    (util_last_bit(instance_rate_inputs) - ffs(instance_rate_inputs) + 1)) {
+         unsigned index = radv_instance_rate_prolog_index(num_attributes, instance_rate_inputs);
+         prolog = device->instance_rate_vs_prologs[index];
+      }
+   }
+   if (prolog)
+      return prolog;
+
+   /* if we couldn't use a pre-compiled prolog, find one in the cache or create one */
+   uint32_t key_words[16];
+   unsigned key_size = 1;
+
+   struct radv_vs_prolog_key key;
+   key.state = state;
+   key.num_attributes = num_attributes;
+   key.misaligned_mask = misaligned_mask;
+   /* The instance ID input VGPR is placed differently when as_ls=true. */
+   key.as_ls = vs_shader->info.vs.as_ls && instance_rate_inputs;
+   key.is_ngg = vs_shader->info.is_ngg;
+   key.wave32 = vs_shader->info.wave_size == 32;
+   key.next_stage = pipeline->next_vertex_stage;
+
+   union vs_prolog_key_header header;
+   header.v = 0;
+   header.num_attributes = num_attributes;
+   header.as_ls = key.as_ls;
+   header.is_ngg = key.is_ngg;
+   header.wave32 = key.wave32;
+   header.next_stage = key.next_stage;
+
+   if (instance_rate_inputs & ~*nontrivial_divisors) {
+      header.instance_rate_inputs = true;
+      key_words[key_size++] = instance_rate_inputs;
+   }
+   if (*nontrivial_divisors) {
+      header.nontrivial_divisors = true;
+      key_words[key_size++] = *nontrivial_divisors;
+   }
+   if (misaligned_mask) {
+      header.misaligned_mask = true;
+      key_words[key_size++] = misaligned_mask;
+
+      uint8_t *formats = (uint8_t *)&key_words[key_size];
+      unsigned num_formats = 0;
+      u_foreach_bit(index, misaligned_mask) formats[num_formats++] = state->formats[index];
+      while (num_formats & 0x3)
+         formats[num_formats++] = 0;
+      key_size += num_formats / 4u;
+
+      if (state->post_shuffle & attribute_mask) {
+         header.post_shuffle = true;
+         key_words[key_size++] = state->post_shuffle & attribute_mask;
+      }
+   }
+   if (state->alpha_adjust_lo & attribute_mask) {
+      header.alpha_adjust_lo = true;
+      key_words[key_size++] = state->alpha_adjust_lo & attribute_mask;
+   }
+   if (state->alpha_adjust_hi & attribute_mask) {
+      header.alpha_adjust_hi = true;
+      key_words[key_size++] = state->alpha_adjust_hi & attribute_mask;
+   }
+
+   header.key_size = key_size * sizeof(key_words[0]);
+   key_words[0] = header.v;
+
+   uint32_t hash = radv_hash_vs_prolog(key_words);
+
+   if (cmd_buffer->state.emitted_vs_prolog &&
+       cmd_buffer->state.emitted_vs_prolog_key_hash == hash &&
+       radv_cmp_vs_prolog(key_words, cmd_buffer->state.emitted_vs_prolog_key))
+      return cmd_buffer->state.emitted_vs_prolog;
+
+   u_rwlock_rdlock(&device->vs_prologs_lock);
+   struct hash_entry *prolog_entry =
+      _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words);
+   u_rwlock_rdunlock(&device->vs_prologs_lock);
+
+   if (!prolog_entry) {
+      u_rwlock_wrlock(&device->vs_prologs_lock);
+      prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words);
+      if (prolog_entry) {
+         u_rwlock_wrunlock(&device->vs_prologs_lock);
+         return prolog_entry->data;
+      }
+
+      prolog = radv_create_vs_prolog(device, &key);
+      uint32_t *key2 = malloc(key_size * 4);
+      if (!prolog || !key2) {
+         radv_prolog_destroy(device, prolog);
+         free(key2);
+         u_rwlock_wrunlock(&device->vs_prologs_lock);
+         return NULL;
+      }
+      memcpy(key2, key_words, key_size * 4);
+      _mesa_hash_table_insert_pre_hashed(device->vs_prologs, hash, key2, prolog);
+
+      u_rwlock_wrunlock(&device->vs_prologs_lock);
+      return prolog;
+   }
+
+   return prolog_entry->data;
+}
+
 static void
-radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer)
+emit_prolog_regs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader,
+                 struct radv_shader_prolog *prolog, bool pipeline_is_dirty)
+{
+   /* no need to re-emit anything in this case */
+   if (cmd_buffer->state.emitted_vs_prolog == prolog && !pipeline_is_dirty)
+      return;
+
+   enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class;
+   struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
+   uint64_t prolog_va = radv_buffer_get_va(prolog->bo) + prolog->alloc->offset;
+
+   assert(cmd_buffer->state.emitted_pipeline == cmd_buffer->state.pipeline);
+   assert(vs_shader->info.num_input_sgprs <= prolog->num_preserved_sgprs);
+
+   uint32_t rsrc1 = vs_shader->config.rsrc1;
+   if (chip < GFX10 && G_00B228_SGPRS(prolog->rsrc1) > G_00B228_SGPRS(vs_shader->config.rsrc1))
+      rsrc1 = (rsrc1 & C_00B228_SGPRS) | (prolog->rsrc1 & ~C_00B228_SGPRS);
+
+   /* The main shader must not use less VGPRs than the prolog, otherwise shared vgprs might not
+    * work.
+    */
+   assert(G_00B848_VGPRS(vs_shader->config.rsrc1) >= G_00B848_VGPRS(prolog->rsrc1));
+
+   unsigned pgm_lo_reg = R_00B120_SPI_SHADER_PGM_LO_VS;
+   unsigned rsrc1_reg = R_00B128_SPI_SHADER_PGM_RSRC1_VS;
+   if (vs_shader->info.is_ngg || pipeline->shaders[MESA_SHADER_GEOMETRY] == vs_shader) {
+      pgm_lo_reg = chip >= GFX10 ? R_00B320_SPI_SHADER_PGM_LO_ES : R_00B210_SPI_SHADER_PGM_LO_ES;
+      rsrc1_reg = R_00B228_SPI_SHADER_PGM_RSRC1_GS;
+   } else if (pipeline->shaders[MESA_SHADER_TESS_CTRL] == vs_shader) {
+      pgm_lo_reg = chip >= GFX10 ? R_00B520_SPI_SHADER_PGM_LO_LS : R_00B410_SPI_SHADER_PGM_LO_LS;
+      rsrc1_reg = R_00B428_SPI_SHADER_PGM_RSRC1_HS;
+   } else if (vs_shader->info.vs.as_ls) {
+      pgm_lo_reg = R_00B520_SPI_SHADER_PGM_LO_LS;
+      rsrc1_reg = R_00B528_SPI_SHADER_PGM_RSRC1_LS;
+   } else if (vs_shader->info.vs.as_es) {
+      pgm_lo_reg = R_00B320_SPI_SHADER_PGM_LO_ES;
+      rsrc1_reg = R_00B328_SPI_SHADER_PGM_RSRC1_ES;
+   }
+
+   radeon_set_sh_reg_seq(cmd_buffer->cs, pgm_lo_reg, 2);
+   radeon_emit(cmd_buffer->cs, prolog_va >> 8);
+   radeon_emit(cmd_buffer->cs, S_00B124_MEM_BASE(prolog_va >> 40));
+
+   if (chip < GFX10)
+      radeon_set_sh_reg(cmd_buffer->cs, rsrc1_reg, rsrc1);
+   else
+      assert(rsrc1 == vs_shader->config.rsrc1);
+
+   radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, prolog->bo);
+}
+
+static void
+emit_prolog_inputs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader,
+                   uint32_t nontrivial_divisors, bool pipeline_is_dirty)
+{
+   /* no need to re-emit anything in this case */
+   if (!nontrivial_divisors && !pipeline_is_dirty && cmd_buffer->state.emitted_vs_prolog &&
+       !cmd_buffer->state.emitted_vs_prolog->nontrivial_divisors)
+      return;
+
+   struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
+   uint64_t input_va = radv_shader_variant_get_va(vs_shader);
+
+   if (nontrivial_divisors) {
+      unsigned inputs_offset;
+      uint32_t *inputs;
+      unsigned size = 8 + util_bitcount(nontrivial_divisors) * 8;
+      if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &inputs_offset, (void **)&inputs))
+         return;
+
+      *(inputs++) = input_va;
+      *(inputs++) = input_va >> 32;
+
+      u_foreach_bit(index, nontrivial_divisors)
+      {
+         uint32_t div = state->divisors[index];
+         if (div == 0) {
+            *(inputs++) = 0;
+            *(inputs++) = 1;
+         } else if (util_is_power_of_two_or_zero(div)) {
+            *(inputs++) = util_logbase2(div) | (1 << 8);
+            *(inputs++) = 0xffffffffu;
+         } else {
+            struct util_fast_udiv_info info = util_compute_fast_udiv_info(div, 32, 32);
+            *(inputs++) = info.pre_shift | (info.increment << 8) | (info.post_shift << 16);
+            *(inputs++) = info.multiplier;
+         }
+      }
+
+      input_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + inputs_offset;
+   }
+
+   struct radv_userdata_info *loc =
+      &vs_shader->info.user_sgprs_locs.shader_data[AC_UD_VS_PROLOG_INPUTS];
+   uint32_t base_reg = cmd_buffer->state.pipeline->user_data_0[MESA_SHADER_VERTEX];
+   assert(loc->sgpr_idx != -1);
+   assert(loc->num_sgprs == 2);
+   radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
+                            input_va, true);
+}
+
+static void
+radv_emit_vertex_state(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
+{
+   struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
+   struct radv_shader_variant *vs_shader = radv_get_shader(pipeline, MESA_SHADER_VERTEX);
+
+   if (!vs_shader->info.vs.has_prolog)
+      return;
+
+   uint32_t nontrivial_divisors;
+   struct radv_shader_prolog *prolog =
+      lookup_vs_prolog(cmd_buffer, vs_shader, &nontrivial_divisors);
+   if (!prolog) {
+      cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
+      return;
+   }
+   emit_prolog_regs(cmd_buffer, vs_shader, prolog, pipeline_is_dirty);
+   emit_prolog_inputs(cmd_buffer, vs_shader, nontrivial_divisors, pipeline_is_dirty);
+
+   cmd_buffer->state.emitted_vs_prolog = prolog;
+}
+
+static void
+radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
 {
    uint64_t states =
       cmd_buffer->state.dirty & cmd_buffer->state.emitted_pipeline->graphics.needed_dynamic_state;
@@ -2687,6 +3048,9 @@ radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer)
    if (states & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE)
       radv_emit_color_write_enable(cmd_buffer);
 
+   if (states & RADV_CMD_DIRTY_VERTEX_STATE)
+      radv_emit_vertex_state(cmd_buffer, pipeline_is_dirty);
+
    cmd_buffer->state.dirty &= ~states;
 }
 
@@ -2803,13 +3167,20 @@ radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags st
       radv_save_descriptors(cmd_buffer, bind_point);
 }
 
+static bool
+radv_shader_loads_push_constants(struct radv_pipeline *pipeline, gl_shader_stage stage)
+{
+   struct radv_userdata_info *loc =
+      radv_lookup_user_sgpr(pipeline, stage, AC_UD_PUSH_CONSTANTS);
+   return loc->sgpr_idx != -1;
+}
+
 static void
 radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages,
                      struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
 {
    struct radv_descriptor_state *descriptors_state =
       radv_get_descriptors_state(cmd_buffer, bind_point);
-   struct radv_pipeline_layout *layout = pipeline->layout;
    struct radv_shader_variant *shader, *prev_shader;
    bool need_push_constants = false;
    unsigned offset;
@@ -2819,7 +3190,7 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stag
    uint32_t dirty_stages = 0;
 
    stages &= cmd_buffer->push_constant_stages;
-   if (!stages || (!layout->push_constant_size && !layout->dynamic_offset_count))
+   if (!stages || (!pipeline->push_constant_size && !pipeline->dynamic_offset_count))
       return;
 
    internal_stages = stages;
@@ -2843,25 +3214,23 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stag
       if (!shader)
          continue;
 
-      need_push_constants |= shader->info.loads_push_constants;
-      need_push_constants |= shader->info.loads_dynamic_offsets;
+      need_push_constants |= radv_shader_loads_push_constants(pipeline, stage);
 
-      uint8_t base = shader->info.base_inline_push_consts;
-      uint8_t count = shader->info.num_inline_push_consts;
+      uint8_t base = shader->info.min_push_constant_used / 4;
 
-      radv_emit_inline_push_consts(cmd_buffer, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS, count,
+      radv_emit_inline_push_consts(cmd_buffer, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS,
                                    (uint32_t *)&cmd_buffer->push_constants[base * 4]);
    }
 
    if (need_push_constants) {
       if (!radv_cmd_buffer_upload_alloc(
-             cmd_buffer, layout->push_constant_size + 16 * layout->dynamic_offset_count, &offset,
+             cmd_buffer, pipeline->push_constant_size + 16 * pipeline->dynamic_offset_count, &offset,
              &ptr))
          return;
 
-      memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size);
-      memcpy((char *)ptr + layout->push_constant_size, descriptors_state->dynamic_buffers,
-             16 * layout->dynamic_offset_count);
+      memcpy(ptr, cmd_buffer->push_constants, pipeline->push_constant_size);
+      memcpy((char *)ptr + pipeline->push_constant_size, descriptors_state->dynamic_buffers,
+             16 * pipeline->dynamic_offset_count);
 
       va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
       va += offset;
@@ -2888,33 +3257,105 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stag
    cmd_buffer->push_constant_stages |= dirty_stages;
 }
 
+enum radv_dst_sel {
+   DST_SEL_0001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) |
+                  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
+   DST_SEL_X001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) |
+                  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
+   DST_SEL_XY01 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
+   DST_SEL_XYZ1 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
+   DST_SEL_XYZW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W),
+   DST_SEL_ZYXW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+                  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W),
+};
+
+static const uint32_t data_format_dst_sel[] = {
+   [V_008F0C_BUF_DATA_FORMAT_INVALID] = DST_SEL_0001,
+   [V_008F0C_BUF_DATA_FORMAT_8] = DST_SEL_X001,
+   [V_008F0C_BUF_DATA_FORMAT_16] = DST_SEL_X001,
+   [V_008F0C_BUF_DATA_FORMAT_8_8] = DST_SEL_XY01,
+   [V_008F0C_BUF_DATA_FORMAT_32] = DST_SEL_X001,
+   [V_008F0C_BUF_DATA_FORMAT_16_16] = DST_SEL_XY01,
+   [V_008F0C_BUF_DATA_FORMAT_10_11_11] = DST_SEL_XYZ1,
+   [V_008F0C_BUF_DATA_FORMAT_11_11_10] = DST_SEL_XYZ1,
+   [V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = DST_SEL_XYZW,
+   [V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = DST_SEL_XYZW,
+   [V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = DST_SEL_XYZW,
+   [V_008F0C_BUF_DATA_FORMAT_32_32] = DST_SEL_XY01,
+   [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = DST_SEL_XYZW,
+   [V_008F0C_BUF_DATA_FORMAT_32_32_32] = DST_SEL_XYZ1,
+   [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = DST_SEL_XYZW,
+};
+
 static void
 radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
 {
    if ((pipeline_is_dirty || (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
        cmd_buffer->state.pipeline->vb_desc_usage_mask) {
       struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
+      struct radv_shader_variant *vs_shader = radv_get_shader(pipeline, MESA_SHADER_VERTEX);
+      enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class;
       unsigned vb_offset;
       void *vb_ptr;
       unsigned desc_index = 0;
       uint32_t mask = pipeline->vb_desc_usage_mask;
       uint64_t va;
+      struct radv_vs_input_state *vs_state =
+         vs_shader->info.vs.dynamic_inputs ? &cmd_buffer->state.dynamic_vs_input : NULL;
 
       /* allocate some descriptor state for vertex buffers */
       if (!radv_cmd_buffer_upload_alloc(cmd_buffer, pipeline->vb_desc_alloc_size, &vb_offset, &vb_ptr))
          return;
 
+      assert(!vs_state || pipeline->use_per_attribute_vb_descs);
+
       while (mask) {
          unsigned i = u_bit_scan(&mask);
          uint32_t *desc = &((uint32_t *)vb_ptr)[desc_index++ * 4];
-         uint32_t offset;
-         unsigned binding = pipeline->use_per_attribute_vb_descs ? pipeline->attrib_bindings[i] : i;
+         uint32_t offset, rsrc_word3;
+         unsigned binding =
+            vs_state ? cmd_buffer->state.dynamic_vs_input.bindings[i]
+                     : (pipeline->use_per_attribute_vb_descs ? pipeline->attrib_bindings[i] : i);
          struct radv_buffer *buffer = cmd_buffer->vertex_bindings[binding].buffer;
          unsigned num_records;
          unsigned stride;
 
+         if (vs_state) {
+            unsigned format = vs_state->formats[i];
+            unsigned dfmt = format & 0xf;
+            unsigned nfmt = (format >> 4) & 0x7;
+
+            rsrc_word3 =
+               vs_state->post_shuffle & (1u << i) ? DST_SEL_ZYXW : data_format_dst_sel[dfmt];
+
+            if (chip >= GFX10)
+               rsrc_word3 |= S_008F0C_FORMAT(ac_get_tbuffer_format(chip, dfmt, nfmt));
+            else
+               rsrc_word3 |= S_008F0C_NUM_FORMAT(nfmt) | S_008F0C_DATA_FORMAT(dfmt);
+         } else {
+            if (chip >= GFX10)
+               rsrc_word3 = DST_SEL_XYZW | S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT);
+            else
+               rsrc_word3 = DST_SEL_XYZW | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
+                            S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+         }
+
          if (!buffer) {
-            memset(desc, 0, 4 * 4);
+            if (vs_state) {
+               /* Stride needs to be non-zero on GFX9, or else bounds checking is disabled. We need
+                * to include the format/word3 so that the alpha channel is 1 for formats without an
+                * alpha channel.
+                */
+               desc[0] = 0;
+               desc[1] = S_008F04_STRIDE(16);
+               desc[2] = 0;
+               desc[3] = rsrc_word3;
+            } else {
+               memset(desc, 0, 4 * 4);
+            }
             continue;
          }
 
@@ -2922,6 +3363,8 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_
 
          offset = cmd_buffer->vertex_bindings[binding].offset;
          va += offset + buffer->offset;
+         if (vs_state)
+            va += vs_state->offsets[i];
 
          if (cmd_buffer->vertex_bindings[binding].size) {
             num_records = cmd_buffer->vertex_bindings[binding].size;
@@ -2935,9 +3378,9 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_
             stride = pipeline->binding_stride[binding];
          }
 
-         enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class;
          if (pipeline->use_per_attribute_vb_descs) {
-            uint32_t attrib_end = pipeline->attrib_ends[i];
+            uint32_t attrib_end = vs_state ? vs_state->offsets[i] + vs_state->format_sizes[i]
+                                           : pipeline->attrib_ends[i];
 
             if (num_records < attrib_end) {
                num_records = 0; /* not enough space for one vertex */
@@ -2962,7 +3405,14 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_
                 * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
                 * GFX10.3 but it doesn't hurt.
                 */
-               memset(desc, 0, 16);
+               if (vs_state) {
+                  desc[0] = 0;
+                  desc[1] = S_008F04_STRIDE(16);
+                  desc[2] = 0;
+                  desc[3] = rsrc_word3;
+               } else {
+                  memset(desc, 0, 16);
+               }
                continue;
             }
          } else {
@@ -2970,22 +3420,13 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_
                num_records = DIV_ROUND_UP(num_records, stride);
          }
 
-         uint32_t rsrc_word3 =
-            S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-            S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
-
          if (chip >= GFX10) {
             /* OOB_SELECT chooses the out-of-bounds check:
              * - 1: index >= NUM_RECORDS (Structured)
              * - 3: offset >= NUM_RECORDS (Raw)
              */
             int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW;
-
-            rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT) |
-                          S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(1);
-         } else {
-            rsrc_word3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
-                          S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+            rsrc_word3 |= S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(1);
          }
 
          desc[0] = va;
@@ -3278,7 +3719,9 @@ radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags src_st
 {
    if (src_stage_mask &
        (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT |
-        VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) {
+        VK_PIPELINE_STAGE_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
+        VK_PIPELINE_STAGE_RAY_TRACING_SHADER_BIT_KHR | VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT |
+        VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) {
       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
    }
 
@@ -3299,6 +3742,35 @@ radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags src_st
    }
 }
 
+/*
+ * In vulkan barriers have two kinds of operations:
+ *
+ * - visibility (implemented with radv_src_access_flush)
+ * - availability (implemented with radv_dst_access_flush)
+ *
+ * for a memory operation to observe the result of a previous memory operation
+ * one needs to do a visibility operation from the source memory and then an
+ * availability operation to the target memory.
+ *
+ * The complication is the availability and visibility operations do not need to
+ * be in the same barrier.
+ *
+ * The cleanest way to implement this is to define the visibility operation to
+ * bring the caches to a "state of rest", which none of the caches below that
+ * level dirty.
+ *
+ * For GFX8 and earlier this would be VRAM/GTT with none of the caches dirty.
+ *
+ * For GFX9+ we can define the state at rest to be L2 instead of VRAM for all
+ * buffers and for images marked as coherent, and VRAM/GTT for non-coherent
+ * images. However, given the existence of memory barriers which do not specify
+ * the image/buffer it often devolves to just VRAM/GTT anyway.
+ *
+ * To help reducing the invalidations for GPUs that have L2 coherency between the
+ * RB and the shader caches, we always invalidate L2 on the src side, as we can
+ * use our knowledge of past usage to optimize flushes away.
+ */
+
 enum radv_cmd_flush_bits
 radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags src_flags,
                       const struct radv_image *image)
@@ -3328,9 +3800,14 @@ radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags src_flag
                flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
             }
          }
+
+         /* This is valid even for the rb_noncoherent_dirty case, because with how we account for
+          * dirtyness, if it isn't dirty it doesn't contain the data at all and hence doesn't need
+          * invalidating. */
          if (!image_is_coherent)
             flush_bits |= RADV_CMD_FLAG_WB_L2;
          break;
+      case VK_ACCESS_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
       case VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
       case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
          if (!image_is_coherent)
@@ -3394,6 +3871,11 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags dst_flag
          has_DB_meta = false;
    }
 
+   /* All the L2 invalidations below are not the CB/DB. So if there are no incoherent images
+    * in the L2 cache in CB/DB mode then they are already usable from all the other L2 clients. */
+   image_is_coherent |= cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 &&
+                        !cmd_buffer->state.rb_noncoherent_dirty;
+
    u_foreach_bit(b, dst_flags)
    {
       switch ((VkAccessFlagBits)(1 << b)) {
@@ -3427,7 +3909,13 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags dst_flag
          if (!image_is_coherent)
             flush_bits |= RADV_CMD_FLAG_INV_L2;
          break;
+      case VK_ACCESS_ACCELERATION_STRUCTURE_READ_BIT_KHR:
+         flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
+         if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9)
+            flush_bits |= RADV_CMD_FLAG_INV_L2;
+         break;
       case VK_ACCESS_SHADER_WRITE_BIT:
+      case VK_ACCESS_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
          break;
       case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
       case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
@@ -3465,7 +3953,7 @@ radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags dst_flag
 }
 
 void
-radv_subpass_barrier(struct radv_cmd_buffer *cmd_buffer, const struct radv_subpass_barrier *barrier)
+radv_emit_subpass_barrier(struct radv_cmd_buffer *cmd_buffer, const struct radv_subpass_barrier *barrier)
 {
    struct radv_framebuffer *fb = cmd_buffer->state.framebuffer;
    if (fb && !fb->imageless) {
@@ -3796,7 +4284,11 @@ radv_AllocateCommandBuffers(VkDevice _device, const VkCommandBufferAllocateInfo
 
          result = radv_reset_cmd_buffer(cmd_buffer);
          cmd_buffer->level = pAllocateInfo->level;
-         vk_object_base_reset(&cmd_buffer->base);
+         vk_command_buffer_finish(&cmd_buffer->vk);
+         VkResult init_result =
+            vk_command_buffer_init(&cmd_buffer->vk, &device->vk);
+         if (init_result != VK_SUCCESS)
+            result = init_result;
 
          pCommandBuffers[i] = radv_cmd_buffer_to_handle(cmd_buffer);
       } else {
@@ -3928,12 +4420,15 @@ radv_CmdBindVertexBuffers2EXT(VkCommandBuffer commandBuffer, uint32_t firstBindi
 {
    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
    struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings;
+   struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
    bool changed = false;
 
    /* We have to defer setting up vertex buffer since we need the buffer
     * stride from the pipeline. */
 
    assert(firstBinding + bindingCount <= MAX_VBS);
+   cmd_buffer->state.vbo_misaligned_mask = state->misaligned_mask;
+   enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class;
    for (uint32_t i = 0; i < bindingCount; i++) {
       RADV_FROM_HANDLE(radv_buffer, buffer, pBuffers[i]);
       uint32_t idx = firstBinding + i;
@@ -3942,14 +4437,36 @@ radv_CmdBindVertexBuffers2EXT(VkCommandBuffer commandBuffer, uint32_t firstBindi
 
       /* pSizes and pStrides are optional. */
       if (!changed && (vb[idx].buffer != buffer || vb[idx].offset != pOffsets[i] ||
-                       vb[idx].size != size || vb[idx].stride != stride)) {
+                       vb[idx].size != size || (pStrides && vb[idx].stride != stride))) {
          changed = true;
       }
 
       vb[idx].buffer = buffer;
       vb[idx].offset = pOffsets[i];
       vb[idx].size = size;
-      vb[idx].stride = stride;
+      /* if pStrides=NULL, it shouldn't overwrite the strides specified by CmdSetVertexInputEXT */
+
+      if (chip == GFX6 || chip >= GFX10) {
+         const uint32_t bit = 1u << idx;
+         if (!buffer) {
+            cmd_buffer->state.vbo_misaligned_mask &= ~bit;
+            cmd_buffer->state.vbo_bound_mask &= ~bit;
+         } else {
+            cmd_buffer->state.vbo_bound_mask |= bit;
+            if (pStrides && vb[idx].stride != stride) {
+               if (stride & state->format_align_req_minus_1[idx])
+                  cmd_buffer->state.vbo_misaligned_mask |= bit;
+               else
+                  cmd_buffer->state.vbo_misaligned_mask &= ~bit;
+            }
+            if (state->possibly_misaligned_mask & bit &&
+                (vb[idx].offset + state->offsets[idx]) & state->format_align_req_minus_1[idx])
+               cmd_buffer->state.vbo_misaligned_mask |= bit;
+         }
+      }
+
+      if (pStrides)
+         vb[idx].stride = stride;
 
       if (buffer) {
          radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, vb[idx].buffer->bo);
@@ -3961,7 +4478,7 @@ radv_CmdBindVertexBuffers2EXT(VkCommandBuffer commandBuffer, uint32_t firstBindi
       return;
    }
 
-   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
+   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_STATE;
 }
 
 static uint32_t
@@ -4245,6 +4762,16 @@ radv_EndCommandBuffer(VkCommandBuffer commandBuffer)
        */
       cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
 
+      /* Flush noncoherent images on GFX9+ so we can assume they're clean on the start of a
+       * command buffer.
+       */
+      if (cmd_buffer->state.rb_noncoherent_dirty &&
+          cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9)
+         cmd_buffer->state.flush_bits |= radv_src_access_flush(
+            cmd_buffer,
+            VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
+            NULL);
+
       /* Since NGG streamout uses GDS, we need to make GDS idle when
        * we leave the IB, otherwise another process might overwrite
        * it while our shaders are busy.
@@ -4267,7 +4794,7 @@ radv_EndCommandBuffer(VkCommandBuffer commandBuffer)
 
    VkResult result = cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs);
    if (result != VK_SUCCESS)
-      return vk_error(cmd_buffer->device->instance, result);
+      return vk_error(cmd_buffer, result);
 
    cmd_buffer->status = RADV_CMD_BUFFER_STATUS_EXECUTABLE;
 
@@ -4334,6 +4861,7 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline
          (VK_SHADER_STAGE_RAYGEN_BIT_KHR | VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
           VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR | VK_SHADER_STAGE_MISS_BIT_KHR |
           VK_SHADER_STAGE_INTERSECTION_BIT_KHR | VK_SHADER_STAGE_CALLABLE_BIT_KHR);
+      radv_set_rt_stack_size(cmd_buffer, cmd_buffer->state.rt_stack_size);
       break;
    case VK_PIPELINE_BIND_POINT_GRAPHICS:
       if (cmd_buffer->state.pipeline == pipeline)
@@ -4348,7 +4876,7 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline
       if (!pipeline)
          break;
 
-      cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE;
+      cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
       cmd_buffer->push_constant_stages |= pipeline->active_stages;
 
       /* the new vertex shader might not have the same user regs */
@@ -4905,6 +5433,87 @@ radv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer, uint32_t attachmen
    state->dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE;
 }
 
+void
+radv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer, uint32_t vertexBindingDescriptionCount,
+                          const VkVertexInputBindingDescription2EXT *pVertexBindingDescriptions,
+                          uint32_t vertexAttributeDescriptionCount,
+                          const VkVertexInputAttributeDescription2EXT *pVertexAttributeDescriptions)
+{
+   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
+
+   const VkVertexInputBindingDescription2EXT *bindings[MAX_VBS];
+   for (unsigned i = 0; i < vertexBindingDescriptionCount; i++)
+      bindings[pVertexBindingDescriptions[i].binding] = &pVertexBindingDescriptions[i];
+
+   cmd_buffer->state.vbo_misaligned_mask = 0;
+
+   state->attribute_mask = 0;
+   state->misaligned_mask = 0;
+   state->possibly_misaligned_mask = 0;
+   state->instance_rate_inputs = 0;
+   state->nontrivial_divisors = 0;
+   state->post_shuffle = 0;
+   state->alpha_adjust_lo = 0;
+   state->alpha_adjust_hi = 0;
+
+   enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class;
+   for (unsigned i = 0; i < vertexAttributeDescriptionCount; i++) {
+      const VkVertexInputAttributeDescription2EXT *attrib = &pVertexAttributeDescriptions[i];
+      const VkVertexInputBindingDescription2EXT *binding = bindings[attrib->binding];
+      unsigned loc = attrib->location;
+      const struct util_format_description *format_desc = vk_format_description(attrib->format);
+      unsigned nfmt, dfmt;
+      bool post_shuffle;
+      enum radv_vs_input_alpha_adjust alpha_adjust;
+
+      state->attribute_mask |= 1u << loc;
+      state->bindings[loc] = attrib->binding;
+      if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE) {
+         state->instance_rate_inputs |= 1u << loc;
+         state->divisors[loc] = binding->divisor;
+         if (binding->divisor != 1)
+            state->nontrivial_divisors |= 1u << loc;
+      }
+      cmd_buffer->vertex_bindings[attrib->binding].stride = binding->stride;
+      state->offsets[loc] = attrib->offset;
+
+      radv_translate_vertex_format(cmd_buffer->device->physical_device, attrib->format, format_desc,
+                                   &dfmt, &nfmt, &post_shuffle, &alpha_adjust);
+
+      state->formats[loc] = dfmt | (nfmt << 4);
+      const uint8_t format_align_req_minus_1 = format_desc->channel[0].size >= 32 ? 3 :
+                                               (format_desc->block.bits / 8u - 1);
+      state->format_align_req_minus_1[loc] = format_align_req_minus_1;
+      state->format_sizes[loc] = format_desc->block.bits / 8u;
+
+      if (chip == GFX6 || chip >= GFX10) {
+         struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings;
+         unsigned bit = 1u << loc;
+         if (binding->stride & format_align_req_minus_1) {
+            state->misaligned_mask |= bit;
+            if (cmd_buffer->state.vbo_bound_mask & bit)
+               cmd_buffer->state.vbo_misaligned_mask |= bit;
+         } else {
+            state->possibly_misaligned_mask |= bit;
+            if (cmd_buffer->state.vbo_bound_mask & bit &&
+                ((vb[attrib->binding].offset + state->offsets[loc]) & format_align_req_minus_1))
+               cmd_buffer->state.vbo_misaligned_mask |= bit;
+         }
+      }
+
+      if (alpha_adjust) {
+         state->alpha_adjust_lo |= (alpha_adjust & 0x1) << loc;
+         state->alpha_adjust_hi |= (alpha_adjust >> 1) << loc;
+      }
+
+      if (post_shuffle)
+         state->post_shuffle |= 1u << loc;
+   }
+
+   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_STATE;
+}
+
 void
 radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCount,
                         const VkCommandBuffer *pCmdBuffers)
@@ -5028,7 +5637,7 @@ radv_CreateCommandPool(VkDevice _device, const VkCommandPoolCreateInfo *pCreateI
    pool =
       vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pool), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (pool == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    vk_object_base_init(&device->vk, &pool->base, VK_OBJECT_TYPE_COMMAND_POOL);
 
@@ -5109,7 +5718,7 @@ radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer, uint32_t subpa
 
    ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096);
 
-   radv_subpass_barrier(cmd_buffer, &subpass->start_barrier);
+   radv_emit_subpass_barrier(cmd_buffer, &subpass->start_barrier);
 
    radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
 
@@ -5133,45 +5742,42 @@ radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer, uint32_t subpa
           */
          int ds_idx = subpass->depth_stencil_attachment->attachment;
          struct radv_image_view *ds_iview = cmd_buffer->state.attachments[ds_idx].iview;
+         struct radv_image *ds_image = ds_iview->image;
 
          VkExtent2D extent = {
-            .width = ds_iview->image->info.width,
-            .height = ds_iview->image->info.height,
+            .width = ds_image->info.width,
+            .height = ds_image->info.height,
          };
 
+         /* HTILE buffer */
+         uint64_t htile_offset = ds_image->offset + ds_image->planes[0].surface.meta_offset;
+         uint64_t htile_size = ds_image->planes[0].surface.meta_slice_size;
+         struct radv_buffer htile_buffer;
+
+         radv_buffer_init(&htile_buffer, cmd_buffer->device, ds_image->bo, htile_size, htile_offset);
+
          /* Copy the VRS rates to the HTILE buffer. */
-         radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_iview->image);
+         radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_image, &htile_buffer, true);
+
+         radv_buffer_finish(&htile_buffer);
       } else {
          /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have
           * to copy the VRS rates to our internal HTILE buffer.
           */
          struct radv_framebuffer *fb = cmd_buffer->state.framebuffer;
          struct radv_image *ds_image = radv_cmd_buffer_get_vrs_image(cmd_buffer);
-         uint32_t htile_value;
 
          if (ds_image) {
-            htile_value = radv_get_htile_initial_value(cmd_buffer->device, ds_image);
+            /* HTILE buffer */
+            struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer;
 
             VkExtent2D extent = {
                .width = MIN2(fb->width, ds_image->info.width),
                .height = MIN2(fb->height, ds_image->info.height),
             };
 
-            /* Clear the HTILE buffer before copying VRS rates because it's a read-modify-write
-             * operation.
-             */
-            VkImageSubresourceRange range = {
-               .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT,
-               .baseMipLevel = 0,
-               .levelCount = 1,
-               .baseArrayLayer = 0,
-               .layerCount = 1,
-            };
-
-            cmd_buffer->state.flush_bits |= radv_clear_htile(cmd_buffer, ds_image, &range, htile_value);
-
             /* Copy the VRS rates to the HTILE buffer. */
-            radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_image);
+            radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_image, htile_buffer, false);
          }
       }
    }
@@ -5183,6 +5789,40 @@ radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer, uint32_t subpa
    assert(cmd_buffer->cs->cdw <= cdw_max);
 }
 
+static void
+radv_mark_noncoherent_rb(struct radv_cmd_buffer *cmd_buffer)
+{
+   const struct radv_subpass *subpass = cmd_buffer->state.subpass;
+
+   /* Have to be conservative in cmdbuffers with inherited attachments. */
+   if (!cmd_buffer->state.attachments) {
+      cmd_buffer->state.rb_noncoherent_dirty = true;
+      return;
+   }
+
+   for (uint32_t i = 0; i < subpass->color_count; ++i) {
+      const uint32_t a = subpass->color_attachments[i].attachment;
+      if (a == VK_ATTACHMENT_UNUSED)
+         continue;
+      if (!cmd_buffer->state.attachments[a].iview->image->l2_coherent) {
+         cmd_buffer->state.rb_noncoherent_dirty = true;
+         return;
+      }
+   }
+   if (subpass->depth_stencil_attachment &&
+       !cmd_buffer->state.attachments[subpass->depth_stencil_attachment->attachment]
+           .iview->image->l2_coherent)
+      cmd_buffer->state.rb_noncoherent_dirty = true;
+}
+
+void
+radv_cmd_buffer_restore_subpass(struct radv_cmd_buffer *cmd_buffer,
+                                const struct radv_subpass *subpass)
+{
+   radv_mark_noncoherent_rb(cmd_buffer);
+   radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
+}
+
 static void
 radv_cmd_buffer_end_subpass(struct radv_cmd_buffer *cmd_buffer)
 {
@@ -5251,6 +5891,8 @@ radv_CmdNextSubpass2(VkCommandBuffer commandBuffer, const VkSubpassBeginInfo *pS
 {
    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
 
+   radv_mark_noncoherent_rb(cmd_buffer);
+
    uint32_t prev_subpass = radv_get_subpass_id(cmd_buffer);
    radv_cmd_buffer_end_subpass(cmd_buffer);
    radv_cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);
@@ -5666,7 +6308,7 @@ radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
    /* Index, vertex and streamout buffers don't change context regs, and
     * pipeline is already handled.
     */
-   used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_BUFFER |
+   used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_STATE |
                     RADV_CMD_DIRTY_STREAMOUT_BUFFER | RADV_CMD_DIRTY_PIPELINE);
 
    if (cmd_buffer->state.dirty & used_states)
@@ -5699,7 +6341,7 @@ radv_skip_ngg_culling(bool has_tess, const unsigned vtx_cnt,
     * When tessellation is used, what matters is the number of tessellated
     * vertices, so let's always assume it's not a small draw.
     */
-   return !has_tess && !indirect && vtx_cnt < 512;
+   return !has_tess && !indirect && vtx_cnt < 128;
 }
 
 ALWAYS_INLINE static uint32_t
@@ -5872,7 +6514,8 @@ radv_emit_ngg_culling_state(struct radv_cmd_buffer *cmd_buffer, const struct rad
 }
 
 static void
-radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info)
+radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
+                              bool pipeline_is_dirty)
 {
    bool late_scissor_emission;
 
@@ -5880,7 +6523,7 @@ radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct r
        cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline)
       radv_emit_rbplus_state(cmd_buffer);
 
-   if ((cmd_buffer->device->instance->perftest_flags & RADV_PERFTEST_NGGC) &&
+   if (cmd_buffer->device->physical_device->use_ngg_culling &&
        cmd_buffer->state.pipeline->graphics.is_ngg)
       radv_emit_ngg_culling_state(cmd_buffer, info);
 
@@ -5909,7 +6552,7 @@ radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct r
       }
    }
 
-   radv_cmd_buffer_flush_dynamic_state(cmd_buffer);
+   radv_cmd_buffer_flush_dynamic_state(cmd_buffer, pipeline_is_dirty);
 
    radv_emit_draw_registers(cmd_buffer, info);
 
@@ -5958,7 +6601,7 @@ radv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info
        * the CUs are idle is very short. (there are only SET_SH
        * packets between the wait and the draw)
        */
-      radv_emit_all_graphics_states(cmd_buffer, info);
+      radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty);
       si_emit_cache_flush(cmd_buffer);
       /* <-- CUs are idle here --> */
 
@@ -5978,7 +6621,7 @@ radv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info
 
       radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
 
-      radv_emit_all_graphics_states(cmd_buffer, info);
+      radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty);
    }
 
    radv_describe_draw(cmd_buffer);
@@ -6228,8 +6871,8 @@ struct radv_dispatch_info {
    /**
     * Indirect compute parameters resource.
     */
-   struct radv_buffer *indirect;
-   uint64_t indirect_offset;
+   struct radeon_winsys_bo *indirect;
+   uint64_t va;
 };
 
 static void
@@ -6255,19 +6898,15 @@ radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, struct radv_pipel
    }
 
    if (info->indirect) {
-      uint64_t va = radv_buffer_get_va(info->indirect->bo);
-
-      va += info->indirect->offset + info->indirect_offset;
-
-      radv_cs_add_buffer(ws, cs, info->indirect->bo);
+      radv_cs_add_buffer(ws, cs, info->indirect);
 
       if (loc->sgpr_idx != -1) {
          for (unsigned i = 0; i < 3; ++i) {
             radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
             radeon_emit(cs,
                         COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG));
-            radeon_emit(cs, (va + 4 * i));
-            radeon_emit(cs, (va + 4 * i) >> 32);
+            radeon_emit(cs, (info->va + 4 * i));
+            radeon_emit(cs, (info->va + 4 * i) >> 32);
             radeon_emit(cs, ((R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4) >> 2) + i);
             radeon_emit(cs, 0);
          }
@@ -6275,14 +6914,14 @@ radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, struct radv_pipel
 
       if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
          radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, predicating) | PKT3_SHADER_TYPE_S(1));
-         radeon_emit(cs, va);
-         radeon_emit(cs, va >> 32);
+         radeon_emit(cs, info->va);
+         radeon_emit(cs, info->va >> 32);
          radeon_emit(cs, dispatch_initiator);
       } else {
          radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1));
          radeon_emit(cs, 1);
-         radeon_emit(cs, va);
-         radeon_emit(cs, va >> 32);
+         radeon_emit(cs, info->va);
+         radeon_emit(cs, info->va >> 32);
 
          radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, predicating) | PKT3_SHADER_TYPE_S(1));
          radeon_emit(cs, 0);
@@ -6479,8 +7118,8 @@ radv_CmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDevi
    RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
    struct radv_dispatch_info info = {0};
 
-   info.indirect = buffer;
-   info.indirect_offset = offset;
+   info.indirect = buffer->bo;
+   info.va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
 
    radv_compute_dispatch(cmd_buffer, &info);
 }
@@ -6498,6 +7137,17 @@ radv_unaligned_dispatch(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t
    radv_compute_dispatch(cmd_buffer, &info);
 }
 
+void
+radv_indirect_dispatch(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys_bo *bo, uint64_t va)
+{
+   struct radv_dispatch_info info = {0};
+
+   info.indirect = bo;
+   info.va = va;
+
+   radv_compute_dispatch(cmd_buffer, &info);
+}
+
 static void
 radv_rt_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info)
 {
@@ -6518,22 +7168,12 @@ radv_rt_bind_tables(struct radv_cmd_buffer *cmd_buffer,
    if (!radv_cmd_buffer_upload_alloc(cmd_buffer, 64, &offset, &ptr))
       return false;
 
-   /* For the descriptor format. */
-   assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10);
-
    desc_ptr = ptr;
    for (unsigned i = 0; i < 4; ++i, desc_ptr += 4) {
-      uint32_t rsrc_word3 =
-         S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
-         S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT) |
-         S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED) | S_008F0C_RESOURCE_LEVEL(1);
-
       desc_ptr[0] = tables[i].deviceAddress;
-      desc_ptr[1] = S_008F04_BASE_ADDRESS_HI(tables[i].deviceAddress >> 32) |
-                    S_008F04_STRIDE(tables[i].stride);
-      desc_ptr[2] = 0xffffffffu;
-      desc_ptr[3] = rsrc_word3;
+      desc_ptr[1] = tables[i].deviceAddress >> 32;
+      desc_ptr[2] = tables[i].stride;
+      desc_ptr[3] = 0;
    }
 
    uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
@@ -6575,9 +7215,48 @@ radv_CmdTraceRaysKHR(VkCommandBuffer commandBuffer,
       return;
    }
 
+   struct radv_userdata_info *loc = radv_lookup_user_sgpr(
+      cmd_buffer->state.rt_pipeline, MESA_SHADER_COMPUTE, AC_UD_CS_RAY_LAUNCH_SIZE);
+
+   if (loc->sgpr_idx != -1) {
+      assert(loc->num_sgprs == 3);
+
+      radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3);
+      radeon_emit(cmd_buffer->cs, width);
+      radeon_emit(cmd_buffer->cs, height);
+      radeon_emit(cmd_buffer->cs, depth);
+   }
+
    radv_rt_dispatch(cmd_buffer, &info);
 }
 
+static void
+radv_set_rt_stack_size(struct radv_cmd_buffer *cmd_buffer, uint32_t size)
+{
+   unsigned wave_size = 0;
+   unsigned scratch_bytes_per_wave = 0;
+
+   if (cmd_buffer->state.rt_pipeline) {
+      scratch_bytes_per_wave = cmd_buffer->state.rt_pipeline->scratch_bytes_per_wave;
+      wave_size = cmd_buffer->state.rt_pipeline->shaders[MESA_SHADER_COMPUTE]->info.wave_size;
+   }
+
+   /* The hardware register is specified as a multiple of 256 DWORDS. */
+   scratch_bytes_per_wave += align(size * wave_size, 1024);
+
+   cmd_buffer->compute_scratch_size_per_wave_needed =
+      MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, scratch_bytes_per_wave);
+}
+
+void
+radv_CmdSetRayTracingPipelineStackSizeKHR(VkCommandBuffer commandBuffer, uint32_t size)
+{
+   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   radv_set_rt_stack_size(cmd_buffer, size);
+   cmd_buffer->state.rt_stack_size = size;
+}
+
 void
 radv_cmd_buffer_end_render_pass(struct radv_cmd_buffer *cmd_buffer)
 {
@@ -6596,7 +7275,9 @@ radv_CmdEndRenderPass2(VkCommandBuffer commandBuffer, const VkSubpassEndInfo *pS
 {
    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
 
-   radv_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier);
+   radv_mark_noncoherent_rb(cmd_buffer);
+
+   radv_emit_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier);
 
    radv_cmd_buffer_end_subpass(cmd_buffer);
 
@@ -6676,7 +7357,7 @@ radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer, struct ra
       cmd_buffer->state.flush_bits |=
          RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
 
-      radv_decompress_depth_stencil(cmd_buffer, image, range, sample_locs);
+      radv_expand_depth_stencil(cmd_buffer, image, range, sample_locs);
 
       cmd_buffer->state.flush_bits |=
          RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
@@ -6978,6 +7659,9 @@ radv_barrier(struct radv_cmd_buffer *cmd_buffer, uint32_t memoryBarrierCount,
    enum radv_cmd_flush_bits src_flush_bits = 0;
    enum radv_cmd_flush_bits dst_flush_bits = 0;
 
+   if (cmd_buffer->state.subpass)
+      radv_mark_noncoherent_rb(cmd_buffer);
+
    radv_describe_barrier_start(cmd_buffer, info->reason);
 
    for (unsigned i = 0; i < info->eventCount; ++i) {
diff --git a/mesa 3D driver/src/amd/vulkan/radv_constants.h b/mesa 3D driver/src/amd/vulkan/radv_constants.h
index 327365c91f..c3f057d8ff 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_constants.h	
+++ b/mesa 3D driver/src/amd/vulkan/radv_constants.h	
@@ -53,7 +53,7 @@
 #define MAX_INLINE_UNIFORM_BLOCK_COUNT 64
 #define MAX_BIND_POINTS                3 /* compute + graphics + raytracing */
 
-#define NUM_DEPTH_CLEAR_PIPELINES      3
+#define NUM_DEPTH_CLEAR_PIPELINES      2
 #define NUM_DEPTH_DECOMPRESS_PIPELINES 3
 
 /*
@@ -90,4 +90,16 @@
 /* Number of invocations in each subgroup. */
 #define RADV_SUBGROUP_SIZE 64
 
+/* The spec requires this to be 32. */
+#define RADV_RT_HANDLE_SIZE 32
+
+#define RADV_MAX_HIT_ATTRIB_SIZE 32
+
+#define RADV_SHADER_ALLOC_ALIGNMENT      256
+#define RADV_SHADER_ALLOC_MIN_ARENA_SIZE (256 * 1024)
+#define RADV_SHADER_ALLOC_MIN_SIZE_CLASS 8
+#define RADV_SHADER_ALLOC_MAX_SIZE_CLASS 15
+#define RADV_SHADER_ALLOC_NUM_FREE_LISTS                                                           \
+   (RADV_SHADER_ALLOC_MAX_SIZE_CLASS - RADV_SHADER_ALLOC_MIN_SIZE_CLASS + 1)
+
 #endif /* RADV_CONSTANTS_H */
diff --git a/mesa 3D driver/src/amd/vulkan/radv_debug.c b/mesa 3D driver/src/amd/vulkan/radv_debug.c
index 160c5e9de5..63420c88b2 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_debug.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_debug.c	
@@ -306,7 +306,7 @@ radv_dump_annotated_shader(struct radv_shader_variant *shader, gl_shader_stage s
    if (!shader)
       return;
 
-   start_addr = radv_buffer_get_va(shader->bo) + shader->bo_offset;
+   start_addr = radv_shader_variant_get_va(shader);
    end_addr = start_addr + shader->code_size;
 
    /* See if any wave executes the shader. */
@@ -499,7 +499,7 @@ radv_get_saved_pipeline(struct radv_device *device, enum ring_type ring)
 static void
 radv_dump_queue_state(struct radv_queue *queue, const char *dump_dir, FILE *f)
 {
-   enum ring_type ring = radv_queue_family_to_ring(queue->queue_family_index);
+   enum ring_type ring = radv_queue_family_to_ring(queue->vk.queue_family_index);
    struct radv_pipeline *pipeline;
 
    fprintf(f, "RING_%s:\n", ring == RING_GFX ? "GFX" : "COMPUTE");
@@ -609,7 +609,7 @@ radv_dump_device_name(struct radv_device *device, FILE *f)
 static void
 radv_dump_umr_ring(struct radv_queue *queue, FILE *f)
 {
-   enum ring_type ring = radv_queue_family_to_ring(queue->queue_family_index);
+   enum ring_type ring = radv_queue_family_to_ring(queue->vk.queue_family_index);
    struct radv_device *device = queue->device;
    char cmd[128];
 
@@ -627,7 +627,7 @@ radv_dump_umr_ring(struct radv_queue *queue, FILE *f)
 static void
 radv_dump_umr_waves(struct radv_queue *queue, FILE *f)
 {
-   enum ring_type ring = radv_queue_family_to_ring(queue->queue_family_index);
+   enum ring_type ring = radv_queue_family_to_ring(queue->vk.queue_family_index);
    struct radv_device *device = queue->device;
    char cmd[128];
 
@@ -647,7 +647,7 @@ radv_gpu_hang_occured(struct radv_queue *queue, enum ring_type ring)
 {
    struct radeon_winsys *ws = queue->device->ws;
 
-   if (!ws->ctx_wait_idle(queue->hw_ctx, ring, queue->queue_idx))
+   if (!ws->ctx_wait_idle(queue->hw_ctx, ring, queue->vk.index_in_family))
       return true;
 
    return false;
@@ -660,7 +660,7 @@ radv_check_gpu_hangs(struct radv_queue *queue, struct radeon_cmdbuf *cs)
    enum ring_type ring;
    uint64_t addr;
 
-   ring = radv_queue_family_to_ring(queue->queue_family_index);
+   ring = radv_queue_family_to_ring(queue->vk.queue_family_index);
 
    bool hang_occurred = radv_gpu_hang_occured(queue, ring);
    bool vm_fault_occurred = false;
@@ -883,38 +883,6 @@ radv_trap_handler_finish(struct radv_device *device)
    }
 }
 
-static struct radv_shader_variant *
-radv_get_faulty_shader(struct radv_device *device, uint64_t faulty_pc)
-{
-   struct radv_shader_variant *shader = NULL;
-
-   mtx_lock(&device->shader_slab_mutex);
-
-   list_for_each_entry(struct radv_shader_slab, slab, &device->shader_slabs, slabs)
-   {
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wshadow"
-#endif
-      list_for_each_entry(struct radv_shader_variant, s, &slab->shaders, slab_list)
-      {
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-         uint64_t offset = align_u64(s->bo_offset + s->code_size, 256);
-         uint64_t va = radv_buffer_get_va(s->bo);
-
-         if (faulty_pc >= va + s->bo_offset && faulty_pc < va + offset) {
-            mtx_unlock(&device->shader_slab_mutex);
-            return s;
-         }
-      }
-   }
-   mtx_unlock(&device->shader_slab_mutex);
-
-   return shader;
-}
-
 static void
 radv_dump_faulty_shader(struct radv_device *device, uint64_t faulty_pc)
 {
@@ -922,11 +890,11 @@ radv_dump_faulty_shader(struct radv_device *device, uint64_t faulty_pc)
    uint64_t start_addr, end_addr;
    uint32_t instr_offset;
 
-   shader = radv_get_faulty_shader(device, faulty_pc);
+   shader = radv_find_shader_variant(device, faulty_pc);
    if (!shader)
       return;
 
-   start_addr = radv_buffer_get_va(shader->bo) + shader->bo_offset;
+   start_addr = radv_shader_variant_get_va(shader);
    end_addr = start_addr + shader->code_size;
    instr_offset = faulty_pc - start_addr;
 
@@ -999,12 +967,12 @@ radv_dump_sq_hw_regs(struct radv_device *device)
 void
 radv_check_trap_handler(struct radv_queue *queue)
 {
-   enum ring_type ring = radv_queue_family_to_ring(queue->queue_family_index);
+   enum ring_type ring = radv_queue_family_to_ring(queue->vk.queue_family_index);
    struct radv_device *device = queue->device;
    struct radeon_winsys *ws = device->ws;
 
    /* Wait for the context to be idle in a finite time. */
-   ws->ctx_wait_idle(queue->hw_ctx, ring, queue->queue_idx);
+   ws->ctx_wait_idle(queue->hw_ctx, ring, queue->vk.index_in_family);
 
    /* Try to detect if the trap handler has been reached by the hw by
     * looking at ttmp0 which should be non-zero if a shader exception
diff --git a/mesa 3D driver/src/amd/vulkan/radv_debug.h b/mesa 3D driver/src/amd/vulkan/radv_debug.h
index 5a0f295860..5c0dd14220 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_debug.h	
+++ b/mesa 3D driver/src/amd/vulkan/radv_debug.h	
@@ -45,23 +45,25 @@ enum {
    RADV_DEBUG_NO_DYNAMIC_BOUNDS = 1ull << 14,
    RADV_DEBUG_NO_OUT_OF_ORDER = 1ull << 15,
    RADV_DEBUG_INFO = 1ull << 16,
-   RADV_DEBUG_ERRORS = 1ull << 17,
-   RADV_DEBUG_STARTUP = 1ull << 18,
-   RADV_DEBUG_CHECKIR = 1ull << 19,
-   RADV_DEBUG_NOBINNING = 1ull << 20,
-   RADV_DEBUG_NO_NGG = 1ull << 21,
-   RADV_DEBUG_DUMP_META_SHADERS = 1ull << 22,
-   RADV_DEBUG_NO_MEMORY_CACHE = 1ull << 23,
-   RADV_DEBUG_DISCARD_TO_DEMOTE = 1ull << 24,
-   RADV_DEBUG_LLVM = 1ull << 25,
-   RADV_DEBUG_FORCE_COMPRESS = 1ull << 26,
-   RADV_DEBUG_HANG = 1ull << 27,
-   RADV_DEBUG_IMG = 1ull << 28,
-   RADV_DEBUG_NO_UMR = 1ull << 29,
-   RADV_DEBUG_INVARIANT_GEOM = 1ull << 30,
-   RADV_DEBUG_NO_DISPLAY_DCC = 1ull << 31,
-   RADV_DEBUG_NO_TC_COMPAT_CMASK = 1ull << 32,
-   RADV_DEBUG_NO_VRS_FLAT_SHADING = 1ull << 33,
+   RADV_DEBUG_STARTUP = 1ull << 17,
+   RADV_DEBUG_CHECKIR = 1ull << 18,
+   RADV_DEBUG_NOBINNING = 1ull << 19,
+   RADV_DEBUG_NO_NGG = 1ull << 20,
+   RADV_DEBUG_DUMP_META_SHADERS = 1ull << 21,
+   RADV_DEBUG_NO_MEMORY_CACHE = 1ull << 22,
+   RADV_DEBUG_DISCARD_TO_DEMOTE = 1ull << 23,
+   RADV_DEBUG_LLVM = 1ull << 24,
+   RADV_DEBUG_FORCE_COMPRESS = 1ull << 25,
+   RADV_DEBUG_HANG = 1ull << 26,
+   RADV_DEBUG_IMG = 1ull << 27,
+   RADV_DEBUG_NO_UMR = 1ull << 28,
+   RADV_DEBUG_INVARIANT_GEOM = 1ull << 29,
+   RADV_DEBUG_NO_DISPLAY_DCC = 1ull << 30,
+   RADV_DEBUG_NO_TC_COMPAT_CMASK = 1ull << 31,
+   RADV_DEBUG_NO_VRS_FLAT_SHADING = 1ull << 32,
+   RADV_DEBUG_NO_ATOC_DITHERING = 1ull << 33,
+   RADV_DEBUG_NO_NGGC = 1ull << 34,
+   RADV_DEBUG_DUMP_PROLOGS = 1ull << 35,
 };
 
 enum {
@@ -75,6 +77,7 @@ enum {
    RADV_PERFTEST_SAM = 1u << 7,
    RADV_PERFTEST_RT = 1u << 8,
    RADV_PERFTEST_NGGC = 1u << 9,
+   RADV_PERFTEST_FORCE_EMULATE_RT = 1u << 10,
 };
 
 bool radv_init_trace(struct radv_device *device);
diff --git a/mesa 3D driver/src/amd/vulkan/radv_descriptor_set.c b/mesa 3D driver/src/amd/vulkan/radv_descriptor_set.c
index 1ea9ce607d..1b1708c3c7 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_descriptor_set.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_descriptor_set.c	
@@ -67,6 +67,9 @@ radv_mutable_descriptor_type_size_alignment(const VkMutableDescriptorTypeListVAL
          align = 16;
          break;
       case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+         size = 32;
+         align = 32;
+         break;
       case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
          size = 64;
          align = 32;
@@ -133,7 +136,7 @@ radv_CreateDescriptorSetLayout(VkDevice _device, const VkDescriptorSetLayoutCrea
    set_layout =
       vk_zalloc2(&device->vk.alloc, pAllocator, size, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (!set_layout)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    vk_object_base_init(&device->vk, &set_layout->base, VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT);
 
@@ -163,7 +166,7 @@ radv_CreateDescriptorSetLayout(VkDevice _device, const VkDescriptorSetLayoutCrea
    if (result != VK_SUCCESS) {
       vk_object_base_finish(&set_layout->base);
       vk_free2(&device->vk.alloc, pAllocator, set_layout);
-      return vk_error(device->instance, result);
+      return vk_error(device, result);
    }
 
    set_layout->binding_count = num_bindings;
@@ -219,6 +222,10 @@ radv_CreateDescriptorSetLayout(VkDevice _device, const VkDescriptorSetLayoutCrea
          alignment = 16;
          break;
       case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+         set_layout->binding[b].size = 32;
+         binding_buffer_count = 1;
+         alignment = 32;
+         break;
       case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
       case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
          /* main descriptor + fmask descriptor */
@@ -385,6 +392,9 @@ radv_GetDescriptorSetLayoutSupport(VkDevice device,
          descriptor_alignment = 16;
          break;
       case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+         descriptor_size = 32;
+         descriptor_alignment = 32;
+         break;
       case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
       case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
          descriptor_size = 64;
@@ -470,7 +480,7 @@ radv_CreatePipelineLayout(VkDevice _device, const VkPipelineLayoutCreateInfo *pC
    layout = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*layout), 8,
                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (layout == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    vk_object_base_init(&device->vk, &layout->base, VK_OBJECT_TYPE_PIPELINE_LAYOUT);
 
@@ -553,7 +563,7 @@ radv_descriptor_set_create(struct radv_device *device, struct radv_descriptor_po
 
    if (pool->host_memory_base) {
       if (pool->host_memory_end - pool->host_memory_ptr < mem_size)
-         return vk_error(device->instance, VK_ERROR_OUT_OF_POOL_MEMORY);
+         return VK_ERROR_OUT_OF_POOL_MEMORY;
 
       set = (struct radv_descriptor_set *)pool->host_memory_ptr;
       pool->host_memory_ptr += mem_size;
@@ -562,7 +572,7 @@ radv_descriptor_set_create(struct radv_device *device, struct radv_descriptor_po
       set = vk_alloc2(&device->vk.alloc, NULL, mem_size, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 
       if (!set)
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
    }
 
    memset(set, 0, mem_size);
@@ -590,7 +600,7 @@ radv_descriptor_set_create(struct radv_device *device, struct radv_descriptor_po
 
    if (!pool->host_memory_base && pool->entry_count == pool->max_entry_count) {
       vk_free2(&device->vk.alloc, NULL, set);
-      return vk_error(device->instance, VK_ERROR_OUT_OF_POOL_MEMORY);
+      return VK_ERROR_OUT_OF_POOL_MEMORY;
    }
 
    /* try to allocate linearly first, so that we don't spend
@@ -619,7 +629,7 @@ radv_descriptor_set_create(struct radv_device *device, struct radv_descriptor_po
 
       if (pool->size - offset < layout_size) {
          vk_free2(&device->vk.alloc, NULL, set);
-         return vk_error(device->instance, VK_ERROR_OUT_OF_POOL_MEMORY);
+         return VK_ERROR_OUT_OF_POOL_MEMORY;
       }
       set->header.bo = pool->bo;
       set->header.mapped_ptr = (uint32_t *)(pool->mapped_ptr + offset);
@@ -631,7 +641,7 @@ radv_descriptor_set_create(struct radv_device *device, struct radv_descriptor_po
       pool->entries[index].set = set;
       pool->entry_count++;
    } else
-      return vk_error(device->instance, VK_ERROR_OUT_OF_POOL_MEMORY);
+      return VK_ERROR_OUT_OF_POOL_MEMORY;
 
    if (layout->has_immutable_samplers) {
       for (unsigned i = 0; i < layout->binding_count; ++i) {
@@ -739,11 +749,11 @@ radv_CreateDescriptorPool(VkDevice _device, const VkDescriptorPoolCreateInfo *pC
       case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
       case VK_DESCRIPTOR_TYPE_SAMPLER:
       case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR:
+      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
          /* 32 as we may need to align for images */
          bo_size += 32 * pCreateInfo->pPoolSizes[i].descriptorCount;
          break;
       case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
-      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
       case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
          bo_size += 64 * pCreateInfo->pPoolSizes[i].descriptorCount;
          break;
@@ -787,7 +797,7 @@ radv_CreateDescriptorPool(VkDevice _device, const VkDescriptorPoolCreateInfo *pC
 
    pool = vk_alloc2(&device->vk.alloc, pAllocator, size, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (!pool)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    memset(pool, 0, sizeof(*pool));
 
@@ -807,19 +817,19 @@ radv_CreateDescriptorPool(VkDevice _device, const VkDescriptorPoolCreateInfo *pC
             RADV_BO_PRIORITY_DESCRIPTOR, 0, &pool->bo);
          if (result != VK_SUCCESS) {
             radv_destroy_descriptor_pool(device, pAllocator, pool);
-            return vk_error(device->instance, result);
+            return vk_error(device, result);
          }
          pool->mapped_ptr = (uint8_t *)device->ws->buffer_map(pool->bo);
          if (!pool->mapped_ptr) {
             radv_destroy_descriptor_pool(device, pAllocator, pool);
-            return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+            return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
          }
       } else {
          pool->host_bo =
             vk_alloc2(&device->vk.alloc, pAllocator, bo_size, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
          if (!pool->host_bo) {
             radv_destroy_descriptor_pool(device, pAllocator, pool);
-            return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+            return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
          }
          pool->mapped_ptr = pool->host_bo;
       }
@@ -966,6 +976,7 @@ write_buffer_descriptor(struct radv_device *device, struct radv_cmd_buffer *cmd_
 
    if (buffer_info->range == VK_WHOLE_SIZE)
       range = buffer->size - buffer_info->offset;
+   assert(buffer->size > 0 && range > 0);
 
    /* robustBufferAccess is relaxed enough to allow this (in combination
     * with the alignment/size we return from vkGetBufferMemoryRequirements)
@@ -1028,6 +1039,7 @@ write_dynamic_buffer_descriptor(struct radv_device *device, struct radv_descript
 
    if (buffer_info->range == VK_WHOLE_SIZE)
       size = buffer->size - buffer_info->offset;
+   assert(buffer->size > 0 && size > 0);
 
    /* robustBufferAccess is relaxed enough to allow this (in combination
     * with the alignment/size we return from vkGetBufferMemoryRequirements)
@@ -1062,6 +1074,7 @@ write_image_descriptor(struct radv_device *device, struct radv_cmd_buffer *cmd_b
    } else {
       descriptor = &iview->descriptor;
    }
+   assert(size > 0);
 
    memcpy(dst, descriptor, size);
 
@@ -1078,12 +1091,11 @@ write_combined_image_sampler_descriptor(struct radv_device *device,
                                         VkDescriptorType descriptor_type,
                                         const VkDescriptorImageInfo *image_info, bool has_sampler)
 {
-   RADV_FROM_HANDLE(radv_sampler, sampler, image_info->sampler);
-
    write_image_descriptor(device, cmd_buffer, sampler_offset, dst, buffer_list, descriptor_type,
                           image_info);
    /* copy over sampler state */
    if (has_sampler) {
+      RADV_FROM_HANDLE(radv_sampler, sampler, image_info->sampler);
       memcpy(dst + sampler_offset / sizeof(*dst), sampler->state, 16);
    }
 }
@@ -1101,7 +1113,7 @@ static void
 write_accel_struct(void *ptr, VkAccelerationStructureKHR _accel_struct)
 {
    RADV_FROM_HANDLE(radv_acceleration_structure, accel_struct, _accel_struct);
-   uint64_t va = radv_accel_struct_get_va(accel_struct);
+   uint64_t va = accel_struct ? radv_accel_struct_get_va(accel_struct) : 0;
    memcpy(ptr, &va, sizeof(va));
 }
 
@@ -1167,8 +1179,11 @@ radv_update_descriptor_sets(struct radv_device *device, struct radv_cmd_buffer *
             write_texel_buffer_descriptor(device, cmd_buffer, ptr, buffer_list,
                                           writeset->pTexelBufferView[j]);
             break;
-         case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
          case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+            write_image_descriptor(device, cmd_buffer, 32, ptr, buffer_list,
+                                   writeset->descriptorType, writeset->pImageInfo + j);
+            break;
+         case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
          case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
             write_image_descriptor(device, cmd_buffer, 64, ptr, buffer_list,
                                    writeset->descriptorType, writeset->pImageInfo + j);
@@ -1298,7 +1313,7 @@ radv_CreateDescriptorUpdateTemplate(VkDevice _device,
 
    templ = vk_alloc2(&device->vk.alloc, pAllocator, size, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (!templ)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    vk_object_base_init(&device->vk, &templ->base, VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE);
 
@@ -1434,8 +1449,12 @@ radv_update_descriptor_set_with_template(struct radv_device *device,
             write_texel_buffer_descriptor(device, cmd_buffer, pDst, buffer_list,
                                           *(VkBufferView *)pSrc);
             break;
-         case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
          case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+            write_image_descriptor(device, cmd_buffer, 32, pDst, buffer_list,
+                                   templ->entry[i].descriptor_type,
+                                   (struct VkDescriptorImageInfo *)pSrc);
+            break;
+         case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
          case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
             write_image_descriptor(device, cmd_buffer, 64, pDst, buffer_list,
                                    templ->entry[i].descriptor_type,
@@ -1494,7 +1513,7 @@ radv_CreateSamplerYcbcrConversion(VkDevice _device,
                            VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 
    if (conversion == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    vk_object_base_init(&device->vk, &conversion->base, VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION);
 
diff --git a/mesa 3D driver/src/amd/vulkan/radv_device.c b/mesa 3D driver/src/amd/vulkan/radv_device.c
index 63baee59f0..0aa03c9cd2 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_device.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_device.c	
@@ -58,12 +58,15 @@ typedef void *drmDevicePtr;
 #include "util/timespec.h"
 #include "util/u_atomic.h"
 #include "winsys/null/radv_null_winsys_public.h"
-#include "ac_llvm_util.h"
 #include "git_sha1.h"
 #include "sid.h"
 #include "vk_format.h"
 #include "vulkan/vk_icd.h"
 
+#ifdef LLVM_AVAILABLE
+#include "ac_llvm_util.h"
+#endif
+
 /* The number of IBs per submit isn't infinite, it depends on the ring type
  * (ie. some initial setup needed for a submit) and the number of IBs (4 DW).
  * This limit is arbitrary but should be safe for now.  Ideally, we should get
@@ -99,11 +102,15 @@ radv_get_current_time(void)
 static uint64_t
 radv_get_absolute_timeout(uint64_t timeout)
 {
-   uint64_t current_time = radv_get_current_time();
+   if (timeout == UINT64_MAX) {
+      return timeout;
+   } else {
+      uint64_t current_time = radv_get_current_time();
 
-   timeout = MIN2(UINT64_MAX - current_time, timeout);
+      timeout = MIN2(UINT64_MAX - current_time, timeout);
 
-   return current_time + timeout;
+      return current_time + timeout;
+   }
 }
 
 static int
@@ -116,8 +123,11 @@ radv_device_get_cache_uuid(enum radeon_family family, void *uuid)
    memset(uuid, 0, VK_UUID_SIZE);
    _mesa_sha1_init(&ctx);
 
-   if (!disk_cache_get_function_identifier(radv_device_get_cache_uuid, &ctx) ||
-       !disk_cache_get_function_identifier(LLVMInitializeAMDGPUTargetInfo, &ctx))
+   if (!disk_cache_get_function_identifier(radv_device_get_cache_uuid, &ctx)
+#ifdef LLVM_AVAILABLE
+       || !disk_cache_get_function_identifier(LLVMInitializeAMDGPUTargetInfo, &ctx)
+#endif
+   )
       return -1;
 
    _mesa_sha1_update(&ctx, &family, sizeof(family));
@@ -301,7 +311,11 @@ radv_get_compiler_string(struct radv_physical_device *pdevice)
       return "";
    }
 
+#ifdef LLVM_AVAILABLE
    return " (LLVM " MESA_LLVM_VERSION_STRING ")";
+#else
+   unreachable("LLVM is not available");
+#endif
 }
 
 int
@@ -392,8 +406,7 @@ radv_physical_device_get_supported_extensions(const struct radv_physical_device
    *ext = (struct vk_device_extension_table){
       .KHR_8bit_storage = true,
       .KHR_16bit_storage = true,
-      .KHR_acceleration_structure = (device->instance->perftest_flags & RADV_PERFTEST_RT) &&
-                                    device->rad_info.chip_class >= GFX10_3,
+      .KHR_acceleration_structure = !!(device->instance->perftest_flags & RADV_PERFTEST_RT),
       .KHR_bind_memory2 = true,
       .KHR_buffer_device_address = true,
       .KHR_copy_commands2 = true,
@@ -411,6 +424,7 @@ radv_physical_device_get_supported_extensions(const struct radv_physical_device
       .KHR_external_memory_fd = true,
       .KHR_external_semaphore = true,
       .KHR_external_semaphore_fd = true,
+      .KHR_format_feature_flags2 = true,
       .KHR_fragment_shading_rate = device->rad_info.chip_class >= GFX10_3,
       .KHR_get_memory_requirements2 = true,
       .KHR_image_format_list = true,
@@ -421,9 +435,12 @@ radv_physical_device_get_supported_extensions(const struct radv_physical_device
       .KHR_maintenance1 = true,
       .KHR_maintenance2 = true,
       .KHR_maintenance3 = true,
+      .KHR_maintenance4 = true,
       .KHR_multiview = true,
       .KHR_pipeline_executable_properties = true,
+      .KHR_pipeline_library = (device->instance->perftest_flags & RADV_PERFTEST_RT) && !device->use_llvm,
       .KHR_push_descriptor = true,
+      .KHR_ray_tracing_pipeline = (device->instance->perftest_flags & RADV_PERFTEST_RT) && !device->use_llvm,
       .KHR_relaxed_block_layout = true,
       .KHR_sampler_mirror_clamp_to_edge = true,
       .KHR_sampler_ycbcr_conversion = true,
@@ -433,6 +450,7 @@ radv_physical_device_get_supported_extensions(const struct radv_physical_device
       .KHR_shader_draw_parameters = true,
       .KHR_shader_float16_int8 = true,
       .KHR_shader_float_controls = true,
+      .KHR_shader_integer_dot_product = true,
       .KHR_shader_non_semantic_info = true,
       .KHR_shader_subgroup_extended_types = true,
       .KHR_shader_subgroup_uniform_control_flow = true,
@@ -486,6 +504,7 @@ radv_physical_device_get_supported_extensions(const struct radv_physical_device
       .EXT_pipeline_creation_cache_control = true,
       .EXT_pipeline_creation_feedback = true,
       .EXT_post_depth_coverage = device->rad_info.chip_class >= GFX10,
+      .EXT_primitive_topology_list_restart = true,
       .EXT_private_data = true,
       .EXT_provoking_vertex = true,
       .EXT_queue_family_foreign = true,
@@ -494,7 +513,11 @@ radv_physical_device_get_supported_extensions(const struct radv_physical_device
       .EXT_sampler_filter_minmax = true,
       .EXT_scalar_block_layout = device->rad_info.chip_class >= GFX7,
       .EXT_shader_atomic_float = true,
-      .EXT_shader_atomic_float2 = !device->use_llvm,
+#ifdef LLVM_AVAILABLE
+      .EXT_shader_atomic_float2 = !device->use_llvm || LLVM_VERSION_MAJOR >= 14,
+#else
+      .EXT_shader_atomic_float2 = true,
+#endif
       .EXT_shader_demote_to_helper_invocation = true,
       .EXT_shader_image_atomic_int64 = true,
       .EXT_shader_stencil_export = true,
@@ -505,6 +528,7 @@ radv_physical_device_get_supported_extensions(const struct radv_physical_device
       .EXT_texel_buffer_alignment = true,
       .EXT_transform_feedback = true,
       .EXT_vertex_attribute_divisor = true,
+      .EXT_vertex_input_dynamic_state = !device->use_llvm,
       .EXT_ycbcr_image_arrays = true,
       .AMD_buffer_marker = true,
       .AMD_device_coherent_memory = true,
@@ -553,31 +577,24 @@ radv_physical_device_try_create(struct radv_instance *instance, drmDevicePtr drm
 
       fd = open(path, O_RDWR | O_CLOEXEC);
       if (fd < 0) {
-         if (instance->debug_flags & RADV_DEBUG_STARTUP)
-            radv_logi("Could not open device '%s'", path);
-
-         return vk_error(instance, VK_ERROR_INCOMPATIBLE_DRIVER);
+         return vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
+                          "Could not open device %s: %m", path);
       }
 
       version = drmGetVersion(fd);
       if (!version) {
          close(fd);
 
-         if (instance->debug_flags & RADV_DEBUG_STARTUP)
-            radv_logi("Could not get the kernel driver version for device '%s'", path);
-
-         return vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER, "failed to get version %s: %m",
-                          path);
+         return vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
+                          "Could not get the kernel driver version for device %s: %m", path);
       }
 
       if (strcmp(version->name, "amdgpu")) {
          drmFreeVersion(version);
          close(fd);
 
-         if (instance->debug_flags & RADV_DEBUG_STARTUP)
-            radv_logi("Device '%s' is not using the amdgpu kernel driver.", path);
-
-         return VK_ERROR_INCOMPATIBLE_DRIVER;
+         return vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
+                          "Device '%s' is not using the AMDGPU kernel driver: %m", path);
       }
       drmFreeVersion(version);
 
@@ -596,6 +613,8 @@ radv_physical_device_try_create(struct radv_instance *instance, drmDevicePtr drm
    struct vk_physical_device_dispatch_table dispatch_table;
    vk_physical_device_dispatch_table_from_entrypoints(&dispatch_table,
                                                       &radv_physical_device_entrypoints, true);
+   vk_physical_device_dispatch_table_from_entrypoints(&dispatch_table,
+                                                      &wsi_physical_device_entrypoints, false);
 
    result = vk_physical_device_init(&device->vk, &instance->vk, NULL, &dispatch_table);
    if (result != VK_SUCCESS) {
@@ -643,6 +662,13 @@ radv_physical_device_try_create(struct radv_instance *instance, drmDevicePtr drm
    device->ws->query_info(device->ws, &device->rad_info);
 
    device->use_llvm = instance->debug_flags & RADV_DEBUG_LLVM;
+#ifndef LLVM_AVAILABLE
+   if (device->use_llvm) {
+      fprintf(stderr, "ERROR: LLVM compiler backend selected for radv, but LLVM support was not "
+                      "enabled at build time.\n");
+      abort();
+   }
+#endif
 
    snprintf(device->name, sizeof(device->name), "AMD RADV %s%s", device->rad_info.name,
             radv_get_compiler_string(device));
@@ -653,15 +679,12 @@ radv_physical_device_try_create(struct radv_instance *instance, drmDevicePtr drm
       goto fail_wsi;
    }
 
-   /* These flags affect shader compilation. */
-   uint64_t shader_env_flags = (device->use_llvm ? 0 : 0x2);
-
    /* The gpu id is already embedded in the uuid so we just pass "radv"
     * when creating the cache.
     */
    char buf[VK_UUID_SIZE * 2 + 1];
    disk_cache_format_hex_id(buf, device->cache_uuid, VK_UUID_SIZE * 2);
-   device->disk_cache = disk_cache_create(device->name, buf, shader_env_flags);
+   device->disk_cache = disk_cache_create(device->name, buf, 0);
 #endif
 
    if (device->rad_info.chip_class < GFX8 || device->rad_info.chip_class > GFX10)
@@ -680,6 +703,13 @@ radv_physical_device_try_create(struct radv_instance *instance, drmDevicePtr drm
                      device->rad_info.family != CHIP_NAVI14 &&
                      !(device->instance->debug_flags & RADV_DEBUG_NO_NGG);
 
+   device->use_ngg_culling =
+      device->use_ngg &&
+      device->rad_info.max_render_backends > 1 &&
+      (device->rad_info.chip_class >= GFX10_3 ||
+       (device->instance->perftest_flags & RADV_PERFTEST_NGGC)) &&
+      !(device->instance->debug_flags & RADV_DEBUG_NO_NGGC);
+
    device->use_ngg_streamout = false;
 
    /* Determine the number of threads per wave for all stages. */
@@ -703,6 +733,8 @@ radv_physical_device_try_create(struct radv_instance *instance, drmDevicePtr drm
 
    radv_physical_device_get_supported_extensions(device, &device->vk.supported_extensions);
 
+   radv_get_nir_options(device);
+
 #ifndef _WIN32
    if (drm_device) {
       struct stat primary_stat = {0}, render_stat = {0};
@@ -797,7 +829,6 @@ static const struct debug_control radv_debug_options[] = {
    {"nodynamicbounds", RADV_DEBUG_NO_DYNAMIC_BOUNDS},
    {"nooutoforder", RADV_DEBUG_NO_OUT_OF_ORDER},
    {"info", RADV_DEBUG_INFO},
-   {"errors", RADV_DEBUG_ERRORS},
    {"startup", RADV_DEBUG_STARTUP},
    {"checkir", RADV_DEBUG_CHECKIR},
    {"nobinning", RADV_DEBUG_NOBINNING},
@@ -814,6 +845,9 @@ static const struct debug_control radv_debug_options[] = {
    {"nodisplaydcc", RADV_DEBUG_NO_DISPLAY_DCC},
    {"notccompatcmask", RADV_DEBUG_NO_TC_COMPAT_CMASK},
    {"novrsflatshading", RADV_DEBUG_NO_VRS_FLAT_SHADING},
+   {"noatocdithering", RADV_DEBUG_NO_ATOC_DITHERING},
+   {"nonggc", RADV_DEBUG_NO_NGGC},
+   {"prologs", RADV_DEBUG_DUMP_PROLOGS},
    {NULL, 0}};
 
 const char *
@@ -833,6 +867,7 @@ static const struct debug_control radv_perftest_options[] = {{"localbos", RADV_P
                                                              {"sam", RADV_PERFTEST_SAM},
                                                              {"rt", RADV_PERFTEST_RT},
                                                              {"nggc", RADV_PERFTEST_NGGC},
+                                                             {"force_emulate_rt", RADV_PERFTEST_FORCE_EMULATE_RT},
                                                              {NULL, 0}};
 
 const char *
@@ -849,6 +884,7 @@ static const driOptionDescription radv_dri_options[] = {
       DRI_CONF_VK_X11_OVERRIDE_MIN_IMAGE_COUNT(0)
       DRI_CONF_VK_X11_STRICT_IMAGE_COUNT(false)
       DRI_CONF_VK_X11_ENSURE_MIN_IMAGE_COUNT(false)
+      DRI_CONF_VK_XWAYLAND_WAIT_READY(true)
       DRI_CONF_RADV_REPORT_LLVM9_VERSION_STRING(false)
       DRI_CONF_RADV_ENABLE_MRT_OUTPUT_NAN_FIXUP(false)
       DRI_CONF_RADV_DISABLE_SHRINK_IMAGE_STORE(false)
@@ -926,6 +962,7 @@ radv_CreateInstance(const VkInstanceCreateInfo *pCreateInfo,
 
    struct vk_instance_dispatch_table dispatch_table;
    vk_instance_dispatch_table_from_entrypoints(&dispatch_table, &radv_instance_entrypoints, true);
+   vk_instance_dispatch_table_from_entrypoints(&dispatch_table, &wsi_instance_entrypoints, false);
    result = vk_instance_init(&instance->vk, &radv_instance_extensions_supported, &dispatch_table,
                              pCreateInfo, pAllocator);
    if (result != VK_SUCCESS) {
@@ -1241,72 +1278,12 @@ radv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
 
    vk_foreach_struct(ext, pFeatures->pNext)
    {
+      if (vk_get_physical_device_core_1_1_feature_ext(ext, &core_1_1))
+         continue;
+      if (vk_get_physical_device_core_1_2_feature_ext(ext, &core_1_2))
+         continue;
+
       switch (ext->sType) {
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTERS_FEATURES: {
-         VkPhysicalDeviceVariablePointersFeatures *features = (void *)ext;
-         CORE_FEATURE(1, 1, variablePointersStorageBuffer);
-         CORE_FEATURE(1, 1, variablePointers);
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_FEATURES: {
-         VkPhysicalDeviceMultiviewFeatures *features = (VkPhysicalDeviceMultiviewFeatures *)ext;
-         CORE_FEATURE(1, 1, multiview);
-         CORE_FEATURE(1, 1, multiviewGeometryShader);
-         CORE_FEATURE(1, 1, multiviewTessellationShader);
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETERS_FEATURES: {
-         VkPhysicalDeviceShaderDrawParametersFeatures *features =
-            (VkPhysicalDeviceShaderDrawParametersFeatures *)ext;
-         CORE_FEATURE(1, 1, shaderDrawParameters);
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROTECTED_MEMORY_FEATURES: {
-         VkPhysicalDeviceProtectedMemoryFeatures *features =
-            (VkPhysicalDeviceProtectedMemoryFeatures *)ext;
-         CORE_FEATURE(1, 1, protectedMemory);
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES: {
-         VkPhysicalDevice16BitStorageFeatures *features =
-            (VkPhysicalDevice16BitStorageFeatures *)ext;
-         CORE_FEATURE(1, 1, storageBuffer16BitAccess);
-         CORE_FEATURE(1, 1, uniformAndStorageBuffer16BitAccess);
-         CORE_FEATURE(1, 1, storagePushConstant16);
-         CORE_FEATURE(1, 1, storageInputOutput16);
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES: {
-         VkPhysicalDeviceSamplerYcbcrConversionFeatures *features =
-            (VkPhysicalDeviceSamplerYcbcrConversionFeatures *)ext;
-         CORE_FEATURE(1, 1, samplerYcbcrConversion);
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_FEATURES: {
-         VkPhysicalDeviceDescriptorIndexingFeatures *features =
-            (VkPhysicalDeviceDescriptorIndexingFeatures *)ext;
-         CORE_FEATURE(1, 2, shaderInputAttachmentArrayDynamicIndexing);
-         CORE_FEATURE(1, 2, shaderUniformTexelBufferArrayDynamicIndexing);
-         CORE_FEATURE(1, 2, shaderStorageTexelBufferArrayDynamicIndexing);
-         CORE_FEATURE(1, 2, shaderUniformBufferArrayNonUniformIndexing);
-         CORE_FEATURE(1, 2, shaderSampledImageArrayNonUniformIndexing);
-         CORE_FEATURE(1, 2, shaderStorageBufferArrayNonUniformIndexing);
-         CORE_FEATURE(1, 2, shaderStorageImageArrayNonUniformIndexing);
-         CORE_FEATURE(1, 2, shaderInputAttachmentArrayNonUniformIndexing);
-         CORE_FEATURE(1, 2, shaderUniformTexelBufferArrayNonUniformIndexing);
-         CORE_FEATURE(1, 2, shaderStorageTexelBufferArrayNonUniformIndexing);
-         CORE_FEATURE(1, 2, descriptorBindingUniformBufferUpdateAfterBind);
-         CORE_FEATURE(1, 2, descriptorBindingSampledImageUpdateAfterBind);
-         CORE_FEATURE(1, 2, descriptorBindingStorageImageUpdateAfterBind);
-         CORE_FEATURE(1, 2, descriptorBindingStorageBufferUpdateAfterBind);
-         CORE_FEATURE(1, 2, descriptorBindingUniformTexelBufferUpdateAfterBind);
-         CORE_FEATURE(1, 2, descriptorBindingStorageTexelBufferUpdateAfterBind);
-         CORE_FEATURE(1, 2, descriptorBindingUpdateUnusedWhilePending);
-         CORE_FEATURE(1, 2, descriptorBindingPartiallyBound);
-         CORE_FEATURE(1, 2, descriptorBindingVariableDescriptorCount);
-         CORE_FEATURE(1, 2, runtimeDescriptorArray);
-         break;
-      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONDITIONAL_RENDERING_FEATURES_EXT: {
          VkPhysicalDeviceConditionalRenderingFeaturesEXT *features =
             (VkPhysicalDeviceConditionalRenderingFeaturesEXT *)ext;
@@ -1348,47 +1325,12 @@ radv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
          CORE_FEATURE(1, 2, bufferDeviceAddressMultiDevice);
          break;
       }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES: {
-         VkPhysicalDeviceBufferDeviceAddressFeatures *features =
-            (VkPhysicalDeviceBufferDeviceAddressFeatures *)ext;
-         CORE_FEATURE(1, 2, bufferDeviceAddress);
-         CORE_FEATURE(1, 2, bufferDeviceAddressCaptureReplay);
-         CORE_FEATURE(1, 2, bufferDeviceAddressMultiDevice);
-         break;
-      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_CLIP_ENABLE_FEATURES_EXT: {
          VkPhysicalDeviceDepthClipEnableFeaturesEXT *features =
             (VkPhysicalDeviceDepthClipEnableFeaturesEXT *)ext;
          features->depthClipEnable = true;
          break;
       }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES: {
-         VkPhysicalDeviceHostQueryResetFeatures *features =
-            (VkPhysicalDeviceHostQueryResetFeatures *)ext;
-         CORE_FEATURE(1, 2, hostQueryReset);
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES: {
-         VkPhysicalDevice8BitStorageFeatures *features = (VkPhysicalDevice8BitStorageFeatures *)ext;
-         CORE_FEATURE(1, 2, storageBuffer8BitAccess);
-         CORE_FEATURE(1, 2, uniformAndStorageBuffer8BitAccess);
-         CORE_FEATURE(1, 2, storagePushConstant8);
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES: {
-         VkPhysicalDeviceShaderFloat16Int8Features *features =
-            (VkPhysicalDeviceShaderFloat16Int8Features *)ext;
-         CORE_FEATURE(1, 2, shaderFloat16);
-         CORE_FEATURE(1, 2, shaderInt8);
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_INT64_FEATURES: {
-         VkPhysicalDeviceShaderAtomicInt64Features *features =
-            (VkPhysicalDeviceShaderAtomicInt64Features *)ext;
-         CORE_FEATURE(1, 2, shaderBufferInt64Atomics);
-         CORE_FEATURE(1, 2, shaderSharedInt64Atomics);
-         break;
-      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DEMOTE_TO_HELPER_INVOCATION_FEATURES_EXT: {
          VkPhysicalDeviceShaderDemoteToHelperInvocationFeaturesEXT *features =
             (VkPhysicalDeviceShaderDemoteToHelperInvocationFeaturesEXT *)ext;
@@ -1416,24 +1358,12 @@ radv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
          features->ycbcrImageArrays = true;
          break;
       }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_UNIFORM_BUFFER_STANDARD_LAYOUT_FEATURES: {
-         VkPhysicalDeviceUniformBufferStandardLayoutFeatures *features =
-            (VkPhysicalDeviceUniformBufferStandardLayoutFeatures *)ext;
-         CORE_FEATURE(1, 2, uniformBufferStandardLayout);
-         break;
-      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INDEX_TYPE_UINT8_FEATURES_EXT: {
          VkPhysicalDeviceIndexTypeUint8FeaturesEXT *features =
             (VkPhysicalDeviceIndexTypeUint8FeaturesEXT *)ext;
          features->indexTypeUint8 = pdevice->rad_info.chip_class >= GFX8;
          break;
       }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGELESS_FRAMEBUFFER_FEATURES: {
-         VkPhysicalDeviceImagelessFramebufferFeatures *features =
-            (VkPhysicalDeviceImagelessFramebufferFeatures *)ext;
-         CORE_FEATURE(1, 2, imagelessFramebuffer);
-         break;
-      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_EXECUTABLE_PROPERTIES_FEATURES_KHR: {
          VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR *features =
             (VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR *)ext;
@@ -1453,12 +1383,6 @@ radv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
          features->texelBufferAlignment = true;
          break;
       }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES: {
-         VkPhysicalDeviceTimelineSemaphoreFeatures *features =
-            (VkPhysicalDeviceTimelineSemaphoreFeatures *)ext;
-         CORE_FEATURE(1, 2, timelineSemaphore);
-         break;
-      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT: {
          VkPhysicalDeviceSubgroupSizeControlFeaturesEXT *features =
             (VkPhysicalDeviceSubgroupSizeControlFeaturesEXT *)ext;
@@ -1472,26 +1396,6 @@ radv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
          features->deviceCoherentMemory = pdevice->rad_info.has_l2_uncached;
          break;
       }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SUBGROUP_EXTENDED_TYPES_FEATURES: {
-         VkPhysicalDeviceShaderSubgroupExtendedTypesFeatures *features =
-            (VkPhysicalDeviceShaderSubgroupExtendedTypesFeatures *)ext;
-         CORE_FEATURE(1, 2, shaderSubgroupExtendedTypes);
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SEPARATE_DEPTH_STENCIL_LAYOUTS_FEATURES_KHR: {
-         VkPhysicalDeviceSeparateDepthStencilLayoutsFeaturesKHR *features =
-            (VkPhysicalDeviceSeparateDepthStencilLayoutsFeaturesKHR *)ext;
-         CORE_FEATURE(1, 2, separateDepthStencilLayouts);
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES: {
-         radv_get_physical_device_features_1_1(pdevice, (void *)ext);
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES: {
-         radv_get_physical_device_features_1_2(pdevice, (void *)ext);
-         break;
-      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LINE_RASTERIZATION_FEATURES_EXT: {
          VkPhysicalDeviceLineRasterizationFeaturesEXT *features =
             (VkPhysicalDeviceLineRasterizationFeaturesEXT *)ext;
@@ -1539,14 +1443,6 @@ radv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
          features->pipelineCreationCacheControl = true;
          break;
       }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_MEMORY_MODEL_FEATURES_KHR: {
-         VkPhysicalDeviceVulkanMemoryModelFeaturesKHR *features =
-            (VkPhysicalDeviceVulkanMemoryModelFeaturesKHR *)ext;
-         CORE_FEATURE(1, 2, vulkanMemoryModel);
-         CORE_FEATURE(1, 2, vulkanMemoryModelDeviceScope);
-         CORE_FEATURE(1, 2, vulkanMemoryModelAvailabilityVisibilityChains);
-         break;
-      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTENDED_DYNAMIC_STATE_FEATURES_EXT: {
          VkPhysicalDeviceExtendedDynamicStateFeaturesEXT *features =
             (VkPhysicalDeviceExtendedDynamicStateFeaturesEXT *)ext;
@@ -1676,27 +1572,65 @@ radv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_FLOAT_2_FEATURES_EXT: {
          VkPhysicalDeviceShaderAtomicFloat2FeaturesEXT *features =
             (VkPhysicalDeviceShaderAtomicFloat2FeaturesEXT *)ext;
-         bool has_shader_float_minmax = pdevice->rad_info.chip_class != GFX8 &&
-                                        pdevice->rad_info.chip_class != GFX9;
+         bool has_shader_buffer_float_minmax = ((pdevice->rad_info.chip_class == GFX6 ||
+                                                 pdevice->rad_info.chip_class == GFX7) &&
+                                                !pdevice->use_llvm) ||
+                                               pdevice->rad_info.chip_class >= GFX10;
+         bool has_shader_image_float_minmax = pdevice->rad_info.chip_class != GFX8 &&
+                                              pdevice->rad_info.chip_class != GFX9;
          features->shaderBufferFloat16Atomics = false;
          features->shaderBufferFloat16AtomicAdd = false;
          features->shaderBufferFloat16AtomicMinMax = false;
-         features->shaderBufferFloat32AtomicMinMax = has_shader_float_minmax;
-         features->shaderBufferFloat64AtomicMinMax = has_shader_float_minmax;
+         features->shaderBufferFloat32AtomicMinMax = has_shader_buffer_float_minmax;
+         features->shaderBufferFloat64AtomicMinMax = has_shader_buffer_float_minmax;
          features->shaderSharedFloat16Atomics = false;
          features->shaderSharedFloat16AtomicAdd = false;
          features->shaderSharedFloat16AtomicMinMax = false;
          features->shaderSharedFloat32AtomicMinMax = true;
          features->shaderSharedFloat64AtomicMinMax = true;
-         features->shaderImageFloat32AtomicMinMax = has_shader_float_minmax;
-         features->sparseImageFloat32AtomicMinMax = has_shader_float_minmax;
+         features->shaderImageFloat32AtomicMinMax = has_shader_image_float_minmax;
+         features->sparseImageFloat32AtomicMinMax = has_shader_image_float_minmax;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRIMITIVE_TOPOLOGY_LIST_RESTART_FEATURES_EXT: {
+         VkPhysicalDevicePrimitiveTopologyListRestartFeaturesEXT *features =
+            (VkPhysicalDevicePrimitiveTopologyListRestartFeaturesEXT *)ext;
+         features->primitiveTopologyListRestart = true;
+         features->primitiveTopologyPatchListRestart = false;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_FEATURES_KHR: {
+         VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR *features =
+            (VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR *)ext;
+         features->shaderIntegerDotProduct = true;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_TRACING_PIPELINE_FEATURES_KHR: {
+         VkPhysicalDeviceRayTracingPipelineFeaturesKHR *features =
+            (VkPhysicalDeviceRayTracingPipelineFeaturesKHR *)ext;
+         features->rayTracingPipeline = true;
+         features->rayTracingPipelineShaderGroupHandleCaptureReplay = false;
+         features->rayTracingPipelineShaderGroupHandleCaptureReplayMixed = false;
+         features->rayTracingPipelineTraceRaysIndirect = false;
+         features->rayTraversalPrimitiveCulling = false;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_4_FEATURES_KHR: {
+         VkPhysicalDeviceMaintenance4FeaturesKHR *features =
+            (VkPhysicalDeviceMaintenance4FeaturesKHR *)ext;
+         features->maintenance4 = true;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_INPUT_DYNAMIC_STATE_FEATURES_EXT: {
+         VkPhysicalDeviceVertexInputDynamicStateFeaturesEXT *features =
+            (VkPhysicalDeviceVertexInputDynamicStateFeaturesEXT *)ext;
+         features->vertexInputDynamicState = true;
          break;
       }
       default:
          break;
       }
    }
-#undef CORE_FEATURE
 }
 
 static size_t
@@ -1754,7 +1688,7 @@ radv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
       .maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE,
       .maxMemoryAllocationCount = UINT32_MAX,
       .maxSamplerAllocationCount = 64 * 1024,
-      .bufferImageGranularity = 64,                              /* A cache line */
+      .bufferImageGranularity = 1,
       .sparseAddressSpaceSize = RADV_MAX_MEMORY_ALLOCATION_SIZE, /* buffer max size */
       .maxBoundDescriptorSets = MAX_SETS,
       .maxPerStageDescriptorSamplers = max_descriptor_set_size,
@@ -2031,15 +1965,13 @@ radv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
    };
    radv_get_physical_device_properties_1_2(pdevice, &core_1_2);
 
-#define CORE_RENAMED_PROPERTY(major, minor, ext_property, core_property)                           \
-   memcpy(&properties->ext_property, &core_##major##_##minor.core_property,                        \
-          sizeof(core_##major##_##minor.core_property))
-
-#define CORE_PROPERTY(major, minor, property)                                                      \
-   CORE_RENAMED_PROPERTY(major, minor, property, property)
-
    vk_foreach_struct(ext, pProperties->pNext)
    {
+      if (vk_get_physical_device_core_1_1_property_ext(ext, &core_1_1))
+         continue;
+      if (vk_get_physical_device_core_1_2_property_ext(ext, &core_1_2))
+         continue;
+
       switch (ext->sType) {
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PUSH_DESCRIPTOR_PROPERTIES_KHR: {
          VkPhysicalDevicePushDescriptorPropertiesKHR *properties =
@@ -2047,27 +1979,6 @@ radv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
          properties->maxPushDescriptors = MAX_PUSH_DESCRIPTORS;
          break;
       }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES: {
-         VkPhysicalDeviceIDProperties *properties = (VkPhysicalDeviceIDProperties *)ext;
-         CORE_PROPERTY(1, 1, deviceUUID);
-         CORE_PROPERTY(1, 1, driverUUID);
-         CORE_PROPERTY(1, 1, deviceLUID);
-         CORE_PROPERTY(1, 1, deviceLUIDValid);
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_PROPERTIES: {
-         VkPhysicalDeviceMultiviewProperties *properties =
-            (VkPhysicalDeviceMultiviewProperties *)ext;
-         CORE_PROPERTY(1, 1, maxMultiviewViewCount);
-         CORE_PROPERTY(1, 1, maxMultiviewInstanceIndex);
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_POINT_CLIPPING_PROPERTIES: {
-         VkPhysicalDevicePointClippingProperties *properties =
-            (VkPhysicalDevicePointClippingProperties *)ext;
-         CORE_PROPERTY(1, 1, pointClippingBehavior);
-         break;
-      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DISCARD_RECTANGLE_PROPERTIES_EXT: {
          VkPhysicalDeviceDiscardRectanglePropertiesEXT *properties =
             (VkPhysicalDeviceDiscardRectanglePropertiesEXT *)ext;
@@ -2080,28 +1991,6 @@ radv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
          properties->minImportedHostPointerAlignment = 4096;
          break;
       }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES: {
-         VkPhysicalDeviceSubgroupProperties *properties = (VkPhysicalDeviceSubgroupProperties *)ext;
-         CORE_PROPERTY(1, 1, subgroupSize);
-         CORE_RENAMED_PROPERTY(1, 1, supportedStages, subgroupSupportedStages);
-         CORE_RENAMED_PROPERTY(1, 1, supportedOperations, subgroupSupportedOperations);
-         CORE_RENAMED_PROPERTY(1, 1, quadOperationsInAllStages, subgroupQuadOperationsInAllStages);
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES: {
-         VkPhysicalDeviceMaintenance3Properties *properties =
-            (VkPhysicalDeviceMaintenance3Properties *)ext;
-         CORE_PROPERTY(1, 1, maxPerSetDescriptors);
-         CORE_PROPERTY(1, 1, maxMemoryAllocationSize);
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_FILTER_MINMAX_PROPERTIES: {
-         VkPhysicalDeviceSamplerFilterMinmaxProperties *properties =
-            (VkPhysicalDeviceSamplerFilterMinmaxProperties *)ext;
-         CORE_PROPERTY(1, 2, filterMinmaxImageComponentMapping);
-         CORE_PROPERTY(1, 2, filterMinmaxSingleComponentFormats);
-         break;
-      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_CORE_PROPERTIES_AMD: {
          VkPhysicalDeviceShaderCorePropertiesAMD *properties =
             (VkPhysicalDeviceShaderCorePropertiesAMD *)ext;
@@ -2141,40 +2030,6 @@ radv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
          properties->maxVertexAttribDivisor = UINT32_MAX;
          break;
       }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_PROPERTIES: {
-         VkPhysicalDeviceDescriptorIndexingProperties *properties =
-            (VkPhysicalDeviceDescriptorIndexingProperties *)ext;
-         CORE_PROPERTY(1, 2, maxUpdateAfterBindDescriptorsInAllPools);
-         CORE_PROPERTY(1, 2, shaderUniformBufferArrayNonUniformIndexingNative);
-         CORE_PROPERTY(1, 2, shaderSampledImageArrayNonUniformIndexingNative);
-         CORE_PROPERTY(1, 2, shaderStorageBufferArrayNonUniformIndexingNative);
-         CORE_PROPERTY(1, 2, shaderStorageImageArrayNonUniformIndexingNative);
-         CORE_PROPERTY(1, 2, shaderInputAttachmentArrayNonUniformIndexingNative);
-         CORE_PROPERTY(1, 2, robustBufferAccessUpdateAfterBind);
-         CORE_PROPERTY(1, 2, quadDivergentImplicitLod);
-         CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindSamplers);
-         CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindUniformBuffers);
-         CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindStorageBuffers);
-         CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindSampledImages);
-         CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindStorageImages);
-         CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindInputAttachments);
-         CORE_PROPERTY(1, 2, maxPerStageUpdateAfterBindResources);
-         CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindSamplers);
-         CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindUniformBuffers);
-         CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindUniformBuffersDynamic);
-         CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindStorageBuffers);
-         CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindStorageBuffersDynamic);
-         CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindSampledImages);
-         CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindStorageImages);
-         CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindInputAttachments);
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROTECTED_MEMORY_PROPERTIES: {
-         VkPhysicalDeviceProtectedMemoryProperties *properties =
-            (VkPhysicalDeviceProtectedMemoryProperties *)ext;
-         CORE_PROPERTY(1, 1, protectedNoFault);
-         break;
-      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONSERVATIVE_RASTERIZATION_PROPERTIES_EXT: {
          VkPhysicalDeviceConservativeRasterizationPropertiesEXT *properties =
             (VkPhysicalDeviceConservativeRasterizationPropertiesEXT *)ext;
@@ -2200,14 +2055,6 @@ radv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
          break;
       }
 #endif
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES: {
-         VkPhysicalDeviceDriverProperties *properties = (VkPhysicalDeviceDriverProperties *)ext;
-         CORE_PROPERTY(1, 2, driverID);
-         CORE_PROPERTY(1, 2, driverName);
-         CORE_PROPERTY(1, 2, driverInfo);
-         CORE_PROPERTY(1, 2, conformanceVersion);
-         break;
-      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_PROPERTIES_EXT: {
          VkPhysicalDeviceTransformFeedbackPropertiesEXT *properties =
             (VkPhysicalDeviceTransformFeedbackPropertiesEXT *)ext;
@@ -2247,15 +2094,6 @@ radv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
          properties->variableSampleLocations = false;
          break;
       }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_STENCIL_RESOLVE_PROPERTIES: {
-         VkPhysicalDeviceDepthStencilResolveProperties *properties =
-            (VkPhysicalDeviceDepthStencilResolveProperties *)ext;
-         CORE_PROPERTY(1, 2, supportedDepthResolveModes);
-         CORE_PROPERTY(1, 2, supportedStencilResolveModes);
-         CORE_PROPERTY(1, 2, independentResolveNone);
-         CORE_PROPERTY(1, 2, independentResolve);
-         break;
-      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_PROPERTIES_EXT: {
          VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT *properties =
             (VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT *)ext;
@@ -2265,34 +2103,6 @@ radv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
          properties->uniformTexelBufferOffsetSingleTexelAlignment = true;
          break;
       }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT_CONTROLS_PROPERTIES: {
-         VkPhysicalDeviceFloatControlsProperties *properties =
-            (VkPhysicalDeviceFloatControlsProperties *)ext;
-         CORE_PROPERTY(1, 2, denormBehaviorIndependence);
-         CORE_PROPERTY(1, 2, roundingModeIndependence);
-         CORE_PROPERTY(1, 2, shaderDenormFlushToZeroFloat16);
-         CORE_PROPERTY(1, 2, shaderDenormPreserveFloat16);
-         CORE_PROPERTY(1, 2, shaderRoundingModeRTEFloat16);
-         CORE_PROPERTY(1, 2, shaderRoundingModeRTZFloat16);
-         CORE_PROPERTY(1, 2, shaderSignedZeroInfNanPreserveFloat16);
-         CORE_PROPERTY(1, 2, shaderDenormFlushToZeroFloat32);
-         CORE_PROPERTY(1, 2, shaderDenormPreserveFloat32);
-         CORE_PROPERTY(1, 2, shaderRoundingModeRTEFloat32);
-         CORE_PROPERTY(1, 2, shaderRoundingModeRTZFloat32);
-         CORE_PROPERTY(1, 2, shaderSignedZeroInfNanPreserveFloat32);
-         CORE_PROPERTY(1, 2, shaderDenormFlushToZeroFloat64);
-         CORE_PROPERTY(1, 2, shaderDenormPreserveFloat64);
-         CORE_PROPERTY(1, 2, shaderRoundingModeRTEFloat64);
-         CORE_PROPERTY(1, 2, shaderRoundingModeRTZFloat64);
-         CORE_PROPERTY(1, 2, shaderSignedZeroInfNanPreserveFloat64);
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_PROPERTIES: {
-         VkPhysicalDeviceTimelineSemaphoreProperties *properties =
-            (VkPhysicalDeviceTimelineSemaphoreProperties *)ext;
-         CORE_PROPERTY(1, 2, maxTimelineSemaphoreValueDifference);
-         break;
-      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_PROPERTIES_EXT: {
          VkPhysicalDeviceSubgroupSizeControlPropertiesEXT *props =
             (VkPhysicalDeviceSubgroupSizeControlPropertiesEXT *)ext;
@@ -2308,12 +2118,6 @@ radv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
          }
          break;
       }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES:
-         radv_get_physical_device_properties_1_1(pdevice, (void *)ext);
-         break;
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES:
-         radv_get_physical_device_properties_1_2(pdevice, (void *)ext);
-         break;
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LINE_RASTERIZATION_PROPERTIES_EXT: {
          VkPhysicalDeviceLineRasterizationPropertiesEXT *props =
             (VkPhysicalDeviceLineRasterizationPropertiesEXT *)ext;
@@ -2404,6 +2208,64 @@ radv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
          props->maxMultiDrawCount = 2048;
          break;
       }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_PROPERTIES_KHR: {
+         VkPhysicalDeviceShaderIntegerDotProductPropertiesKHR *props =
+            (VkPhysicalDeviceShaderIntegerDotProductPropertiesKHR *)ext;
+
+         bool accel = pdevice->rad_info.has_accelerated_dot_product;
+
+         props->integerDotProduct8BitUnsignedAccelerated = accel;
+         props->integerDotProduct8BitSignedAccelerated = accel;
+         props->integerDotProduct8BitMixedSignednessAccelerated = false;
+         props->integerDotProduct4x8BitPackedUnsignedAccelerated = accel;
+         props->integerDotProduct4x8BitPackedSignedAccelerated = accel;
+         props->integerDotProduct4x8BitPackedMixedSignednessAccelerated = false;
+         props->integerDotProduct16BitUnsignedAccelerated = accel;
+         props->integerDotProduct16BitSignedAccelerated = accel;
+         props->integerDotProduct16BitMixedSignednessAccelerated = false;
+         props->integerDotProduct32BitUnsignedAccelerated = false;
+         props->integerDotProduct32BitSignedAccelerated = false;
+         props->integerDotProduct32BitMixedSignednessAccelerated = false;
+         props->integerDotProduct64BitUnsignedAccelerated = false;
+         props->integerDotProduct64BitSignedAccelerated = false;
+         props->integerDotProduct64BitMixedSignednessAccelerated = false;
+         props->integerDotProductAccumulatingSaturating8BitUnsignedAccelerated = accel;
+         props->integerDotProductAccumulatingSaturating8BitSignedAccelerated = accel;
+         props->integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = false;
+         props->integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated = accel;
+         props->integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = accel;
+         props->integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated =
+            false;
+         props->integerDotProductAccumulatingSaturating16BitUnsignedAccelerated = accel;
+         props->integerDotProductAccumulatingSaturating16BitSignedAccelerated = accel;
+         props->integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated = false;
+         props->integerDotProductAccumulatingSaturating32BitUnsignedAccelerated = false;
+         props->integerDotProductAccumulatingSaturating32BitSignedAccelerated = false;
+         props->integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated = false;
+         props->integerDotProductAccumulatingSaturating64BitUnsignedAccelerated = false;
+         props->integerDotProductAccumulatingSaturating64BitSignedAccelerated = false;
+         props->integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated = false;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_TRACING_PIPELINE_PROPERTIES_KHR: {
+         VkPhysicalDeviceRayTracingPipelinePropertiesKHR *props =
+            (VkPhysicalDeviceRayTracingPipelinePropertiesKHR *)ext;
+         props->shaderGroupHandleSize = RADV_RT_HANDLE_SIZE;
+         props->maxRayRecursionDepth = 31;    /* Minimum allowed for DXR. */
+         props->maxShaderGroupStride = 16384; /* dummy */
+         props->shaderGroupBaseAlignment = 16;
+         props->shaderGroupHandleCaptureReplaySize = 16;
+         props->maxRayDispatchInvocationCount = 1024 * 1024 * 64;
+         props->shaderGroupHandleAlignment = 16;
+         props->maxRayHitAttributeSize = RADV_MAX_HIT_ATTRIB_SIZE;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_4_PROPERTIES_KHR: {
+         VkPhysicalDeviceMaintenance4PropertiesKHR *properties =
+            (VkPhysicalDeviceMaintenance4PropertiesKHR *)ext;
+         properties->maxBufferSize = RADV_MAX_MEMORY_ALLOCATION_SIZE;
+         break;
+      }
       default:
          break;
       }
@@ -2693,26 +2555,25 @@ radv_get_queue_global_priority(const VkDeviceQueueGlobalPriorityCreateInfoEXT *p
 }
 
 static int
-radv_queue_init(struct radv_device *device, struct radv_queue *queue, uint32_t queue_family_index,
-                int idx, VkDeviceQueueCreateFlags flags,
+radv_queue_init(struct radv_device *device, struct radv_queue *queue,
+                int idx, const VkDeviceQueueCreateInfo *create_info,
                 const VkDeviceQueueGlobalPriorityCreateInfoEXT *global_priority)
 {
    queue->device = device;
-   queue->queue_family_index = queue_family_index;
-   queue->queue_idx = idx;
    queue->priority = radv_get_queue_global_priority(global_priority);
-   queue->flags = flags;
    queue->hw_ctx = device->hw_ctx[queue->priority];
 
-   vk_object_base_init(&device->vk, &queue->base, VK_OBJECT_TYPE_QUEUE);
+   VkResult result = vk_queue_init(&queue->vk, &device->vk, create_info, idx);
+   if (result != VK_SUCCESS)
+      return result;
 
    list_inithead(&queue->pending_submissions);
    mtx_init(&queue->pending_mutex, mtx_plain);
 
    mtx_init(&queue->thread_mutex, mtx_plain);
    if (u_cnd_monotonic_init(&queue->thread_cond)) {
-      vk_object_base_finish(&queue->base);
-      return vk_error(device->instance, VK_ERROR_INITIALIZATION_FAILED);
+      vk_queue_finish(&queue->vk);
+      return vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
    }
    queue->cond_created = true;
 
@@ -2760,7 +2621,7 @@ radv_queue_finish(struct radv_queue *queue)
    if (queue->compute_scratch_bo)
       queue->device->ws->buffer_destroy(queue->device->ws, queue->compute_scratch_bo);
 
-   vk_object_base_finish(&queue->base);
+   vk_queue_finish(&queue->vk);
 }
 
 static void
@@ -2770,24 +2631,6 @@ radv_device_init_gs_info(struct radv_device *device)
                                                   device->physical_device->rad_info.family);
 }
 
-static VkResult
-check_physical_device_features(VkPhysicalDevice physicalDevice,
-                               const VkPhysicalDeviceFeatures *features)
-{
-   RADV_FROM_HANDLE(radv_physical_device, physical_device, physicalDevice);
-   VkPhysicalDeviceFeatures supported_features;
-   radv_GetPhysicalDeviceFeatures(physicalDevice, &supported_features);
-   VkBool32 *supported_feature = (VkBool32 *)&supported_features;
-   VkBool32 *enabled_feature = (VkBool32 *)features;
-   unsigned num_features = sizeof(VkPhysicalDeviceFeatures) / sizeof(VkBool32);
-   for (uint32_t i = 0; i < num_features; i++) {
-      if (enabled_feature[i] && !supported_feature[i])
-         return vk_error(physical_device->instance, VK_ERROR_FEATURE_NOT_PRESENT);
-   }
-
-   return VK_SUCCESS;
-}
-
 static VkResult
 radv_device_init_border_color(struct radv_device *device)
 {
@@ -2799,15 +2642,15 @@ radv_device_init_border_color(struct radv_device *device)
       RADV_BO_PRIORITY_SHADER, 0, &device->border_color_data.bo);
 
    if (result != VK_SUCCESS)
-      return vk_error(device->physical_device->instance, result);
+      return vk_error(device, result);
 
    result = device->ws->buffer_make_resident(device->ws, device->border_color_data.bo, true);
    if (result != VK_SUCCESS)
-      return vk_error(device->physical_device->instance, result);
+      return vk_error(device, result);
 
    device->border_color_data.colors_gpu_ptr = device->ws->buffer_map(device->border_color_data.bo);
    if (!device->border_color_data.colors_gpu_ptr)
-      return vk_error(device->physical_device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
    mtx_init(&device->border_color_data.mutex, mtx_plain);
 
    return VK_SUCCESS;
@@ -2824,15 +2667,100 @@ radv_device_finish_border_color(struct radv_device *device)
    }
 }
 
+static VkResult
+radv_device_init_vs_prologs(struct radv_device *device)
+{
+   u_rwlock_init(&device->vs_prologs_lock);
+   device->vs_prologs = _mesa_hash_table_create(NULL, &radv_hash_vs_prolog, &radv_cmp_vs_prolog);
+   if (!device->vs_prologs)
+      return vk_error(device->physical_device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   /* don't pre-compile prologs if we want to print them */
+   if (device->instance->debug_flags & RADV_DEBUG_DUMP_PROLOGS)
+      return VK_SUCCESS;
+
+   struct radv_vs_input_state state;
+   state.nontrivial_divisors = 0;
+   memset(state.offsets, 0, sizeof(state.offsets));
+   state.alpha_adjust_lo = 0;
+   state.alpha_adjust_hi = 0;
+   memset(state.formats, 0, sizeof(state.formats));
+
+   struct radv_vs_prolog_key key;
+   key.state = &state;
+   key.misaligned_mask = 0;
+   key.as_ls = false;
+   key.is_ngg = device->physical_device->use_ngg;
+   key.next_stage = MESA_SHADER_VERTEX;
+   key.wave32 = device->physical_device->ge_wave_size == 32;
+
+   for (unsigned i = 1; i <= MAX_VERTEX_ATTRIBS; i++) {
+      state.attribute_mask = BITFIELD_MASK(i);
+      state.instance_rate_inputs = 0;
+
+      key.num_attributes = i;
+
+      device->simple_vs_prologs[i - 1] = radv_create_vs_prolog(device, &key);
+      if (!device->simple_vs_prologs[i - 1])
+         return vk_error(device->physical_device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+   }
+
+   unsigned idx = 0;
+   for (unsigned num_attributes = 1; num_attributes <= 16; num_attributes++) {
+      state.attribute_mask = BITFIELD_MASK(num_attributes);
+
+      for (unsigned i = 0; i < num_attributes; i++)
+         state.divisors[i] = 1;
+
+      for (unsigned count = 1; count <= num_attributes; count++) {
+         for (unsigned start = 0; start <= (num_attributes - count); start++) {
+            state.instance_rate_inputs = u_bit_consecutive(start, count);
+
+            key.num_attributes = num_attributes;
+
+            struct radv_shader_prolog *prolog = radv_create_vs_prolog(device, &key);
+            if (!prolog)
+               return vk_error(device->physical_device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+            assert(idx ==
+                   radv_instance_rate_prolog_index(num_attributes, state.instance_rate_inputs));
+            device->instance_rate_vs_prologs[idx++] = prolog;
+         }
+      }
+   }
+   assert(idx == ARRAY_SIZE(device->instance_rate_vs_prologs));
+
+   return VK_SUCCESS;
+}
+
+static void
+radv_device_finish_vs_prologs(struct radv_device *device)
+{
+   if (device->vs_prologs) {
+      hash_table_foreach(device->vs_prologs, entry)
+      {
+         free((void *)entry->key);
+         radv_prolog_destroy(device, entry->data);
+      }
+      _mesa_hash_table_destroy(device->vs_prologs, NULL);
+   }
+
+   for (unsigned i = 0; i < ARRAY_SIZE(device->simple_vs_prologs); i++)
+      radv_prolog_destroy(device, device->simple_vs_prologs[i]);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(device->instance_rate_vs_prologs); i++)
+      radv_prolog_destroy(device, device->instance_rate_vs_prologs[i]);
+}
+
 VkResult
-radv_device_init_vrs_image(struct radv_device *device)
+radv_device_init_vrs_state(struct radv_device *device)
 {
    /* FIXME: 4k depth buffers should be large enough for now but we might want to adjust this
-    * dynamically at some point. Also, it's probably better to use S8_UINT but no HTILE support yet.
+    * dynamically at some point.
     */
    uint32_t width = 4096, height = 4096;
-   VkMemoryRequirements mem_req;
    VkDeviceMemory mem;
+   VkBuffer buffer;
    VkResult result;
    VkImage image;
 
@@ -2857,11 +2785,30 @@ radv_device_init_vrs_image(struct radv_device *device)
    if (result != VK_SUCCESS)
       return result;
 
-   radv_GetImageMemoryRequirements(radv_device_to_handle(device), image, &mem_req);
+   VkBufferCreateInfo buffer_create_info = {
+      .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+      .size = radv_image_from_handle(image)->planes[0].surface.meta_size,
+      .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+      .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+   };
+
+   result = radv_CreateBuffer(radv_device_to_handle(device), &buffer_create_info,
+                              &device->meta_state.alloc, &buffer);
+   if (result != VK_SUCCESS)
+      goto fail_create;
+
+   VkBufferMemoryRequirementsInfo2 info = {
+      .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2,
+      .buffer = buffer,
+   };
+   VkMemoryRequirements2 mem_req = {
+      .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
+   };
+   radv_GetBufferMemoryRequirements2(radv_device_to_handle(device), &info, &mem_req);
 
    VkMemoryAllocateInfo alloc_info = {
       .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
-      .allocationSize = mem_req.size,
+      .allocationSize = mem_req.memoryRequirements.size,
    };
 
    result = radv_AllocateMemory(radv_device_to_handle(device), &alloc_info,
@@ -2869,11 +2816,19 @@ radv_device_init_vrs_image(struct radv_device *device)
    if (result != VK_SUCCESS)
       goto fail_alloc;
 
-   result = radv_BindImageMemory(radv_device_to_handle(device), image, mem, 0);
+   VkBindBufferMemoryInfo bind_info = {
+      .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO,
+      .buffer = buffer,
+      .memory = mem,
+      .memoryOffset = 0
+   };
+
+   result = radv_BindBufferMemory2(radv_device_to_handle(device), 1, &bind_info);
    if (result != VK_SUCCESS)
       goto fail_bind;
 
    device->vrs.image = radv_image_from_handle(image);
+   device->vrs.buffer = radv_buffer_from_handle(buffer);
    device->vrs.mem = radv_device_memory_from_handle(mem);
 
    return VK_SUCCESS;
@@ -2881,6 +2836,8 @@ radv_device_init_vrs_image(struct radv_device *device)
 fail_bind:
    radv_FreeMemory(radv_device_to_handle(device), mem, &device->meta_state.alloc);
 fail_alloc:
+   radv_DestroyBuffer(radv_device_to_handle(device), buffer, &device->meta_state.alloc);
+fail_create:
    radv_DestroyImage(radv_device_to_handle(device), image, &device->meta_state.alloc);
 
    return result;
@@ -2891,6 +2848,8 @@ radv_device_finish_vrs_image(struct radv_device *device)
 {
    radv_FreeMemory(radv_device_to_handle(device), radv_device_memory_to_handle(device->vrs.mem),
                    &device->meta_state.alloc);
+   radv_DestroyBuffer(radv_device_to_handle(device), radv_buffer_to_handle(device->vrs.buffer),
+                     &device->meta_state.alloc);
    radv_DestroyImage(radv_device_to_handle(device), radv_image_to_handle(device->vrs.image),
                      &device->meta_state.alloc);
 }
@@ -2905,8 +2864,7 @@ _radv_device_set_lost(struct radv_device *device, const char *file, int line, co
 
    va_start(ap, msg);
    err =
-      __vk_errorv(device->physical_device->instance, device, VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_EXT,
-                  VK_ERROR_DEVICE_LOST, file, line, msg, ap);
+      __vk_errorv(device, VK_ERROR_DEVICE_LOST, file, line, msg, ap);
    va_end(ap);
 
    return err;
@@ -2925,15 +2883,12 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
    bool robust_buffer_access2 = false;
    bool overallocation_disallowed = false;
    bool custom_border_colors = false;
-   bool vrs_enabled = false;
    bool attachment_vrs_enabled = false;
+   bool image_float32_atomics = false;
+   bool vs_prologs = false;
 
    /* Check enabled features */
    if (pCreateInfo->pEnabledFeatures) {
-      result = check_physical_device_features(physicalDevice, pCreateInfo->pEnabledFeatures);
-      if (result != VK_SUCCESS)
-         return result;
-
       if (pCreateInfo->pEnabledFeatures->robustBufferAccess)
          robust_buffer_access = true;
    }
@@ -2943,10 +2898,6 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
       switch (ext->sType) {
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2: {
          const VkPhysicalDeviceFeatures2 *features = (const void *)ext;
-         result = check_physical_device_features(physicalDevice, &features->features);
-         if (result != VK_SUCCESS)
-            return result;
-
          if (features->features.robustBufferAccess)
             robust_buffer_access = true;
          break;
@@ -2967,8 +2918,6 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_SHADING_RATE_FEATURES_KHR: {
          const VkPhysicalDeviceFragmentShadingRateFeaturesKHR *vrs = (const void *)ext;
          attachment_vrs_enabled = vrs->attachmentFragmentShadingRate;
-         vrs_enabled = vrs->pipelineFragmentShadingRate || vrs->primitiveFragmentShadingRate ||
-                       attachment_vrs_enabled;
          break;
       }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_FEATURES_EXT: {
@@ -2977,6 +2926,26 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
             robust_buffer_access2 = true;
          break;
       }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_FLOAT_FEATURES_EXT: {
+         const VkPhysicalDeviceShaderAtomicFloatFeaturesEXT *features = (const void *)ext;
+         if (features->shaderImageFloat32Atomics ||
+             features->sparseImageFloat32Atomics)
+            image_float32_atomics = true;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_FLOAT_2_FEATURES_EXT: {
+         const VkPhysicalDeviceShaderAtomicFloat2FeaturesEXT *features = (const void *)ext;
+         if (features->shaderImageFloat32AtomicMinMax ||
+             features->sparseImageFloat32AtomicMinMax)
+            image_float32_atomics = true;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_INPUT_DYNAMIC_STATE_FEATURES_EXT: {
+         const VkPhysicalDeviceVertexInputDynamicStateFeaturesEXT *features = (const void *)ext;
+         if (features->vertexInputDynamicState)
+            vs_prologs = true;
+         break;
+      }
       default:
          break;
       }
@@ -2995,6 +2964,7 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
    } else {
       vk_device_dispatch_table_from_entrypoints(&dispatch_table, &radv_device_entrypoints, true);
    }
+   vk_device_dispatch_table_from_entrypoints(&dispatch_table, &wsi_device_entrypoints, false);
 
    result =
       vk_device_init(&device->vk, &physical_device->vk, &dispatch_table, pCreateInfo, pAllocator);
@@ -3023,16 +2993,11 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
    device->robust_buffer_access = robust_buffer_access || robust_buffer_access2;
    device->robust_buffer_access2 = robust_buffer_access2;
 
-   device->adjust_frag_coord_z =
-      (vrs_enabled || device->vk.enabled_extensions.KHR_fragment_shading_rate ||
-       device->force_vrs != RADV_FORCE_VRS_NONE) &&
-      (device->physical_device->rad_info.family == CHIP_SIENNA_CICHLID ||
-       device->physical_device->rad_info.family == CHIP_NAVY_FLOUNDER ||
-       device->physical_device->rad_info.family == CHIP_VANGOGH);
    device->attachment_vrs_enabled = attachment_vrs_enabled;
 
-   mtx_init(&device->shader_slab_mutex, mtx_plain);
-   list_inithead(&device->shader_slabs);
+   device->image_float32_atomics = image_float32_atomics;
+
+   radv_init_shader_arenas(device);
 
    device->overallocation_disallowed = overallocation_disallowed;
    mtx_init(&device->overallocation_mutex, mtx_plain);
@@ -3071,8 +3036,7 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
       device->queue_count[qfi] = queue_create->queueCount;
 
       for (unsigned q = 0; q < queue_create->queueCount; q++) {
-         result = radv_queue_init(device, &device->queues[qfi][q], qfi, q, queue_create->flags,
-                                  global_priority);
+         result = radv_queue_init(device, &device->queues[qfi][q], q, queue_create, global_priority);
          if (result != VK_SUCCESS)
             goto fail;
       }
@@ -3111,19 +3075,6 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
    device->tess_offchip_block_dw_size =
       device->physical_device->rad_info.family == CHIP_HAWAII ? 4096 : 8192;
 
-   if (getenv("RADV_TRACE_FILE")) {
-      fprintf(
-         stderr,
-         "***********************************************************************************\n");
-      fprintf(
-         stderr,
-         "* WARNING: RADV_TRACE_FILE=<file> is deprecated and replaced by RADV_DEBUG=hang *\n");
-      fprintf(
-         stderr,
-         "***********************************************************************************\n");
-      abort();
-   }
-
    if (device->instance->debug_flags & RADV_DEBUG_HANG) {
       /* Enable GPU hangs detection and dump logs if a GPU hang is
        * detected.
@@ -3198,6 +3149,13 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
                          "(valid values are 2x2, 2x1 and 1x2)\n");
    }
 
+   device->adjust_frag_coord_z =
+      (device->vk.enabled_extensions.KHR_fragment_shading_rate ||
+       device->force_vrs != RADV_FORCE_VRS_NONE) &&
+      (device->physical_device->rad_info.family == CHIP_SIENNA_CICHLID ||
+       device->physical_device->rad_info.family == CHIP_NAVY_FLOUNDER ||
+       device->physical_device->rad_info.family == CHIP_VANGOGH);
+
    device->keep_shader_info = keep_shader_info;
    result = radv_device_init_meta(device);
    if (result != VK_SUCCESS)
@@ -3212,6 +3170,12 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
          goto fail;
    }
 
+   if (vs_prologs) {
+      result = radv_device_init_vs_prologs(device);
+      if (result != VK_SUCCESS)
+         goto fail;
+   }
+
    for (int family = 0; family < RADV_MAX_QUEUE_FAMILIES; ++family) {
       device->empty_cs[family] = device->ws->cs_create(device->ws, family);
       if (!device->empty_cs[family])
@@ -3278,6 +3242,7 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
    if (device->gfx_init)
       device->ws->buffer_destroy(device->ws, device->gfx_init);
 
+   radv_device_finish_vs_prologs(device);
    radv_device_finish_border_color(device);
 
    for (unsigned i = 0; i < RADV_MAX_QUEUE_FAMILIES; i++) {
@@ -3308,6 +3273,7 @@ radv_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
    if (device->gfx_init)
       device->ws->buffer_destroy(device->ws, device->gfx_init);
 
+   radv_device_finish_vs_prologs(device);
    radv_device_finish_border_color(device);
    radv_device_finish_vrs_image(device);
 
@@ -3333,7 +3299,7 @@ radv_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
    radv_trap_handler_finish(device);
    radv_finish_trace(device);
 
-   radv_destroy_shader_slabs(device);
+   radv_destroy_shader_arenas(device);
 
    u_cnd_monotonic_destroy(&device->timeline_cond);
 
@@ -3369,41 +3335,6 @@ radv_EnumerateDeviceLayerProperties(VkPhysicalDevice physicalDevice, uint32_t *p
    return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT);
 }
 
-void
-radv_GetDeviceQueue2(VkDevice _device, const VkDeviceQueueInfo2 *pQueueInfo, VkQueue *pQueue)
-{
-   RADV_FROM_HANDLE(radv_device, device, _device);
-   struct radv_queue *queue;
-
-   queue = &device->queues[pQueueInfo->queueFamilyIndex][pQueueInfo->queueIndex];
-   if (pQueueInfo->flags != queue->flags) {
-      /* From the Vulkan 1.1.70 spec:
-       *
-       * "The queue returned by vkGetDeviceQueue2 must have the same
-       * flags value from this structure as that used at device
-       * creation time in a VkDeviceQueueCreateInfo instance. If no
-       * matching flags were specified at device creation time then
-       * pQueue will return VK_NULL_HANDLE."
-       */
-      *pQueue = VK_NULL_HANDLE;
-      return;
-   }
-
-   *pQueue = radv_queue_to_handle(queue);
-}
-
-void
-radv_GetDeviceQueue(VkDevice _device, uint32_t queueFamilyIndex, uint32_t queueIndex,
-                    VkQueue *pQueue)
-{
-   const VkDeviceQueueInfo2 info =
-      (VkDeviceQueueInfo2){.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_INFO_2,
-                           .queueFamilyIndex = queueFamilyIndex,
-                           .queueIndex = queueIndex};
-
-   radv_GetDeviceQueue2(_device, &info, pQueue);
-}
-
 static void
 fill_geom_tess_rings(struct radv_queue *queue, uint32_t *map, bool add_sample_positions,
                      uint32_t esgs_ring_size, struct radeon_winsys_bo *esgs_ring_bo,
@@ -3676,7 +3607,7 @@ radv_emit_graphics_scratch(struct radv_queue *queue, struct radeon_cmdbuf *cs,
                            uint32_t size_per_wave, uint32_t waves,
                            struct radeon_winsys_bo *scratch_bo)
 {
-   if (queue->queue_family_index != RADV_QUEUE_GENERAL)
+   if (queue->vk.queue_family_index != RADV_QUEUE_GENERAL)
       return;
 
    if (!scratch_bo)
@@ -3981,7 +3912,7 @@ radv_get_preamble_cs(struct radv_queue *queue, uint32_t scratch_size_per_wave,
       enum rgp_flush_bits sqtt_flush_bits = 0;
       struct radeon_cmdbuf *cs = NULL;
       cs = queue->device->ws->cs_create(queue->device->ws,
-                                        queue->queue_family_index ? RING_COMPUTE : RING_GFX);
+                                        queue->vk.queue_family_index ? RING_COMPUTE : RING_GFX);
       if (!cs) {
          result = VK_ERROR_OUT_OF_HOST_MEMORY;
          goto fail;
@@ -3993,7 +3924,7 @@ radv_get_preamble_cs(struct radv_queue *queue, uint32_t scratch_size_per_wave,
          radv_cs_add_buffer(queue->device->ws, cs, scratch_bo);
 
       /* Emit initial configuration. */
-      switch (queue->queue_family_index) {
+      switch (queue->vk.queue_family_index) {
       case RADV_QUEUE_GENERAL:
          radv_init_graphics_state(cs, queue);
          break;
@@ -4028,9 +3959,9 @@ radv_get_preamble_cs(struct radv_queue *queue, uint32_t scratch_size_per_wave,
       if (i == 0) {
          si_cs_emit_cache_flush(
             cs, queue->device->physical_device->rad_info.chip_class, NULL, 0,
-            queue->queue_family_index == RING_COMPUTE &&
+            queue->vk.queue_family_index == RING_COMPUTE &&
                queue->device->physical_device->rad_info.chip_class >= GFX7,
-            (queue->queue_family_index == RADV_QUEUE_COMPUTE
+            (queue->vk.queue_family_index == RADV_QUEUE_COMPUTE
                 ? RADV_CMD_FLAG_CS_PARTIAL_FLUSH
                 : (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH)) |
                RADV_CMD_FLAG_INV_ICACHE | RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE |
@@ -4038,7 +3969,7 @@ radv_get_preamble_cs(struct radv_queue *queue, uint32_t scratch_size_per_wave,
             &sqtt_flush_bits, 0);
       } else if (i == 1) {
          si_cs_emit_cache_flush(cs, queue->device->physical_device->rad_info.chip_class, NULL, 0,
-                                queue->queue_family_index == RING_COMPUTE &&
+                                queue->vk.queue_family_index == RING_COMPUTE &&
                                    queue->device->physical_device->rad_info.chip_class >= GFX7,
                                 RADV_CMD_FLAG_INV_ICACHE | RADV_CMD_FLAG_INV_SCACHE |
                                    RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_L2 |
@@ -4146,7 +4077,7 @@ radv_get_preamble_cs(struct radv_queue *queue, uint32_t scratch_size_per_wave,
    if (gds_oa_bo && gds_oa_bo != queue->gds_oa_bo)
       queue->device->ws->buffer_destroy(queue->device->ws, gds_oa_bo);
 
-   return vk_error(queue->device->instance, result);
+   return vk_error(queue, result);
 }
 
 static VkResult
@@ -4184,7 +4115,7 @@ radv_alloc_sem_counts(struct radv_device *device, struct radv_winsys_sem_counts
                                           (sizeof(*counts->syncobj) + sizeof(*counts->points)) *
                                              counts->timeline_syncobj_count);
       if (!counts->points)
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
       counts->syncobj = (uint32_t *)(counts->points + counts->timeline_syncobj_count);
    }
 
@@ -4763,8 +4694,8 @@ radv_queue_submit_deferred(struct radv_deferred_queue_submission *submission,
    }
 
    if (!submission->cmd_buffer_count) {
-      result = queue->device->ws->cs_submit(ctx, queue->queue_idx,
-                                            &queue->device->empty_cs[queue->queue_family_index], 1,
+      result = queue->device->ws->cs_submit(ctx, queue->vk.index_in_family,
+                                            &queue->device->empty_cs[queue->vk.queue_family_index], 1,
                                             NULL, NULL, &sem_info, false);
       if (result != VK_SUCCESS)
          goto fail;
@@ -4794,7 +4725,7 @@ radv_queue_submit_deferred(struct radv_deferred_queue_submission *submission,
          sem_info.cs_emit_wait = j == 0;
          sem_info.cs_emit_signal = j + advance == submission->cmd_buffer_count;
 
-         result = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j, advance,
+         result = queue->device->ws->cs_submit(ctx, queue->vk.index_in_family, cs_array + j, advance,
                                                initial_preamble, continue_preamble_cs, &sem_info,
                                                can_patch);
          if (result != VK_SUCCESS) {
@@ -4879,7 +4810,7 @@ wait_for_submission_timelines_available(struct radv_deferred_queue_submission *s
 
    uint64_t *points = malloc((sizeof(uint64_t) + sizeof(uint32_t)) * syncobj_count);
    if (!points)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    uint32_t *syncobj = (uint32_t *)(points + syncobj_count);
 
@@ -4969,7 +4900,7 @@ radv_queue_trigger_submission(struct radv_deferred_queue_submission *submission,
       ret = thrd_create(&queue->submission_thread, radv_queue_submission_thread_run, queue);
       if (ret) {
          mtx_unlock(&queue->thread_mutex);
-         return vk_errorf(queue->device->instance, VK_ERROR_DEVICE_LOST,
+         return vk_errorf(queue, VK_ERROR_DEVICE_LOST,
                           "Failed to start submission thread");
       }
       queue->thread_running = true;
@@ -5015,7 +4946,8 @@ radv_queue_internal_submit(struct radv_queue *queue, struct radeon_cmdbuf *cs)
       return false;
 
    result =
-      queue->device->ws->cs_submit(ctx, queue->queue_idx, &cs, 1, NULL, NULL, &sem_info, false);
+      queue->device->ws->cs_submit(ctx, queue->vk.index_in_family, &cs, 1,
+                                   NULL, NULL, &sem_info, false);
    radv_free_sem_info(&sem_info);
    if (result != VK_SUCCESS)
       return false;
@@ -5104,7 +5036,7 @@ radv_QueueSubmit(VkQueue _queue, uint32_t submitCount, const VkSubmitInfo *pSubm
 static const char *
 radv_get_queue_family_name(struct radv_queue *queue)
 {
-   switch (queue->queue_family_index) {
+   switch (queue->vk.queue_family_index) {
    case RADV_QUEUE_GENERAL:
       return "graphics";
    case RADV_QUEUE_COMPUTE:
@@ -5131,7 +5063,8 @@ radv_QueueWaitIdle(VkQueue _queue)
    mtx_unlock(&queue->pending_mutex);
 
    if (!queue->device->ws->ctx_wait_idle(
-          queue->hw_ctx, radv_queue_family_to_ring(queue->queue_family_index), queue->queue_idx)) {
+          queue->hw_ctx, radv_queue_family_to_ring(queue->vk.queue_family_index),
+          queue->vk.index_in_family)) {
       return radv_device_set_lost(queue->device,
                                   "Failed to wait for a '%s' queue "
                                   "to be idle. GPU hang ?",
@@ -5141,22 +5074,6 @@ radv_QueueWaitIdle(VkQueue _queue)
    return VK_SUCCESS;
 }
 
-VkResult
-radv_DeviceWaitIdle(VkDevice _device)
-{
-   RADV_FROM_HANDLE(radv_device, device, _device);
-
-   for (unsigned i = 0; i < RADV_MAX_QUEUE_FAMILIES; i++) {
-      for (unsigned q = 0; q < device->queue_count[i]; q++) {
-         VkResult result = radv_QueueWaitIdle(radv_queue_to_handle(&device->queues[i][q]));
-
-         if (result != VK_SUCCESS)
-            return result;
-      }
-   }
-   return VK_SUCCESS;
-}
-
 VkResult
 radv_EnumerateInstanceExtensionProperties(const char *pLayerName, uint32_t *pPropertyCount,
                                           VkExtensionProperties *pProperties)
@@ -5239,6 +5156,22 @@ radv_get_memory_fd(struct radv_device *device, struct radv_device_memory *memory
    return device->ws->buffer_get_fd(device->ws, memory->bo, pFD);
 }
 
+void
+radv_device_memory_init(struct radv_device_memory *mem, struct radv_device *device,
+                        struct radeon_winsys_bo *bo)
+{
+   memset(mem, 0, sizeof(*mem));
+   vk_object_base_init(&device->vk, &mem->base, VK_OBJECT_TYPE_DEVICE_MEMORY);
+
+   mem->bo = bo;
+}
+
+void
+radv_device_memory_finish(struct radv_device_memory *mem)
+{
+   vk_object_base_finish(&mem->base);
+}
+
 void
 radv_free_memory(struct radv_device *device, const VkAllocationCallbacks *pAllocator,
                  struct radv_device_memory *mem)
@@ -5264,7 +5197,7 @@ radv_free_memory(struct radv_device *device, const VkAllocationCallbacks *pAlloc
       mem->bo = NULL;
    }
 
-   vk_object_base_finish(&mem->base);
+   radv_device_memory_finish(mem);
    vk_free2(&device->vk.alloc, pAllocator, mem);
 }
 
@@ -5302,11 +5235,11 @@ radv_alloc_memory(struct radv_device *device, const VkMemoryAllocateInfo *pAlloc
    }
 
    mem =
-      vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*mem), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*mem), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (mem == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   vk_object_base_init(&device->vk, &mem->base, VK_OBJECT_TYPE_DEVICE_MEMORY);
+   radv_device_memory_init(mem, device, NULL);
 
    if (wsi_info) {
       if(wsi_info->implicit_sync)
@@ -5347,7 +5280,6 @@ radv_alloc_memory(struct radv_device *device, const VkMemoryAllocateInfo *pAlloc
                             (int)(priority_float * RADV_BO_PRIORITY_APPLICATION_MAX));
 
    mem->user_ptr = NULL;
-   mem->bo = NULL;
 
 #if RADV_SUPPORT_ANDROID_HARDWARE_BUFFER
    mem->android_hardware_buffer = NULL;
@@ -5508,7 +5440,7 @@ radv_MapMemory(VkDevice _device, VkDeviceMemory _memory, VkDeviceSize offset, Vk
       return VK_SUCCESS;
    }
 
-   return vk_error(device->instance, VK_ERROR_MEMORY_MAP_FAILED);
+   return vk_error(device, VK_ERROR_MEMORY_MAP_FAILED);
 }
 
 void
@@ -5538,30 +5470,23 @@ radv_InvalidateMappedMemoryRanges(VkDevice _device, uint32_t memoryRangeCount,
    return VK_SUCCESS;
 }
 
-void
-radv_GetBufferMemoryRequirements(VkDevice _device, VkBuffer _buffer,
-                                 VkMemoryRequirements *pMemoryRequirements)
+static void
+radv_get_buffer_memory_requirements(struct radv_device *device,
+                                    VkDeviceSize size,
+                                    VkBufferCreateFlags flags,
+                                    VkMemoryRequirements2 *pMemoryRequirements)
 {
-   RADV_FROM_HANDLE(radv_device, device, _device);
-   RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
-
-   pMemoryRequirements->memoryTypeBits =
+   pMemoryRequirements->memoryRequirements.memoryTypeBits =
       (1u << device->physical_device->memory_properties.memoryTypeCount) - 1;
 
-   if (buffer->flags & VK_BUFFER_CREATE_SPARSE_BINDING_BIT)
-      pMemoryRequirements->alignment = 4096;
+   if (flags & VK_BUFFER_CREATE_SPARSE_BINDING_BIT)
+      pMemoryRequirements->memoryRequirements.alignment = 4096;
    else
-      pMemoryRequirements->alignment = 16;
+      pMemoryRequirements->memoryRequirements.alignment = 16;
 
-   pMemoryRequirements->size = align64(buffer->size, pMemoryRequirements->alignment);
-}
+   pMemoryRequirements->memoryRequirements.size =
+      align64(size, pMemoryRequirements->memoryRequirements.alignment);
 
-void
-radv_GetBufferMemoryRequirements2(VkDevice device, const VkBufferMemoryRequirementsInfo2 *pInfo,
-                                  VkMemoryRequirements2 *pMemoryRequirements)
-{
-   radv_GetBufferMemoryRequirements(device, pInfo->buffer,
-                                    &pMemoryRequirements->memoryRequirements);
    vk_foreach_struct(ext, pMemoryRequirements->pNext)
    {
       switch (ext->sType) {
@@ -5578,27 +5503,39 @@ radv_GetBufferMemoryRequirements2(VkDevice device, const VkBufferMemoryRequireme
 }
 
 void
-radv_GetImageMemoryRequirements(VkDevice _device, VkImage _image,
-                                VkMemoryRequirements *pMemoryRequirements)
+radv_GetBufferMemoryRequirements2(VkDevice _device, const VkBufferMemoryRequirementsInfo2 *pInfo,
+                                  VkMemoryRequirements2 *pMemoryRequirements)
 {
    RADV_FROM_HANDLE(radv_device, device, _device);
-   RADV_FROM_HANDLE(radv_image, image, _image);
+   RADV_FROM_HANDLE(radv_buffer, buffer, pInfo->buffer);
 
-   pMemoryRequirements->memoryTypeBits =
-      (1u << device->physical_device->memory_properties.memoryTypeCount) - 1;
-
-   pMemoryRequirements->size = image->size;
-   pMemoryRequirements->alignment = image->alignment;
+   radv_get_buffer_memory_requirements(device, buffer->size, buffer->flags, pMemoryRequirements);
 }
 
 void
-radv_GetImageMemoryRequirements2(VkDevice device, const VkImageMemoryRequirementsInfo2 *pInfo,
+radv_GetDeviceBufferMemoryRequirementsKHR(VkDevice _device,
+                                          const VkDeviceBufferMemoryRequirementsKHR* pInfo,
+                                          VkMemoryRequirements2 *pMemoryRequirements)
+{
+   RADV_FROM_HANDLE(radv_device, device, _device);
+
+   radv_get_buffer_memory_requirements(device, pInfo->pCreateInfo->size, pInfo->pCreateInfo->flags,
+                                       pMemoryRequirements);
+}
+
+void
+radv_GetImageMemoryRequirements2(VkDevice _device, const VkImageMemoryRequirementsInfo2 *pInfo,
                                  VkMemoryRequirements2 *pMemoryRequirements)
 {
-   radv_GetImageMemoryRequirements(device, pInfo->image, &pMemoryRequirements->memoryRequirements);
-
+   RADV_FROM_HANDLE(radv_device, device, _device);
    RADV_FROM_HANDLE(radv_image, image, pInfo->image);
 
+   pMemoryRequirements->memoryRequirements.memoryTypeBits =
+      (1u << device->physical_device->memory_properties.memoryTypeCount) - 1;
+
+   pMemoryRequirements->memoryRequirements.size = image->size;
+   pMemoryRequirements->memoryRequirements.alignment = image->alignment;
+
    vk_foreach_struct(ext, pMemoryRequirements->pNext)
    {
       switch (ext->sType) {
@@ -5615,6 +5552,31 @@ radv_GetImageMemoryRequirements2(VkDevice device, const VkImageMemoryRequirement
    }
 }
 
+void
+radv_GetDeviceImageMemoryRequirementsKHR(VkDevice device,
+                                         const VkDeviceImageMemoryRequirementsKHR *pInfo,
+                                         VkMemoryRequirements2 *pMemoryRequirements)
+{
+   UNUSED VkResult result;
+   VkImage image;
+
+   /* Determining the image size/alignment require to create a surface, which is complicated without
+    * creating an image.
+    * TODO: Avoid creating an image.
+    */
+   result = radv_CreateImage(device, pInfo->pCreateInfo, NULL, &image);
+   assert(result == VK_SUCCESS);
+
+   VkImageMemoryRequirementsInfo2 info2 = {
+      .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2,
+      .image = image,
+   };
+
+   radv_GetImageMemoryRequirements2(device, &info2, pMemoryRequirements);
+
+   radv_DestroyImage(device, image, NULL);
+}
+
 void
 radv_GetDeviceMemoryCommitment(VkDevice device, VkDeviceMemory memory,
                                VkDeviceSize *pCommittedMemoryInBytes)
@@ -5634,12 +5596,18 @@ radv_BindBufferMemory2(VkDevice _device, uint32_t bindInfoCount,
 
       if (mem) {
          if (mem->alloc_size) {
-            VkMemoryRequirements req;
+            VkBufferMemoryRequirementsInfo2 info = {
+               .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2,
+               .buffer = pBindInfos[i].buffer,
+            };
+            VkMemoryRequirements2 reqs = {
+               .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
+            };
 
-            radv_GetBufferMemoryRequirements(_device, pBindInfos[i].buffer, &req);
+            radv_GetBufferMemoryRequirements2(_device, &info, &reqs);
 
-            if (pBindInfos[i].memoryOffset + req.size > mem->alloc_size) {
-               return vk_errorf(device->instance, VK_ERROR_UNKNOWN,
+            if (pBindInfos[i].memoryOffset + reqs.memoryRequirements.size > mem->alloc_size) {
+               return vk_errorf(device, VK_ERROR_UNKNOWN,
                                 "Device memory object too small for the buffer.\n");
             }
          }
@@ -5653,18 +5621,6 @@ radv_BindBufferMemory2(VkDevice _device, uint32_t bindInfoCount,
    return VK_SUCCESS;
 }
 
-VkResult
-radv_BindBufferMemory(VkDevice device, VkBuffer buffer, VkDeviceMemory memory,
-                      VkDeviceSize memoryOffset)
-{
-   const VkBindBufferMemoryInfo info = {.sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO,
-                                        .buffer = buffer,
-                                        .memory = memory,
-                                        .memoryOffset = memoryOffset};
-
-   return radv_BindBufferMemory2(device, 1, &info);
-}
-
 VkResult
 radv_BindImageMemory2(VkDevice _device, uint32_t bindInfoCount,
                       const VkBindImageMemoryInfo *pBindInfos)
@@ -5677,12 +5633,18 @@ radv_BindImageMemory2(VkDevice _device, uint32_t bindInfoCount,
 
       if (mem) {
          if (mem->alloc_size) {
-            VkMemoryRequirements req;
+            VkImageMemoryRequirementsInfo2 info = {
+               .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2,
+               .image = pBindInfos[i].image,
+            };
+            VkMemoryRequirements2 reqs = {
+               .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
+            };
 
-            radv_GetImageMemoryRequirements(_device, pBindInfos[i].image, &req);
+            radv_GetImageMemoryRequirements2(_device, &info, &reqs);
 
-            if (pBindInfos[i].memoryOffset + req.size > mem->alloc_size) {
-               return vk_errorf(device->instance, VK_ERROR_UNKNOWN,
+            if (pBindInfos[i].memoryOffset + reqs.memoryRequirements.size > mem->alloc_size) {
+               return vk_errorf(device, VK_ERROR_UNKNOWN,
                                 "Device memory object too small for the image.\n");
             }
          }
@@ -5697,18 +5659,6 @@ radv_BindImageMemory2(VkDevice _device, uint32_t bindInfoCount,
    return VK_SUCCESS;
 }
 
-VkResult
-radv_BindImageMemory(VkDevice device, VkImage image, VkDeviceMemory memory,
-                     VkDeviceSize memoryOffset)
-{
-   const VkBindImageMemoryInfo info = {.sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO,
-                                       .image = image,
-                                       .memory = memory,
-                                       .memoryOffset = memoryOffset};
-
-   return radv_BindImageMemory2(device, 1, &info);
-}
-
 static bool
 radv_sparse_bind_has_effects(const VkBindSparseInfo *info)
 {
@@ -5807,7 +5757,7 @@ radv_CreateFence(VkDevice _device, const VkFenceCreateInfo *pCreateInfo,
    fence = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*fence), 8,
                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (!fence)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    vk_object_base_init(&device->vk, &fence->base, VK_OBJECT_TYPE_FENCE);
 
@@ -5819,7 +5769,7 @@ radv_CreateFence(VkDevice _device, const VkFenceCreateInfo *pCreateInfo,
    ret = device->ws->create_syncobj(device->ws, create_signaled, &fence->permanent.syncobj);
    if (ret) {
       radv_destroy_fence(device, pAllocator, fence);
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
    }
 
    *pFence = radv_fence_to_handle(fence);
@@ -5853,7 +5803,7 @@ radv_WaitForFences(VkDevice _device, uint32_t fenceCount, const VkFence *pFences
 
    handles = malloc(sizeof(uint32_t) * fenceCount);
    if (!handles)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    for (uint32_t i = 0; i < fenceCount; ++i) {
       RADV_FROM_HANDLE(radv_fence, fence, pFences[i]);
@@ -6123,7 +6073,7 @@ radv_CreateSemaphore(VkDevice _device, const VkSemaphoreCreateInfo *pCreateInfo,
    struct radv_semaphore *sem =
       vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*sem), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (!sem)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    vk_object_base_init(&device->vk, &sem->base, VK_OBJECT_TYPE_SEMAPHORE);
 
@@ -6135,7 +6085,7 @@ radv_CreateSemaphore(VkDevice _device, const VkSemaphoreCreateInfo *pCreateInfo,
       int ret = device->ws->create_syncobj(device->ws, false, &sem->permanent.syncobj);
       if (ret) {
          radv_destroy_semaphore(device, pAllocator, sem);
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
       }
       device->ws->signal_syncobj(device->ws, sem->permanent.syncobj, initial_value);
       sem->permanent.timeline_syncobj.max_point = initial_value;
@@ -6147,7 +6097,7 @@ radv_CreateSemaphore(VkDevice _device, const VkSemaphoreCreateInfo *pCreateInfo,
       int ret = device->ws->create_syncobj(device->ws, false, &sem->permanent.syncobj);
       if (ret) {
          radv_destroy_semaphore(device, pAllocator, sem);
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
       }
       sem->permanent.kind = RADV_SEMAPHORE_SYNCOBJ;
    }
@@ -6243,13 +6193,13 @@ radv_WaitSemaphores(VkDevice _device, const VkSemaphoreWaitInfo *pWaitInfo, uint
       return radv_wait_timelines(device, pWaitInfo, abs_timeout);
 
    if (pWaitInfo->semaphoreCount > UINT32_MAX / sizeof(uint32_t))
-      return vk_errorf(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY,
+      return vk_errorf(device, VK_ERROR_OUT_OF_HOST_MEMORY,
                        "semaphoreCount integer overflow");
 
    bool wait_all = !(pWaitInfo->flags & VK_SEMAPHORE_WAIT_ANY_BIT_KHR);
    uint32_t *handles = malloc(sizeof(*handles) * pWaitInfo->semaphoreCount);
    if (!handles)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    for (uint32_t i = 0; i < pWaitInfo->semaphoreCount; ++i) {
       RADV_FROM_HANDLE(radv_semaphore, semaphore, pWaitInfo->pSemaphores[i]);
@@ -6328,7 +6278,7 @@ radv_CreateEvent(VkDevice _device, const VkEventCreateInfo *pCreateInfo,
                                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 
    if (!event)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    vk_object_base_init(&device->vk, &event->base, VK_OBJECT_TYPE_EVENT);
 
@@ -6338,13 +6288,13 @@ radv_CreateEvent(VkDevice _device, const VkEventCreateInfo *pCreateInfo,
       RADV_BO_PRIORITY_FENCE, 0, &event->bo);
    if (result != VK_SUCCESS) {
       radv_destroy_event(device, pAllocator, event);
-      return vk_error(device->instance, result);
+      return vk_error(device, result);
    }
 
    event->map = (uint64_t *)device->ws->buffer_map(event->bo);
    if (!event->map) {
       radv_destroy_event(device, pAllocator, event);
-      return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
    }
 
    *pEvent = radv_event_to_handle(event);
@@ -6396,6 +6346,26 @@ radv_ResetEvent(VkDevice _device, VkEvent _event)
    return VK_SUCCESS;
 }
 
+void
+radv_buffer_init(struct radv_buffer *buffer, struct radv_device *device,
+                 struct radeon_winsys_bo *bo, uint64_t size,
+                 uint64_t offset)
+{
+   vk_object_base_init(&device->vk, &buffer->base, VK_OBJECT_TYPE_BUFFER);
+
+   buffer->usage = 0;
+   buffer->flags = 0;
+   buffer->bo = bo;
+   buffer->size = size;
+   buffer->offset = offset;
+}
+
+void
+radv_buffer_finish(struct radv_buffer *buffer)
+{
+   vk_object_base_finish(&buffer->base);
+}
+
 static void
 radv_destroy_buffer(struct radv_device *device, const VkAllocationCallbacks *pAllocator,
                     struct radv_buffer *buffer)
@@ -6403,7 +6373,7 @@ radv_destroy_buffer(struct radv_device *device, const VkAllocationCallbacks *pAl
    if ((buffer->flags & VK_BUFFER_CREATE_SPARSE_BINDING_BIT) && buffer->bo)
       device->ws->buffer_destroy(device->ws, buffer->bo);
 
-   vk_object_base_finish(&buffer->base);
+   radv_buffer_finish(buffer);
    vk_free2(&device->vk.alloc, pAllocator, buffer);
 }
 
@@ -6422,14 +6392,11 @@ radv_CreateBuffer(VkDevice _device, const VkBufferCreateInfo *pCreateInfo,
    buffer = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*buffer), 8,
                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (buffer == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   vk_object_base_init(&device->vk, &buffer->base, VK_OBJECT_TYPE_BUFFER);
+   radv_buffer_init(buffer, device, NULL, pCreateInfo->size, 0);
 
-   buffer->size = pCreateInfo->size;
    buffer->usage = pCreateInfo->usage;
-   buffer->bo = NULL;
-   buffer->offset = 0;
    buffer->flags = pCreateInfo->flags;
 
    buffer->shareable =
@@ -6451,7 +6418,7 @@ radv_CreateBuffer(VkDevice _device, const VkBufferCreateInfo *pCreateInfo,
                                                   replay_address, &buffer->bo);
       if (result != VK_SUCCESS) {
          radv_destroy_buffer(device, pAllocator, buffer);
-         return vk_error(device->instance, result);
+         return vk_error(device, result);
       }
    }
 
@@ -6705,8 +6672,8 @@ radv_initialise_color_surface(struct radv_device *device, struct radv_color_buff
    ntype = radv_translate_color_numformat(iview->vk_format, desc,
                                           vk_format_get_first_non_void_channel(iview->vk_format));
    format = radv_translate_colorformat(iview->vk_format);
-   if (format == V_028C70_COLOR_INVALID || ntype == ~0u)
-      radv_finishme("Illegal color\n");
+   assert(format != V_028C70_COLOR_INVALID);
+
    swap = radv_translate_colorswap(iview->vk_format, false);
    endian = radv_colorformat_endian_swap(format);
 
@@ -6848,6 +6815,31 @@ radv_calc_decompress_on_z_planes(struct radv_device *device, struct radv_image_v
    return max_zplanes;
 }
 
+void
+radv_initialise_vrs_surface(struct radv_image *image, struct radv_buffer *htile_buffer,
+                            struct radv_ds_buffer_info *ds)
+{
+   const struct radeon_surf *surf = &image->planes[0].surface;
+
+   assert(image->vk_format == VK_FORMAT_D16_UNORM);
+   memset(ds, 0, sizeof(*ds));
+
+   ds->pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-16);
+
+   ds->db_z_info = S_028038_FORMAT(V_028040_Z_16) |
+                   S_028038_SW_MODE(surf->u.gfx9.swizzle_mode) |
+                   S_028038_ZRANGE_PRECISION(1) |
+                   S_028038_TILE_SURFACE_ENABLE(1);
+   ds->db_stencil_info = S_02803C_FORMAT(V_028044_STENCIL_INVALID);
+
+   ds->db_depth_size = S_02801C_X_MAX(image->info.width - 1) |
+                       S_02801C_Y_MAX(image->info.height - 1);
+
+   ds->db_htile_data_base = radv_buffer_get_va(htile_buffer->bo) >> 8;
+   ds->db_htile_surface = S_028ABC_FULL_CACHE(1) | S_028ABC_PIPE_ALIGNED(1) |
+                          S_028ABC_VRS_HTILE_ENCODING(V_028ABC_VRS_HTILE_4BIT_ENCODING);
+}
+
 void
 radv_initialise_ds_surface(struct radv_device *device, struct radv_ds_buffer_info *ds,
                            struct radv_image_view *iview)
@@ -7046,7 +7038,7 @@ radv_CreateFramebuffer(VkDevice _device, const VkFramebufferCreateInfo *pCreateI
    framebuffer =
       vk_alloc2(&device->vk.alloc, pAllocator, size, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (framebuffer == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    vk_object_base_init(&device->vk, &framebuffer->base, VK_OBJECT_TYPE_FRAMEBUFFER);
 
@@ -7355,7 +7347,7 @@ radv_CreateSampler(VkDevice _device, const VkSamplerCreateInfo *pCreateInfo,
    sampler = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*sampler), 8,
                        VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (!sampler)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    vk_object_base_init(&device->vk, &sampler->base, VK_OBJECT_TYPE_SAMPLER);
 
@@ -7437,7 +7429,7 @@ radv_GetMemoryFdKHR(VkDevice _device, const VkMemoryGetFdInfoKHR *pGetFdInfo, in
 
    bool ret = radv_get_memory_fd(device, memory, pFD);
    if (ret == false)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
    return VK_SUCCESS;
 }
 
@@ -7497,7 +7489,7 @@ radv_GetMemoryFdPropertiesKHR(VkDevice _device, VkExternalMemoryHandleTypeFlagBi
       enum radeon_bo_domain domains;
       enum radeon_bo_flag flags;
       if (!device->ws->buffer_get_flags_from_fd(device->ws, fd, &domains, &flags))
-         return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+         return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
 
       pMemoryFdProperties->memoryTypeBits =
          radv_compute_valid_memory_types(device->physical_device, domains, flags);
@@ -7511,7 +7503,7 @@ radv_GetMemoryFdPropertiesKHR(VkDevice _device, VkExternalMemoryHandleTypeFlagBi
        *
        * So opaque handle types fall into the default "unsupported" case.
        */
-      return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+      return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
    }
 }
 
@@ -7521,7 +7513,7 @@ radv_import_opaque_fd(struct radv_device *device, int fd, uint32_t *syncobj)
    uint32_t syncobj_handle = 0;
    int ret = device->ws->import_syncobj(device->ws, fd, &syncobj_handle);
    if (ret != 0)
-      return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+      return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
 
    if (*syncobj)
       device->ws->destroy_syncobj(device->ws, *syncobj);
@@ -7543,7 +7535,7 @@ radv_import_sync_fd(struct radv_device *device, int fd, uint32_t *syncobj)
 
       int ret = device->ws->create_syncobj(device->ws, create_signaled, &syncobj_handle);
       if (ret) {
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
       }
    } else {
       if (fd == -1)
@@ -7553,7 +7545,7 @@ radv_import_sync_fd(struct radv_device *device, int fd, uint32_t *syncobj)
    if (fd != -1) {
       int ret = device->ws->import_syncobj_from_sync_file(device->ws, syncobj_handle, fd);
       if (ret)
-         return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+         return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
       close(fd);
    }
 
@@ -7630,12 +7622,12 @@ radv_GetSemaphoreFdKHR(VkDevice _device, const VkSemaphoreGetFdInfoKHR *pGetFdIn
    case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT:
       ret = device->ws->export_syncobj(device->ws, syncobj_handle, pFd);
       if (ret)
-         return vk_error(device->instance, VK_ERROR_TOO_MANY_OBJECTS);
+         return vk_error(device, VK_ERROR_TOO_MANY_OBJECTS);
       break;
    case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT:
       ret = device->ws->export_syncobj_to_sync_file(device->ws, syncobj_handle, pFd);
       if (ret)
-         return vk_error(device->instance, VK_ERROR_TOO_MANY_OBJECTS);
+         return vk_error(device, VK_ERROR_TOO_MANY_OBJECTS);
 
       if (sem->temporary.kind != RADV_SEMAPHORE_NONE) {
          radv_destroy_semaphore_part(device, &sem->temporary);
@@ -7749,12 +7741,12 @@ radv_GetFenceFdKHR(VkDevice _device, const VkFenceGetFdInfoKHR *pGetFdInfo, int
    case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT:
       ret = device->ws->export_syncobj(device->ws, part->syncobj, pFd);
       if (ret)
-         return vk_error(device->instance, VK_ERROR_TOO_MANY_OBJECTS);
+         return vk_error(device, VK_ERROR_TOO_MANY_OBJECTS);
       break;
    case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT:
       ret = device->ws->export_syncobj_to_sync_file(device->ws, part->syncobj, pFd);
       if (ret)
-         return vk_error(device->instance, VK_ERROR_TOO_MANY_OBJECTS);
+         return vk_error(device, VK_ERROR_TOO_MANY_OBJECTS);
 
       if (part == &fence->temporary) {
          radv_destroy_fence_part(device, part);
diff --git a/mesa 3D driver/src/amd/vulkan/radv_formats.c b/mesa 3D driver/src/amd/vulkan/radv_formats.c
index 1ad9ca1987..8d355d3a28 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_formats.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_formats.c	
@@ -36,6 +36,7 @@
 #include "util/format_srgb.h"
 #include "util/half_float.h"
 #include "vulkan/util/vk_format.h"
+#include "vulkan/util/vk_enum_defines.h"
 
 uint32_t
 radv_translate_buffer_dataformat(const struct util_format_description *desc, int first_non_void)
@@ -146,6 +147,58 @@ radv_translate_buffer_numformat(const struct util_format_description *desc, int
    }
 }
 
+void
+radv_translate_vertex_format(const struct radv_physical_device *pdevice, VkFormat format,
+                             const struct util_format_description *desc, unsigned *dfmt,
+                             unsigned *nfmt, bool *post_shuffle,
+                             enum radv_vs_input_alpha_adjust *alpha_adjust)
+{
+   assert(desc->channel[0].type != UTIL_FORMAT_TYPE_VOID);
+   *nfmt = radv_translate_buffer_numformat(desc, 0);
+   *dfmt = radv_translate_buffer_dataformat(desc, 0);
+
+   *alpha_adjust = ALPHA_ADJUST_NONE;
+   if (pdevice->rad_info.chip_class <= GFX8 && pdevice->rad_info.family != CHIP_STONEY) {
+      switch (format) {
+      case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
+      case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
+         *alpha_adjust = ALPHA_ADJUST_SNORM;
+         break;
+      case VK_FORMAT_A2R10G10B10_SSCALED_PACK32:
+      case VK_FORMAT_A2B10G10R10_SSCALED_PACK32:
+         *alpha_adjust = ALPHA_ADJUST_SSCALED;
+         break;
+      case VK_FORMAT_A2R10G10B10_SINT_PACK32:
+      case VK_FORMAT_A2B10G10R10_SINT_PACK32:
+         *alpha_adjust = ALPHA_ADJUST_SINT;
+         break;
+      default:
+         break;
+      }
+   }
+
+   switch (format) {
+   case VK_FORMAT_B8G8R8A8_UNORM:
+   case VK_FORMAT_B8G8R8A8_SNORM:
+   case VK_FORMAT_B8G8R8A8_USCALED:
+   case VK_FORMAT_B8G8R8A8_SSCALED:
+   case VK_FORMAT_B8G8R8A8_UINT:
+   case VK_FORMAT_B8G8R8A8_SINT:
+   case VK_FORMAT_B8G8R8A8_SRGB:
+   case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
+   case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
+   case VK_FORMAT_A2R10G10B10_USCALED_PACK32:
+   case VK_FORMAT_A2R10G10B10_SSCALED_PACK32:
+   case VK_FORMAT_A2R10G10B10_UINT_PACK32:
+   case VK_FORMAT_A2R10G10B10_SINT_PACK32:
+      *post_shuffle = true;
+      break;
+   default:
+      *post_shuffle = false;
+      break;
+   }
+}
+
 uint32_t
 radv_translate_tex_dataformat(VkFormat format, const struct util_format_description *desc,
                               int first_non_void)
@@ -545,6 +598,8 @@ radv_is_storage_image_format_supported(struct radv_physical_device *physical_dev
    case V_008F14_IMG_DATA_FORMAT_4_4_4_4:
       /* TODO: FMASK formats. */
       return true;
+   case V_008F14_IMG_DATA_FORMAT_5_9_9_9:
+      return physical_device->rad_info.chip_class >= GFX10_3;
    default:
       return false;
    }
@@ -603,9 +658,9 @@ radv_is_filter_minmax_format_supported(VkFormat format)
    /* From the Vulkan spec 1.1.71:
     *
     * "The following formats must support the
-    *  VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT feature with
+    *  VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT_KHR feature with
     *  VK_IMAGE_TILING_OPTIMAL, if they support
-    *  VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT."
+    *  VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT_KHR."
     */
    /* TODO: enable more formats. */
    switch (format) {
@@ -638,9 +693,9 @@ radv_device_supports_etc(struct radv_physical_device *physical_device)
 
 static void
 radv_physical_device_get_format_properties(struct radv_physical_device *physical_device,
-                                           VkFormat format, VkFormatProperties *out_properties)
+                                           VkFormat format, VkFormatProperties3KHR *out_properties)
 {
-   VkFormatFeatureFlags linear = 0, tiled = 0, buffer = 0;
+   VkFormatFeatureFlags2KHR linear = 0, tiled = 0, buffer = 0;
    const struct util_format_description *desc = vk_format_description(format);
    bool blendable;
    bool scaled = false;
@@ -661,14 +716,15 @@ radv_physical_device_get_format_properties(struct radv_physical_device *physical
    }
 
    if (vk_format_get_plane_count(format) > 1 || desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
-      uint32_t tiling = VK_FORMAT_FEATURE_TRANSFER_SRC_BIT | VK_FORMAT_FEATURE_TRANSFER_DST_BIT |
-                        VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT |
-                        VK_FORMAT_FEATURE_COSITED_CHROMA_SAMPLES_BIT |
-                        VK_FORMAT_FEATURE_MIDPOINT_CHROMA_SAMPLES_BIT;
+      uint64_t tiling = VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT_KHR |
+                        VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT_KHR |
+                        VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT_KHR |
+                        VK_FORMAT_FEATURE_2_COSITED_CHROMA_SAMPLES_BIT_KHR |
+                        VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT_KHR;
 
       /* The subsampled formats have no support for linear filters. */
       if (desc->layout != UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
-         tiling |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_YCBCR_CONVERSION_LINEAR_FILTER_BIT;
+         tiling |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_LINEAR_FILTER_BIT_KHR;
       }
 
       /* Fails for unknown reasons with linear tiling & subsampled formats. */
@@ -680,35 +736,41 @@ radv_physical_device_get_format_properties(struct radv_physical_device *physical
    }
 
    if (radv_is_storage_image_format_supported(physical_device, format)) {
-      tiled |= VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT;
-      linear |= VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT;
+      tiled |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT_KHR |
+               VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT_KHR |
+               VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT_KHR;
+      linear |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT_KHR |
+                VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT_KHR |
+                VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT_KHR;
    }
 
    if (radv_is_buffer_format_supported(format, &scaled)) {
       if (format != VK_FORMAT_R64_UINT && format != VK_FORMAT_R64_SINT) {
-         buffer |= VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT;
+         buffer |= VK_FORMAT_FEATURE_2_VERTEX_BUFFER_BIT_KHR;
          if (!scaled)
-            buffer |= VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT;
+            buffer |= VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT_KHR;
       }
-      buffer |= VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT;
+      buffer |= VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT_KHR;
    }
 
    if (vk_format_is_depth_or_stencil(format)) {
       if (radv_is_zs_format_supported(format)) {
-         tiled |= VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT;
-         tiled |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT;
-         tiled |= VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT;
-         tiled |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT | VK_FORMAT_FEATURE_TRANSFER_DST_BIT;
+         tiled |= VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT_KHR;
+         tiled |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT_KHR;
+         tiled |= VK_FORMAT_FEATURE_2_BLIT_SRC_BIT_KHR | VK_FORMAT_FEATURE_2_BLIT_DST_BIT_KHR;
+         tiled |= VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT_KHR | VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT_KHR;
 
          if (radv_is_filter_minmax_format_supported(format))
-            tiled |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT;
+            tiled |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT_KHR;
 
-         if (vk_format_has_depth(format))
-            tiled |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
+         if (vk_format_has_depth(format)) {
+            tiled |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT_KHR |
+                     VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_DEPTH_COMPARISON_BIT_KHR;
+         }
 
          /* Don't support blitting surfaces with depth/stencil. */
          if (vk_format_has_depth(format) && vk_format_has_stencil(format))
-            tiled &= ~VK_FORMAT_FEATURE_BLIT_DST_BIT;
+            tiled &= ~VK_FORMAT_FEATURE_2_BLIT_DST_BIT_KHR;
 
          /* Don't support linear depth surfaces */
          linear = 0;
@@ -716,33 +778,33 @@ radv_physical_device_get_format_properties(struct radv_physical_device *physical
    } else {
       bool linear_sampling;
       if (radv_is_sampler_format_supported(format, &linear_sampling)) {
-         linear |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT | VK_FORMAT_FEATURE_BLIT_SRC_BIT;
-         tiled |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT | VK_FORMAT_FEATURE_BLIT_SRC_BIT;
+         linear |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT_KHR | VK_FORMAT_FEATURE_2_BLIT_SRC_BIT_KHR;
+         tiled |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT_KHR | VK_FORMAT_FEATURE_2_BLIT_SRC_BIT_KHR;
 
          if (radv_is_filter_minmax_format_supported(format))
-            tiled |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT;
+            tiled |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT_KHR;
 
          if (linear_sampling) {
-            linear |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
-            tiled |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
+            linear |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT_KHR;
+            tiled |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT_KHR;
          }
 
          /* Don't support blitting for R32G32B32 formats. */
          if (format == VK_FORMAT_R32G32B32_SFLOAT || format == VK_FORMAT_R32G32B32_UINT ||
              format == VK_FORMAT_R32G32B32_SINT) {
-            linear &= ~VK_FORMAT_FEATURE_BLIT_SRC_BIT;
+            linear &= ~VK_FORMAT_FEATURE_2_BLIT_SRC_BIT_KHR;
          }
       }
       if (radv_is_colorbuffer_format_supported(physical_device, format, &blendable)) {
-         linear |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT;
-         tiled |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT;
+         linear |= VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT_KHR | VK_FORMAT_FEATURE_2_BLIT_DST_BIT_KHR;
+         tiled |= VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT_KHR | VK_FORMAT_FEATURE_2_BLIT_DST_BIT_KHR;
          if (blendable) {
-            linear |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT;
-            tiled |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT;
+            linear |= VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BLEND_BIT_KHR;
+            tiled |= VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BLEND_BIT_KHR;
          }
       }
       if (tiled && !scaled) {
-         tiled |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT | VK_FORMAT_FEATURE_TRANSFER_DST_BIT;
+         tiled |= VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT_KHR | VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT_KHR;
       }
 
       /* Tiled formatting does not support NPOT pixel sizes */
@@ -751,13 +813,13 @@ radv_physical_device_get_format_properties(struct radv_physical_device *physical
    }
 
    if (linear && !scaled) {
-      linear |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT | VK_FORMAT_FEATURE_TRANSFER_DST_BIT;
+      linear |= VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT_KHR | VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT_KHR;
    }
 
    if (radv_is_atomic_format_supported(format)) {
-      buffer |= VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_ATOMIC_BIT;
-      linear |= VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT;
-      tiled |= VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT;
+      buffer |= VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_ATOMIC_BIT_KHR;
+      linear |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_ATOMIC_BIT_KHR;
+      tiled |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_ATOMIC_BIT_KHR;
    }
 
    switch (format) {
@@ -768,7 +830,7 @@ radv_physical_device_get_format_properties(struct radv_physical_device *physical
    case VK_FORMAT_A2R10G10B10_SINT_PACK32:
    case VK_FORMAT_A2B10G10R10_SINT_PACK32:
       buffer &=
-         ~(VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT | VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT);
+         ~(VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT_KHR | VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT_KHR);
       linear = 0;
       tiled = 0;
       break;
@@ -777,11 +839,16 @@ radv_physical_device_get_format_properties(struct radv_physical_device *physical
    }
 
    switch (format) {
+   case VK_FORMAT_R32G32_SFLOAT:
    case VK_FORMAT_R32G32B32_SFLOAT:
    case VK_FORMAT_R32G32B32A32_SFLOAT:
+   case VK_FORMAT_R16G16_SFLOAT:
    case VK_FORMAT_R16G16B16_SFLOAT:
    case VK_FORMAT_R16G16B16A16_SFLOAT:
-      buffer |= VK_FORMAT_FEATURE_ACCELERATION_STRUCTURE_VERTEX_BUFFER_BIT_KHR;
+   case VK_FORMAT_R16G16_SNORM:
+   case VK_FORMAT_R16G16B16A16_SNORM:
+   case VK_FORMAT_R16G16B16A16_UNORM:
+      buffer |= VK_FORMAT_FEATURE_2_ACCELERATION_STRUCTURE_VERTEX_BUFFER_BIT_KHR;
       break;
    default:
       break;
@@ -792,13 +859,13 @@ radv_physical_device_get_format_properties(struct radv_physical_device *physical
 
    /* From the Vulkan spec 1.2.163:
     *
-    * "VK_FORMAT_FEATURE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR must be supported for the
+    * "VK_FORMAT_FEATURE_2_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR must be supported for the
     *  following formats if the attachmentFragmentShadingRate feature is supported:"
     *
     * - VK_FORMAT_R8_UINT
     */
    if (format == VK_FORMAT_R8_UINT) {
-      tiled |= VK_FORMAT_FEATURE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR;
+      tiled |= VK_FORMAT_FEATURE_2_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR;
    }
 
    out_properties->linearTilingFeatures = linear;
@@ -1111,25 +1178,16 @@ radv_format_pack_clear_color(VkFormat format, uint32_t clear_vals[2], VkClearCol
    return true;
 }
 
-void
-radv_GetPhysicalDeviceFormatProperties(VkPhysicalDevice physicalDevice, VkFormat format,
-                                       VkFormatProperties *pFormatProperties)
-{
-   RADV_FROM_HANDLE(radv_physical_device, physical_device, physicalDevice);
-
-   radv_physical_device_get_format_properties(physical_device, format, pFormatProperties);
-}
-
 static const struct ac_modifier_options radv_modifier_options = {
    .dcc = true,
    .dcc_retile = true,
 };
 
-static VkFormatFeatureFlags
+static VkFormatFeatureFlags2KHR
 radv_get_modifier_flags(struct radv_physical_device *dev, VkFormat format, uint64_t modifier,
-                        const VkFormatProperties *props)
+                        const VkFormatProperties3KHR *props)
 {
-   VkFormatFeatureFlags features;
+   VkFormatFeatureFlags2KHR features;
 
    if (vk_format_is_compressed(format) || vk_format_is_depth_or_stencil(format))
       return 0;
@@ -1140,7 +1198,11 @@ radv_get_modifier_flags(struct radv_physical_device *dev, VkFormat format, uint6
       features = props->optimalTilingFeatures;
 
    if (ac_modifier_has_dcc(modifier)) {
-      features &= ~VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT;
+      /* Only disable support for STORAGE_IMAGE on modifiers that
+       * do not support DCC image stores.
+       */
+      if (!ac_modifier_supports_dcc_image_stores(modifier) || radv_is_atomic_format_supported(format))
+         features &= ~VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT_KHR;
 
       if (dev->instance->debug_flags & (RADV_DEBUG_NO_DCC | RADV_DEBUG_NO_DISPLAY_DCC))
          return 0;
@@ -1149,12 +1211,17 @@ radv_get_modifier_flags(struct radv_physical_device *dev, VkFormat format, uint6
    return features;
 }
 
+static VkFormatFeatureFlags
+features2_to_features(VkFormatFeatureFlags2KHR features2)
+{
+   return features2 & VK_ALL_FORMAT_FEATURE_FLAG_BITS;
+}
+
 static void
 radv_list_drm_format_modifiers(struct radv_physical_device *dev, VkFormat format,
-                               VkFormatProperties2 *pFormatProperties)
+                               const VkFormatProperties3KHR *format_props,
+                               VkDrmFormatModifierPropertiesListEXT *mod_list)
 {
-   VkDrmFormatModifierPropertiesListEXT *mod_list =
-      vk_find_struct(pFormatProperties, DRM_FORMAT_MODIFIER_PROPERTIES_LIST_EXT);
    unsigned mod_count;
 
    if (!mod_list)
@@ -1185,8 +1252,69 @@ radv_list_drm_format_modifiers(struct radv_physical_device *dev, VkFormat format
 
    mod_list->drmFormatModifierCount = 0;
    for (unsigned i = 0; i < mod_count; ++i) {
-      VkFormatFeatureFlags features =
-         radv_get_modifier_flags(dev, format, mods[i], &pFormatProperties->formatProperties);
+      VkFormatFeatureFlags2KHR features =
+         radv_get_modifier_flags(dev, format, mods[i], format_props);
+      unsigned planes = vk_format_get_plane_count(format);
+      if (planes == 1) {
+         if (ac_modifier_has_dcc_retile(mods[i]))
+            planes = 3;
+         else if (ac_modifier_has_dcc(mods[i]))
+            planes = 2;
+      }
+
+      if (!features)
+         continue;
+
+      mod_list->pDrmFormatModifierProperties[mod_list->drmFormatModifierCount].drmFormatModifier =
+         mods[i];
+      mod_list->pDrmFormatModifierProperties[mod_list->drmFormatModifierCount]
+         .drmFormatModifierPlaneCount = planes;
+      mod_list->pDrmFormatModifierProperties[mod_list->drmFormatModifierCount]
+         .drmFormatModifierTilingFeatures = features2_to_features(features);
+
+      ++mod_list->drmFormatModifierCount;
+   }
+
+   free(mods);
+}
+
+static void
+radv_list_drm_format_modifiers_2(struct radv_physical_device *dev, VkFormat format,
+                                 const VkFormatProperties3KHR *format_props,
+                                 VkDrmFormatModifierPropertiesList2EXT *mod_list)
+{
+   unsigned mod_count;
+
+   if (!mod_list)
+      return;
+
+   if (vk_format_is_compressed(format) || vk_format_is_depth_or_stencil(format)) {
+      mod_list->drmFormatModifierCount = 0;
+      return;
+   }
+
+   ac_get_supported_modifiers(&dev->rad_info, &radv_modifier_options,
+                              vk_format_to_pipe_format(format), &mod_count, NULL);
+   if (!mod_list->pDrmFormatModifierProperties) {
+      mod_list->drmFormatModifierCount = mod_count;
+      return;
+   }
+
+   mod_count = MIN2(mod_count, mod_list->drmFormatModifierCount);
+
+   uint64_t *mods = malloc(mod_count * sizeof(uint64_t));
+   if (!mods) {
+      /* We can't return an error here ... */
+      mod_list->drmFormatModifierCount = 0;
+      return;
+   }
+   ac_get_supported_modifiers(&dev->rad_info, &radv_modifier_options,
+                              vk_format_to_pipe_format(format), &mod_count, mods);
+
+   mod_list->drmFormatModifierCount = 0;
+   for (unsigned i = 0; i < mod_count; ++i) {
+      VkFormatFeatureFlags2KHR features =
+         radv_get_modifier_flags(dev, format, mods[i], format_props);
       unsigned planes = vk_format_get_plane_count(format);
       if (planes == 1) {
          if (ac_modifier_has_dcc_retile(mods[i]))
@@ -1216,6 +1344,8 @@ radv_check_modifier_support(struct radv_physical_device *dev,
                             const VkPhysicalDeviceImageFormatInfo2 *info,
                             VkImageFormatProperties *props, VkFormat format, uint64_t modifier)
 {
+   uint32_t max_width, max_height;
+
    if (info->type != VK_IMAGE_TYPE_2D)
       return VK_ERROR_FORMAT_NOT_SUPPORTED;
 
@@ -1276,6 +1406,11 @@ radv_check_modifier_support(struct radv_physical_device *dev,
       props->maxMipLevels = 1;
       props->maxArrayLayers = 1;
    }
+
+   ac_modifier_max_extent(&dev->rad_info, modifier, &max_width, &max_height);
+   props->maxExtent.width = MIN2(props->maxExtent.width, max_width);
+   props->maxExtent.height = MIN2(props->maxExtent.width, max_height);
+
    /* We don't support MSAA for modifiers */
    props->sampleCounts &= VK_SAMPLE_COUNT_1_BIT;
    return VK_SUCCESS;
@@ -1286,11 +1421,31 @@ radv_GetPhysicalDeviceFormatProperties2(VkPhysicalDevice physicalDevice, VkForma
                                         VkFormatProperties2 *pFormatProperties)
 {
    RADV_FROM_HANDLE(radv_physical_device, physical_device, physicalDevice);
+   VkFormatProperties3KHR format_props;
 
-   radv_physical_device_get_format_properties(physical_device, format,
-                                              &pFormatProperties->formatProperties);
+   radv_physical_device_get_format_properties(physical_device, format, &format_props);
 
-   radv_list_drm_format_modifiers(physical_device, format, pFormatProperties);
+   pFormatProperties->formatProperties.linearTilingFeatures =
+      features2_to_features(format_props.linearTilingFeatures);
+   pFormatProperties->formatProperties.optimalTilingFeatures =
+      features2_to_features(format_props.optimalTilingFeatures);
+   pFormatProperties->formatProperties.bufferFeatures =
+      features2_to_features(format_props.bufferFeatures);
+
+   VkFormatProperties3KHR *format_props_extended =
+      vk_find_struct(pFormatProperties, FORMAT_PROPERTIES_3_KHR);
+   if (format_props_extended) {
+      format_props_extended->linearTilingFeatures = format_props.linearTilingFeatures;
+      format_props_extended->optimalTilingFeatures = format_props.optimalTilingFeatures;
+      format_props_extended->bufferFeatures = format_props.bufferFeatures;
+   }
+
+   radv_list_drm_format_modifiers(
+      physical_device, format, &format_props,
+      vk_find_struct(pFormatProperties, DRM_FORMAT_MODIFIER_PROPERTIES_LIST_EXT));
+   radv_list_drm_format_modifiers_2(
+      physical_device, format, &format_props,
+      vk_find_struct(pFormatProperties, DRM_FORMAT_MODIFIER_PROPERTIES_LIST_2_EXT));
 }
 
 static VkResult
@@ -1299,8 +1454,8 @@ radv_get_image_format_properties(struct radv_physical_device *physical_device,
                                  VkImageFormatProperties *pImageFormatProperties)
 
 {
-   VkFormatProperties format_props;
-   VkFormatFeatureFlags format_feature_flags;
+   VkFormatProperties3KHR format_props;
+   VkFormatFeatureFlags2KHR format_feature_flags;
    VkExtent3D maxExtent;
    uint32_t maxMipLevels;
    uint32_t maxArraySize;
@@ -1369,8 +1524,8 @@ radv_get_image_format_properties(struct radv_physical_device *physical_device,
    }
 
    if (tiling == VK_IMAGE_TILING_OPTIMAL && info->type == VK_IMAGE_TYPE_2D &&
-       (format_feature_flags & (VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
-                                VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT)) &&
+       (format_feature_flags & (VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT_KHR |
+                                VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT_KHR)) &&
        !(info->flags & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT) &&
        !(info->usage & VK_IMAGE_USAGE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR)) {
       sampleCounts |= VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT | VK_SAMPLE_COUNT_8_BIT;
@@ -1394,49 +1549,49 @@ radv_get_image_format_properties(struct radv_physical_device *physical_device,
        vk_format_get_blocksizebits(format) == 128 && vk_format_is_compressed(format) &&
        (info->flags & VK_IMAGE_CREATE_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT) &&
        ((info->flags & VK_IMAGE_CREATE_EXTENDED_USAGE_BIT) ||
-        (info->usage & VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT))) {
+        (info->usage & VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT_KHR))) {
       goto unsupported;
    }
 
    if (info->usage & VK_IMAGE_USAGE_SAMPLED_BIT) {
-      if (!(format_feature_flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)) {
+      if (!(format_feature_flags & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT_KHR)) {
          goto unsupported;
       }
    }
 
    if (info->usage & VK_IMAGE_USAGE_STORAGE_BIT) {
-      if (!(format_feature_flags & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT)) {
+      if (!(format_feature_flags & VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT_KHR)) {
          goto unsupported;
       }
    }
 
    if (info->usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
-      if (!(format_feature_flags & VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT)) {
+      if (!(format_feature_flags & VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT_KHR)) {
          goto unsupported;
       }
    }
 
    if (info->usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {
-      if (!(format_feature_flags & VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT)) {
+      if (!(format_feature_flags & VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT_KHR)) {
          goto unsupported;
       }
    }
 
    if (info->usage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) {
-      if (!(format_feature_flags & VK_FORMAT_FEATURE_TRANSFER_SRC_BIT)) {
+      if (!(format_feature_flags & VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT_KHR)) {
          goto unsupported;
       }
    }
 
    if (info->usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT) {
-      if (!(format_feature_flags & VK_FORMAT_FEATURE_TRANSFER_DST_BIT)) {
+      if (!(format_feature_flags & VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT_KHR)) {
          goto unsupported;
       }
    }
 
    if (info->usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) {
-      if (!(format_feature_flags & (VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
-                                    VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT))) {
+      if (!(format_feature_flags & (VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT_KHR |
+                                    VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT_KHR))) {
          goto unsupported;
       }
    }
@@ -1489,27 +1644,6 @@ radv_get_image_format_properties(struct radv_physical_device *physical_device,
    return result;
 }
 
-VkResult
-radv_GetPhysicalDeviceImageFormatProperties(VkPhysicalDevice physicalDevice, VkFormat format,
-                                            VkImageType type, VkImageTiling tiling,
-                                            VkImageUsageFlags usage, VkImageCreateFlags createFlags,
-                                            VkImageFormatProperties *pImageFormatProperties)
-{
-   RADV_FROM_HANDLE(radv_physical_device, physical_device, physicalDevice);
-
-   const VkPhysicalDeviceImageFormatInfo2 info = {
-      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2,
-      .pNext = NULL,
-      .format = format,
-      .type = type,
-      .tiling = tiling,
-      .usage = usage,
-      .flags = createFlags,
-   };
-
-   return radv_get_image_format_properties(physical_device, &info, format, pImageFormatProperties);
-}
-
 static void
 get_external_image_format_properties(struct radv_physical_device *physical_device,
                                      const VkPhysicalDeviceImageFormatInfo2 *pImageFormatInfo,
@@ -1668,7 +1802,7 @@ radv_GetPhysicalDeviceImageFormatProperties2(VkPhysicalDevice physicalDevice,
           *    vkGetPhysicalDeviceImageFormatProperties2 returns
           *    VK_ERROR_FORMAT_NOT_SUPPORTED.
           */
-         result = vk_errorf(physical_device->instance, VK_ERROR_FORMAT_NOT_SUPPORTED,
+         result = vk_errorf(physical_device, VK_ERROR_FORMAT_NOT_SUPPORTED,
                             "unsupported VkExternalMemoryTypeFlagBitsKHR 0x%x",
                             external_info->handleType);
          goto fail;
@@ -1761,41 +1895,6 @@ radv_GetPhysicalDeviceSparseImageFormatProperties2(
    };
 }
 
-void
-radv_GetPhysicalDeviceSparseImageFormatProperties(VkPhysicalDevice physicalDevice, VkFormat format,
-                                                  VkImageType type, uint32_t samples,
-                                                  VkImageUsageFlags usage, VkImageTiling tiling,
-                                                  uint32_t *pNumProperties,
-                                                  VkSparseImageFormatProperties *pProperties)
-{
-   const VkPhysicalDeviceSparseImageFormatInfo2 info = {
-      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SPARSE_IMAGE_FORMAT_INFO_2,
-      .format = format,
-      .type = type,
-      .samples = samples,
-      .usage = usage,
-      .tiling = tiling};
-
-   if (!pProperties) {
-      radv_GetPhysicalDeviceSparseImageFormatProperties2(physicalDevice, &info, pNumProperties,
-                                                         NULL);
-      return;
-   }
-
-   VkSparseImageFormatProperties2 props[4];
-   uint32_t prop_cnt = MIN2(ARRAY_SIZE(props), *pNumProperties);
-
-   memset(props, 0, sizeof(props));
-   for (unsigned i = 0; i < ARRAY_SIZE(props); ++i)
-      props[i].sType = VK_STRUCTURE_TYPE_SPARSE_IMAGE_FORMAT_PROPERTIES_2;
-
-   radv_GetPhysicalDeviceSparseImageFormatProperties2(physicalDevice, &info, &prop_cnt, props);
-
-   for (unsigned i = 0; i < prop_cnt; ++i)
-      pProperties[i] = props[i].properties;
-   *pNumProperties = prop_cnt;
-}
-
 void
 radv_GetImageSparseMemoryRequirements2(VkDevice _device,
                                        const VkImageSparseMemoryRequirementsInfo2 *pInfo,
@@ -1847,31 +1946,30 @@ radv_GetImageSparseMemoryRequirements2(VkDevice _device,
 }
 
 void
-radv_GetImageSparseMemoryRequirements(VkDevice device, VkImage image,
-                                      uint32_t *pSparseMemoryRequirementCount,
-                                      VkSparseImageMemoryRequirements *pSparseMemoryRequirements)
+radv_GetDeviceImageSparseMemoryRequirementsKHR(VkDevice device,
+                                               const VkDeviceImageMemoryRequirementsKHR* pInfo,
+                                               uint32_t *pSparseMemoryRequirementCount,
+                                               VkSparseImageMemoryRequirements2 *pSparseMemoryRequirements)
 {
-   const VkImageSparseMemoryRequirementsInfo2 info = {
+   UNUSED VkResult result;
+   VkImage image;
+
+   /* Determining the image size/alignment require to create a surface, which is complicated without
+    * creating an image.
+    * TODO: Avoid creating an image.
+    */
+   result = radv_CreateImage(device, pInfo->pCreateInfo, NULL, &image);
+   assert(result == VK_SUCCESS);
+
+   VkImageSparseMemoryRequirementsInfo2 info2 = {
       .sType = VK_STRUCTURE_TYPE_IMAGE_SPARSE_MEMORY_REQUIREMENTS_INFO_2,
-      .image = image};
+      .image = image,
+   };
 
-   if (!pSparseMemoryRequirements) {
-      radv_GetImageSparseMemoryRequirements2(device, &info, pSparseMemoryRequirementCount, NULL);
-      return;
-   }
+   radv_GetImageSparseMemoryRequirements2(device, &info2, pSparseMemoryRequirementCount,
+                                          pSparseMemoryRequirements);
 
-   VkSparseImageMemoryRequirements2 reqs[4];
-   uint32_t reqs_cnt = MIN2(ARRAY_SIZE(reqs), *pSparseMemoryRequirementCount);
-
-   memset(reqs, 0, sizeof(reqs));
-   for (unsigned i = 0; i < ARRAY_SIZE(reqs); ++i)
-      reqs[i].sType = VK_STRUCTURE_TYPE_SPARSE_IMAGE_MEMORY_REQUIREMENTS_2;
-
-   radv_GetImageSparseMemoryRequirements2(device, &info, &reqs_cnt, reqs);
-
-   for (unsigned i = 0; i < reqs_cnt; ++i)
-      pSparseMemoryRequirements[i] = reqs[i].memoryRequirements;
-   *pSparseMemoryRequirementCount = reqs_cnt;
+   radv_DestroyImage(device, image, NULL);
 }
 
 void
diff --git a/mesa 3D driver/src/amd/vulkan/radv_image.c b/mesa 3D driver/src/amd/vulkan/radv_image.c
index 9d00efe084..5430f94794 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_image.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_image.c	
@@ -124,8 +124,8 @@ radv_surface_has_scanout(struct radv_device *device, const struct radv_image_cre
 }
 
 static bool
-radv_image_use_fast_clear_for_image(const struct radv_device *device,
-                                    const struct radv_image *image)
+radv_image_use_fast_clear_for_image_early(const struct radv_device *device,
+                                          const struct radv_image *image)
 {
    if (device->instance->debug_flags & RADV_DEBUG_FORCE_COMPRESS)
       return true;
@@ -139,7 +139,17 @@ radv_image_use_fast_clear_for_image(const struct radv_device *device,
       return false;
    }
 
-   return image->usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT &&
+   return !!(image->usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT);
+}
+
+static bool
+radv_image_use_fast_clear_for_image(const struct radv_device *device,
+                                    const struct radv_image *image)
+{
+   if (device->instance->debug_flags & RADV_DEBUG_FORCE_COMPRESS)
+      return true;
+
+   return radv_image_use_fast_clear_for_image_early(device, image) &&
           (image->exclusive ||
            /* Enable DCC for concurrent images if stores are
             * supported because that means we can keep DCC compressed on
@@ -186,9 +196,19 @@ radv_are_formats_dcc_compatible(const struct radv_physical_device *pdev, const v
 }
 
 static bool
-radv_formats_is_atomic_allowed(const void *pNext, VkFormat format, VkImageCreateFlags flags)
+radv_format_is_atomic_allowed(struct radv_device *device, VkFormat format)
 {
-   if (radv_is_atomic_format_supported(format))
+   if (format == VK_FORMAT_R32_SFLOAT && !device->image_float32_atomics)
+      return false;
+
+   return radv_is_atomic_format_supported(format);
+}
+
+static bool
+radv_formats_is_atomic_allowed(struct radv_device *device, const void *pNext, VkFormat format,
+                               VkImageCreateFlags flags)
+{
+   if (radv_format_is_atomic_allowed(device, format))
       return true;
 
    if (flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT) {
@@ -199,7 +219,7 @@ radv_formats_is_atomic_allowed(const void *pNext, VkFormat format, VkImageCreate
       /* We have to ignore the existence of the list if viewFormatCount = 0 */
       if (format_list && format_list->viewFormatCount) {
          for (unsigned i = 0; i < format_list->viewFormatCount; ++i) {
-            if (radv_is_atomic_format_supported(format_list->pViewFormats[i]))
+            if (radv_format_is_atomic_allowed(device, format_list->pViewFormats[i]))
                return true;
          }
       }
@@ -209,9 +229,9 @@ radv_formats_is_atomic_allowed(const void *pNext, VkFormat format, VkImageCreate
 }
 
 static bool
-radv_use_dcc_for_image(struct radv_device *device, struct radv_image *image,
-                       const VkImageCreateInfo *pCreateInfo, VkFormat format,
-                       bool *sign_reinterpret)
+radv_use_dcc_for_image_early(struct radv_device *device, struct radv_image *image,
+                             const VkImageCreateInfo *pCreateInfo, VkFormat format,
+                             bool *sign_reinterpret)
 {
    /* DCC (Delta Color Compression) is only available for GFX8+. */
    if (device->physical_device->rad_info.chip_class < GFX8)
@@ -231,8 +251,8 @@ radv_use_dcc_for_image(struct radv_device *device, struct radv_image *image,
     * decompressing a lot anyway we might as well not have DCC.
     */
    if ((pCreateInfo->usage & VK_IMAGE_USAGE_STORAGE_BIT) &&
-       (!radv_image_use_dcc_image_stores(device, image) ||
-        radv_formats_is_atomic_allowed(pCreateInfo->pNext, format, pCreateInfo->flags)))
+       (device->physical_device->rad_info.chip_class < GFX10 ||
+        radv_formats_is_atomic_allowed(device, pCreateInfo->pNext, format, pCreateInfo->flags)))
       return false;
 
    /* Do not enable DCC for fragment shading rate attachments. */
@@ -245,7 +265,7 @@ radv_use_dcc_for_image(struct radv_device *device, struct radv_image *image,
    if (vk_format_is_subsampled(format) || vk_format_get_plane_count(format) > 1)
       return false;
 
-   if (!radv_image_use_fast_clear_for_image(device, image) &&
+   if (!radv_image_use_fast_clear_for_image_early(device, image) &&
        image->tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT)
       return false;
 
@@ -268,6 +288,26 @@ radv_use_dcc_for_image(struct radv_device *device, struct radv_image *image,
                                           pCreateInfo->flags, sign_reinterpret);
 }
 
+static bool
+radv_use_dcc_for_image_late(struct radv_device *device, struct radv_image *image)
+{
+   if (!radv_image_has_dcc(image))
+      return false;
+
+   if (image->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT)
+      return true;
+
+   if (!radv_image_use_fast_clear_for_image(device, image))
+      return false;
+
+   /* TODO: Fix storage images with DCC without DCC image stores.
+    * Disabling it for now. */
+   if ((image->usage & VK_IMAGE_USAGE_STORAGE_BIT) && !radv_image_use_dcc_image_stores(device, image))
+      return false;
+
+   return true;
+}
+
 /*
  * Whether to enable image stores with DCC compression for this image. If
  * this function returns false the image subresource should be decompressed
@@ -281,7 +321,8 @@ radv_use_dcc_for_image(struct radv_device *device, struct radv_image *image,
 bool
 radv_image_use_dcc_image_stores(const struct radv_device *device, const struct radv_image *image)
 {
-   return device->physical_device->rad_info.chip_class >= GFX10;
+   return ac_surface_supports_dcc_image_stores(device->physical_device->rad_info.chip_class,
+                                               &image->planes[0].surface);
 }
 
 /*
@@ -542,8 +583,8 @@ radv_get_surface_flags(struct radv_device *device, struct radv_image *image, uns
        vk_format_get_blocksizebits(image_format) == 128 && vk_format_is_compressed(image_format))
       flags |= RADEON_SURF_NO_RENDER_TARGET;
 
-   if (!radv_use_dcc_for_image(device, image, pCreateInfo, image_format,
-                               &image->dcc_sign_reinterpret))
+   if (!radv_use_dcc_for_image_early(device, image, pCreateInfo, image_format,
+                                     &image->dcc_sign_reinterpret))
       flags |= RADEON_SURF_DISABLE_DCC;
 
    if (!radv_use_fmask_for_image(device, image))
@@ -1288,7 +1329,8 @@ radv_image_alloc_values(const struct radv_device *device, struct radv_image *ima
       image->size += 8 * image->info.levels;
    }
 
-   if (radv_image_has_dcc(image) || radv_image_has_cmask(image) || radv_image_has_htile(image)) {
+   if ((radv_image_has_dcc(image) && !image->support_comp_to_single) ||
+       radv_image_has_cmask(image) || radv_image_has_htile(image)) {
       image->clear_value_offset = image->size;
       image->size += 8 * image->info.levels;
    }
@@ -1423,10 +1465,6 @@ radv_image_use_comp_to_single(const struct radv_device *device, const struct rad
    if (!radv_image_has_dcc(image))
       return false;
 
-   /* TODO: DCC fast clears with MSAA aren't fully supported. */
-   if (image->info.samples > 1)
-      return false;
-
    /* It seems 8bpp and 16bpp require RB+ to work. */
    unsigned bytes_per_pixel = vk_format_get_blocksize(image->vk_format);
    if (bytes_per_pixel <= 2 && !device->physical_device->rad_info.rbplus_allowed)
@@ -1500,6 +1538,11 @@ radv_image_create_layout(struct radv_device *device, struct radv_image_create_in
 
       device->ws->surface_init(device->ws, &info, &image->planes[plane].surface);
 
+      if (plane == 0) {
+         if (!radv_use_dcc_for_image_late(device, image))
+            ac_surface_zero_dcc_fields(&image->planes[0].surface);
+      }
+
       if (create_info.bo_metadata && !mod_info &&
           !ac_surface_set_umd_metadata(&device->physical_device->rad_info,
                                        &image->planes[plane].surface, image_info.storage_samples,
@@ -1669,7 +1712,7 @@ radv_image_create(VkDevice _device, const struct radv_image_create_info *create_
    image =
       vk_zalloc2(&device->vk.alloc, alloc, image_struct_size, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (!image)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    vk_object_base_init(&device->vk, &image->base, VK_OBJECT_TYPE_IMAGE);
 
@@ -1746,7 +1789,7 @@ radv_image_create(VkDevice _device, const struct radv_image_create_info *create_
                                    RADEON_FLAG_VIRTUAL, RADV_BO_PRIORITY_VIRTUAL, 0, &image->bo);
       if (result != VK_SUCCESS) {
          radv_destroy_image(device, alloc, image);
-         return vk_error(device->instance, result);
+         return vk_error(device, result);
       }
    }
 
@@ -1793,7 +1836,7 @@ radv_image_view_make_descriptor(struct radv_image_view *iview, struct radv_devic
       vk_format_get_plane_width(image->vk_format, plane_id, iview->extent.width),
       vk_format_get_plane_height(image->vk_format, plane_id, iview->extent.height),
       iview->extent.depth, descriptor->plane_descriptors[descriptor_plane_id],
-      descriptor_plane_id ? NULL : descriptor->fmask_descriptor);
+      descriptor_plane_id || is_storage_image ? NULL : descriptor->fmask_descriptor);
 
    const struct legacy_surf_level *base_level_info = NULL;
    if (device->physical_device->rad_info.chip_class <= GFX9) {
@@ -1887,6 +1930,8 @@ radv_image_view_init(struct radv_image_view *iview, struct radv_device *device,
    const VkImageSubresourceRange *range = &pCreateInfo->subresourceRange;
    uint32_t plane_count = 1;
 
+   vk_object_base_init(&device->vk, &iview->base, VK_OBJECT_TYPE_IMAGE_VIEW);
+
    switch (image->type) {
    case VK_IMAGE_TYPE_1D:
    case VK_IMAGE_TYPE_2D:
@@ -2022,6 +2067,12 @@ radv_image_view_init(struct radv_image_view *iview, struct radv_device *device,
    }
 }
 
+void
+radv_image_view_finish(struct radv_image_view *iview)
+{
+   vk_object_base_finish(&iview->base);
+}
+
 bool
 radv_layout_is_htile_compressed(const struct radv_device *device, const struct radv_image *image,
                                 VkImageLayout layout, bool in_render_loop, unsigned queue_mask)
@@ -2032,7 +2083,8 @@ radv_layout_is_htile_compressed(const struct radv_device *device, const struct r
    case VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR:
       return radv_image_has_htile(image);
    case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL:
-      return radv_image_has_htile(image) && queue_mask == (1u << RADV_QUEUE_GENERAL);
+      return radv_image_is_tc_compat_htile(image) ||
+             (radv_image_has_htile(image) && queue_mask == (1u << RADV_QUEUE_GENERAL));
    case VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR:
    case VK_IMAGE_LAYOUT_GENERAL:
       /* It should be safe to enable TC-compat HTILE with
@@ -2047,9 +2099,7 @@ radv_layout_is_htile_compressed(const struct radv_device *device, const struct r
        */
       if (radv_image_is_tc_compat_htile(image) && queue_mask & (1u << RADV_QUEUE_GENERAL) &&
           !in_render_loop && !device->instance->disable_tc_compat_htile_in_general) {
-         /* GFX10+ supports compressed writes to HTILE. */
-         return device->physical_device->rad_info.chip_class >= GFX10 ||
-                !(image->usage & VK_IMAGE_USAGE_STORAGE_BIT);
+         return true;
       } else {
          return false;
       }
@@ -2270,9 +2320,7 @@ radv_CreateImageView(VkDevice _device, const VkImageViewCreateInfo *pCreateInfo,
    view =
       vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*view), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (view == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   vk_object_base_init(&device->vk, &view->base, VK_OBJECT_TYPE_IMAGE_VIEW);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    radv_image_view_init(view, device, pCreateInfo, NULL);
 
@@ -2290,7 +2338,7 @@ radv_DestroyImageView(VkDevice _device, VkImageView _iview, const VkAllocationCa
    if (!iview)
       return;
 
-   vk_object_base_finish(&iview->base);
+   radv_image_view_finish(iview);
    vk_free2(&device->vk.alloc, pAllocator, iview);
 }
 
@@ -2300,6 +2348,8 @@ radv_buffer_view_init(struct radv_buffer_view *view, struct radv_device *device,
 {
    RADV_FROM_HANDLE(radv_buffer, buffer, pCreateInfo->buffer);
 
+   vk_object_base_init(&device->vk, &view->base, VK_OBJECT_TYPE_BUFFER_VIEW);
+
    view->bo = buffer->bo;
    view->range =
       pCreateInfo->range == VK_WHOLE_SIZE ? buffer->size - pCreateInfo->offset : pCreateInfo->range;
@@ -2309,6 +2359,12 @@ radv_buffer_view_init(struct radv_buffer_view *view, struct radv_device *device,
                                view->state);
 }
 
+void
+radv_buffer_view_finish(struct radv_buffer_view *view)
+{
+   vk_object_base_finish(&view->base);
+}
+
 VkResult
 radv_CreateBufferView(VkDevice _device, const VkBufferViewCreateInfo *pCreateInfo,
                       const VkAllocationCallbacks *pAllocator, VkBufferView *pView)
@@ -2319,9 +2375,7 @@ radv_CreateBufferView(VkDevice _device, const VkBufferViewCreateInfo *pCreateInf
    view =
       vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*view), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (!view)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   vk_object_base_init(&device->vk, &view->base, VK_OBJECT_TYPE_BUFFER_VIEW);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    radv_buffer_view_init(view, device, pCreateInfo);
 
@@ -2340,6 +2394,6 @@ radv_DestroyBufferView(VkDevice _device, VkBufferView bufferView,
    if (!view)
       return;
 
-   vk_object_base_finish(&view->base);
+   radv_buffer_view_finish(view);
    vk_free2(&device->vk.alloc, pAllocator, view);
 }
diff --git a/mesa 3D driver/src/amd/vulkan/radv_llvm_helper.cpp b/mesa 3D driver/src/amd/vulkan/radv_llvm_helper.cpp
index 1e0708170d..0341bfcc41 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_llvm_helper.cpp	
+++ b/mesa 3D driver/src/amd/vulkan/radv_llvm_helper.cpp	
@@ -20,16 +20,15 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */
+#include "radv_llvm_helper.h"
 #include "ac_llvm_util.h"
-#include "radv_shader_helper.h"
 
 #include <list>
 class radv_llvm_per_thread_info {
  public:
    radv_llvm_per_thread_info(enum radeon_family arg_family,
                              enum ac_target_machine_options arg_tm_options, unsigned arg_wave_size)
-       : family(arg_family), tm_options(arg_tm_options), wave_size(arg_wave_size), passes(NULL),
-         passes_wave32(NULL)
+       : family(arg_family), tm_options(arg_tm_options), wave_size(arg_wave_size), passes(NULL)
    {
    }
 
@@ -52,8 +51,7 @@ class radv_llvm_per_thread_info {
 
    bool compile_to_memory_buffer(LLVMModuleRef module, char **pelf_buffer, size_t *pelf_size)
    {
-      struct ac_compiler_passes *p = wave_size == 32 ? passes_wave32 : passes;
-      return ac_compile_module_to_elf(p, module, pelf_buffer, pelf_size);
+      return ac_compile_module_to_elf(passes, module, pelf_buffer, pelf_size);
    }
 
    bool is_same(enum radeon_family arg_family, enum ac_target_machine_options arg_tm_options,
@@ -70,7 +68,6 @@ class radv_llvm_per_thread_info {
    enum ac_target_machine_options tm_options;
    unsigned wave_size;
    struct ac_compiler_passes *passes;
-   struct ac_compiler_passes *passes_wave32;
 };
 
 /* we have to store a linked list per thread due to the possiblity of multiple gpus being required */
diff --git a/mesa 3D driver/src/amd/vulkan/radv_llvm_helper.h b/mesa 3D driver/src/amd/vulkan/radv_llvm_helper.h
new file mode 100644
index 0000000000..e57d72dd9d
--- /dev/null
+++ b/mesa 3D driver/src/amd/vulkan/radv_llvm_helper.h	
@@ -0,0 +1,41 @@
+/*
+ * Copyright © 2018 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifndef RADV_SHADER_HELPER_H
+#define RADV_SHADER_HELPER_H
+
+#include "ac_llvm_util.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool radv_init_llvm_compiler(struct ac_llvm_compiler *info, enum radeon_family family,
+                             enum ac_target_machine_options tm_options, unsigned wave_size);
+
+bool radv_compile_to_elf(struct ac_llvm_compiler *info, LLVMModuleRef module, char **pelf_buffer,
+                         size_t *pelf_size);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/mesa 3D driver/src/amd/vulkan/radv_meta.c b/mesa 3D driver/src/amd/vulkan/radv_meta.c
index 5475e37341..8e2a9180d0 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_meta.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_meta.c	
@@ -691,3 +691,19 @@ radv_meta_load_descriptor(nir_builder *b, unsigned desc_set, unsigned binding)
                                                  .binding = binding);
    return nir_channels(b, rsrc, 0x3);
 }
+
+nir_ssa_def *
+get_global_ids(nir_builder *b, unsigned num_components)
+{
+   unsigned mask = BITFIELD_MASK(num_components);
+
+   nir_ssa_def *local_ids = nir_channels(b, nir_load_local_invocation_id(b), mask);
+   nir_ssa_def *block_ids = nir_channels(b, nir_load_workgroup_id(b, 32), mask);
+   nir_ssa_def *block_size = nir_channels(
+      b,
+      nir_imm_ivec4(b, b->shader->info.workgroup_size[0], b->shader->info.workgroup_size[1],
+                    b->shader->info.workgroup_size[2], 0),
+      mask);
+
+   return nir_iadd(b, nir_imul(b, block_ids, block_size), local_ids);
+}
diff --git a/mesa 3D driver/src/amd/vulkan/radv_meta.h b/mesa 3D driver/src/amd/vulkan/radv_meta.h
index 54ce4fb160..cfc5a5faff 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_meta.h	
+++ b/mesa 3D driver/src/amd/vulkan/radv_meta.h	
@@ -205,9 +205,9 @@ void radv_meta_image_to_image_cs(struct radv_cmd_buffer *cmd_buffer,
 void radv_meta_clear_image_cs(struct radv_cmd_buffer *cmd_buffer, struct radv_meta_blit2d_surf *dst,
                               const VkClearColorValue *clear_color);
 
-void radv_decompress_depth_stencil(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
-                                   const VkImageSubresourceRange *subresourceRange,
-                                   struct radv_sample_locations_state *sample_locs);
+void radv_expand_depth_stencil(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
+                               const VkImageSubresourceRange *subresourceRange,
+                               struct radv_sample_locations_state *sample_locs);
 void radv_resummarize_depth_stencil(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
                                     const VkImageSubresourceRange *subresourceRange,
                                     struct radv_sample_locations_state *sample_locs);
@@ -220,7 +220,8 @@ void radv_retile_dcc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *imag
 void radv_expand_fmask_image_inplace(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
                                      const VkImageSubresourceRange *subresourceRange);
 void radv_copy_vrs_htile(struct radv_cmd_buffer *cmd_buffer, struct radv_image *vrs_image,
-                         VkExtent2D *extent, struct radv_image *dst_image);
+                         VkExtent2D *extent, struct radv_image *dst_image,
+                         struct radv_buffer *htile_buffer, bool read_htile_value);
 
 void radv_meta_resolve_compute_image(struct radv_cmd_buffer *cmd_buffer,
                                      struct radv_image *src_image, VkFormat src_format,
@@ -248,6 +249,9 @@ uint32_t radv_clear_dcc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *i
 uint32_t radv_clear_htile(struct radv_cmd_buffer *cmd_buffer, const struct radv_image *image,
                           const VkImageSubresourceRange *range, uint32_t value);
 
+void radv_update_buffer_cp(struct radv_cmd_buffer *cmd_buffer, uint64_t va, const void *data,
+                           uint64_t size);
+
 /**
  * Return whether the bound pipeline is the FMASK decompress pass.
  */
@@ -287,6 +291,8 @@ void radv_meta_build_resolve_shader_core(nir_builder *b, bool is_integer, int sa
 
 nir_ssa_def *radv_meta_load_descriptor(nir_builder *b, unsigned desc_set, unsigned binding);
 
+nir_ssa_def *get_global_ids(nir_builder *b, unsigned num_components);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/mesa 3D driver/src/amd/vulkan/radv_meta_blit.c b/mesa 3D driver/src/amd/vulkan/radv_meta_blit.c
index 03a4912f03..0c13e15829 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_meta_blit.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_meta_blit.c	
@@ -613,6 +613,9 @@ blit_image(struct radv_cmd_buffer *cmd_buffer, struct radv_image *src_image,
       meta_emit_blit(cmd_buffer, src_image, &src_iview, src_image_layout, src_offset_0,
                      src_offset_1, dst_image, &dst_iview, dst_image_layout, dst_offset_0,
                      dst_offset_1, dst_box, sampler);
+
+      radv_image_view_finish(&dst_iview);
+      radv_image_view_finish(&src_iview);
    }
 
    /* Restore conditional rendering. */
diff --git a/mesa 3D driver/src/amd/vulkan/radv_meta_blit2d.c b/mesa 3D driver/src/amd/vulkan/radv_meta_blit2d.c
index b6ac95be41..8ae8df2bc4 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_meta_blit2d.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_meta_blit2d.c	
@@ -395,6 +395,13 @@ radv_meta_blit2d_normal_dst(struct radv_cmd_buffer *cmd_buffer,
           */
          radv_DestroyFramebuffer(radv_device_to_handle(device), dst_temps.fb,
                                  &cmd_buffer->pool->alloc);
+
+         if (src_type == BLIT2D_SRC_TYPE_BUFFER)
+            radv_buffer_view_finish(&src_temps.bview);
+         else
+            radv_image_view_finish(&dst_temps.iview);
+
+         radv_image_view_finish(&dst_temps.iview);
       }
    }
 }
@@ -1314,6 +1321,10 @@ radv_device_init_meta_blit2d_state(struct radv_device *device, bool on_demand)
          if (src == BLIT2D_SRC_TYPE_BUFFER && log2_samples > 0)
             continue;
 
+         /* There are no multisampled 3D images. */
+         if (src == BLIT2D_SRC_TYPE_IMAGE_3D && log2_samples > 0)
+            continue;
+
          result = meta_blit2d_create_pipe_layout(device, src, log2_samples);
          if (result != VK_SUCCESS)
             goto fail;
diff --git a/mesa 3D driver/src/amd/vulkan/radv_meta_buffer.c b/mesa 3D driver/src/amd/vulkan/radv_meta_buffer.c
index b7a1e239bb..b66bb57a1a 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_meta_buffer.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_meta_buffer.c	
@@ -12,13 +12,7 @@ build_buffer_fill_shader(struct radv_device *dev)
    b.shader->info.workgroup_size[1] = 1;
    b.shader->info.workgroup_size[2] = 1;
 
-   nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
-   nir_ssa_def *wg_id = nir_load_workgroup_id(&b, 32);
-   nir_ssa_def *block_size =
-      nir_imm_ivec4(&b, b.shader->info.workgroup_size[0], b.shader->info.workgroup_size[1],
-                    b.shader->info.workgroup_size[2], 0);
-
-   nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
+   nir_ssa_def *global_id = get_global_ids(&b, 1);
 
    nir_ssa_def *offset = nir_imul(&b, global_id, nir_imm_int(&b, 16));
    offset = nir_channel(&b, offset, 0);
@@ -42,13 +36,7 @@ build_buffer_copy_shader(struct radv_device *dev)
    b.shader->info.workgroup_size[1] = 1;
    b.shader->info.workgroup_size[2] = 1;
 
-   nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
-   nir_ssa_def *wg_id = nir_load_workgroup_id(&b, 32);
-   nir_ssa_def *block_size =
-      nir_imm_ivec4(&b, b.shader->info.workgroup_size[0], b.shader->info.workgroup_size[1],
-                    b.shader->info.workgroup_size[2], 0);
-
-   nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
+   nir_ssa_def *global_id = get_global_ids(&b, 1);
 
    nir_ssa_def *offset = nir_imul(&b, global_id, nir_imm_int(&b, 16));
    offset = nir_channel(&b, offset, 0);
@@ -214,12 +202,13 @@ fill_buffer_shader(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys_bo *
    struct radv_device *device = cmd_buffer->device;
    uint64_t block_count = round_up_u64(size, 1024);
    struct radv_meta_saved_state saved_state;
+   struct radv_buffer dst_buffer;
 
    radv_meta_save(
       &saved_state, cmd_buffer,
       RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_CONSTANTS | RADV_META_SAVE_DESCRIPTORS);
 
-   struct radv_buffer dst_buffer = {.bo = bo, .offset = offset, .size = size};
+   radv_buffer_init(&dst_buffer, cmd_buffer->device, bo, size, offset);
 
    radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
                         device->meta_state.buffer.fill_pipeline);
@@ -244,6 +233,8 @@ fill_buffer_shader(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys_bo *
 
    radv_CmdDispatch(radv_cmd_buffer_to_handle(cmd_buffer), block_count, 1, 1);
 
+   radv_buffer_finish(&dst_buffer);
+
    radv_meta_restore(&saved_state, cmd_buffer);
 }
 
@@ -255,13 +246,13 @@ copy_buffer_shader(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys_bo *
    struct radv_device *device = cmd_buffer->device;
    uint64_t block_count = round_up_u64(size, 1024);
    struct radv_meta_saved_state saved_state;
+   struct radv_buffer src_buffer, dst_buffer;
 
    radv_meta_save(&saved_state, cmd_buffer,
                   RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_DESCRIPTORS);
 
-   struct radv_buffer dst_buffer = {.bo = dst_bo, .offset = dst_offset, .size = size};
-
-   struct radv_buffer src_buffer = {.bo = src_bo, .offset = src_offset, .size = size};
+   radv_buffer_init(&src_buffer, cmd_buffer->device, src_bo, size, src_offset);
+   radv_buffer_init(&dst_buffer, cmd_buffer->device, dst_bo, size, dst_offset);
 
    radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
                         device->meta_state.buffer.copy_pipeline);
@@ -290,6 +281,9 @@ copy_buffer_shader(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys_bo *
 
    radv_CmdDispatch(radv_cmd_buffer_to_handle(cmd_buffer), block_count, 1, 1);
 
+   radv_buffer_finish(&src_buffer);
+   radv_buffer_finish(&dst_buffer);
+
    radv_meta_restore(&saved_state, cmd_buffer);
 }
 
@@ -408,14 +402,35 @@ radv_CmdCopyBuffer2KHR(VkCommandBuffer commandBuffer, const VkCopyBufferInfo2KHR
    }
 }
 
+void
+radv_update_buffer_cp(struct radv_cmd_buffer *cmd_buffer, uint64_t va, const void *data,
+                      uint64_t size)
+{
+   uint64_t words = size / 4;
+   bool mec = radv_cmd_buffer_uses_mec(cmd_buffer);
+
+   assert(size < RADV_BUFFER_UPDATE_THRESHOLD);
+
+   si_emit_cache_flush(cmd_buffer);
+   radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, words + 4);
+
+   radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + words, 0));
+   radeon_emit(cmd_buffer->cs, S_370_DST_SEL(mec ? V_370_MEM : V_370_MEM_GRBM) |
+                                  S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
+   radeon_emit(cmd_buffer->cs, va);
+   radeon_emit(cmd_buffer->cs, va >> 32);
+   radeon_emit_array(cmd_buffer->cs, data, words);
+
+   if (unlikely(cmd_buffer->device->trace_bo))
+      radv_cmd_buffer_trace_emit(cmd_buffer);
+}
+
 void
 radv_CmdUpdateBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, VkDeviceSize dstOffset,
                      VkDeviceSize dataSize, const void *pData)
 {
    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
    RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer);
-   bool mec = radv_cmd_buffer_uses_mec(cmd_buffer);
-   uint64_t words = dataSize / 4;
    uint64_t va = radv_buffer_get_va(dst_buffer->bo);
    va += dstOffset + dst_buffer->offset;
 
@@ -426,21 +441,8 @@ radv_CmdUpdateBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer, VkDevice
       return;
 
    if (dataSize < RADV_BUFFER_UPDATE_THRESHOLD) {
-      si_emit_cache_flush(cmd_buffer);
-
       radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, dst_buffer->bo);
-
-      radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, words + 4);
-
-      radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + words, 0));
-      radeon_emit(cmd_buffer->cs, S_370_DST_SEL(mec ? V_370_MEM : V_370_MEM_GRBM) |
-                                     S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
-      radeon_emit(cmd_buffer->cs, va);
-      radeon_emit(cmd_buffer->cs, va >> 32);
-      radeon_emit_array(cmd_buffer->cs, pData, words);
-
-      if (unlikely(cmd_buffer->device->trace_bo))
-         radv_cmd_buffer_trace_emit(cmd_buffer);
+      radv_update_buffer_cp(cmd_buffer, va, pData, dataSize);
    } else {
       uint32_t buf_offset;
       radv_cmd_buffer_upload_data(cmd_buffer, dataSize, pData, &buf_offset);
diff --git a/mesa 3D driver/src/amd/vulkan/radv_meta_bufimage.c b/mesa 3D driver/src/amd/vulkan/radv_meta_bufimage.c
index ea3bec3323..60884d0342 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_meta_bufimage.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_meta_bufimage.c	
@@ -47,17 +47,11 @@ build_nir_itob_compute_shader(struct radv_device *dev, bool is_3d)
    input_img->data.descriptor_set = 0;
    input_img->data.binding = 0;
 
-   nir_variable *output_img = nir_variable_create(b.shader, nir_var_uniform, img_type, "out_img");
+   nir_variable *output_img = nir_variable_create(b.shader, nir_var_image, img_type, "out_img");
    output_img->data.descriptor_set = 0;
    output_img->data.binding = 1;
 
-   nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
-   nir_ssa_def *wg_id = nir_load_workgroup_id(&b, 32);
-   nir_ssa_def *block_size =
-      nir_imm_ivec4(&b, b.shader->info.workgroup_size[0], b.shader->info.workgroup_size[1],
-                    b.shader->info.workgroup_size[2], 0);
-
-   nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
+   nir_ssa_def *global_id = get_global_ids(&b, is_3d ? 3 : 2);
 
    nir_ssa_def *offset =
       nir_load_push_constant(&b, is_3d ? 3 : 2, 32, nir_imm_int(&b, 0), .range = 16);
@@ -235,17 +229,11 @@ build_nir_btoi_compute_shader(struct radv_device *dev, bool is_3d)
    input_img->data.descriptor_set = 0;
    input_img->data.binding = 0;
 
-   nir_variable *output_img = nir_variable_create(b.shader, nir_var_uniform, img_type, "out_img");
+   nir_variable *output_img = nir_variable_create(b.shader, nir_var_image, img_type, "out_img");
    output_img->data.descriptor_set = 0;
    output_img->data.binding = 1;
 
-   nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
-   nir_ssa_def *wg_id = nir_load_workgroup_id(&b, 32);
-   nir_ssa_def *block_size =
-      nir_imm_ivec4(&b, b.shader->info.workgroup_size[0], b.shader->info.workgroup_size[1],
-                    b.shader->info.workgroup_size[2], 0);
-
-   nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
+   nir_ssa_def *global_id = get_global_ids(&b, is_3d ? 3 : 2);
 
    nir_ssa_def *offset =
       nir_load_push_constant(&b, is_3d ? 3 : 2, 32, nir_imm_int(&b, 0), .range = 16);
@@ -254,19 +242,17 @@ build_nir_btoi_compute_shader(struct radv_device *dev, bool is_3d)
    nir_ssa_def *pos_x = nir_channel(&b, global_id, 0);
    nir_ssa_def *pos_y = nir_channel(&b, global_id, 1);
 
-   nir_ssa_def *tmp = nir_imul(&b, pos_y, stride);
-   tmp = nir_iadd(&b, tmp, pos_x);
+   nir_ssa_def *buf_coord = nir_imul(&b, pos_y, stride);
+   buf_coord = nir_iadd(&b, buf_coord, pos_x);
 
-   nir_ssa_def *buf_coord = nir_vec4(&b, tmp, tmp, tmp, tmp);
-
-   nir_ssa_def *img_coord = nir_iadd(&b, global_id, offset);
+   nir_ssa_def *coord = nir_iadd(&b, global_id, offset);
    nir_ssa_def *input_img_deref = &nir_build_deref_var(&b, input_img)->dest.ssa;
 
    nir_tex_instr *tex = nir_tex_instr_create(b.shader, 3);
    tex->sampler_dim = GLSL_SAMPLER_DIM_BUF;
    tex->op = nir_texop_txf;
    tex->src[0].src_type = nir_tex_src_coord;
-   tex->src[0].src = nir_src_for_ssa(nir_channels(&b, buf_coord, 1));
+   tex->src[0].src = nir_src_for_ssa(buf_coord);
    tex->src[1].src_type = nir_tex_src_lod;
    tex->src[1].src = nir_src_for_ssa(nir_imm_int(&b, 0));
    tex->src[2].src_type = nir_tex_src_texture_deref;
@@ -279,6 +265,12 @@ build_nir_btoi_compute_shader(struct radv_device *dev, bool is_3d)
    nir_builder_instr_insert(&b, &tex->instr);
 
    nir_ssa_def *outval = &tex->dest.ssa;
+
+   nir_ssa_def *img_coord = nir_vec4(&b, nir_channel(&b, coord, 0),
+                                         nir_channel(&b, coord, 1),
+                                         is_3d ? nir_channel(&b, coord, 2) : nir_ssa_undef(&b, 1, 32),
+                                         nir_ssa_undef(&b, 1, 32));
+
    nir_image_deref_store(&b, &nir_build_deref_var(&b, output_img)->dest.ssa, img_coord,
                          nir_ssa_undef(&b, 1, 32), outval, nir_imm_int(&b, 0), .image_dim = dim);
 
@@ -417,17 +409,11 @@ build_nir_btoi_r32g32b32_compute_shader(struct radv_device *dev)
    input_img->data.descriptor_set = 0;
    input_img->data.binding = 0;
 
-   nir_variable *output_img = nir_variable_create(b.shader, nir_var_uniform, img_type, "out_img");
+   nir_variable *output_img = nir_variable_create(b.shader, nir_var_image, img_type, "out_img");
    output_img->data.descriptor_set = 0;
    output_img->data.binding = 1;
 
-   nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
-   nir_ssa_def *wg_id = nir_load_workgroup_id(&b, 32);
-   nir_ssa_def *block_size =
-      nir_imm_ivec4(&b, b.shader->info.workgroup_size[0], b.shader->info.workgroup_size[1],
-                    b.shader->info.workgroup_size[2], 0);
-
-   nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
+   nir_ssa_def *global_id = get_global_ids(&b, 2);
 
    nir_ssa_def *offset = nir_load_push_constant(&b, 2, 32, nir_imm_int(&b, 0), .range = 16);
    nir_ssa_def *pitch = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 8), .range = 16);
@@ -436,10 +422,8 @@ build_nir_btoi_r32g32b32_compute_shader(struct radv_device *dev)
    nir_ssa_def *pos_x = nir_channel(&b, global_id, 0);
    nir_ssa_def *pos_y = nir_channel(&b, global_id, 1);
 
-   nir_ssa_def *tmp = nir_imul(&b, pos_y, stride);
-   tmp = nir_iadd(&b, tmp, pos_x);
-
-   nir_ssa_def *buf_coord = nir_vec4(&b, tmp, tmp, tmp, tmp);
+   nir_ssa_def *buf_coord = nir_imul(&b, pos_y, stride);
+   buf_coord = nir_iadd(&b, buf_coord, pos_x);
 
    nir_ssa_def *img_coord = nir_iadd(&b, global_id, offset);
 
@@ -453,7 +437,7 @@ build_nir_btoi_r32g32b32_compute_shader(struct radv_device *dev)
    tex->sampler_dim = GLSL_SAMPLER_DIM_BUF;
    tex->op = nir_texop_txf;
    tex->src[0].src_type = nir_tex_src_coord;
-   tex->src[0].src = nir_src_for_ssa(nir_channels(&b, buf_coord, 1));
+   tex->src[0].src = nir_src_for_ssa(buf_coord);
    tex->src[1].src_type = nir_tex_src_lod;
    tex->src[1].src = nir_src_for_ssa(nir_imm_int(&b, 0));
    tex->src[2].src_type = nir_tex_src_texture_deref;
@@ -579,17 +563,11 @@ build_nir_itoi_compute_shader(struct radv_device *dev, bool is_3d, int samples)
    input_img->data.descriptor_set = 0;
    input_img->data.binding = 0;
 
-   nir_variable *output_img = nir_variable_create(b.shader, nir_var_uniform, img_type, "out_img");
+   nir_variable *output_img = nir_variable_create(b.shader, nir_var_image, img_type, "out_img");
    output_img->data.descriptor_set = 0;
    output_img->data.binding = 1;
 
-   nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
-   nir_ssa_def *wg_id = nir_load_workgroup_id(&b, 32);
-   nir_ssa_def *block_size =
-      nir_imm_ivec4(&b, b.shader->info.workgroup_size[0], b.shader->info.workgroup_size[1],
-                    b.shader->info.workgroup_size[2], 0);
-
-   nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
+   nir_ssa_def *global_id = get_global_ids(&b, is_3d ? 3 : 2);
 
    nir_ssa_def *src_offset =
       nir_load_push_constant(&b, is_3d ? 3 : 2, 32, nir_imm_int(&b, 0), .range = 24);
@@ -626,9 +604,14 @@ build_nir_itoi_compute_shader(struct radv_device *dev, bool is_3d, int samples)
       nir_builder_instr_insert(&b, &tex->instr);
    }
 
+   nir_ssa_def *img_coord = nir_vec4(&b, nir_channel(&b, dst_coord, 0),
+                                         nir_channel(&b, dst_coord, 1),
+                                         is_3d ? nir_channel(&b, dst_coord, 2) : nir_ssa_undef(&b, 1, 32),
+                                         nir_ssa_undef(&b, 1, 32));
+
    for (uint32_t i = 0; i < samples; i++) {
       nir_ssa_def *outval = &tex_instr[i]->dest.ssa;
-      nir_image_deref_store(&b, &nir_build_deref_var(&b, output_img)->dest.ssa, dst_coord,
+      nir_image_deref_store(&b, &nir_build_deref_var(&b, output_img)->dest.ssa, img_coord,
                             nir_imm_int(&b, i), outval, nir_imm_int(&b, 0), .image_dim = dim);
    }
 
@@ -781,17 +764,11 @@ build_nir_itoi_r32g32b32_compute_shader(struct radv_device *dev)
    input_img->data.binding = 0;
 
    nir_variable *output_img =
-      nir_variable_create(b.shader, nir_var_uniform, img_type, "output_img");
+      nir_variable_create(b.shader, nir_var_image, img_type, "output_img");
    output_img->data.descriptor_set = 0;
    output_img->data.binding = 1;
 
-   nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
-   nir_ssa_def *wg_id = nir_load_workgroup_id(&b, 32);
-   nir_ssa_def *block_size =
-      nir_imm_ivec4(&b, b.shader->info.workgroup_size[0], b.shader->info.workgroup_size[1],
-                    b.shader->info.workgroup_size[2], 0);
-
-   nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
+   nir_ssa_def *global_id = get_global_ids(&b, 2);
 
    nir_ssa_def *src_offset = nir_load_push_constant(&b, 3, 32, nir_imm_int(&b, 0), .range = 24);
    nir_ssa_def *dst_offset = nir_load_push_constant(&b, 3, 32, nir_imm_int(&b, 12), .range = 24);
@@ -813,17 +790,13 @@ build_nir_itoi_r32g32b32_compute_shader(struct radv_device *dev)
    for (int chan = 0; chan < 3; chan++) {
       /* src */
       nir_ssa_def *src_local_pos = nir_iadd(&b, src_global_pos, nir_imm_int(&b, chan));
-
-      nir_ssa_def *src_coord =
-         nir_vec4(&b, src_local_pos, src_local_pos, src_local_pos, src_local_pos);
-
       nir_ssa_def *input_img_deref = &nir_build_deref_var(&b, input_img)->dest.ssa;
 
       nir_tex_instr *tex = nir_tex_instr_create(b.shader, 3);
       tex->sampler_dim = GLSL_SAMPLER_DIM_BUF;
       tex->op = nir_texop_txf;
       tex->src[0].src_type = nir_tex_src_coord;
-      tex->src[0].src = nir_src_for_ssa(nir_channels(&b, src_coord, 1));
+      tex->src[0].src = nir_src_for_ssa(src_local_pos);
       tex->src[1].src_type = nir_tex_src_lod;
       tex->src[1].src = nir_src_for_ssa(nir_imm_int(&b, 0));
       tex->src[2].src_type = nir_tex_src_texture_deref;
@@ -947,28 +920,20 @@ build_nir_cleari_compute_shader(struct radv_device *dev, bool is_3d, int samples
    b.shader->info.workgroup_size[1] = 8;
    b.shader->info.workgroup_size[2] = 1;
 
-   nir_variable *output_img = nir_variable_create(b.shader, nir_var_uniform, img_type, "out_img");
+   nir_variable *output_img = nir_variable_create(b.shader, nir_var_image, img_type, "out_img");
    output_img->data.descriptor_set = 0;
    output_img->data.binding = 0;
 
-   nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
-   nir_ssa_def *wg_id = nir_load_workgroup_id(&b, 32);
-   nir_ssa_def *block_size =
-      nir_imm_ivec4(&b, b.shader->info.workgroup_size[0], b.shader->info.workgroup_size[1],
-                    b.shader->info.workgroup_size[2], 0);
-
-   nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
+   nir_ssa_def *global_id = get_global_ids(&b, 2);
 
    nir_ssa_def *clear_val = nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0), .range = 20);
    nir_ssa_def *layer = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 16), .range = 20);
 
-   nir_ssa_def *global_z = nir_iadd(&b, nir_channel(&b, global_id, 2), layer);
-
    nir_ssa_def *comps[4];
    comps[0] = nir_channel(&b, global_id, 0);
    comps[1] = nir_channel(&b, global_id, 1);
-   comps[2] = global_z;
-   comps[3] = nir_imm_int(&b, 0);
+   comps[2] = layer;
+   comps[3] = nir_ssa_undef(&b, 1, 32);
    global_id = nir_vec(&b, comps, 4);
 
    for (uint32_t i = 0; i < samples; i++) {
@@ -1113,17 +1078,11 @@ build_nir_cleari_r32g32b32_compute_shader(struct radv_device *dev)
    b.shader->info.workgroup_size[1] = 8;
    b.shader->info.workgroup_size[2] = 1;
 
-   nir_variable *output_img = nir_variable_create(b.shader, nir_var_uniform, img_type, "out_img");
+   nir_variable *output_img = nir_variable_create(b.shader, nir_var_image, img_type, "out_img");
    output_img->data.descriptor_set = 0;
    output_img->data.binding = 0;
 
-   nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
-   nir_ssa_def *wg_id = nir_load_workgroup_id(&b, 32);
-   nir_ssa_def *block_size =
-      nir_imm_ivec4(&b, b.shader->info.workgroup_size[0], b.shader->info.workgroup_size[1],
-                    b.shader->info.workgroup_size[2], 0);
-
-   nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
+   nir_ssa_def *global_id = get_global_ids(&b, 2);
 
    nir_ssa_def *clear_val = nir_load_push_constant(&b, 3, 32, nir_imm_int(&b, 0), .range = 16);
    nir_ssa_def *stride = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 12), .range = 16);
@@ -1288,18 +1247,22 @@ radv_device_init_meta_bufimage_state(struct radv_device *device)
 
 static void
 create_iview(struct radv_cmd_buffer *cmd_buffer, struct radv_meta_blit2d_surf *surf,
-             struct radv_image_view *iview)
+             struct radv_image_view *iview, VkFormat format, VkImageAspectFlagBits aspects)
 {
    VkImageViewType view_type = cmd_buffer->device->physical_device->rad_info.chip_class < GFX9
                                   ? VK_IMAGE_VIEW_TYPE_2D
                                   : radv_meta_get_view_type(surf->image);
+
+   if (format == VK_FORMAT_UNDEFINED)
+      format = surf->format;
+
    radv_image_view_init(iview, cmd_buffer->device,
                         &(VkImageViewCreateInfo){
                            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
                            .image = radv_image_to_handle(surf->image),
                            .viewType = view_type,
-                           .format = surf->format,
-                           .subresourceRange = {.aspectMask = surf->aspect_mask,
+                           .format = format,
+                           .subresourceRange = {.aspectMask = aspects,
                                                 .baseMipLevel = surf->level,
                                                 .levelCount = 1,
                                                 .baseArrayLayer = surf->layer,
@@ -1330,7 +1293,9 @@ create_buffer_from_image(struct radv_cmd_buffer *cmd_buffer, struct radv_meta_bl
                          VkBufferUsageFlagBits usage, VkBuffer *buffer)
 {
    struct radv_device *device = cmd_buffer->device;
-   struct radv_device_memory mem = {.bo = surf->image->bo};
+   struct radv_device_memory mem;
+
+   radv_device_memory_init(&mem, device, surf->image->bo);
 
    radv_CreateBuffer(radv_device_to_handle(device),
                      &(VkBufferCreateInfo){
@@ -1349,6 +1314,8 @@ create_buffer_from_image(struct radv_cmd_buffer *cmd_buffer, struct radv_meta_bl
                              .memory = radv_device_memory_to_handle(&mem),
                              .memoryOffset = surf->image->offset,
                           }});
+
+   radv_device_memory_finish(&mem);
 }
 
 static void
@@ -1440,7 +1407,7 @@ radv_meta_image_to_buffer(struct radv_cmd_buffer *cmd_buffer, struct radv_meta_b
    struct radv_image_view src_view;
    struct radv_buffer_view dst_view;
 
-   create_iview(cmd_buffer, src, &src_view);
+   create_iview(cmd_buffer, src, &src_view, VK_FORMAT_UNDEFINED, src->aspect_mask);
    create_bview(cmd_buffer, dst->buffer, dst->offset, dst->format, &dst_view);
    itob_bind_descriptors(cmd_buffer, &src_view, &dst_view);
 
@@ -1458,6 +1425,9 @@ radv_meta_image_to_buffer(struct radv_cmd_buffer *cmd_buffer, struct radv_meta_b
 
       radv_unaligned_dispatch(cmd_buffer, rects[r].width, rects[r].height, 1);
    }
+
+   radv_image_view_finish(&src_view);
+   radv_buffer_view_finish(&dst_view);
 }
 
 static void
@@ -1533,6 +1503,8 @@ radv_meta_buffer_to_image_cs_r32g32b32(struct radv_cmd_buffer *cmd_buffer,
       radv_unaligned_dispatch(cmd_buffer, rects[r].width, rects[r].height, 1);
    }
 
+   radv_buffer_view_finish(&src_view);
+   radv_buffer_view_finish(&dst_view);
    radv_DestroyBuffer(radv_device_to_handle(device), buffer, NULL);
 }
 
@@ -1586,7 +1558,7 @@ radv_meta_buffer_to_image_cs(struct radv_cmd_buffer *cmd_buffer,
    }
 
    create_bview(cmd_buffer, src->buffer, src->offset, src->format, &src_view);
-   create_iview(cmd_buffer, dst, &dst_view);
+   create_iview(cmd_buffer, dst, &dst_view, VK_FORMAT_UNDEFINED, dst->aspect_mask);
    btoi_bind_descriptors(cmd_buffer, &src_view, &dst_view);
 
    if (device->physical_device->rad_info.chip_class >= GFX9 && dst->image->type == VK_IMAGE_TYPE_3D)
@@ -1607,6 +1579,9 @@ radv_meta_buffer_to_image_cs(struct radv_cmd_buffer *cmd_buffer,
 
       radv_unaligned_dispatch(cmd_buffer, rects[r].width, rects[r].height, 1);
    }
+
+   radv_image_view_finish(&dst_view);
+   radv_buffer_view_finish(&src_view);
 }
 
 static void
@@ -1685,6 +1660,8 @@ radv_meta_image_to_image_cs_r32g32b32(struct radv_cmd_buffer *cmd_buffer,
       radv_unaligned_dispatch(cmd_buffer, rects[r].width, rects[r].height, 1);
    }
 
+   radv_buffer_view_finish(&src_view);
+   radv_buffer_view_finish(&dst_view);
    radv_DestroyBuffer(radv_device_to_handle(device), src_buffer, NULL);
    radv_DestroyBuffer(radv_device_to_handle(device), dst_buffer, NULL);
 }
@@ -1741,27 +1718,39 @@ radv_meta_image_to_image_cs(struct radv_cmd_buffer *cmd_buffer, struct radv_meta
       return;
    }
 
-   create_iview(cmd_buffer, src, &src_view);
-   create_iview(cmd_buffer, dst, &dst_view);
+   u_foreach_bit(i, dst->aspect_mask) {
+      unsigned aspect_mask = 1u << i;
+      VkFormat depth_format = 0;
+      if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
+         depth_format = vk_format_stencil_only(dst->image->vk_format);
+      else if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
+         depth_format = vk_format_depth_only(dst->image->vk_format);
 
-   itoi_bind_descriptors(cmd_buffer, &src_view, &dst_view);
+      create_iview(cmd_buffer, src, &src_view, depth_format, aspect_mask);
+      create_iview(cmd_buffer, dst, &dst_view, depth_format, aspect_mask);
 
-   VkPipeline pipeline = cmd_buffer->device->meta_state.itoi.pipeline[samples_log2];
-   if (device->physical_device->rad_info.chip_class >= GFX9 &&
-       (src->image->type == VK_IMAGE_TYPE_3D || dst->image->type == VK_IMAGE_TYPE_3D))
-      pipeline = cmd_buffer->device->meta_state.itoi.pipeline_3d;
-   radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
-                        pipeline);
+      itoi_bind_descriptors(cmd_buffer, &src_view, &dst_view);
 
-   for (unsigned r = 0; r < num_rects; ++r) {
-      unsigned push_constants[6] = {
-         rects[r].src_x, rects[r].src_y, src->layer, rects[r].dst_x, rects[r].dst_y, dst->layer,
-      };
-      radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
-                            device->meta_state.itoi.img_p_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0,
-                            24, push_constants);
+      VkPipeline pipeline = cmd_buffer->device->meta_state.itoi.pipeline[samples_log2];
+      if (device->physical_device->rad_info.chip_class >= GFX9 &&
+          (src->image->type == VK_IMAGE_TYPE_3D || dst->image->type == VK_IMAGE_TYPE_3D))
+         pipeline = cmd_buffer->device->meta_state.itoi.pipeline_3d;
+      radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
+                           pipeline);
 
-      radv_unaligned_dispatch(cmd_buffer, rects[r].width, rects[r].height, 1);
+      for (unsigned r = 0; r < num_rects; ++r) {
+         unsigned push_constants[6] = {
+            rects[r].src_x, rects[r].src_y, src->layer, rects[r].dst_x, rects[r].dst_y, dst->layer,
+         };
+         radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
+                               device->meta_state.itoi.img_p_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0,
+                               24, push_constants);
+
+         radv_unaligned_dispatch(cmd_buffer, rects[r].width, rects[r].height, 1);
+      }
+
+      radv_image_view_finish(&src_view);
+      radv_image_view_finish(&dst_view);
    }
 }
 
@@ -1823,6 +1812,7 @@ radv_meta_clear_image_cs_r32g32b32(struct radv_cmd_buffer *cmd_buffer,
 
    radv_unaligned_dispatch(cmd_buffer, dst->image->info.width, dst->image->info.height, 1);
 
+   radv_buffer_view_finish(&dst_view);
    radv_DestroyBuffer(radv_device_to_handle(device), buffer, NULL);
 }
 
@@ -1866,7 +1856,7 @@ radv_meta_clear_image_cs(struct radv_cmd_buffer *cmd_buffer, struct radv_meta_bl
       return;
    }
 
-   create_iview(cmd_buffer, dst, &dst_iview);
+   create_iview(cmd_buffer, dst, &dst_iview, VK_FORMAT_UNDEFINED, dst->aspect_mask);
    cleari_bind_descriptors(cmd_buffer, &dst_iview);
 
    VkPipeline pipeline = cmd_buffer->device->meta_state.cleari.pipeline[samples_log2];
@@ -1889,4 +1879,6 @@ radv_meta_clear_image_cs(struct radv_cmd_buffer *cmd_buffer, struct radv_meta_bl
                          push_constants);
 
    radv_unaligned_dispatch(cmd_buffer, dst->image->info.width, dst->image->info.height, 1);
+
+   radv_image_view_finish(&dst_iview);
 }
diff --git a/mesa 3D driver/src/amd/vulkan/radv_meta_clear.c b/mesa 3D driver/src/amd/vulkan/radv_meta_clear.c
index fdb1a9f900..8a3ff022ec 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_meta_clear.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_meta_clear.c	
@@ -29,7 +29,7 @@
 #include "util/format_rgb9e5.h"
 #include "vk_format.h"
 
-enum { DEPTH_CLEAR_SLOW, DEPTH_CLEAR_FAST_EXPCLEAR, DEPTH_CLEAR_FAST_NO_EXPCLEAR };
+enum { DEPTH_CLEAR_SLOW, DEPTH_CLEAR_FAST };
 
 static void
 build_color_shaders(struct nir_shader **out_vs, struct nir_shader **out_fs, uint32_t frag_output)
@@ -318,8 +318,10 @@ finish_meta_clear_dcc_comp_to_single_state(struct radv_device *device)
 {
    struct radv_meta_state *state = &device->meta_state;
 
-   radv_DestroyPipeline(radv_device_to_handle(device), state->clear_dcc_comp_to_single_pipeline,
-                        &state->alloc);
+   for (uint32_t i = 0; i < 2; i++) {
+      radv_DestroyPipeline(radv_device_to_handle(device),
+                           state->clear_dcc_comp_to_single_pipeline[i], &state->alloc);
+   }
    radv_DestroyPipelineLayout(radv_device_to_handle(device), state->clear_dcc_comp_to_single_p_layout,
                               &state->alloc);
    radv_DestroyDescriptorSetLayout(radv_device_to_handle(device), state->clear_dcc_comp_to_single_ds_layout,
@@ -399,11 +401,7 @@ emit_color_clear(struct radv_cmd_buffer *cmd_buffer, const VkClearAttachment *cl
 
    samples_log2 = ffs(samples) - 1;
    fs_key = radv_format_meta_fs_key(device, format);
-
-   if (fs_key == -1) {
-      radv_finishme("color clears incomplete");
-      return;
-   }
+   assert(fs_key != -1);
 
    if (device->meta_state.clear[samples_log2].render_pass[fs_key] == VK_NULL_HANDLE) {
       VkResult ret =
@@ -426,10 +424,7 @@ emit_color_clear(struct radv_cmd_buffer *cmd_buffer, const VkClearAttachment *cl
    }
 
    pipeline = device->meta_state.clear[samples_log2].color_pipelines[fs_key];
-   if (!pipeline) {
-      radv_finishme("color clears incomplete");
-      return;
-   }
+
    assert(samples_log2 < ARRAY_SIZE(device->meta_state.clear));
    assert(pipeline);
    assert(clear_att->aspectMask == VK_IMAGE_ASPECT_COLOR_BIT);
@@ -466,7 +461,7 @@ emit_color_clear(struct radv_cmd_buffer *cmd_buffer, const VkClearAttachment *cl
       radv_CmdDraw(cmd_buffer_h, 3, clear_rect->layerCount, 0, clear_rect->baseArrayLayer);
    }
 
-   radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
+   radv_cmd_buffer_restore_subpass(cmd_buffer, subpass);
 }
 
 static void
@@ -636,11 +631,9 @@ create_depthstencil_pipeline(struct radv_device *device, VkImageAspectFlags aspe
 
    if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
       extra.db_depth_clear = index == DEPTH_CLEAR_SLOW ? false : true;
-      extra.db_depth_disable_expclear = index == DEPTH_CLEAR_FAST_NO_EXPCLEAR ? true : false;
    }
    if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
       extra.db_stencil_clear = index == DEPTH_CLEAR_SLOW ? false : true;
-      extra.db_stencil_disable_expclear = index == DEPTH_CLEAR_FAST_NO_EXPCLEAR ? true : false;
    }
    result =
       create_pipeline(device, radv_render_pass_from_handle(render_pass), samples, vs_nir, fs_nir,
@@ -688,15 +681,9 @@ pick_depthstencil_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_meta_
    bool fast = depth_view_can_fast_clear(cmd_buffer, iview, aspects, layout, in_render_loop,
                                          clear_rect, clear_value);
    bool unrestricted = cmd_buffer->device->vk.enabled_extensions.EXT_depth_range_unrestricted;
-   int index = DEPTH_CLEAR_SLOW;
+   int index = fast ? DEPTH_CLEAR_FAST : DEPTH_CLEAR_SLOW;
    VkPipeline *pipeline;
 
-   if (fast) {
-      /* we don't know the previous clear values, so we always have
-       * the NO_EXPCLEAR path */
-      index = DEPTH_CLEAR_FAST_NO_EXPCLEAR;
-   }
-
    switch (aspects) {
    case VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT:
       pipeline = unrestricted
@@ -827,7 +814,7 @@ emit_depthstencil_clear(struct radv_cmd_buffer *cmd_buffer, const VkClearAttachm
       radv_CmdSetStencilReference(cmd_buffer_h, VK_STENCIL_FACE_FRONT_BIT, prev_reference);
    }
 
-   radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
+   radv_cmd_buffer_restore_subpass(cmd_buffer, subpass);
 }
 
 static uint32_t
@@ -839,12 +826,13 @@ clear_htile_mask(struct radv_cmd_buffer *cmd_buffer, const struct radv_image *im
    struct radv_meta_state *state = &device->meta_state;
    uint64_t block_count = round_up_u64(size, 1024);
    struct radv_meta_saved_state saved_state;
+   struct radv_buffer dst_buffer;
 
    radv_meta_save(
       &saved_state, cmd_buffer,
       RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_CONSTANTS | RADV_META_SAVE_DESCRIPTORS);
 
-   struct radv_buffer dst_buffer = {.bo = bo, .offset = offset, .size = size};
+   radv_buffer_init(&dst_buffer, device, bo, size, offset);
 
    radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
                         state->clear_htile_mask_pipeline);
@@ -872,9 +860,11 @@ clear_htile_mask(struct radv_cmd_buffer *cmd_buffer, const struct radv_image *im
 
    radv_CmdDispatch(radv_cmd_buffer_to_handle(cmd_buffer), block_count, 1, 1);
 
+   radv_buffer_finish(&dst_buffer);
+
    radv_meta_restore(&saved_state, cmd_buffer);
 
-   return RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_VCACHE |
+   return RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
           radv_src_access_flush(cmd_buffer, VK_ACCESS_SHADER_WRITE_BIT, image);
 }
 
@@ -1067,13 +1057,7 @@ build_clear_htile_mask_shader()
    b.shader->info.workgroup_size[1] = 1;
    b.shader->info.workgroup_size[2] = 1;
 
-   nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
-   nir_ssa_def *wg_id = nir_load_workgroup_id(&b, 32);
-   nir_ssa_def *block_size =
-      nir_imm_ivec4(&b, b.shader->info.workgroup_size[0], b.shader->info.workgroup_size[1],
-                    b.shader->info.workgroup_size[2], 0);
-
-   nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
+   nir_ssa_def *global_id = get_global_ids(&b, 1);
 
    nir_ssa_def *offset = nir_imul(&b, global_id, nir_imm_int(&b, 16));
    offset = nir_channel(&b, offset, 0);
@@ -1162,24 +1146,23 @@ init_meta_clear_htile_mask_state(struct radv_device *device)
    return result;
 }
 
+/* Clear DCC using comp-to-single by storing the clear value at the beginning of every 256B block.
+ * For MSAA images, clearing the first sample should be enough as long as CMASK is also cleared.
+ */
 static nir_shader *
-build_clear_dcc_comp_to_single_shader()
+build_clear_dcc_comp_to_single_shader(bool is_msaa)
 {
-   const struct glsl_type *img_type = glsl_image_type(GLSL_SAMPLER_DIM_2D, true, GLSL_TYPE_FLOAT);
+   enum glsl_sampler_dim dim = is_msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D;
+   const struct glsl_type *img_type = glsl_image_type(dim, true, GLSL_TYPE_FLOAT);
 
    nir_builder b =
-      nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, "meta_clear_dcc_comp_to_single");
+      nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, "meta_clear_dcc_comp_to_single-%s",
+                                     is_msaa ? "multisampled" : "singlesampled");
    b.shader->info.workgroup_size[0] = 8;
    b.shader->info.workgroup_size[1] = 8;
    b.shader->info.workgroup_size[2] = 1;
 
-   nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
-   nir_ssa_def *wg_id = nir_load_workgroup_id(&b, 32);
-   nir_ssa_def *block_size =
-      nir_imm_ivec4(&b, b.shader->info.workgroup_size[0], b.shader->info.workgroup_size[1],
-                    b.shader->info.workgroup_size[2], 0);
-   nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
-   nir_ssa_def *layer_id = nir_channel(&b, wg_id, 2);
+   nir_ssa_def *global_id = get_global_ids(&b, 3);
 
    /* Load the dimensions in pixels of a block that gets compressed to one DCC byte. */
    nir_ssa_def *dcc_block_size = nir_load_push_constant(&b, 2, 32, nir_imm_int(&b, 0), .range = 8);
@@ -1189,10 +1172,10 @@ build_clear_dcc_comp_to_single_shader()
    coord = nir_imul(&b, coord, dcc_block_size);
    coord = nir_vec4(&b, nir_channel(&b, coord, 0),
                         nir_channel(&b, coord, 1),
-                        layer_id,
+                        nir_channel(&b, global_id, 2),
                         nir_ssa_undef(&b, 1, 32));
 
-   nir_variable *output_img = nir_variable_create(b.shader, nir_var_uniform, img_type, "out_img");
+   nir_variable *output_img = nir_variable_create(b.shader, nir_var_image, img_type, "out_img");
    output_img->data.descriptor_set = 0;
    output_img->data.binding = 0;
 
@@ -1205,19 +1188,20 @@ build_clear_dcc_comp_to_single_shader()
                                     nir_channel(&b, clear_values, 1));
 
    /* Store the clear color values. */
+   nir_ssa_def *sample_id = is_msaa ? nir_imm_int(&b, 0) : nir_ssa_undef(&b, 1, 32);
    nir_image_deref_store(&b, &nir_build_deref_var(&b, output_img)->dest.ssa, coord,
-                         nir_imm_int(&b, 0), data, nir_imm_int(&b, 0),
-                         .image_dim = GLSL_SAMPLER_DIM_2D, .image_array = true);
+                         sample_id, data, nir_imm_int(&b, 0),
+                         .image_dim = dim, .image_array = true);
 
    return b.shader;
 }
 
 static VkResult
-create_dcc_comp_to_single_pipeline(struct radv_device *device, VkPipeline *pipeline)
+create_dcc_comp_to_single_pipeline(struct radv_device *device, bool is_msaa, VkPipeline *pipeline)
 {
    struct radv_meta_state *state = &device->meta_state;
    VkResult result;
-   nir_shader *cs = build_clear_dcc_comp_to_single_shader();
+   nir_shader *cs = build_clear_dcc_comp_to_single_shader(is_msaa);
 
    VkPipelineShaderStageCreateInfo shader_stage = {
       .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
@@ -1283,9 +1267,12 @@ init_meta_clear_dcc_comp_to_single_state(struct radv_device *device)
    if (result != VK_SUCCESS)
       goto fail;
 
-   result = create_dcc_comp_to_single_pipeline(device, &state->clear_dcc_comp_to_single_pipeline);
-   if (result != VK_SUCCESS)
-      goto fail;
+   for (uint32_t i = 0; i < 2; i++) {
+      result = create_dcc_comp_to_single_pipeline(device, !!i,
+                                                  &state->clear_dcc_comp_to_single_pipeline[i]);
+      if (result != VK_SUCCESS)
+         goto fail;
+   }
 
 fail:
    return result;
@@ -1528,6 +1515,7 @@ radv_clear_dcc_comp_to_single(struct radv_cmd_buffer *cmd_buffer,
    unsigned bytes_per_pixel = vk_format_get_blocksize(image->vk_format);
    unsigned layer_count = radv_get_layerCount(image, range);
    struct radv_meta_saved_state saved_state;
+   bool is_msaa = image->info.samples > 1;
    struct radv_image_view iview;
    VkFormat format;
 
@@ -1555,7 +1543,7 @@ radv_clear_dcc_comp_to_single(struct radv_cmd_buffer *cmd_buffer,
       &saved_state, cmd_buffer,
       RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_CONSTANTS);
 
-   VkPipeline pipeline = device->meta_state.clear_dcc_comp_to_single_pipeline;
+   VkPipeline pipeline = device->meta_state.clear_dcc_comp_to_single_pipeline[is_msaa];
 
    radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
                         pipeline);
@@ -1620,6 +1608,8 @@ radv_clear_dcc_comp_to_single(struct radv_cmd_buffer *cmd_buffer,
                             VK_SHADER_STAGE_COMPUTE_BIT, 0, 16, constants);
 
       radv_unaligned_dispatch(cmd_buffer, dcc_width, dcc_height, layer_count);
+
+      radv_image_view_finish(&iview);
    }
 
    radv_meta_restore(&saved_state, cmd_buffer);
@@ -2118,7 +2108,8 @@ radv_cmd_buffer_clear_subpass(struct radv_cmd_buffer *cmd_buffer)
 static void
 radv_clear_image_layer(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
                        VkImageLayout image_layout, const VkImageSubresourceRange *range,
-                       VkFormat format, int level, int layer, const VkClearValue *clear_val)
+                       VkFormat format, int level, unsigned layer_count,
+                       const VkClearValue *clear_val)
 {
    VkDevice device_h = radv_device_to_handle(cmd_buffer->device);
    struct radv_image_view iview;
@@ -2134,8 +2125,8 @@ radv_clear_image_layer(struct radv_cmd_buffer *cmd_buffer, struct radv_image *im
                            .subresourceRange = {.aspectMask = range->aspectMask,
                                                 .baseMipLevel = range->baseMipLevel + level,
                                                 .levelCount = 1,
-                                                .baseArrayLayer = range->baseArrayLayer + layer,
-                                                .layerCount = 1},
+                                                .baseArrayLayer = range->baseArrayLayer,
+                                                .layerCount = layer_count},
                         },
                         NULL);
 
@@ -2150,7 +2141,7 @@ radv_clear_image_layer(struct radv_cmd_buffer *cmd_buffer, struct radv_image *im
                                     },
                                  .width = width,
                                  .height = height,
-                                 .layers = 1},
+                                 .layers = layer_count},
       &cmd_buffer->pool->alloc, &fb);
 
    VkAttachmentDescription2 att_desc = {
@@ -2255,12 +2246,13 @@ radv_clear_image_layer(struct radv_cmd_buffer *cmd_buffer, struct radv_image *im
             .offset = {0, 0},
             .extent = {width, height},
          },
-      .baseArrayLayer = range->baseArrayLayer,
-      .layerCount = 1, /* FINISHME: clear multi-layer framebuffer */
+      .baseArrayLayer = 0,
+      .layerCount = layer_count,
    };
 
    emit_clear(cmd_buffer, &clear_att, &clear_rect, NULL, NULL, 0, false);
 
+   radv_image_view_finish(&iview);
    radv_cmd_buffer_end_render_pass(cmd_buffer);
    radv_DestroyRenderPass(device_h, pass, &cmd_buffer->pool->alloc);
    radv_DestroyFramebuffer(device_h, fb, &cmd_buffer->pool->alloc);
@@ -2275,6 +2267,7 @@ radv_fast_clear_range(struct radv_cmd_buffer *cmd_buffer, struct radv_image *ima
                       const VkImageSubresourceRange *range, const VkClearValue *clear_val)
 {
    struct radv_image_view iview;
+   bool fast_cleared = false;
 
    radv_image_view_init(&iview, cmd_buffer->device,
                         &(VkImageViewCreateInfo){
@@ -2318,18 +2311,19 @@ radv_fast_clear_range(struct radv_cmd_buffer *cmd_buffer, struct radv_image *ima
                                     clear_att.clearValue.color, 0)) {
          radv_fast_clear_color(cmd_buffer, &iview, &clear_att, clear_att.colorAttachment, NULL,
                                NULL);
-         return true;
+         fast_cleared = true;
       }
    } else {
       if (radv_can_fast_clear_depth(cmd_buffer, &iview, image_layout, in_render_loop,
                                     range->aspectMask, &clear_rect,
                                     clear_att.clearValue.depthStencil, 0)) {
          radv_fast_clear_depth(cmd_buffer, &iview, &clear_att, NULL, NULL);
-         return true;
+         fast_cleared = true;
       }
    }
 
-   return false;
+   radv_image_view_finish(&iview);
+   return fast_cleared;
 }
 
 static void
@@ -2394,9 +2388,9 @@ radv_cmd_clear_image(struct radv_cmd_buffer *cmd_buffer, struct radv_image *imag
          const uint32_t layer_count = image->type == VK_IMAGE_TYPE_3D
                                          ? radv_minify(image->info.depth, range->baseMipLevel + l)
                                          : radv_get_layerCount(image, range);
-         for (uint32_t s = 0; s < layer_count; ++s) {
 
-            if (cs) {
+         if (cs) {
+            for (uint32_t s = 0; s < layer_count; ++s) {
                struct radv_meta_blit2d_surf surf;
                surf.format = format;
                surf.image = image;
@@ -2405,11 +2399,11 @@ radv_cmd_clear_image(struct radv_cmd_buffer *cmd_buffer, struct radv_image *imag
                surf.aspect_mask = range->aspectMask;
                surf.disable_compression = disable_compression;
                radv_meta_clear_image_cs(cmd_buffer, &surf, &internal_clear_value.color);
-            } else {
-               assert(!disable_compression);
-               radv_clear_image_layer(cmd_buffer, image, image_layout, range, format, l, s,
-                                      &internal_clear_value);
             }
+         } else {
+            assert(!disable_compression);
+            radv_clear_image_layer(cmd_buffer, image, image_layout, range, format, l, layer_count,
+                                   &internal_clear_value);
          }
       }
    }
diff --git a/mesa 3D driver/src/amd/vulkan/radv_meta_copy.c b/mesa 3D driver/src/amd/vulkan/radv_meta_copy.c
index a50818d1dd..22aa69bca1 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_meta_copy.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_meta_copy.c	
@@ -405,6 +405,33 @@ copy_image(struct radv_cmd_buffer *cmd_buffer, struct radv_image *src_image,
    old_predicating = cmd_buffer->state.predicating;
    cmd_buffer->state.predicating = false;
 
+   if (cs) {
+      /* For partial copies, HTILE should be decompressed before copying because the metadata is
+       * re-initialized to the uncompressed state after.
+       */
+      uint32_t queue_mask = radv_image_queue_family_mask(dst_image, cmd_buffer->queue_family_index,
+                                                         cmd_buffer->queue_family_index);
+
+      if (radv_layout_is_htile_compressed(cmd_buffer->device, dst_image, dst_image_layout,
+                                          false, queue_mask) &&
+          (region->dstOffset.x || region->dstOffset.y || region->dstOffset.z ||
+           region->extent.width != dst_image->info.width ||
+           region->extent.height != dst_image->info.height ||
+           region->extent.depth != dst_image->info.depth)) {
+         u_foreach_bit(i, region->dstSubresource.aspectMask) {
+            unsigned aspect_mask = 1u << i;
+            radv_expand_depth_stencil(cmd_buffer, dst_image,
+                                      &(VkImageSubresourceRange){
+                                         .aspectMask = aspect_mask,
+                                         .baseMipLevel = region->dstSubresource.mipLevel,
+                                         .levelCount = 1,
+                                         .baseArrayLayer = region->dstSubresource.baseArrayLayer,
+                                         .layerCount = region->dstSubresource.layerCount,
+                                      }, NULL);
+         }
+      }
+   }
+
    VkImageAspectFlags src_aspects[3] = {VK_IMAGE_ASPECT_PLANE_0_BIT, VK_IMAGE_ASPECT_PLANE_1_BIT,
                                         VK_IMAGE_ASPECT_PLANE_2_BIT};
    VkImageAspectFlags dst_aspects[3] = {VK_IMAGE_ASPECT_PLANE_0_BIT, VK_IMAGE_ASPECT_PLANE_1_BIT,
@@ -520,6 +547,30 @@ copy_image(struct radv_cmd_buffer *cmd_buffer, struct radv_image *src_image,
       }
    }
 
+   if (cs) {
+      /* Fixup HTILE after a copy on compute. */
+      uint32_t queue_mask = radv_image_queue_family_mask(dst_image, cmd_buffer->queue_family_index,
+                                                         cmd_buffer->queue_family_index);
+
+      if (radv_layout_is_htile_compressed(cmd_buffer->device, dst_image, dst_image_layout,
+                                          false, queue_mask)) {
+
+         cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_VCACHE;
+
+         VkImageSubresourceRange range = {
+            .aspectMask = region->dstSubresource.aspectMask,
+            .baseMipLevel = region->dstSubresource.mipLevel,
+            .levelCount = 1,
+            .baseArrayLayer = region->dstSubresource.baseArrayLayer,
+            .layerCount = region->dstSubresource.layerCount,
+         };
+
+         uint32_t htile_value = radv_get_htile_initial_value(cmd_buffer->device, dst_image);
+
+         cmd_buffer->state.flush_bits |= radv_clear_htile(cmd_buffer, dst_image, &range, htile_value);
+      }
+   }
+
    /* Restore conditional rendering. */
    cmd_buffer->state.predicating = old_predicating;
 
diff --git a/mesa 3D driver/src/amd/vulkan/radv_meta_copy_vrs_htile.c b/mesa 3D driver/src/amd/vulkan/radv_meta_copy_vrs_htile.c
index adf98c7135..65d683e98e 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_meta_copy_vrs_htile.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_meta_copy_vrs_htile.c	
@@ -49,23 +49,17 @@ build_copy_vrs_htile_shader(struct radv_device *device, struct radeon_surf *surf
    b.shader->info.workgroup_size[1] = 8;
    b.shader->info.workgroup_size[2] = 1;
 
-   nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
-   nir_ssa_def *wg_id = nir_load_workgroup_id(&b, 32);
-   nir_ssa_def *block_size =
-      nir_imm_ivec4(&b, b.shader->info.workgroup_size[0], b.shader->info.workgroup_size[1],
-                    b.shader->info.workgroup_size[2], 0);
-
    /* Get coordinates. */
-   nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
-   nir_ssa_def *coord = nir_channels(&b, global_id, 0x3);
+   nir_ssa_def *global_id = get_global_ids(&b, 2);
 
    /* Multiply the coordinates by the HTILE block size. */
-   coord = nir_imul(&b, coord, nir_imm_ivec2(&b, 8, 8));
+   nir_ssa_def *coord = nir_imul(&b, global_id, nir_imm_ivec2(&b, 8, 8));
 
    /* Load constants. */
-   nir_ssa_def *constants = nir_load_push_constant(&b, 2, 32, nir_imm_int(&b, 0), .range = 8);
+   nir_ssa_def *constants = nir_load_push_constant(&b, 3, 32, nir_imm_int(&b, 0), .range = 12);
    nir_ssa_def *htile_pitch = nir_channel(&b, constants, 0);
    nir_ssa_def *htile_slice_size = nir_channel(&b, constants, 1);
+   nir_ssa_def *read_htile_value = nir_channel(&b, constants, 2);
 
    /* Get the HTILE addr from coordinates. */
    nir_ssa_def *zero = nir_imm_int(&b, 0);
@@ -88,7 +82,7 @@ build_copy_vrs_htile_shader(struct radv_device *device, struct radeon_surf *surf
    tex->sampler_dim = GLSL_SAMPLER_DIM_2D;
    tex->op = nir_texop_txf;
    tex->src[0].src_type = nir_tex_src_coord;
-   tex->src[0].src = nir_src_for_ssa(nir_channels(&b, global_id, 0x3));
+   tex->src[0].src = nir_src_for_ssa(global_id);
    tex->src[1].src_type = nir_tex_src_lod;
    tex->src[1].src = nir_src_for_ssa(nir_imm_int(&b, 0));
    tex->src[2].src_type = nir_tex_src_texture_deref;
@@ -119,17 +113,28 @@ build_copy_vrs_htile_shader(struct radv_device *device, struct radeon_surf *surf
    /* Load the HTILE buffer descriptor. */
    nir_ssa_def *htile_buf = radv_meta_load_descriptor(&b, 0, 1);
 
-   /* Load the existing HTILE 32-bit value for this 8x8 pixels area. */
-   nir_ssa_def *htile_value = nir_load_ssbo(&b, 1, 32, htile_buf, htile_addr, .align_mul = 4);
+   /* Load the HTILE value if requested, otherwise use the default value. */
+   nir_variable *htile_value = nir_local_variable_create(b.impl, glsl_int_type(), "htile_value");
 
-   /* Clear the 4-bit VRS rates. */
-   htile_value = nir_iand(&b, htile_value, nir_imm_int(&b, 0xfffff33f));
+   nir_push_if(&b, nir_ieq(&b, read_htile_value, nir_imm_int(&b, 1)));
+   {
+      /* Load the existing HTILE 32-bit value for this 8x8 pixels area. */
+      nir_ssa_def *input_value = nir_load_ssbo(&b, 1, 32, htile_buf, htile_addr, .align_mul = 4);
+
+      /* Clear the 4-bit VRS rates. */
+      nir_store_var(&b, htile_value, nir_iand(&b, input_value, nir_imm_int(&b, 0xfffff33f)), 0x1);
+   }
+   nir_push_else(&b, NULL);
+   {
+      nir_store_var(&b, htile_value, nir_imm_int(&b, 0xfffff33f), 0x1);
+   }
+   nir_pop_if(&b, NULL);
 
    /* Set the VRS rates loaded from the image. */
-   htile_value = nir_ior(&b, htile_value, vrs_rates);
+   nir_ssa_def *output_value = nir_ior(&b, nir_load_var(&b, htile_value), vrs_rates);
 
    /* Store the updated HTILE 32-bit which contains the VRS rates. */
-   nir_store_ssbo(&b, htile_value, htile_buf, htile_addr, .write_mask = 0x1,
+   nir_store_ssbo(&b, output_value, htile_buf, htile_addr, .write_mask = 0x1,
                   .access = ACCESS_NON_READABLE, .align_mul = 4);
 
    return b.shader;
@@ -174,7 +179,7 @@ radv_device_init_meta_copy_vrs_htile_state(struct radv_device *device,
          &(VkPushConstantRange){
             VK_SHADER_STAGE_COMPUTE_BIT,
             0,
-            8,
+            12,
          },
    };
 
@@ -208,7 +213,8 @@ radv_device_init_meta_copy_vrs_htile_state(struct radv_device *device,
 
 void
 radv_copy_vrs_htile(struct radv_cmd_buffer *cmd_buffer, struct radv_image *vrs_image,
-                    VkExtent2D *extent, struct radv_image *dst_image)
+                    VkExtent2D *extent, struct radv_image *dst_image,
+                    struct radv_buffer *htile_buffer, bool read_htile_value)
 {
    struct radv_device *device = cmd_buffer->device;
    struct radv_meta_state *state = &device->meta_state;
@@ -237,13 +243,6 @@ radv_copy_vrs_htile(struct radv_cmd_buffer *cmd_buffer, struct radv_image *vrs_i
    radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
                         state->copy_vrs_htile_pipeline);
 
-   /* HTILE buffer */
-   uint64_t htile_offset = dst_image->offset + dst_image->planes[0].surface.meta_offset;
-   uint64_t htile_size = dst_image->planes[0].surface.meta_slice_size;
-   struct radv_buffer htile_buffer = {.bo = dst_image->bo,
-                                      .offset = htile_offset,
-                                      .size = htile_size};
-
    radv_image_view_init(&vrs_iview, cmd_buffer->device,
                         &(VkImageViewCreateInfo){
                            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
@@ -280,22 +279,25 @@ radv_copy_vrs_htile(struct radv_cmd_buffer *cmd_buffer, struct radv_image *vrs_i
           .dstArrayElement = 0,
           .descriptorCount = 1,
           .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-          .pBufferInfo = &(VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&htile_buffer),
+          .pBufferInfo = &(VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(htile_buffer),
                                                    .offset = 0,
-                                                   .range = htile_size}}});
+                                                   .range = htile_buffer->size}}});
 
-   const unsigned constants[2] = {
+   const unsigned constants[3] = {
       dst_image->planes[0].surface.meta_pitch, dst_image->planes[0].surface.meta_slice_size,
+      read_htile_value,
    };
 
    radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), state->copy_vrs_htile_p_layout,
-                         VK_SHADER_STAGE_COMPUTE_BIT, 0, 8, constants);
+                         VK_SHADER_STAGE_COMPUTE_BIT, 0, 12, constants);
 
    uint32_t width = DIV_ROUND_UP(extent->width, 8);
    uint32_t height = DIV_ROUND_UP(extent->height, 8);
 
    radv_unaligned_dispatch(cmd_buffer, width, height, 1);
 
+   radv_image_view_finish(&vrs_iview);
+
    radv_meta_restore(&saved_state, cmd_buffer);
 
    cmd_buffer->state.flush_bits |=
diff --git a/mesa 3D driver/src/amd/vulkan/radv_meta_dcc_retile.c b/mesa 3D driver/src/amd/vulkan/radv_meta_dcc_retile.c
index 9fbd958114..1240015e8e 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_meta_dcc_retile.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_meta_dcc_retile.c	
@@ -27,26 +27,11 @@
 #include "radv_meta.h"
 #include "radv_private.h"
 
-static nir_ssa_def *
-get_global_ids(nir_builder *b, unsigned num_components)
-{
-   unsigned mask = BITFIELD_MASK(num_components);
-
-   nir_ssa_def *local_ids = nir_channels(b, nir_load_local_invocation_id(b), mask);
-   nir_ssa_def *block_ids = nir_channels(b, nir_load_workgroup_id(b, 32), mask);
-   nir_ssa_def *block_size = nir_channels(
-      b,
-      nir_imm_ivec4(b, b->shader->info.workgroup_size[0], b->shader->info.workgroup_size[1],
-                    b->shader->info.workgroup_size[2], 0),
-      mask);
-
-   return nir_iadd(b, nir_imul(b, block_ids, block_size), local_ids);
-}
-
 static nir_shader *
 build_dcc_retile_compute_shader(struct radv_device *dev, struct radeon_surf *surf)
 {
-   const struct glsl_type *buf_type = glsl_image_type(GLSL_SAMPLER_DIM_BUF, false, GLSL_TYPE_UINT);
+   enum glsl_sampler_dim dim = GLSL_SAMPLER_DIM_BUF;
+   const struct glsl_type *buf_type = glsl_image_type(dim, false, GLSL_TYPE_UINT);
    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, "dcc_retile_compute");
 
    b.shader->info.workgroup_size[0] = 8;
@@ -85,28 +70,14 @@ build_dcc_retile_compute_shader(struct radv_device *dev, struct radeon_surf *sur
       dst_dcc_pitch, dst_dcc_height, zero, nir_channel(&b, coord, 0), nir_channel(&b, coord, 1),
       zero, zero, zero);
 
-   nir_intrinsic_instr *dcc_val =
-      nir_intrinsic_instr_create(b.shader, nir_intrinsic_image_deref_load);
-   dcc_val->num_components = 1;
-   dcc_val->src[0] = nir_src_for_ssa(input_dcc_ref);
-   dcc_val->src[1] = nir_src_for_ssa(nir_vec4(&b, src, src, src, src));
-   dcc_val->src[2] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32));
-   dcc_val->src[3] = nir_src_for_ssa(nir_imm_int(&b, 0));
-   nir_ssa_dest_init(&dcc_val->instr, &dcc_val->dest, 1, 32, "dcc_val");
-   nir_intrinsic_set_image_dim(dcc_val, GLSL_SAMPLER_DIM_BUF);
-   nir_builder_instr_insert(&b, &dcc_val->instr);
+   nir_ssa_def *dcc_val = nir_image_deref_load(&b, 1, 32, input_dcc_ref,
+                                               nir_vec4(&b, src, src, src, src),
+                                               nir_ssa_undef(&b, 1, 32), nir_imm_int(&b, 0),
+                                               .image_dim = dim);
 
-   nir_intrinsic_instr *store =
-      nir_intrinsic_instr_create(b.shader, nir_intrinsic_image_deref_store);
-   store->num_components = 1;
-   store->src[0] = nir_src_for_ssa(output_dcc_ref);
-   store->src[1] = nir_src_for_ssa(nir_vec4(&b, dst, dst, dst, dst));
-   store->src[2] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32));
-   store->src[3] = nir_src_for_ssa(&dcc_val->dest.ssa);
-   store->src[4] = nir_src_for_ssa(nir_imm_int(&b, 0));
-   nir_intrinsic_set_image_dim(store, GLSL_SAMPLER_DIM_BUF);
+   nir_image_deref_store(&b, output_dcc_ref, nir_vec4(&b, dst, dst, dst, dst),
+                         nir_ssa_undef(&b, 1, 32), dcc_val, nir_imm_int(&b, 0), .image_dim = dim);
 
-   nir_builder_instr_insert(&b, &store->instr);
    return b.shader;
 }
 
@@ -115,7 +86,10 @@ radv_device_finish_meta_dcc_retile_state(struct radv_device *device)
 {
    struct radv_meta_state *state = &device->meta_state;
 
-   radv_DestroyPipeline(radv_device_to_handle(device), state->dcc_retile.pipeline, &state->alloc);
+   for (unsigned i = 0; i < ARRAY_SIZE(state->dcc_retile.pipeline); i++) {
+      radv_DestroyPipeline(radv_device_to_handle(device), state->dcc_retile.pipeline[i],
+                           &state->alloc);
+   }
    radv_DestroyPipelineLayout(radv_device_to_handle(device), state->dcc_retile.p_layout,
                               &state->alloc);
    radv_DestroyDescriptorSetLayout(radv_device_to_handle(device), state->dcc_retile.ds_layout,
@@ -131,9 +105,7 @@ radv_device_finish_meta_dcc_retile_state(struct radv_device *device)
  * - DCC equations
  * - DCC block size
  *
- * BPE is always 4 at the moment and the rest is derived from the tilemode,
- * and ac_surface limits displayable DCC to at most 1 tiling mode. So in effect
- * this shader is indepedent of the surface.
+ * BPE is always 4 at the moment and the rest is derived from the tilemode.
  */
 static VkResult
 radv_device_init_meta_dcc_retile_state(struct radv_device *device, struct radeon_surf *surf)
@@ -197,7 +169,7 @@ radv_device_init_meta_dcc_retile_state(struct radv_device *device, struct radeon
 
    result = radv_CreateComputePipelines(
       radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,
-      &vk_pipeline_info, NULL, &device->meta_state.dcc_retile.pipeline);
+      &vk_pipeline_info, NULL, &device->meta_state.dcc_retile.pipeline[surf->u.gfx9.swizzle_mode]);
    if (result != VK_SUCCESS)
       goto cleanup;
 
@@ -213,6 +185,7 @@ radv_retile_dcc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image)
 {
    struct radv_meta_saved_state saved_state;
    struct radv_device *device = cmd_buffer->device;
+   struct radv_buffer buffer;
 
    assert(image->type == VK_IMAGE_TYPE_2D);
    assert(image->info.array_size == 1 && image->info.levels == 1);
@@ -222,8 +195,10 @@ radv_retile_dcc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image)
    state->flush_bits |= radv_dst_access_flush(cmd_buffer, VK_ACCESS_SHADER_READ_BIT, image) |
                         radv_dst_access_flush(cmd_buffer, VK_ACCESS_SHADER_WRITE_BIT, image);
 
+   unsigned swizzle_mode = image->planes[0].surface.u.gfx9.swizzle_mode;
+
    /* Compile pipelines if not already done so. */
-   if (!cmd_buffer->device->meta_state.dcc_retile.pipeline) {
+   if (!cmd_buffer->device->meta_state.dcc_retile.pipeline[swizzle_mode]) {
       VkResult ret =
          radv_device_init_meta_dcc_retile_state(cmd_buffer->device, &image->planes[0].surface);
       if (ret != VK_SUCCESS) {
@@ -237,9 +212,9 @@ radv_retile_dcc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image)
       RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_CONSTANTS);
 
    radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
-                        device->meta_state.dcc_retile.pipeline);
+                        device->meta_state.dcc_retile.pipeline[swizzle_mode]);
 
-   struct radv_buffer buffer = {.size = image->size, .bo = image->bo, .offset = image->offset};
+   radv_buffer_init(&buffer, device, image->bo, image->size, image->offset);
 
    struct radv_buffer_view views[2];
    VkBufferView view_handles[2];
@@ -303,6 +278,10 @@ radv_retile_dcc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image)
 
    radv_unaligned_dispatch(cmd_buffer, dcc_width, dcc_height, 1);
 
+   radv_buffer_view_finish(views);
+   radv_buffer_view_finish(views + 1);
+   radv_buffer_finish(&buffer);
+
    radv_meta_restore(&saved_state, cmd_buffer);
 
    state->flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
diff --git a/mesa 3D driver/src/amd/vulkan/radv_meta_decompress.c b/mesa 3D driver/src/amd/vulkan/radv_meta_decompress.c
index 6f0b644531..43ffa184b8 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_meta_decompress.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_meta_decompress.c	
@@ -33,6 +33,123 @@ enum radv_depth_op {
    DEPTH_RESUMMARIZE,
 };
 
+static nir_shader *
+build_expand_depth_stencil_compute_shader(struct radv_device *dev)
+{
+   const struct glsl_type *img_type = glsl_image_type(GLSL_SAMPLER_DIM_2D, false, GLSL_TYPE_FLOAT);
+
+   nir_builder b =
+      nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, "expand_depth_stencil_compute");
+
+   /* We need at least 8/8/1 to cover an entire HTILE block in a single workgroup. */
+   b.shader->info.workgroup_size[0] = 8;
+   b.shader->info.workgroup_size[1] = 8;
+   b.shader->info.workgroup_size[2] = 1;
+   nir_variable *input_img = nir_variable_create(b.shader, nir_var_image, img_type, "in_img");
+   input_img->data.descriptor_set = 0;
+   input_img->data.binding = 0;
+
+   nir_variable *output_img = nir_variable_create(b.shader, nir_var_image, img_type, "out_img");
+   output_img->data.descriptor_set = 0;
+   output_img->data.binding = 1;
+
+   nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
+   nir_ssa_def *wg_id = nir_load_workgroup_id(&b, 32);
+   nir_ssa_def *block_size =
+      nir_imm_ivec4(&b, b.shader->info.workgroup_size[0], b.shader->info.workgroup_size[1],
+                    b.shader->info.workgroup_size[2], 0);
+
+   nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
+
+   nir_ssa_def *data = nir_image_deref_load(
+      &b, 4, 32, &nir_build_deref_var(&b, input_img)->dest.ssa, global_id, nir_ssa_undef(&b, 1, 32),
+      nir_imm_int(&b, 0), .image_dim = GLSL_SAMPLER_DIM_2D);
+
+   /* We need a NIR_SCOPE_DEVICE memory_scope because ACO will avoid
+    * creating a vmcnt(0) because it expects the L1 cache to keep memory
+    * operations in-order for the same workgroup. The vmcnt(0) seems
+    * necessary however. */
+   nir_scoped_barrier(&b, .execution_scope = NIR_SCOPE_WORKGROUP, .memory_scope = NIR_SCOPE_DEVICE,
+                      .memory_semantics = NIR_MEMORY_ACQ_REL, .memory_modes = nir_var_mem_ssbo);
+
+   nir_image_deref_store(&b, &nir_build_deref_var(&b, output_img)->dest.ssa, global_id,
+                         nir_ssa_undef(&b, 1, 32), data, nir_imm_int(&b, 0),
+                         .image_dim = GLSL_SAMPLER_DIM_2D);
+   return b.shader;
+}
+
+static VkResult
+create_expand_depth_stencil_compute(struct radv_device *device)
+{
+   VkResult result = VK_SUCCESS;
+   nir_shader *cs = build_expand_depth_stencil_compute_shader(device);
+
+   VkDescriptorSetLayoutCreateInfo ds_create_info = {
+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+      .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
+      .bindingCount = 2,
+      .pBindings = (VkDescriptorSetLayoutBinding[]){
+         {.binding = 0,
+          .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+          .descriptorCount = 1,
+          .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+          .pImmutableSamplers = NULL},
+         {.binding = 1,
+          .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+          .descriptorCount = 1,
+          .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+          .pImmutableSamplers = NULL},
+      }};
+
+   result = radv_CreateDescriptorSetLayout(
+      radv_device_to_handle(device), &ds_create_info, &device->meta_state.alloc,
+      &device->meta_state.expand_depth_stencil_compute_ds_layout);
+   if (result != VK_SUCCESS)
+      goto cleanup;
+
+   VkPipelineLayoutCreateInfo pl_create_info = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+      .setLayoutCount = 1,
+      .pSetLayouts = &device->meta_state.expand_depth_stencil_compute_ds_layout,
+      .pushConstantRangeCount = 0,
+      .pPushConstantRanges = NULL,
+   };
+
+   result = radv_CreatePipelineLayout(
+      radv_device_to_handle(device), &pl_create_info, &device->meta_state.alloc,
+      &device->meta_state.expand_depth_stencil_compute_p_layout);
+   if (result != VK_SUCCESS)
+      goto cleanup;
+
+   /* compute shader */
+
+   VkPipelineShaderStageCreateInfo pipeline_shader_stage = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+      .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+      .module = vk_shader_module_handle_from_nir(cs),
+      .pName = "main",
+      .pSpecializationInfo = NULL,
+   };
+
+   VkComputePipelineCreateInfo vk_pipeline_info = {
+      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+      .stage = pipeline_shader_stage,
+      .flags = 0,
+      .layout = device->meta_state.expand_depth_stencil_compute_p_layout,
+   };
+
+   result = radv_CreateComputePipelines(
+      radv_device_to_handle(device), radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,
+      &vk_pipeline_info, NULL,
+      &device->meta_state.expand_depth_stencil_compute_pipeline);
+   if (result != VK_SUCCESS)
+      goto cleanup;
+
+cleanup:
+   ralloc_free(cs);
+   return result;
+}
+
 static VkResult
 create_pass(struct radv_device *device, uint32_t samples, VkRenderPass *pass)
 {
@@ -263,6 +380,13 @@ radv_device_finish_meta_depth_decomp_state(struct radv_device *device)
       radv_DestroyPipeline(radv_device_to_handle(device),
                            state->depth_decomp[i].resummarize_pipeline, &state->alloc);
    }
+
+   radv_DestroyPipeline(radv_device_to_handle(device),
+                        state->expand_depth_stencil_compute_pipeline, &state->alloc);
+   radv_DestroyPipelineLayout(radv_device_to_handle(device),
+                              state->expand_depth_stencil_compute_p_layout, &state->alloc);
+   radv_DestroyDescriptorSetLayout(radv_device_to_handle(device),
+                                   state->expand_depth_stencil_compute_ds_layout, &state->alloc);
 }
 
 VkResult
@@ -298,6 +422,10 @@ radv_device_init_meta_depth_decomp_state(struct radv_device *device, bool on_dem
          goto fail;
    }
 
+   res = create_expand_depth_stencil_compute(device);
+   if (res != VK_SUCCESS)
+      goto fail;
+
    return VK_SUCCESS;
 
 fail:
@@ -413,6 +541,7 @@ radv_process_depth_image_layer(struct radv_cmd_buffer *cmd_buffer, struct radv_i
    radv_CmdDraw(radv_cmd_buffer_to_handle(cmd_buffer), 3, 1, 0, 0);
    radv_cmd_buffer_end_render_pass(cmd_buffer);
 
+   radv_image_view_finish(&iview);
    radv_DestroyFramebuffer(radv_device_to_handle(device), fb_h, &cmd_buffer->pool->alloc);
 }
 
@@ -481,18 +610,130 @@ radv_process_depth_stencil(struct radv_cmd_buffer *cmd_buffer, struct radv_image
    radv_meta_restore(&saved_state, cmd_buffer);
 }
 
+static void
+radv_expand_depth_stencil_compute(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
+                                  const VkImageSubresourceRange *subresourceRange)
+{
+   struct radv_meta_saved_state saved_state;
+   struct radv_image_view load_iview = {0};
+   struct radv_image_view store_iview = {0};
+   struct radv_device *device = cmd_buffer->device;
+
+   assert(radv_image_is_tc_compat_htile(image));
+
+   cmd_buffer->state.flush_bits |=
+      radv_dst_access_flush(cmd_buffer, VK_ACCESS_SHADER_WRITE_BIT, image);
+
+   radv_meta_save(&saved_state, cmd_buffer,
+                  RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_COMPUTE_PIPELINE);
+
+   radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
+                        device->meta_state.expand_depth_stencil_compute_pipeline);
+
+   for (uint32_t l = 0; l < radv_get_levelCount(image, subresourceRange); l++) {
+      uint32_t width, height;
+
+      /* Do not decompress levels without HTILE. */
+      if (!radv_htile_enabled(image, subresourceRange->baseMipLevel + l))
+         continue;
+
+      width = radv_minify(image->info.width, subresourceRange->baseMipLevel + l);
+      height = radv_minify(image->info.height, subresourceRange->baseMipLevel + l);
+
+      for (uint32_t s = 0; s < radv_get_layerCount(image, subresourceRange); s++) {
+         radv_image_view_init(
+            &load_iview, cmd_buffer->device,
+            &(VkImageViewCreateInfo){
+               .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+               .image = radv_image_to_handle(image),
+               .viewType = VK_IMAGE_VIEW_TYPE_2D,
+               .format = image->vk_format,
+               .subresourceRange = {.aspectMask = subresourceRange->aspectMask,
+                                    .baseMipLevel = subresourceRange->baseMipLevel + l,
+                                    .levelCount = 1,
+                                    .baseArrayLayer = subresourceRange->baseArrayLayer + s,
+                                    .layerCount = 1},
+            },
+            &(struct radv_image_view_extra_create_info){.enable_compression = true});
+         radv_image_view_init(
+            &store_iview, cmd_buffer->device,
+            &(VkImageViewCreateInfo){
+               .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+               .image = radv_image_to_handle(image),
+               .viewType = VK_IMAGE_VIEW_TYPE_2D,
+               .format = image->vk_format,
+               .subresourceRange = {.aspectMask = subresourceRange->aspectMask,
+                                    .baseMipLevel = subresourceRange->baseMipLevel + l,
+                                    .levelCount = 1,
+                                    .baseArrayLayer = subresourceRange->baseArrayLayer + s,
+                                    .layerCount = 1},
+            },
+            &(struct radv_image_view_extra_create_info){.disable_compression = true});
+
+         radv_meta_push_descriptor_set(
+            cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+            device->meta_state.expand_depth_stencil_compute_p_layout, 0, /* set */
+            2, /* descriptorWriteCount */
+            (VkWriteDescriptorSet[]){{.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+                                      .dstBinding = 0,
+                                      .dstArrayElement = 0,
+                                      .descriptorCount = 1,
+                                      .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+                                      .pImageInfo =
+                                         (VkDescriptorImageInfo[]){
+                                            {
+                                               .sampler = VK_NULL_HANDLE,
+                                               .imageView = radv_image_view_to_handle(&load_iview),
+                                               .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
+                                            },
+                                         }},
+                                     {.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+                                      .dstBinding = 1,
+                                      .dstArrayElement = 0,
+                                      .descriptorCount = 1,
+                                      .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+                                      .pImageInfo = (VkDescriptorImageInfo[]){
+                                         {
+                                            .sampler = VK_NULL_HANDLE,
+                                            .imageView = radv_image_view_to_handle(&store_iview),
+                                            .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
+                                         },
+                                      }}});
+
+         radv_unaligned_dispatch(cmd_buffer, width, height, 1);
+
+         radv_image_view_finish(&load_iview);
+         radv_image_view_finish(&store_iview);
+      }
+   }
+
+   radv_meta_restore(&saved_state, cmd_buffer);
+
+   cmd_buffer->state.flush_bits |=
+      RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_VCACHE |
+      radv_src_access_flush(cmd_buffer, VK_ACCESS_SHADER_WRITE_BIT, image);
+
+   /* Initialize the HTILE metadata as "fully expanded". */
+   uint32_t htile_value = radv_get_htile_initial_value(cmd_buffer->device, image);
+
+   cmd_buffer->state.flush_bits |= radv_clear_htile(cmd_buffer, image, subresourceRange, htile_value);
+}
+
 void
-radv_decompress_depth_stencil(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
-                              const VkImageSubresourceRange *subresourceRange,
-                              struct radv_sample_locations_state *sample_locs)
+radv_expand_depth_stencil(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
+                          const VkImageSubresourceRange *subresourceRange,
+                          struct radv_sample_locations_state *sample_locs)
 {
    struct radv_barrier_data barrier = {0};
 
    barrier.layout_transitions.depth_stencil_expand = 1;
    radv_describe_layout_transition(cmd_buffer, &barrier);
 
-   assert(cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL);
-   radv_process_depth_stencil(cmd_buffer, image, subresourceRange, sample_locs, DEPTH_DECOMPRESS);
+   if (cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
+      radv_process_depth_stencil(cmd_buffer, image, subresourceRange, sample_locs, DEPTH_DECOMPRESS);
+   } else {
+      radv_expand_depth_stencil_compute(cmd_buffer, image, subresourceRange);
+   }
 }
 
 void
diff --git a/mesa 3D driver/src/amd/vulkan/radv_meta_fast_clear.c b/mesa 3D driver/src/amd/vulkan/radv_meta_fast_clear.c
index 0c257e8085..6beb898f69 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_meta_fast_clear.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_meta_fast_clear.c	
@@ -46,24 +46,22 @@ build_dcc_decompress_compute_shader(struct radv_device *dev)
    b.shader->info.workgroup_size[0] = 16;
    b.shader->info.workgroup_size[1] = 16;
    b.shader->info.workgroup_size[2] = 1;
-   nir_variable *input_img = nir_variable_create(b.shader, nir_var_uniform, img_type, "in_img");
+   nir_variable *input_img = nir_variable_create(b.shader, nir_var_image, img_type, "in_img");
    input_img->data.descriptor_set = 0;
    input_img->data.binding = 0;
 
-   nir_variable *output_img = nir_variable_create(b.shader, nir_var_uniform, img_type, "out_img");
+   nir_variable *output_img = nir_variable_create(b.shader, nir_var_image, img_type, "out_img");
    output_img->data.descriptor_set = 0;
    output_img->data.binding = 1;
 
-   nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
-   nir_ssa_def *wg_id = nir_load_workgroup_id(&b, 32);
-   nir_ssa_def *block_size =
-      nir_imm_ivec4(&b, b.shader->info.workgroup_size[0], b.shader->info.workgroup_size[1],
-                    b.shader->info.workgroup_size[2], 0);
-
-   nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
+   nir_ssa_def *global_id = get_global_ids(&b, 2);
+   nir_ssa_def *img_coord = nir_vec4(&b, nir_channel(&b, global_id, 0),
+                                         nir_channel(&b, global_id, 1),
+                                         nir_ssa_undef(&b, 1, 32),
+                                         nir_ssa_undef(&b, 1, 32));
 
    nir_ssa_def *data = nir_image_deref_load(
-      &b, 4, 32, &nir_build_deref_var(&b, input_img)->dest.ssa, global_id, nir_ssa_undef(&b, 1, 32),
+      &b, 4, 32, &nir_build_deref_var(&b, input_img)->dest.ssa, img_coord, nir_ssa_undef(&b, 1, 32),
       nir_imm_int(&b, 0), .image_dim = GLSL_SAMPLER_DIM_2D);
 
    /* We need a NIR_SCOPE_DEVICE memory_scope because ACO will avoid
@@ -73,7 +71,7 @@ build_dcc_decompress_compute_shader(struct radv_device *dev)
    nir_scoped_barrier(&b, .execution_scope = NIR_SCOPE_WORKGROUP, .memory_scope = NIR_SCOPE_DEVICE,
                       .memory_semantics = NIR_MEMORY_ACQ_REL, .memory_modes = nir_var_mem_ssbo);
 
-   nir_image_deref_store(&b, &nir_build_deref_var(&b, output_img)->dest.ssa, global_id,
+   nir_image_deref_store(&b, &nir_build_deref_var(&b, output_img)->dest.ssa, img_coord,
                          nir_ssa_undef(&b, 1, 32), data, nir_imm_int(&b, 0),
                          .image_dim = GLSL_SAMPLER_DIM_2D);
    return b.shader;
@@ -622,6 +620,7 @@ radv_process_color_image_layer(struct radv_cmd_buffer *cmd_buffer, struct radv_i
 
    radv_cmd_buffer_end_render_pass(cmd_buffer);
 
+   radv_image_view_finish(&iview);
    radv_DestroyFramebuffer(radv_device_to_handle(device), fb_h, &cmd_buffer->pool->alloc);
 }
 
@@ -895,6 +894,9 @@ radv_decompress_dcc_compute(struct radv_cmd_buffer *cmd_buffer, struct radv_imag
                                       }}});
 
          radv_unaligned_dispatch(cmd_buffer, width, height, 1);
+
+         radv_image_view_finish(&load_iview);
+         radv_image_view_finish(&store_iview);
       }
    }
 
diff --git a/mesa 3D driver/src/amd/vulkan/radv_meta_fmask_expand.c b/mesa 3D driver/src/amd/vulkan/radv_meta_fmask_expand.c
index a6acfdd8d2..0b75520c9a 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_meta_fmask_expand.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_meta_fmask_expand.c	
@@ -43,25 +43,15 @@ build_fmask_expand_compute_shader(struct radv_device *device, int samples)
    input_img->data.descriptor_set = 0;
    input_img->data.binding = 0;
 
-   nir_variable *output_img = nir_variable_create(b.shader, nir_var_uniform, img_type, "out_img");
+   nir_variable *output_img = nir_variable_create(b.shader, nir_var_image, img_type, "out_img");
    output_img->data.descriptor_set = 0;
    output_img->data.binding = 1;
    output_img->data.access = ACCESS_NON_READABLE;
 
-   nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
-   nir_ssa_def *wg_id = nir_load_workgroup_id(&b, 32);
-   nir_ssa_def *block_size =
-      nir_imm_ivec4(&b, b.shader->info.workgroup_size[0], b.shader->info.workgroup_size[1],
-                    b.shader->info.workgroup_size[2], 0);
-
-   nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
-   nir_ssa_def *layer_id = nir_channel(&b, wg_id, 2);
-
    nir_ssa_def *input_img_deref = &nir_build_deref_var(&b, input_img)->dest.ssa;
    nir_ssa_def *output_img_deref = &nir_build_deref_var(&b, output_img)->dest.ssa;
 
-   nir_ssa_def *tex_coord =
-      nir_vec3(&b, nir_channel(&b, global_id, 0), nir_channel(&b, global_id, 1), layer_id);
+   nir_ssa_def *tex_coord = get_global_ids(&b, 3);
 
    nir_tex_instr *tex_instr[8];
    for (uint32_t i = 0; i < samples; i++) {
@@ -86,7 +76,7 @@ build_fmask_expand_compute_shader(struct radv_device *device, int samples)
 
    nir_ssa_def *img_coord =
       nir_vec4(&b, nir_channel(&b, tex_coord, 0), nir_channel(&b, tex_coord, 1),
-               nir_channel(&b, tex_coord, 2), nir_imm_int(&b, 0));
+               nir_channel(&b, tex_coord, 2), nir_ssa_undef(&b, 1, 32));
 
    for (uint32_t i = 0; i < samples; i++) {
       nir_ssa_def *outval = &tex_instr[i]->dest.ssa;
@@ -165,6 +155,8 @@ radv_expand_fmask_image_inplace(struct radv_cmd_buffer *cmd_buffer, struct radv_
 
    radv_unaligned_dispatch(cmd_buffer, image->info.width, image->info.height, layer_count);
 
+   radv_image_view_finish(&iview);
+
    radv_meta_restore(&saved_state, cmd_buffer);
 
    cmd_buffer->state.flush_bits |=
diff --git a/mesa 3D driver/src/amd/vulkan/radv_meta_resolve.c b/mesa 3D driver/src/amd/vulkan/radv_meta_resolve.c
index e955ddced9..b012a757e5 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_meta_resolve.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_meta_resolve.c	
@@ -451,16 +451,8 @@ radv_meta_resolve_hardware_image(struct radv_cmd_buffer *cmd_buffer, struct radv
    radv_meta_save(&saved_state, cmd_buffer, RADV_META_SAVE_GRAPHICS_PIPELINE);
 
    assert(src_image->info.samples > 1);
-   if (src_image->info.samples <= 1) {
-      /* this causes GPU hangs if we get past here */
-      fprintf(stderr, "radv: Illegal resolve operation (src not multisampled), will hang GPU.");
-      return;
-   }
    assert(dst_image->info.samples == 1);
 
-   if (src_image->info.array_size > 1)
-      radv_finishme("vkCmdResolveImage: multisample array images");
-
    unsigned fs_key = radv_format_meta_fs_key(device, dst_image->vk_format);
 
    /* From the Vulkan 1.0 spec:
@@ -608,6 +600,8 @@ radv_meta_resolve_hardware_image(struct radv_cmd_buffer *cmd_buffer, struct radv
 
       radv_cmd_buffer_end_render_pass(cmd_buffer);
 
+      radv_image_view_finish(&src_iview);
+      radv_image_view_finish(&dst_iview);
       radv_DestroyFramebuffer(radv_device_to_handle(device), fb_h, &cmd_buffer->pool->alloc);
    }
 
@@ -736,7 +730,7 @@ radv_cmd_buffer_resolve_subpass_hw(struct radv_cmd_buffer *cmd_buffer)
                    &(VkExtent2D){fb->width, fb->height});
    }
 
-   radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
+   radv_cmd_buffer_restore_subpass(cmd_buffer, subpass);
 
    radv_meta_restore(&saved_state, cmd_buffer);
 }
diff --git a/mesa 3D driver/src/amd/vulkan/radv_meta_resolve_cs.c b/mesa 3D driver/src/amd/vulkan/radv_meta_resolve_cs.c
index b659d4aae1..0e8bad1dde 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_meta_resolve_cs.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_meta_resolve_cs.c	
@@ -75,31 +75,32 @@ build_resolve_compute_shader(struct radv_device *dev, bool is_integer, bool is_s
    input_img->data.descriptor_set = 0;
    input_img->data.binding = 0;
 
-   nir_variable *output_img = nir_variable_create(b.shader, nir_var_uniform, img_type, "out_img");
+   nir_variable *output_img = nir_variable_create(b.shader, nir_var_image, img_type, "out_img");
    output_img->data.descriptor_set = 0;
    output_img->data.binding = 1;
-   nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
-   nir_ssa_def *wg_id = nir_load_workgroup_id(&b, 32);
-   nir_ssa_def *block_size =
-      nir_imm_ivec4(&b, b.shader->info.workgroup_size[0], b.shader->info.workgroup_size[1],
-                    b.shader->info.workgroup_size[2], 0);
 
-   nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
+   nir_ssa_def *global_id = get_global_ids(&b, 2);
 
    nir_ssa_def *src_offset = nir_load_push_constant(&b, 2, 32, nir_imm_int(&b, 0), .range = 16);
    nir_ssa_def *dst_offset = nir_load_push_constant(&b, 2, 32, nir_imm_int(&b, 8), .range = 16);
 
-   nir_ssa_def *img_coord = nir_channels(&b, nir_iadd(&b, global_id, src_offset), 0x3);
+   nir_ssa_def *src_coord = nir_iadd(&b, global_id, src_offset);
+   nir_ssa_def *dst_coord = nir_iadd(&b, global_id, dst_offset);
+
    nir_variable *color = nir_local_variable_create(b.impl, glsl_vec4_type(), "color");
 
-   radv_meta_build_resolve_shader_core(&b, is_integer, samples, input_img, color, img_coord);
+   radv_meta_build_resolve_shader_core(&b, is_integer, samples, input_img, color, src_coord);
 
    nir_ssa_def *outval = nir_load_var(&b, color);
    if (is_srgb)
       outval = radv_meta_build_resolve_srgb_conversion(&b, outval);
 
-   nir_ssa_def *coord = nir_iadd(&b, global_id, dst_offset);
-   nir_image_deref_store(&b, &nir_build_deref_var(&b, output_img)->dest.ssa, coord,
+   nir_ssa_def *img_coord = nir_vec4(&b, nir_channel(&b, dst_coord, 0),
+                                         nir_channel(&b, dst_coord, 1),
+                                         nir_ssa_undef(&b, 1, 32),
+                                         nir_ssa_undef(&b, 1, 32));
+
+   nir_image_deref_store(&b, &nir_build_deref_var(&b, output_img)->dest.ssa, img_coord,
                          nir_ssa_undef(&b, 1, 32), outval, nir_imm_int(&b, 0),
                          .image_dim = GLSL_SAMPLER_DIM_2D);
    return b.shader;
@@ -146,20 +147,11 @@ build_depth_stencil_resolve_compute_shader(struct radv_device *dev, int samples,
    input_img->data.descriptor_set = 0;
    input_img->data.binding = 0;
 
-   nir_variable *output_img = nir_variable_create(b.shader, nir_var_uniform, img_type, "out_img");
+   nir_variable *output_img = nir_variable_create(b.shader, nir_var_image, img_type, "out_img");
    output_img->data.descriptor_set = 0;
    output_img->data.binding = 1;
-   nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
-   nir_ssa_def *wg_id = nir_load_workgroup_id(&b, 32);
-   nir_ssa_def *block_size =
-      nir_imm_ivec4(&b, b.shader->info.workgroup_size[0], b.shader->info.workgroup_size[1],
-                    b.shader->info.workgroup_size[2], 0);
 
-   nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
-   nir_ssa_def *layer_id = nir_channel(&b, wg_id, 2);
-
-   nir_ssa_def *img_coord =
-      nir_vec3(&b, nir_channel(&b, global_id, 0), nir_channel(&b, global_id, 1), layer_id);
+   nir_ssa_def *img_coord = get_global_ids(&b, 3);
 
    nir_ssa_def *input_img_deref = &nir_build_deref_var(&b, input_img)->dest.ssa;
 
@@ -228,7 +220,7 @@ build_depth_stencil_resolve_compute_shader(struct radv_device *dev, int samples,
    }
 
    nir_ssa_def *coord = nir_vec4(&b, nir_channel(&b, img_coord, 0), nir_channel(&b, img_coord, 1),
-                                 nir_channel(&b, img_coord, 2), nir_imm_int(&b, 0));
+                                 nir_channel(&b, img_coord, 2), nir_ssa_undef(&b, 1, 32));
    nir_image_deref_store(&b, &nir_build_deref_var(&b, output_img)->dest.ssa, coord,
                          nir_ssa_undef(&b, 1, 32), outval, nir_imm_int(&b, 0),
                          .image_dim = GLSL_SAMPLER_DIM_2D, .image_array = true);
@@ -758,6 +750,9 @@ radv_meta_resolve_compute_image(struct radv_cmd_buffer *cmd_buffer, struct radv_
       emit_resolve(cmd_buffer, &src_iview, &dest_iview, &(VkOffset2D){srcOffset.x, srcOffset.y},
                    &(VkOffset2D){dstOffset.x, dstOffset.y},
                    &(VkExtent2D){extent.width, extent.height});
+
+      radv_image_view_finish(&src_iview);
+      radv_image_view_finish(&dest_iview);
    }
 
    radv_meta_restore(&saved_state, cmd_buffer);
@@ -800,7 +795,7 @@ radv_cmd_buffer_resolve_subpass_cs(struct radv_cmd_buffer *cmd_buffer)
    barrier.src_stage_mask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
    barrier.src_access_mask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
    barrier.dst_access_mask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
-   radv_subpass_barrier(cmd_buffer, &barrier);
+   radv_emit_subpass_barrier(cmd_buffer, &barrier);
 
    for (uint32_t i = 0; i < subpass->color_count; ++i) {
       struct radv_subpass_attachment src_att = subpass->color_attachments[i];
@@ -945,5 +940,8 @@ radv_depth_stencil_resolve_subpass_cs(struct radv_cmd_buffer *cmd_buffer,
       cmd_buffer->state.flush_bits |= radv_clear_htile(cmd_buffer, dst_image, &range, htile_value);
    }
 
+   radv_image_view_finish(&tsrc_iview);
+   radv_image_view_finish(&tdst_iview);
+
    radv_meta_restore(&saved_state, cmd_buffer);
 }
diff --git a/mesa 3D driver/src/amd/vulkan/radv_meta_resolve_fs.c b/mesa 3D driver/src/amd/vulkan/radv_meta_resolve_fs.c
index d926bf6277..1733dce702 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_meta_resolve_fs.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_meta_resolve_fs.c	
@@ -1053,6 +1053,8 @@ radv_meta_resolve_fragment_image(struct radv_cmd_buffer *cmd_buffer, struct radv
 
       radv_cmd_buffer_end_render_pass(cmd_buffer);
 
+      radv_image_view_finish(&src_iview);
+      radv_image_view_finish(&dest_iview);
       radv_DestroyFramebuffer(radv_device_to_handle(cmd_buffer->device), fb,
                               &cmd_buffer->pool->alloc);
    }
@@ -1076,7 +1078,7 @@ radv_cmd_buffer_resolve_subpass_fs(struct radv_cmd_buffer *cmd_buffer)
    barrier.src_stage_mask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
    barrier.src_access_mask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
    barrier.dst_access_mask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT;
-   radv_subpass_barrier(cmd_buffer, &barrier);
+   radv_emit_subpass_barrier(cmd_buffer, &barrier);
 
    radv_decompress_resolve_subpass_src(cmd_buffer);
 
@@ -1106,7 +1108,7 @@ radv_cmd_buffer_resolve_subpass_fs(struct radv_cmd_buffer *cmd_buffer)
                    &(VkExtent2D){fb->width, fb->height});
    }
 
-   radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
+   radv_cmd_buffer_restore_subpass(cmd_buffer, subpass);
 
    radv_meta_restore(&saved_state, cmd_buffer);
 }
@@ -1129,7 +1131,7 @@ radv_depth_stencil_resolve_subpass_fs(struct radv_cmd_buffer *cmd_buffer,
    barrier.src_stage_mask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
    barrier.src_access_mask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
    barrier.dst_access_mask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT;
-   radv_subpass_barrier(cmd_buffer, &barrier);
+   radv_emit_subpass_barrier(cmd_buffer, &barrier);
 
    struct radv_subpass_attachment src_att = *subpass->depth_stencil_attachment;
    struct radv_image_view *src_iview = cmd_buffer->state.attachments[src_att.attachment].iview;
@@ -1179,7 +1181,9 @@ radv_depth_stencil_resolve_subpass_fs(struct radv_cmd_buffer *cmd_buffer,
    emit_depth_stencil_resolve(cmd_buffer, &tsrc_iview, dst_iview,
                               &(VkExtent2D){fb->width, fb->height}, aspects, resolve_mode);
 
-   radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
+   radv_cmd_buffer_restore_subpass(cmd_buffer, subpass);
+
+   radv_image_view_finish(&tsrc_iview);
 
    radv_meta_restore(&saved_state, cmd_buffer);
 }
diff --git a/mesa 3D driver/src/amd/vulkan/radv_nir_lower_ycbcr_textures.c b/mesa 3D driver/src/amd/vulkan/radv_nir_lower_ycbcr_textures.c
index 044aff1a91..8695f3bd04 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_nir_lower_ycbcr_textures.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_nir_lower_ycbcr_textures.c	
@@ -132,7 +132,7 @@ create_plane_tex_instr_implicit(struct ycbcr_state *state, uint32_t plane)
          }
       FALLTHROUGH;
       default:
-         nir_src_copy(&tex->src[i].src, &old_tex->src[i].src, tex);
+         nir_src_copy(&tex->src[i].src, &old_tex->src[i].src);
          break;
       }
    }
diff --git a/mesa 3D driver/src/amd/vulkan/radv_nir_to_llvm.c b/mesa 3D driver/src/amd/vulkan/radv_nir_to_llvm.c
index dff9f63575..46d361352a 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_nir_to_llvm.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_nir_to_llvm.c	
@@ -27,10 +27,10 @@
 
 #include "nir/nir.h"
 #include "radv_debug.h"
+#include "radv_llvm_helper.h"
 #include "radv_private.h"
 #include "radv_shader.h"
 #include "radv_shader_args.h"
-#include "radv_shader_helper.h"
 
 #include "ac_binary.h"
 #include "ac_exp_param.h"
@@ -40,12 +40,12 @@
 #include "ac_shader_util.h"
 #include "sid.h"
 
-#define RADEON_LLVM_MAX_INPUTS (VARYING_SLOT_VAR31 + 1)
-
 struct radv_shader_context {
    struct ac_llvm_context ac;
    const struct nir_shader *shader;
    struct ac_shader_abi abi;
+   const struct radv_nir_compiler_options *options;
+   struct radv_shader_info *shader_info;
    const struct radv_shader_args *args;
 
    gl_shader_stage stage;
@@ -68,8 +68,6 @@ struct radv_shader_context {
    LLVMValueRef hs_ring_tess_offchip;
    LLVMValueRef hs_ring_tess_factor;
 
-   LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS * 4];
-
    uint64_t output_mask;
 
    LLVMValueRef gs_next_vertex[4];
@@ -115,8 +113,10 @@ create_llvm_function(struct ac_llvm_context *ctx, LLVMModuleRef module, LLVMBuil
 static void
 load_descriptor_sets(struct radv_shader_context *ctx)
 {
-   uint32_t mask = ctx->args->shader_info->desc_set_used_mask;
-   if (ctx->args->shader_info->need_indirect_descriptor_sets) {
+   struct radv_userdata_locations *user_sgprs_locs = &ctx->shader_info->user_sgprs_locs;
+   uint32_t mask = ctx->shader_info->desc_set_used_mask;
+
+   if (user_sgprs_locs->shader_data[AC_UD_INDIRECT_DESCRIPTOR_SETS].sgpr_idx != -1) {
       LLVMValueRef desc_sets = ac_get_arg(&ctx->ac, ctx->args->descriptor_sets[0]);
       while (mask) {
          int i = u_bit_scan(&mask);
@@ -170,7 +170,7 @@ static void
 create_function(struct radv_shader_context *ctx, gl_shader_stage stage, bool has_previous_stage)
 {
    if (ctx->ac.chip_class >= GFX10) {
-      if (is_pre_gs_stage(stage) && ctx->args->options->key.vs_common_out.as_ngg) {
+      if (is_pre_gs_stage(stage) && ctx->shader_info->is_ngg) {
          /* On GFX10, VS is merged into GS for NGG. */
          stage = MESA_SHADER_GEOMETRY;
          has_previous_stage = true;
@@ -180,7 +180,7 @@ create_function(struct radv_shader_context *ctx, gl_shader_stage stage, bool has
    ctx->main_function =
       create_llvm_function(&ctx->ac, ctx->ac.module, ctx->ac.builder, &ctx->args->ac,
                            get_llvm_calling_convention(ctx->main_function, stage),
-                           ctx->max_workgroup_size, ctx->args->options);
+                           ctx->max_workgroup_size, ctx->options);
 
    ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr",
                                           LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_CONST), NULL, 0,
@@ -191,7 +191,7 @@ create_function(struct radv_shader_context *ctx, gl_shader_stage stage, bool has
    load_descriptor_sets(ctx);
 
    if (stage == MESA_SHADER_TESS_CTRL ||
-       (stage == MESA_SHADER_VERTEX && ctx->args->options->key.vs_common_out.as_ls) ||
+       (stage == MESA_SHADER_VERTEX && ctx->shader_info->vs.as_ls) ||
        /* GFX9 has the ESGS ring buffer in LDS. */
        (stage == MESA_SHADER_GEOMETRY && has_previous_stage)) {
       ac_declare_lds_as_pointer(&ctx->ac);
@@ -204,7 +204,7 @@ radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index, unsigned desc_
 {
    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
    LLVMValueRef desc_ptr = ctx->descriptor_sets[desc_set];
-   struct radv_pipeline_layout *pipeline_layout = ctx->args->options->layout;
+   struct radv_pipeline_layout *pipeline_layout = ctx->options->layout;
    struct radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
    unsigned base_offset = layout->binding[binding].offset;
    LLVMValueRef offset, stride;
@@ -263,7 +263,7 @@ load_sample_position(struct ac_shader_abi *abi, LLVMValueRef sample_id)
 
    ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ac_array_in_const_addr_space(ctx->ac.v2f32), "");
 
-   uint32_t sample_pos_offset = radv_get_sample_pos_offset(ctx->args->options->key.fs.num_samples);
+   uint32_t sample_pos_offset = radv_get_sample_pos_offset(ctx->options->key.ps.num_samples);
 
    sample_id = LLVMBuildAdd(ctx->ac.builder, sample_id,
                             LLVMConstInt(ctx->ac.i32, sample_pos_offset, false), "");
@@ -278,10 +278,10 @@ load_sample_mask_in(struct ac_shader_abi *abi)
    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
    uint8_t log2_ps_iter_samples;
 
-   if (ctx->args->shader_info->ps.uses_sample_shading) {
-      log2_ps_iter_samples = util_logbase2(ctx->args->options->key.fs.num_samples);
+   if (ctx->shader_info->ps.uses_sample_shading) {
+      log2_ps_iter_samples = util_logbase2(ctx->options->key.ps.num_samples);
    } else {
-      log2_ps_iter_samples = ctx->args->options->key.fs.log2_ps_iter_samples;
+      log2_ps_iter_samples = ctx->options->key.ps.log2_ps_iter_samples;
    }
 
    LLVMValueRef result, sample_id;
@@ -308,14 +308,14 @@ visit_emit_vertex_with_counter(struct ac_shader_abi *abi, unsigned stream, LLVMV
    unsigned offset = 0;
    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
 
-   if (ctx->args->options->key.vs_common_out.as_ngg) {
+   if (ctx->shader_info->is_ngg) {
       gfx10_ngg_gs_emit_vertex(ctx, stream, vertexidx, addrs);
       return;
    }
 
    for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
-      unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
-      uint8_t output_stream = ctx->args->shader_info->gs.output_streams[i];
+      unsigned output_usage_mask = ctx->shader_info->gs.output_usage_mask[i];
+      uint8_t output_stream = ctx->shader_info->gs.output_streams[i];
       LLVMValueRef *out_ptr = &addrs[i * 4];
       int length = util_last_bit(output_usage_mask);
 
@@ -353,7 +353,7 @@ visit_end_primitive(struct ac_shader_abi *abi, unsigned stream)
 {
    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
 
-   if (ctx->args->options->key.vs_common_out.as_ngg) {
+   if (ctx->shader_info->is_ngg) {
       LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
       return;
    }
@@ -362,25 +362,6 @@ visit_end_primitive(struct ac_shader_abi *abi, unsigned stream)
                     ctx->gs_wave_id);
 }
 
-static LLVMValueRef
-load_tess_coord(struct ac_shader_abi *abi)
-{
-   struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
-
-   LLVMValueRef coord[4] = {
-      ac_get_arg(&ctx->ac, ctx->args->ac.tes_u),
-      ac_get_arg(&ctx->ac, ctx->args->ac.tes_v),
-      ctx->ac.f32_0,
-      ctx->ac.f32_0,
-   };
-
-   if (ctx->shader->info.tess.primitive_mode == GL_TRIANGLES)
-      coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1,
-                               LLVMBuildFAdd(ctx->ac.builder, coord[0], coord[1], ""), "");
-
-   return ac_build_gather_values(&ctx->ac, coord, 3);
-}
-
 static LLVMValueRef
 load_ring_tess_factors(struct ac_shader_abi *abi)
 {
@@ -427,7 +408,7 @@ get_desc_ptr(struct radv_shader_context *ctx, LLVMValueRef ptr, bool non_uniform
    if (non_uniform) {
       /* 32-bit seems to always use SMEM. addrspacecast from 32-bit -> 64-bit is broken. */
       LLVMValueRef dwords[] = {ptr,
-                               LLVMConstInt(ctx->ac.i32, ctx->args->options->address32_hi, false)};
+                               LLVMConstInt(ctx->ac.i32, ctx->options->address32_hi, false)};
       ptr = ac_build_gather_values(&ctx->ac, dwords, 2);
       ptr = LLVMBuildBitCast(ctx->ac.builder, ptr, ctx->ac.i64, "");
       addr_space = AC_ADDR_SPACE_CONST;
@@ -460,7 +441,7 @@ radv_load_ubo(struct ac_shader_abi *abi, unsigned desc_set, unsigned binding, bo
    LLVMValueRef result;
 
    if (valid_binding) {
-      struct radv_pipeline_layout *pipeline_layout = ctx->args->options->layout;
+      struct radv_pipeline_layout *pipeline_layout = ctx->options->layout;
       struct radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
 
       if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
@@ -482,7 +463,7 @@ radv_load_ubo(struct ac_shader_abi *abi, unsigned desc_set, unsigned binding, bo
 
          LLVMValueRef desc_components[4] = {
             LLVMBuildPtrToInt(ctx->ac.builder, buffer_ptr, ctx->ac.intptr, ""),
-            LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->args->options->address32_hi),
+            LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi),
                          false),
             LLVMConstInt(ctx->ac.i32, 0xffffffff, false),
             LLVMConstInt(ctx->ac.i32, desc_type, false),
@@ -510,7 +491,7 @@ radv_get_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set, unsign
    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
    LLVMValueRef list = ctx->descriptor_sets[descriptor_set];
    struct radv_descriptor_set_layout *layout =
-      ctx->args->options->layout->set[descriptor_set].layout;
+      ctx->options->layout->set[descriptor_set].layout;
    struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
    unsigned offset = binding->offset;
    unsigned stride = binding->size;
@@ -520,6 +501,9 @@ radv_get_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set, unsign
 
    assert(base_index < layout->binding_count);
 
+   if (binding->type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE && desc_type == AC_DESC_FMASK)
+      return NULL;
+
    switch (desc_type) {
    case AC_DESC_IMAGE:
       type = ctx->ac.v8i32;
@@ -602,7 +586,7 @@ radv_get_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set, unsign
          components[i] = ac_llvm_extract_elem(&ctx->ac, descriptor2, i);
       descriptor = ac_build_gather_values(&ctx->ac, components, 8);
    } else if (desc_type == AC_DESC_IMAGE &&
-              ctx->args->options->has_image_load_dcc_bug &&
+              ctx->options->has_image_load_dcc_bug &&
               image && !write) {
       LLVMValueRef components[8];
 
@@ -624,14 +608,14 @@ radv_get_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set, unsign
 static LLVMValueRef
 adjust_vertex_fetch_alpha(struct radv_shader_context *ctx, unsigned adjustment, LLVMValueRef alpha)
 {
-   if (adjustment == AC_FETCH_FORMAT_NONE)
+   if (adjustment == ALPHA_ADJUST_NONE)
       return alpha;
 
    LLVMValueRef c30 = LLVMConstInt(ctx->ac.i32, 30, 0);
 
    alpha = LLVMBuildBitCast(ctx->ac.builder, alpha, ctx->ac.f32, "");
 
-   if (adjustment == AC_FETCH_FORMAT_SSCALED)
+   if (adjustment == ALPHA_ADJUST_SSCALED)
       alpha = LLVMBuildFPToUI(ctx->ac.builder, alpha, ctx->ac.i32, "");
    else
       alpha = ac_to_integer(&ctx->ac, alpha);
@@ -644,17 +628,17 @@ adjust_vertex_fetch_alpha(struct radv_shader_context *ctx, unsigned adjustment,
     */
    alpha =
       LLVMBuildShl(ctx->ac.builder, alpha,
-                   adjustment == AC_FETCH_FORMAT_SNORM ? LLVMConstInt(ctx->ac.i32, 7, 0) : c30, "");
+                   adjustment == ALPHA_ADJUST_SNORM ? LLVMConstInt(ctx->ac.i32, 7, 0) : c30, "");
    alpha = LLVMBuildAShr(ctx->ac.builder, alpha, c30, "");
 
    /* Convert back to the right type. */
-   if (adjustment == AC_FETCH_FORMAT_SNORM) {
+   if (adjustment == ALPHA_ADJUST_SNORM) {
       LLVMValueRef clamp;
       LLVMValueRef neg_one = LLVMConstReal(ctx->ac.f32, -1.0);
       alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, "");
       clamp = LLVMBuildFCmp(ctx->ac.builder, LLVMRealULT, alpha, neg_one, "");
       alpha = LLVMBuildSelect(ctx->ac.builder, clamp, neg_one, alpha, "");
-   } else if (adjustment == AC_FETCH_FORMAT_SSCALED) {
+   } else if (adjustment == ALPHA_ADJUST_SSCALED) {
       alpha = LLVMBuildSIToFP(ctx->ac.builder, alpha, ctx->ac.f32, "");
    }
 
@@ -693,162 +677,154 @@ radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx, LLVMValueRef va
 }
 
 static void
-handle_vs_input_decl(struct radv_shader_context *ctx, struct nir_variable *variable)
+load_vs_input(struct radv_shader_context *ctx, unsigned driver_location, LLVMTypeRef dest_type,
+              LLVMValueRef out[4])
 {
    LLVMValueRef t_list_ptr = ac_get_arg(&ctx->ac, ctx->args->ac.vertex_buffers);
    LLVMValueRef t_offset;
    LLVMValueRef t_list;
    LLVMValueRef input;
    LLVMValueRef buffer_index;
-   unsigned attrib_count = glsl_count_attribute_slots(variable->type, true);
+   unsigned attrib_index = driver_location - VERT_ATTRIB_GENERIC0;
+   unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[attrib_index];
+   unsigned data_format = attrib_format & 0x0f;
+   unsigned num_format = (attrib_format >> 4) & 0x07;
+   bool is_float =
+      num_format != V_008F0C_BUF_NUM_FORMAT_UINT && num_format != V_008F0C_BUF_NUM_FORMAT_SINT;
+   uint8_t input_usage_mask =
+      ctx->shader_info->vs.input_usage_mask[driver_location];
+   unsigned num_input_channels = util_last_bit(input_usage_mask);
 
-   enum glsl_base_type type = glsl_get_base_type(variable->type);
-   for (unsigned i = 0; i < attrib_count; ++i) {
-      LLVMValueRef output[4];
-      unsigned attrib_index = variable->data.location + i - VERT_ATTRIB_GENERIC0;
-      unsigned attrib_format = ctx->args->options->key.vs.vertex_attribute_formats[attrib_index];
-      unsigned data_format = attrib_format & 0x0f;
-      unsigned num_format = (attrib_format >> 4) & 0x07;
-      bool is_float =
-         num_format != V_008F0C_BUF_NUM_FORMAT_UINT && num_format != V_008F0C_BUF_NUM_FORMAT_SINT;
-      uint8_t input_usage_mask =
-         ctx->args->shader_info->vs.input_usage_mask[variable->data.location + i];
-      unsigned num_input_channels = util_last_bit(input_usage_mask);
+   if (ctx->options->key.vs.instance_rate_inputs & (1u << attrib_index)) {
+      uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[attrib_index];
 
-      if (num_input_channels == 0)
-         continue;
+      if (divisor) {
+         buffer_index = ctx->abi.instance_id;
 
-      if (ctx->args->options->key.vs.instance_rate_inputs & (1u << attrib_index)) {
-         uint32_t divisor = ctx->args->options->key.vs.instance_rate_divisors[attrib_index];
-
-         if (divisor) {
-            buffer_index = ctx->abi.instance_id;
-
-            if (divisor != 1) {
-               buffer_index = LLVMBuildUDiv(ctx->ac.builder, buffer_index,
-                                            LLVMConstInt(ctx->ac.i32, divisor, 0), "");
-            }
-         } else {
-            buffer_index = ctx->ac.i32_0;
+         if (divisor != 1) {
+            buffer_index = LLVMBuildUDiv(ctx->ac.builder, buffer_index,
+                                         LLVMConstInt(ctx->ac.i32, divisor, 0), "");
          }
-
-         buffer_index = LLVMBuildAdd(
-            ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->ac.start_instance), buffer_index, "");
       } else {
-         buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
-                                     ac_get_arg(&ctx->ac, ctx->args->ac.base_vertex), "");
+         buffer_index = ctx->ac.i32_0;
       }
 
-      const struct ac_data_format_info *vtx_info = ac_get_data_format_info(data_format);
+      buffer_index = LLVMBuildAdd(
+         ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->ac.start_instance), buffer_index, "");
+   } else {
+      buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
+                                  ac_get_arg(&ctx->ac, ctx->args->ac.base_vertex), "");
+   }
 
-      /* Adjust the number of channels to load based on the vertex
-       * attribute format.
-       */
-      unsigned num_channels = MIN2(num_input_channels, vtx_info->num_channels);
-      unsigned attrib_binding = ctx->args->options->key.vs.vertex_attribute_bindings[attrib_index];
-      unsigned attrib_offset = ctx->args->options->key.vs.vertex_attribute_offsets[attrib_index];
-      unsigned attrib_stride = ctx->args->options->key.vs.vertex_attribute_strides[attrib_index];
-      unsigned alpha_adjust = ctx->args->options->key.vs.alpha_adjust[attrib_index];
+   const struct ac_data_format_info *vtx_info = ac_get_data_format_info(data_format);
 
-      if (ctx->args->options->key.vs.post_shuffle & (1 << attrib_index)) {
-         /* Always load, at least, 3 channels for formats that
-          * need to be shuffled because X<->Z.
-          */
-         num_channels = MAX2(num_channels, 3);
-      }
+   /* Adjust the number of channels to load based on the vertex attribute format. */
+   unsigned num_channels = MIN2(num_input_channels, vtx_info->num_channels);
+   unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[attrib_index];
+   unsigned attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[attrib_index];
+   unsigned attrib_stride = ctx->options->key.vs.vertex_attribute_strides[attrib_index];
+   unsigned alpha_adjust = ctx->options->key.vs.vertex_alpha_adjust[attrib_index];
 
-      unsigned desc_index =
-         ctx->args->shader_info->vs.use_per_attribute_vb_descs ? attrib_index : attrib_binding;
-      desc_index = util_bitcount(ctx->args->shader_info->vs.vb_desc_usage_mask &
-                                 u_bit_consecutive(0, desc_index));
-      t_offset = LLVMConstInt(ctx->ac.i32, desc_index, false);
-      t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
+   if (ctx->options->key.vs.vertex_post_shuffle & (1 << attrib_index)) {
+      /* Always load, at least, 3 channels for formats that need to be shuffled because X<->Z. */
+      num_channels = MAX2(num_channels, 3);
+   }
 
-      /* Always split typed vertex buffer loads on GFX6 and GFX10+
-       * to avoid any alignment issues that triggers memory
-       * violations and eventually a GPU hang. This can happen if
-       * the stride (static or dynamic) is unaligned and also if the
-       * VBO offset is aligned to a scalar (eg. stride is 8 and VBO
-       * offset is 2 for R16G16B16A16_SNORM).
-       */
-      if (ctx->ac.chip_class == GFX6 || ctx->ac.chip_class >= GFX10) {
-         unsigned chan_format = vtx_info->chan_format;
-         LLVMValueRef values[4];
+   unsigned desc_index =
+      ctx->shader_info->vs.use_per_attribute_vb_descs ? attrib_index : attrib_binding;
+   desc_index = util_bitcount(ctx->shader_info->vs.vb_desc_usage_mask &
+                              u_bit_consecutive(0, desc_index));
+   t_offset = LLVMConstInt(ctx->ac.i32, desc_index, false);
+   t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
 
-         assert(ctx->ac.chip_class == GFX6 || ctx->ac.chip_class >= GFX10);
+   /* Always split typed vertex buffer loads on GFX6 and GFX10+ to avoid any alignment issues that
+    * triggers memory violations and eventually a GPU hang. This can happen if the stride (static or
+    * dynamic) is unaligned and also if the VBO offset is aligned to a scalar (eg. stride is 8 and
+    * VBO offset is 2 for R16G16B16A16_SNORM).
+    */
+   if (ctx->ac.chip_class == GFX6 || ctx->ac.chip_class >= GFX10) {
+      unsigned chan_format = vtx_info->chan_format;
+      LLVMValueRef values[4];
 
-         for (unsigned chan = 0; chan < num_channels; chan++) {
-            unsigned chan_offset = attrib_offset + chan * vtx_info->chan_byte_size;
-            LLVMValueRef chan_index = buffer_index;
+      assert(ctx->ac.chip_class == GFX6 || ctx->ac.chip_class >= GFX10);
 
-            if (attrib_stride != 0 && chan_offset > attrib_stride) {
-               LLVMValueRef buffer_offset =
-                  LLVMConstInt(ctx->ac.i32, chan_offset / attrib_stride, false);
+      for (unsigned chan = 0; chan < num_channels; chan++) {
+         unsigned chan_offset = attrib_offset + chan * vtx_info->chan_byte_size;
+         LLVMValueRef chan_index = buffer_index;
 
-               chan_index = LLVMBuildAdd(ctx->ac.builder, buffer_index, buffer_offset, "");
-
-               chan_offset = chan_offset % attrib_stride;
-            }
-
-            values[chan] = ac_build_struct_tbuffer_load(
-               &ctx->ac, t_list, chan_index, LLVMConstInt(ctx->ac.i32, chan_offset, false),
-               ctx->ac.i32_0, ctx->ac.i32_0, 1, chan_format, num_format, 0, true);
-         }
-
-         input = ac_build_gather_values(&ctx->ac, values, num_channels);
-      } else {
-         if (attrib_stride != 0 && attrib_offset > attrib_stride) {
+         if (attrib_stride != 0 && chan_offset > attrib_stride) {
             LLVMValueRef buffer_offset =
-               LLVMConstInt(ctx->ac.i32, attrib_offset / attrib_stride, false);
+               LLVMConstInt(ctx->ac.i32, chan_offset / attrib_stride, false);
 
-            buffer_index = LLVMBuildAdd(ctx->ac.builder, buffer_index, buffer_offset, "");
+            chan_index = LLVMBuildAdd(ctx->ac.builder, buffer_index, buffer_offset, "");
 
-            attrib_offset = attrib_offset % attrib_stride;
+            chan_offset = chan_offset % attrib_stride;
          }
 
-         input = ac_build_struct_tbuffer_load(
-            &ctx->ac, t_list, buffer_index, LLVMConstInt(ctx->ac.i32, attrib_offset, false),
-            ctx->ac.i32_0, ctx->ac.i32_0, num_channels, data_format, num_format, 0, true);
+         values[chan] = ac_build_struct_tbuffer_load(
+            &ctx->ac, t_list, chan_index, LLVMConstInt(ctx->ac.i32, chan_offset, false),
+            ctx->ac.i32_0, ctx->ac.i32_0, 1, chan_format, num_format, 0, true);
       }
 
-      if (ctx->args->options->key.vs.post_shuffle & (1 << attrib_index)) {
-         LLVMValueRef c[4];
-         c[0] = ac_llvm_extract_elem(&ctx->ac, input, 2);
-         c[1] = ac_llvm_extract_elem(&ctx->ac, input, 1);
-         c[2] = ac_llvm_extract_elem(&ctx->ac, input, 0);
-         c[3] = ac_llvm_extract_elem(&ctx->ac, input, 3);
+      input = ac_build_gather_values(&ctx->ac, values, num_channels);
+   } else {
+      if (attrib_stride != 0 && attrib_offset > attrib_stride) {
+         LLVMValueRef buffer_offset =
+            LLVMConstInt(ctx->ac.i32, attrib_offset / attrib_stride, false);
 
-         input = ac_build_gather_values(&ctx->ac, c, 4);
+         buffer_index = LLVMBuildAdd(ctx->ac.builder, buffer_index, buffer_offset, "");
+
+         attrib_offset = attrib_offset % attrib_stride;
       }
 
-      input = radv_fixup_vertex_input_fetches(ctx, input, num_channels, is_float);
+      input = ac_build_struct_tbuffer_load(
+         &ctx->ac, t_list, buffer_index, LLVMConstInt(ctx->ac.i32, attrib_offset, false),
+         ctx->ac.i32_0, ctx->ac.i32_0, num_channels, data_format, num_format, 0, true);
+   }
 
-      for (unsigned chan = 0; chan < 4; chan++) {
-         LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
-         output[chan] = LLVMBuildExtractElement(ctx->ac.builder, input, llvm_chan, "");
-         if (type == GLSL_TYPE_FLOAT16) {
-            output[chan] = LLVMBuildBitCast(ctx->ac.builder, output[chan], ctx->ac.f32, "");
-            output[chan] = LLVMBuildFPTrunc(ctx->ac.builder, output[chan], ctx->ac.f16, "");
-         }
+   if (ctx->options->key.vs.vertex_post_shuffle & (1 << attrib_index)) {
+      LLVMValueRef c[4];
+      c[0] = ac_llvm_extract_elem(&ctx->ac, input, 2);
+      c[1] = ac_llvm_extract_elem(&ctx->ac, input, 1);
+      c[2] = ac_llvm_extract_elem(&ctx->ac, input, 0);
+      c[3] = ac_llvm_extract_elem(&ctx->ac, input, 3);
+
+      input = ac_build_gather_values(&ctx->ac, c, 4);
+   }
+
+   input = radv_fixup_vertex_input_fetches(ctx, input, num_channels, is_float);
+
+   for (unsigned chan = 0; chan < 4; chan++) {
+      LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
+      out[chan] = LLVMBuildExtractElement(ctx->ac.builder, input, llvm_chan, "");
+      if (dest_type == ctx->ac.i16 && is_float) {
+         out[chan] = LLVMBuildBitCast(ctx->ac.builder, out[chan], ctx->ac.f32, "");
+         out[chan] = LLVMBuildFPTrunc(ctx->ac.builder, out[chan], ctx->ac.f16, "");
       }
+   }
 
-      output[3] = adjust_vertex_fetch_alpha(ctx, alpha_adjust, output[3]);
+   out[3] = adjust_vertex_fetch_alpha(ctx, alpha_adjust, out[3]);
 
-      for (unsigned chan = 0; chan < 4; chan++) {
-         output[chan] = ac_to_integer(&ctx->ac, output[chan]);
-         if (type == GLSL_TYPE_UINT16 || type == GLSL_TYPE_INT16)
-            output[chan] = LLVMBuildTrunc(ctx->ac.builder, output[chan], ctx->ac.i16, "");
-
-         ctx->inputs[ac_llvm_reg_index_soa(variable->data.location + i, chan)] = output[chan];
-      }
+   for (unsigned chan = 0; chan < 4; chan++) {
+      out[chan] = ac_to_integer(&ctx->ac, out[chan]);
+      if (dest_type == ctx->ac.i16 && !is_float)
+         out[chan] = LLVMBuildTrunc(ctx->ac.builder, out[chan], ctx->ac.i16, "");
    }
 }
 
-static void
-handle_vs_inputs(struct radv_shader_context *ctx, struct nir_shader *nir)
+static LLVMValueRef
+radv_load_vs_inputs(struct ac_shader_abi *abi, unsigned driver_location, unsigned component,
+                    unsigned num_components, unsigned vertex_index, LLVMTypeRef type)
 {
-   nir_foreach_shader_in_variable (variable, nir)
-      handle_vs_input_decl(ctx, variable);
+   struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
+   LLVMValueRef values[4];
+
+   load_vs_input(ctx, driver_location, type, values);
+
+   for (unsigned i = 0; i < 4; i++)
+      values[i] = LLVMBuildBitCast(ctx->ac.builder, values[i], type, "");
+
+   return ac_build_varying_gather_values(&ctx->ac, values, num_components, component);
 }
 
 static void
@@ -930,9 +906,9 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, LLVMValueRef *values,
    bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2;
    if (ctx->stage == MESA_SHADER_FRAGMENT) {
       unsigned index = target - V_008DFC_SQ_EXP_MRT;
-      unsigned col_format = (ctx->args->options->key.fs.col_format >> (4 * index)) & 0xf;
-      bool is_int8 = (ctx->args->options->key.fs.is_int8 >> index) & 1;
-      bool is_int10 = (ctx->args->options->key.fs.is_int10 >> index) & 1;
+      unsigned col_format = (ctx->options->key.ps.col_format >> (4 * index)) & 0xf;
+      bool is_int8 = (ctx->options->key.ps.is_int8 >> index) & 1;
+      bool is_int10 = (ctx->options->key.ps.is_int10 >> index) & 1;
 
       LLVMValueRef (*packf)(struct ac_llvm_context * ctx, LLVMValueRef args[2]) = NULL;
       LLVMValueRef (*packi)(struct ac_llvm_context * ctx, LLVMValueRef args[2], unsigned bits,
@@ -1015,7 +991,7 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, LLVMValueRef *values,
       /* Replace NaN by zero (only 32-bit) to fix game bugs if
        * requested.
        */
-      if (ctx->args->options->enable_mrt_output_nan_fixup && !is_16bit &&
+      if (ctx->options->enable_mrt_output_nan_fixup && !is_16bit &&
           (col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR ||
            col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR ||
            col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
@@ -1171,7 +1147,7 @@ radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream)
       LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->args->streamout_buffers);
 
       for (i = 0; i < 4; i++) {
-         uint16_t stride = ctx->args->shader_info->so.strides[i];
+         uint16_t stride = ctx->shader_info->so.strides[i];
 
          if (!stride)
             continue;
@@ -1190,9 +1166,9 @@ radv_emit_streamout(struct radv_shader_context *ctx, unsigned stream)
       }
 
       /* Write streamout data. */
-      for (i = 0; i < ctx->args->shader_info->so.num_outputs; i++) {
+      for (i = 0; i < ctx->shader_info->so.num_outputs; i++) {
          struct radv_shader_output_values shader_out = {0};
-         struct radv_stream_output *output = &ctx->args->shader_info->so.outputs[i];
+         struct radv_stream_output *output = &ctx->shader_info->so.outputs[i];
 
          if (stream != output->stream)
             continue;
@@ -1212,8 +1188,6 @@ radv_build_param_exports(struct radv_shader_context *ctx, struct radv_shader_out
                          unsigned noutput, struct radv_vs_output_info *outinfo,
                          bool export_clip_dists)
 {
-   unsigned param_count = 0;
-
    for (unsigned i = 0; i < noutput; i++) {
       unsigned slot_name = outputs[i].slot_name;
       unsigned usage_mask = outputs[i].usage_mask;
@@ -1227,13 +1201,9 @@ radv_build_param_exports(struct radv_shader_context *ctx, struct radv_shader_out
           !export_clip_dists)
          continue;
 
-      radv_export_param(ctx, param_count, outputs[i].values, usage_mask);
-
-      assert(i < ARRAY_SIZE(outinfo->vs_output_param_offset));
-      outinfo->vs_output_param_offset[slot_name] = param_count++;
+      radv_export_param(ctx, outinfo->vs_output_param_offset[slot_name], outputs[i].values,
+                        usage_mask);
    }
-
-   outinfo->param_exports = param_count;
 }
 
 /* Generate export instructions for hardware VS shader stage or NGG GS stage
@@ -1292,7 +1262,7 @@ radv_llvm_export_vs(struct radv_shader_context *ctx, struct radv_shader_output_v
    }
 
    bool writes_primitive_shading_rate = outinfo->writes_primitive_shading_rate ||
-                                        ctx->args->options->force_vrs_rates;
+                                        ctx->options->force_vrs_rates;
 
    if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_layer ||
        outinfo->writes_viewport_index || writes_primitive_shading_rate) {
@@ -1313,7 +1283,7 @@ radv_llvm_export_vs(struct radv_shader_context *ctx, struct radv_shader_output_v
       if (outinfo->writes_layer == true)
          pos_args[1].out[2] = layer_value;
       if (outinfo->writes_viewport_index == true) {
-         if (ctx->args->options->chip_class >= GFX9) {
+         if (ctx->options->chip_class >= GFX9) {
             /* GFX9 has the layer in out.z[10:0] and the viewport
              * index in out.z[19:16].
              */
@@ -1332,7 +1302,7 @@ radv_llvm_export_vs(struct radv_shader_context *ctx, struct radv_shader_output_v
 
       if (outinfo->writes_primitive_shading_rate) {
          pos_args[1].out[1] = primitive_shading_rate;
-      } else if (ctx->args->options->force_vrs_rates) {
+      } else if (ctx->options->force_vrs_rates) {
          /* Bits [2:3] = VRS rate X
           * Bits [4:5] = VRS rate Y
           *
@@ -1344,7 +1314,7 @@ radv_llvm_export_vs(struct radv_shader_context *ctx, struct radv_shader_output_v
           *
           * Sample shading can't go above 8 samples, so both numbers can't be -2 at the same time.
           */
-         LLVMValueRef rates = LLVMConstInt(ctx->ac.i32, ctx->args->options->force_vrs_rates, false);
+         LLVMValueRef rates = LLVMConstInt(ctx->ac.i32, ctx->options->force_vrs_rates, false);
          LLVMValueRef cond;
          LLVMValueRef v;
 
@@ -1356,11 +1326,6 @@ radv_llvm_export_vs(struct radv_shader_context *ctx, struct radv_shader_output_v
       }
    }
 
-   for (i = 0; i < 4; i++) {
-      if (pos_args[i].out[0])
-         outinfo->pos_exports++;
-   }
-
    /* GFX10 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
     * Setting valid_mask=1 prevents it and has no other effect.
     */
@@ -1393,7 +1358,7 @@ handle_vs_outputs_post(struct radv_shader_context *ctx, bool export_prim_id, boo
    struct radv_shader_output_values *outputs;
    unsigned noutput = 0;
 
-   if (ctx->args->options->key.has_multiview_view_index) {
+   if (ctx->options->key.has_multiview_view_index) {
       LLVMValueRef *tmp_out = &ctx->abi.outputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)];
       if (!*tmp_out) {
          for (unsigned i = 0; i < 4; ++i)
@@ -1406,12 +1371,7 @@ handle_vs_outputs_post(struct radv_shader_context *ctx, bool export_prim_id, boo
       ctx->output_mask |= 1ull << VARYING_SLOT_LAYER;
    }
 
-   memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
-          sizeof(outinfo->vs_output_param_offset));
-   outinfo->pos_exports = 0;
-
-   if (!ctx->args->options->use_ngg_streamout && ctx->args->shader_info->so.num_outputs &&
-       !ctx->args->is_gs_copy_shader) {
+   if (ctx->shader_info->so.num_outputs && !ctx->args->is_gs_copy_shader) {
       /* The GS copy shader emission already emits streamout. */
       radv_emit_streamout(ctx, 0);
    }
@@ -1428,12 +1388,12 @@ handle_vs_outputs_post(struct radv_shader_context *ctx, bool export_prim_id, boo
       outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;
 
       if (ctx->stage == MESA_SHADER_VERTEX && !ctx->args->is_gs_copy_shader) {
-         outputs[noutput].usage_mask = ctx->args->shader_info->vs.output_usage_mask[i];
+         outputs[noutput].usage_mask = ctx->shader_info->vs.output_usage_mask[i];
       } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
-         outputs[noutput].usage_mask = ctx->args->shader_info->tes.output_usage_mask[i];
+         outputs[noutput].usage_mask = ctx->shader_info->tes.output_usage_mask[i];
       } else {
          assert(ctx->args->is_gs_copy_shader);
-         outputs[noutput].usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
+         outputs[noutput].usage_mask = ctx->shader_info->gs.output_usage_mask[i];
       }
 
       for (unsigned j = 0; j < 4; j++) {
@@ -1500,19 +1460,12 @@ ngg_get_prim_cnt(struct radv_shader_context *ctx)
                        false);
 }
 
-static LLVMValueRef
-ngg_get_ordered_id(struct radv_shader_context *ctx)
-{
-   return ac_build_bfe(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_tg_info), ctx->ac.i32_0,
-                       LLVMConstInt(ctx->ac.i32, 12, false), false);
-}
-
 static LLVMValueRef
 ngg_gs_get_vertex_storage(struct radv_shader_context *ctx)
 {
    unsigned num_outputs = util_bitcount64(ctx->output_mask);
 
-   if (ctx->args->options->key.has_multiview_view_index)
+   if (ctx->options->key.has_multiview_view_index)
       num_outputs++;
 
    LLVMTypeRef elements[2] = {
@@ -1608,475 +1561,6 @@ ngg_gs_get_emit_primflag_ptr(struct radv_shader_context *ctx, LLVMValueRef verte
    return LLVMBuildGEP(ctx->ac.builder, vertexptr, gep_idx, 3, "");
 }
 
-static struct radv_stream_output *
-radv_get_stream_output_by_loc(struct radv_streamout_info *so, unsigned location)
-{
-   for (unsigned i = 0; i < so->num_outputs; ++i) {
-      if (so->outputs[i].location == location)
-         return &so->outputs[i];
-   }
-
-   return NULL;
-}
-
-static void
-build_streamout_vertex(struct radv_shader_context *ctx, LLVMValueRef *so_buffer,
-                       LLVMValueRef *wg_offset_dw, unsigned stream, LLVMValueRef offset_vtx,
-                       LLVMValueRef vertexptr)
-{
-   struct radv_streamout_info *so = &ctx->args->shader_info->so;
-   LLVMBuilderRef builder = ctx->ac.builder;
-   LLVMValueRef offset[4] = {0};
-   LLVMValueRef tmp;
-
-   for (unsigned buffer = 0; buffer < 4; ++buffer) {
-      if (!wg_offset_dw[buffer])
-         continue;
-
-      tmp = LLVMBuildMul(builder, offset_vtx, LLVMConstInt(ctx->ac.i32, so->strides[buffer], false),
-                         "");
-      tmp = LLVMBuildAdd(builder, wg_offset_dw[buffer], tmp, "");
-      offset[buffer] = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->ac.i32, 2, false), "");
-   }
-
-   if (ctx->stage == MESA_SHADER_GEOMETRY) {
-      struct radv_shader_output_values outputs[AC_LLVM_MAX_OUTPUTS];
-      unsigned noutput = 0;
-      unsigned out_idx = 0;
-
-      for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
-         unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
-         uint8_t output_stream = ctx->args->shader_info->gs.output_streams[i];
-
-         if (!(ctx->output_mask & (1ull << i)) || output_stream != stream)
-            continue;
-
-         outputs[noutput].slot_name = i;
-         outputs[noutput].slot_index = i == VARYING_SLOT_CLIP_DIST1;
-         outputs[noutput].usage_mask = output_usage_mask;
-
-         int length = util_last_bit(output_usage_mask);
-
-         for (unsigned j = 0; j < length; j++, out_idx++) {
-            if (!(output_usage_mask & (1 << j)))
-               continue;
-
-            tmp = ac_build_gep0(&ctx->ac, vertexptr, LLVMConstInt(ctx->ac.i32, out_idx, false));
-            outputs[noutput].values[j] = LLVMBuildLoad(builder, tmp, "");
-         }
-
-         for (unsigned j = length; j < 4; j++)
-            outputs[noutput].values[j] = LLVMGetUndef(ctx->ac.f32);
-
-         noutput++;
-      }
-
-      for (unsigned i = 0; i < noutput; i++) {
-         struct radv_stream_output *output =
-            radv_get_stream_output_by_loc(so, outputs[i].slot_name);
-
-         if (!output || output->stream != stream)
-            continue;
-
-         struct radv_shader_output_values out = {0};
-
-         for (unsigned j = 0; j < 4; j++) {
-            out.values[j] = outputs[i].values[j];
-         }
-
-         radv_emit_stream_output(ctx, so_buffer, offset, output, &out);
-      }
-   } else {
-      for (unsigned i = 0; i < so->num_outputs; ++i) {
-         struct radv_stream_output *output = &ctx->args->shader_info->so.outputs[i];
-
-         if (stream != output->stream)
-            continue;
-
-         struct radv_shader_output_values out = {0};
-
-         for (unsigned comp = 0; comp < 4; comp++) {
-            if (!(output->component_mask & (1 << comp)))
-               continue;
-
-            tmp =
-               ac_build_gep0(&ctx->ac, vertexptr, LLVMConstInt(ctx->ac.i32, 4 * i + comp, false));
-            out.values[comp] = LLVMBuildLoad(builder, tmp, "");
-         }
-
-         radv_emit_stream_output(ctx, so_buffer, offset, output, &out);
-      }
-   }
-}
-
-struct ngg_streamout {
-   LLVMValueRef num_vertices;
-
-   /* per-thread data */
-   LLVMValueRef prim_enable[4]; /* i1 per stream */
-   LLVMValueRef vertices[3];    /* [N x i32] addrspace(LDS)* */
-
-   /* Output */
-   LLVMValueRef emit[4]; /* per-stream emitted primitives (only valid for used streams) */
-};
-
-/**
- * Build streamout logic.
- *
- * Implies a barrier.
- *
- * Writes number of emitted primitives to gs_ngg_scratch[4:7].
- *
- * Clobbers gs_ngg_scratch[8:].
- */
-static void
-build_streamout(struct radv_shader_context *ctx, struct ngg_streamout *nggso)
-{
-   struct radv_streamout_info *so = &ctx->args->shader_info->so;
-   LLVMBuilderRef builder = ctx->ac.builder;
-   LLVMValueRef buf_ptr = ac_get_arg(&ctx->ac, ctx->args->streamout_buffers);
-   LLVMValueRef tid = get_thread_id_in_tg(ctx);
-   LLVMValueRef cond, tmp, tmp2;
-   LLVMValueRef i32_2 = LLVMConstInt(ctx->ac.i32, 2, false);
-   LLVMValueRef i32_4 = LLVMConstInt(ctx->ac.i32, 4, false);
-   LLVMValueRef i32_8 = LLVMConstInt(ctx->ac.i32, 8, false);
-   LLVMValueRef so_buffer[4] = {0};
-   unsigned max_num_vertices = 1 + (nggso->vertices[1] ? 1 : 0) + (nggso->vertices[2] ? 1 : 0);
-   LLVMValueRef prim_stride_dw[4] = {0};
-   LLVMValueRef prim_stride_dw_vgpr = LLVMGetUndef(ctx->ac.i32);
-   int stream_for_buffer[4] = {-1, -1, -1, -1};
-   unsigned bufmask_for_stream[4] = {0};
-   bool isgs = ctx->stage == MESA_SHADER_GEOMETRY;
-   unsigned scratch_emit_base = isgs ? 4 : 0;
-   LLVMValueRef scratch_emit_basev = isgs ? i32_4 : ctx->ac.i32_0;
-   unsigned scratch_offset_base = isgs ? 8 : 4;
-   LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4;
-
-   ac_llvm_add_target_dep_function_attr(ctx->main_function, "amdgpu-gds-size", 256);
-
-   /* Determine the mapping of streamout buffers to vertex streams. */
-   for (unsigned i = 0; i < so->num_outputs; ++i) {
-      unsigned buf = so->outputs[i].buffer;
-      unsigned stream = so->outputs[i].stream;
-      assert(stream_for_buffer[buf] < 0 || stream_for_buffer[buf] == stream);
-      stream_for_buffer[buf] = stream;
-      bufmask_for_stream[stream] |= 1 << buf;
-   }
-
-   for (unsigned buffer = 0; buffer < 4; ++buffer) {
-      if (stream_for_buffer[buffer] == -1)
-         continue;
-
-      assert(so->strides[buffer]);
-
-      LLVMValueRef stride_for_buffer = LLVMConstInt(ctx->ac.i32, so->strides[buffer], false);
-      prim_stride_dw[buffer] = LLVMBuildMul(builder, stride_for_buffer, nggso->num_vertices, "");
-      prim_stride_dw_vgpr =
-         ac_build_writelane(&ctx->ac, prim_stride_dw_vgpr, prim_stride_dw[buffer],
-                            LLVMConstInt(ctx->ac.i32, buffer, false));
-
-      LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, buffer, false);
-      so_buffer[buffer] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset);
-   }
-
-   cond = LLVMBuildICmp(builder, LLVMIntEQ, get_wave_id_in_tg(ctx), ctx->ac.i32_0, "");
-   ac_build_ifcc(&ctx->ac, cond, 5200);
-   {
-      LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
-      LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
-
-      /* Advance the streamout offsets in GDS. */
-      LLVMValueRef offsets_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-      LLVMValueRef generated_by_stream_vgpr = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-
-      cond = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
-      ac_build_ifcc(&ctx->ac, cond, 5210);
-      {
-         /* Fetch the number of generated primitives and store
-          * it in GDS for later use.
-          */
-         if (isgs) {
-            tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid);
-            tmp = LLVMBuildLoad(builder, tmp, "");
-         } else {
-            tmp = ac_build_writelane(&ctx->ac, ctx->ac.i32_0, ngg_get_prim_cnt(ctx), ctx->ac.i32_0);
-         }
-         LLVMBuildStore(builder, tmp, generated_by_stream_vgpr);
-
-         unsigned swizzle[4];
-         int unused_stream = -1;
-         for (unsigned stream = 0; stream < 4; ++stream) {
-            if (!ctx->args->shader_info->gs.num_stream_output_components[stream]) {
-               unused_stream = stream;
-               break;
-            }
-         }
-         for (unsigned buffer = 0; buffer < 4; ++buffer) {
-            if (stream_for_buffer[buffer] >= 0) {
-               swizzle[buffer] = stream_for_buffer[buffer];
-            } else {
-               assert(unused_stream >= 0);
-               swizzle[buffer] = unused_stream;
-            }
-         }
-
-         tmp = ac_build_quad_swizzle(&ctx->ac, tmp, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
-         tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
-
-         LLVMValueRef args[] = {
-            LLVMBuildIntToPtr(builder, ngg_get_ordered_id(ctx), gdsptr, ""),
-            tmp,
-            ctx->ac.i32_0,                             // ordering
-            ctx->ac.i32_0,                             // scope
-            ctx->ac.i1false,                           // isVolatile
-            LLVMConstInt(ctx->ac.i32, 4 << 24, false), // OA index
-            ctx->ac.i1true,                            // wave release
-            ctx->ac.i1true,                            // wave done
-         };
-
-         tmp = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.ds.ordered.add", ctx->ac.i32, args,
-                                  ARRAY_SIZE(args), 0);
-
-         /* Keep offsets in a VGPR for quick retrieval via readlane by
-          * the first wave for bounds checking, and also store in LDS
-          * for retrieval by all waves later. */
-         LLVMBuildStore(builder, tmp, offsets_vgpr);
-
-         tmp2 = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_offset_basev, "");
-         tmp2 = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp2);
-         LLVMBuildStore(builder, tmp, tmp2);
-      }
-      ac_build_endif(&ctx->ac, 5210);
-
-      /* Determine the max emit per buffer. This is done via the SALU, in part
-       * because LLVM can't generate divide-by-multiply if we try to do this
-       * via VALU with one lane per buffer.
-       */
-      LLVMValueRef max_emit[4] = {0};
-      for (unsigned buffer = 0; buffer < 4; ++buffer) {
-         if (stream_for_buffer[buffer] == -1)
-            continue;
-
-         /* Compute the streamout buffer size in DWORD. */
-         LLVMValueRef bufsize_dw = LLVMBuildLShr(
-            builder, LLVMBuildExtractElement(builder, so_buffer[buffer], i32_2, ""), i32_2, "");
-
-         /* Load the streamout buffer offset from GDS. */
-         tmp = LLVMBuildLoad(builder, offsets_vgpr, "");
-         LLVMValueRef offset_dw =
-            ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, buffer, false));
-
-         /* Compute the remaining size to emit. */
-         LLVMValueRef remaining_dw = LLVMBuildSub(builder, bufsize_dw, offset_dw, "");
-         tmp = LLVMBuildUDiv(builder, remaining_dw, prim_stride_dw[buffer], "");
-
-         cond = LLVMBuildICmp(builder, LLVMIntULT, bufsize_dw, offset_dw, "");
-         max_emit[buffer] = LLVMBuildSelect(builder, cond, ctx->ac.i32_0, tmp, "");
-      }
-
-      /* Determine the number of emitted primitives per stream and fixup the
-       * GDS counter if necessary.
-       *
-       * This is complicated by the fact that a single stream can emit to
-       * multiple buffers (but luckily not vice versa).
-       */
-      LLVMValueRef emit_vgpr = ctx->ac.i32_0;
-
-      for (unsigned stream = 0; stream < 4; ++stream) {
-         if (!ctx->args->shader_info->gs.num_stream_output_components[stream])
-            continue;
-
-         /* Load the number of generated primitives from GDS and
-          * determine that number for the given stream.
-          */
-         tmp = LLVMBuildLoad(builder, generated_by_stream_vgpr, "");
-         LLVMValueRef generated =
-            ac_build_readlane(&ctx->ac, tmp, LLVMConstInt(ctx->ac.i32, stream, false));
-
-         /* Compute the number of emitted primitives. */
-         LLVMValueRef emit = generated;
-         for (unsigned buffer = 0; buffer < 4; ++buffer) {
-            if (stream_for_buffer[buffer] == stream)
-               emit = ac_build_umin(&ctx->ac, emit, max_emit[buffer]);
-         }
-
-         /* Store the number of emitted primitives for that
-          * stream.
-          */
-         emit_vgpr =
-            ac_build_writelane(&ctx->ac, emit_vgpr, emit, LLVMConstInt(ctx->ac.i32, stream, false));
-
-         /* Fixup the offset using a plain GDS atomic if we overflowed. */
-         cond = LLVMBuildICmp(builder, LLVMIntULT, emit, generated, "");
-         ac_build_ifcc(&ctx->ac, cond, 5221); /* scalar branch */
-         tmp = LLVMBuildLShr(builder, LLVMConstInt(ctx->ac.i32, bufmask_for_stream[stream], false),
-                             ac_get_thread_id(&ctx->ac), "");
-         tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
-         ac_build_ifcc(&ctx->ac, tmp, 5222);
-         {
-            tmp = LLVMBuildSub(builder, generated, emit, "");
-            tmp = LLVMBuildMul(builder, tmp, prim_stride_dw_vgpr, "");
-            tmp2 = LLVMBuildGEP(builder, gdsbase, &tid, 1, "");
-            LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpSub, tmp2, tmp,
-                               LLVMAtomicOrderingMonotonic, false);
-         }
-         ac_build_endif(&ctx->ac, 5222);
-         ac_build_endif(&ctx->ac, 5221);
-      }
-
-      /* Store the number of emitted primitives to LDS for later use. */
-      cond = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), i32_4, "");
-      ac_build_ifcc(&ctx->ac, cond, 5225);
-      {
-         tmp = LLVMBuildAdd(builder, ac_get_thread_id(&ctx->ac), scratch_emit_basev, "");
-         tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tmp);
-         LLVMBuildStore(builder, emit_vgpr, tmp);
-      }
-      ac_build_endif(&ctx->ac, 5225);
-   }
-   ac_build_endif(&ctx->ac, 5200);
-
-   /* Determine the workgroup-relative per-thread / primitive offset into
-    * the streamout buffers */
-   struct ac_wg_scan primemit_scan[4] = {0};
-
-   if (isgs) {
-      for (unsigned stream = 0; stream < 4; ++stream) {
-         if (!ctx->args->shader_info->gs.num_stream_output_components[stream])
-            continue;
-
-         primemit_scan[stream].enable_exclusive = true;
-         primemit_scan[stream].op = nir_op_iadd;
-         primemit_scan[stream].src = nggso->prim_enable[stream];
-         primemit_scan[stream].scratch = ac_build_gep0(
-            &ctx->ac, ctx->gs_ngg_scratch, LLVMConstInt(ctx->ac.i32, 12 + 8 * stream, false));
-         primemit_scan[stream].waveidx = get_wave_id_in_tg(ctx);
-         primemit_scan[stream].numwaves = get_tgsize(ctx);
-         primemit_scan[stream].maxwaves = 8;
-         ac_build_wg_scan_top(&ctx->ac, &primemit_scan[stream]);
-      }
-   }
-
-   ac_build_s_barrier(&ctx->ac);
-
-   /* Fetch the per-buffer offsets and per-stream emit counts in all waves. */
-   LLVMValueRef wgoffset_dw[4] = {0};
-
-   {
-      LLVMValueRef scratch_vgpr;
-
-      tmp = ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, ac_get_thread_id(&ctx->ac));
-      scratch_vgpr = LLVMBuildLoad(builder, tmp, "");
-
-      for (unsigned buffer = 0; buffer < 4; ++buffer) {
-         if (stream_for_buffer[buffer] >= 0) {
-            wgoffset_dw[buffer] =
-               ac_build_readlane(&ctx->ac, scratch_vgpr,
-                                 LLVMConstInt(ctx->ac.i32, scratch_offset_base + buffer, false));
-         }
-      }
-
-      for (unsigned stream = 0; stream < 4; ++stream) {
-         if (ctx->args->shader_info->gs.num_stream_output_components[stream]) {
-            nggso->emit[stream] =
-               ac_build_readlane(&ctx->ac, scratch_vgpr,
-                                 LLVMConstInt(ctx->ac.i32, scratch_emit_base + stream, false));
-         }
-      }
-   }
-
-   /* Write out primitive data */
-   for (unsigned stream = 0; stream < 4; ++stream) {
-      if (!ctx->args->shader_info->gs.num_stream_output_components[stream])
-         continue;
-
-      if (isgs) {
-         ac_build_wg_scan_bottom(&ctx->ac, &primemit_scan[stream]);
-      } else {
-         primemit_scan[stream].result_exclusive = tid;
-      }
-
-      cond = LLVMBuildICmp(builder, LLVMIntULT, primemit_scan[stream].result_exclusive,
-                           nggso->emit[stream], "");
-      cond = LLVMBuildAnd(builder, cond, nggso->prim_enable[stream], "");
-      ac_build_ifcc(&ctx->ac, cond, 5240);
-      {
-         LLVMValueRef offset_vtx =
-            LLVMBuildMul(builder, primemit_scan[stream].result_exclusive, nggso->num_vertices, "");
-
-         for (unsigned i = 0; i < max_num_vertices; ++i) {
-            cond = LLVMBuildICmp(builder, LLVMIntULT, LLVMConstInt(ctx->ac.i32, i, false),
-                                 nggso->num_vertices, "");
-            ac_build_ifcc(&ctx->ac, cond, 5241);
-            build_streamout_vertex(ctx, so_buffer, wgoffset_dw, stream, offset_vtx,
-                                   nggso->vertices[i]);
-            ac_build_endif(&ctx->ac, 5241);
-            offset_vtx = LLVMBuildAdd(builder, offset_vtx, ctx->ac.i32_1, "");
-         }
-      }
-      ac_build_endif(&ctx->ac, 5240);
-   }
-}
-
-static unsigned
-ngg_nogs_vertex_size(struct radv_shader_context *ctx)
-{
-   unsigned lds_vertex_size = 0;
-
-   if (ctx->args->shader_info->so.num_outputs)
-      lds_vertex_size = 4 * ctx->args->shader_info->so.num_outputs + 1;
-
-   return lds_vertex_size;
-}
-
-/**
- * Returns an `[N x i32] addrspace(LDS)*` pointing at contiguous LDS storage
- * for the vertex outputs.
- */
-static LLVMValueRef
-ngg_nogs_vertex_ptr(struct radv_shader_context *ctx, LLVMValueRef vtxid)
-{
-   /* The extra dword is used to avoid LDS bank conflicts. */
-   unsigned vertex_size = ngg_nogs_vertex_size(ctx);
-   LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, vertex_size);
-   LLVMTypeRef pai32 = LLVMPointerType(ai32, AC_ADDR_SPACE_LDS);
-   LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, ctx->esgs_ring, pai32, "");
-   return LLVMBuildGEP(ctx->ac.builder, tmp, &vtxid, 1, "");
-}
-
-static void
-handle_ngg_outputs_post_1(struct radv_shader_context *ctx)
-{
-   struct radv_streamout_info *so = &ctx->args->shader_info->so;
-   LLVMBuilderRef builder = ctx->ac.builder;
-   LLVMValueRef vertex_ptr = NULL;
-   LLVMValueRef tmp, tmp2;
-
-   assert((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
-          !ctx->args->is_gs_copy_shader);
-
-   if (!ctx->args->shader_info->so.num_outputs)
-      return;
-
-   vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
-
-   for (unsigned i = 0; i < so->num_outputs; ++i) {
-      struct radv_stream_output *output = &ctx->args->shader_info->so.outputs[i];
-
-      unsigned loc = output->location;
-
-      for (unsigned comp = 0; comp < 4; comp++) {
-         if (!(output->component_mask & (1 << comp)))
-            continue;
-
-         tmp = ac_build_gep0(&ctx->ac, vertex_ptr, LLVMConstInt(ctx->ac.i32, 4 * i + comp, false));
-         tmp2 = LLVMBuildLoad(builder, ctx->abi.outputs[4 * loc + comp], "");
-         tmp2 = ac_to_integer(&ctx->ac, tmp2);
-         LLVMBuildStore(builder, tmp2, tmp);
-      }
-   }
-}
-
 static void
 handle_ngg_outputs_post_2(struct radv_shader_context *ctx)
 {
@@ -2097,17 +1581,13 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx)
    LLVMValueRef vtxindex[] = {
       ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[0]), 0, 16),
       ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[0]), 16, 16),
-      ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[2]), 0, 16),
+      ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[1]), 0, 16),
    };
 
    /* Determine the number of vertices per primitive. */
    unsigned num_vertices;
-   LLVMValueRef num_vertices_val;
 
    if (ctx->stage == MESA_SHADER_VERTEX) {
-      LLVMValueRef outprim_val =
-         LLVMConstInt(ctx->ac.i32, ctx->args->options->key.vs.outprim, false);
-      num_vertices_val = LLVMBuildAdd(builder, outprim_val, ctx->ac.i32_1, "");
       num_vertices = 3; /* TODO: optimize for points & lines */
    } else {
       assert(ctx->stage == MESA_SHADER_TESS_EVAL);
@@ -2118,37 +1598,21 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx)
          num_vertices = 2;
       else
          num_vertices = 3;
-
-      num_vertices_val = LLVMConstInt(ctx->ac.i32, num_vertices, false);
-   }
-
-   /* Streamout */
-   if (ctx->args->shader_info->so.num_outputs) {
-      struct ngg_streamout nggso = {0};
-
-      nggso.num_vertices = num_vertices_val;
-      nggso.prim_enable[0] = is_gs_thread;
-
-      for (unsigned i = 0; i < num_vertices; ++i)
-         nggso.vertices[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
-
-      build_streamout(ctx, &nggso);
    }
 
    /* Copy Primitive IDs from GS threads to the LDS address corresponding
     * to the ES thread of the provoking vertex.
     */
-   if (ctx->stage == MESA_SHADER_VERTEX && ctx->args->options->key.vs_common_out.export_prim_id) {
-      if (ctx->args->shader_info->so.num_outputs)
-         ac_build_s_barrier(&ctx->ac);
-
+   if (ctx->stage == MESA_SHADER_VERTEX && ctx->shader_info->vs.outinfo.export_prim_id) {
       ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
 
       LLVMValueRef provoking_vtx_in_prim = LLVMConstInt(ctx->ac.i32, 0, false);
 
       /* For provoking vertex last mode, use num_vtx_in_prim - 1. */
-      if (ctx->args->options->key.vs.provoking_vtx_last)
-         provoking_vtx_in_prim = LLVMConstInt(ctx->ac.i32, ctx->args->options->key.vs.outprim, false);
+      if (ctx->options->key.vs.provoking_vtx_last) {
+         uint8_t outprim = si_conv_prim_to_gs_out(ctx->options->key.vs.topology);
+         provoking_vtx_in_prim = LLVMConstInt(ctx->ac.i32, outprim, false);
+      }
 
       /* provoking_vtx_index = vtxindex[provoking_vtx_in_prim]; */
       LLVMValueRef indices = ac_build_gather_values(&ctx->ac, vtxindex, 3);
@@ -2179,18 +1643,13 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx)
    {
       struct ac_ngg_prim prim = {0};
 
-      if (ctx->args->options->key.vs_common_out.as_ngg_passthrough) {
+      if (ctx->shader_info->is_ngg_passthrough) {
          prim.passthrough = ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[0]);
       } else {
          prim.num_vertices = num_vertices;
          prim.isnull = ctx->ac.i1false;
+         prim.edgeflags = ctx->ac.i32_0;
          memcpy(prim.index, vtxindex, sizeof(vtxindex[0]) * 3);
-
-         for (unsigned i = 0; i < num_vertices; ++i) {
-            tmp = LLVMBuildLShr(builder, ac_get_arg(&ctx->ac, ctx->args->ac.gs_invocation_id),
-                                LLVMConstInt(ctx->ac.i32, 8 + i, false), "");
-            prim.edgeflag[i] = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
-         }
       }
 
       ac_build_export_prim(&ctx->ac, &prim);
@@ -2201,16 +1660,14 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx)
    ac_build_ifcc(&ctx->ac, is_es_thread, 6002);
    {
       struct radv_vs_output_info *outinfo = ctx->stage == MESA_SHADER_TESS_EVAL
-                                               ? &ctx->args->shader_info->tes.outinfo
-                                               : &ctx->args->shader_info->vs.outinfo;
+                                               ? &ctx->shader_info->tes.outinfo
+                                               : &ctx->shader_info->vs.outinfo;
 
       /* Exporting the primitive ID is handled below. */
       /* TODO: use the new VS export path */
-      handle_vs_outputs_post(ctx, false, ctx->args->options->key.vs_common_out.export_clip_dists,
-                             outinfo);
+      handle_vs_outputs_post(ctx, false, outinfo->export_clip_dists, outinfo);
 
-      if (ctx->args->options->key.vs_common_out.export_prim_id) {
-         unsigned param_count = outinfo->param_exports;
+      if (outinfo->export_prim_id) {
          LLVMValueRef values[4];
 
          if (ctx->stage == MESA_SHADER_VERTEX) {
@@ -2228,10 +1685,8 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx)
          for (unsigned j = 1; j < 4; j++)
             values[j] = ctx->ac.f32_0;
 
-         radv_export_param(ctx, param_count, values, 0x1);
-
-         outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = param_count++;
-         outinfo->param_exports = param_count;
+         radv_export_param(ctx, outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID], values,
+                           0x1);
       }
    }
    ac_build_endif(&ctx->ac, 6002);
@@ -2283,7 +1738,7 @@ gfx10_ngg_gs_emit_epilogue_1(struct radv_shader_context *ctx)
    for (unsigned stream = 0; stream < 4; ++stream) {
       unsigned num_components;
 
-      num_components = ctx->args->shader_info->gs.num_stream_output_components[stream];
+      num_components = ctx->shader_info->gs.num_stream_output_components[stream];
       if (!num_components)
          continue;
 
@@ -2311,7 +1766,7 @@ gfx10_ngg_gs_emit_epilogue_1(struct radv_shader_context *ctx)
    for (unsigned stream = 0; stream < 4; ++stream) {
       unsigned num_components;
 
-      num_components = ctx->args->shader_info->gs.num_stream_output_components[stream];
+      num_components = ctx->shader_info->gs.num_stream_output_components[stream];
       if (!num_components)
          continue;
 
@@ -2343,33 +1798,6 @@ gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
    const LLVMValueRef tid = get_thread_id_in_tg(ctx);
    LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
 
-   /* Streamout */
-   if (ctx->args->shader_info->so.num_outputs) {
-      struct ngg_streamout nggso = {0};
-
-      nggso.num_vertices = LLVMConstInt(ctx->ac.i32, verts_per_prim, false);
-
-      LLVMValueRef vertexptr = ngg_gs_vertex_ptr(ctx, tid);
-      for (unsigned stream = 0; stream < 4; ++stream) {
-         if (!ctx->args->shader_info->gs.num_stream_output_components[stream])
-            continue;
-
-         tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, vertexptr, stream), "");
-         tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
-         tmp2 = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
-         nggso.prim_enable[stream] = LLVMBuildAnd(builder, tmp, tmp2, "");
-      }
-
-      for (unsigned i = 0; i < verts_per_prim; ++i) {
-         tmp = LLVMBuildSub(builder, tid, LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false),
-                            "");
-         tmp = ngg_gs_vertex_ptr(ctx, tmp);
-         nggso.vertices[i] = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
-      }
-
-      build_streamout(ctx, &nggso);
-   }
-
    /* Write shader query data. */
    tmp = ac_get_arg(&ctx->ac, ctx->args->ngg_gs_state);
    tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
@@ -2483,11 +1911,11 @@ gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
       tmp = ngg_gs_vertex_ptr(ctx, tid);
       flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
       prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), "");
+      prim.edgeflags = ctx->ac.i32_0;
 
       for (unsigned i = 0; i < verts_per_prim; ++i) {
          prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive,
                                       LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
-         prim.edgeflag[i] = ctx->ac.i1false;
       }
 
       /* Geometry shaders output triangle strips, but NGG expects triangles. */
@@ -2496,7 +1924,7 @@ gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
          is_odd = LLVMBuildTrunc(builder, is_odd, ctx->ac.i1, "");
 
          LLVMValueRef flatshade_first =
-            LLVMConstInt(ctx->ac.i32, !ctx->args->options->key.vs.provoking_vtx_last, false);
+            LLVMConstInt(ctx->ac.i1, !ctx->options->key.vs.provoking_vtx_last, false);
 
          ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, prim.index);
       }
@@ -2509,8 +1937,8 @@ gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
    tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, "");
    ac_build_ifcc(&ctx->ac, tmp, 5145);
    {
-      struct radv_vs_output_info *outinfo = &ctx->args->shader_info->vs.outinfo;
-      bool export_view_index = ctx->args->options->key.has_multiview_view_index;
+      struct radv_vs_output_info *outinfo = &ctx->shader_info->vs.outinfo;
+      bool export_view_index = ctx->options->key.has_multiview_view_index;
       struct radv_shader_output_values *outputs;
       unsigned noutput = 0;
 
@@ -2518,10 +1946,6 @@ gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
       unsigned num_outputs = util_bitcount64(ctx->output_mask) + export_view_index;
       outputs = calloc(num_outputs, sizeof(outputs[0]));
 
-      memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
-             sizeof(outinfo->vs_output_param_offset));
-      outinfo->pos_exports = 0;
-
       tmp = ngg_gs_vertex_ptr(ctx, tid);
       tmp = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 1), "");
       tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
@@ -2529,7 +1953,7 @@ gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
 
       unsigned out_idx = 0;
       for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
-         unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
+         unsigned output_usage_mask = ctx->shader_info->gs.output_usage_mask[i];
          int length = util_last_bit(output_usage_mask);
 
          if (!(ctx->output_mask & (1ull << i)))
@@ -2573,8 +1997,7 @@ gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
          noutput++;
       }
 
-      radv_llvm_export_vs(ctx, outputs, noutput, outinfo,
-                          ctx->args->options->key.vs_common_out.export_clip_dists);
+      radv_llvm_export_vs(ctx, outputs, noutput, outinfo, outinfo->export_clip_dists);
       FREE(outputs);
    }
    ac_build_endif(&ctx->ac, 5145);
@@ -2590,8 +2013,8 @@ gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, unsigned stream, LLVMV
    const LLVMValueRef vertexptr = ngg_gs_emit_vertex_ptr(ctx, get_thread_id_in_tg(ctx), vertexidx);
    unsigned out_idx = 0;
    for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
-      unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
-      uint8_t output_stream = ctx->args->shader_info->gs.output_streams[i];
+      unsigned output_usage_mask = ctx->shader_info->gs.output_usage_mask[i];
+      uint8_t output_stream = ctx->shader_info->gs.output_streams[i];
       LLVMValueRef *out_ptr = &addrs[i * 4];
       int length = util_last_bit(output_usage_mask);
 
@@ -2609,7 +2032,7 @@ gfx10_ngg_gs_emit_vertex(struct radv_shader_context *ctx, unsigned stream, LLVMV
          LLVMBuildStore(builder, out_val, ngg_gs_get_emit_output_ptr(ctx, vertexptr, out_idx));
       }
    }
-   assert(out_idx * 4 <= ctx->args->shader_info->gs.gsvs_vertex_size);
+   assert(out_idx * 4 <= ctx->shader_info->gs.gsvs_vertex_size);
 
    /* Store the current number of emitted vertices to zero out remaining
     * primitive flags in case the geometry shader doesn't emit the maximum
@@ -2701,22 +2124,22 @@ handle_fs_outputs_post(struct radv_shader_context *ctx)
    }
 
    /* Process depth, stencil, samplemask. */
-   if (ctx->args->shader_info->ps.writes_z) {
+   if (ctx->shader_info->ps.writes_z) {
       depth = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_DEPTH, 0));
    }
-   if (ctx->args->shader_info->ps.writes_stencil) {
+   if (ctx->shader_info->ps.writes_stencil) {
       stencil = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_STENCIL, 0));
    }
-   if (ctx->args->shader_info->ps.writes_sample_mask) {
+   if (ctx->shader_info->ps.writes_sample_mask) {
       samplemask = ac_to_float(&ctx->ac, radv_load_output(ctx, FRAG_RESULT_SAMPLE_MASK, 0));
    }
 
    /* Set the DONE bit on last non-null color export only if Z isn't
     * exported.
     */
-   if (index > 0 && !ctx->args->shader_info->ps.writes_z &&
-       !ctx->args->shader_info->ps.writes_stencil &&
-       !ctx->args->shader_info->ps.writes_sample_mask) {
+   if (index > 0 && !ctx->shader_info->ps.writes_z &&
+       !ctx->shader_info->ps.writes_stencil &&
+       !ctx->shader_info->ps.writes_sample_mask) {
       unsigned last = index - 1;
 
       color_args[last].valid_mask = 1; /* whether the EXEC mask is valid */
@@ -2736,7 +2159,7 @@ handle_fs_outputs_post(struct radv_shader_context *ctx)
 static void
 emit_gs_epilogue(struct radv_shader_context *ctx)
 {
-   if (ctx->args->options->key.vs_common_out.as_ngg) {
+   if (ctx->shader_info->is_ngg) {
       gfx10_ngg_gs_emit_epilogue_1(ctx);
       return;
    }
@@ -2748,22 +2171,22 @@ emit_gs_epilogue(struct radv_shader_context *ctx)
 }
 
 static void
-handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
+handle_shader_outputs_post(struct ac_shader_abi *abi)
 {
    struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
 
    switch (ctx->stage) {
    case MESA_SHADER_VERTEX:
-      if (ctx->args->options->key.vs_common_out.as_ls)
+      if (ctx->shader_info->vs.as_ls)
          break; /* Lowered in NIR */
-      else if (ctx->args->options->key.vs_common_out.as_es)
+      else if (ctx->shader_info->vs.as_es)
          break; /* Lowered in NIR */
-      else if (ctx->args->options->key.vs_common_out.as_ngg)
-         handle_ngg_outputs_post_1(ctx);
+      else if (ctx->shader_info->is_ngg)
+         break;
       else
-         handle_vs_outputs_post(ctx, ctx->args->options->key.vs_common_out.export_prim_id,
-                                ctx->args->options->key.vs_common_out.export_clip_dists,
-                                &ctx->args->shader_info->vs.outinfo);
+         handle_vs_outputs_post(ctx, ctx->shader_info->vs.outinfo.export_prim_id,
+                                ctx->shader_info->vs.outinfo.export_clip_dists,
+                                &ctx->shader_info->vs.outinfo);
       break;
    case MESA_SHADER_FRAGMENT:
       handle_fs_outputs_post(ctx);
@@ -2774,14 +2197,14 @@ handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs, LLVM
    case MESA_SHADER_TESS_CTRL:
       break; /* Lowered in NIR */
    case MESA_SHADER_TESS_EVAL:
-      if (ctx->args->options->key.vs_common_out.as_es)
+      if (ctx->shader_info->tes.as_es)
          break; /* Lowered in NIR */
-      else if (ctx->args->options->key.vs_common_out.as_ngg)
-         handle_ngg_outputs_post_1(ctx);
+      else if (ctx->shader_info->is_ngg)
+         break;
       else
-         handle_vs_outputs_post(ctx, ctx->args->options->key.vs_common_out.export_prim_id,
-                                ctx->args->options->key.vs_common_out.export_clip_dists,
-                                &ctx->args->shader_info->tes.outinfo);
+         handle_vs_outputs_post(ctx, ctx->shader_info->tes.outinfo.export_prim_id,
+                                ctx->shader_info->tes.outinfo.export_clip_dists,
+                                &ctx->shader_info->tes.outinfo);
       break;
    default:
       break;
@@ -2789,8 +2212,7 @@ handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs, LLVM
 }
 
 static void
-ac_llvm_finalize_module(struct radv_shader_context *ctx, LLVMPassManagerRef passmgr,
-                        const struct radv_nir_compiler_options *options)
+ac_llvm_finalize_module(struct radv_shader_context *ctx, LLVMPassManagerRef passmgr)
 {
    LLVMRunPassManager(passmgr, ctx->ac.module);
    LLVMDisposeBuilder(ctx->ac.builder);
@@ -2810,15 +2232,15 @@ ac_nir_eliminate_const_vs_outputs(struct radv_shader_context *ctx)
    case MESA_SHADER_GEOMETRY:
       return;
    case MESA_SHADER_VERTEX:
-      if (ctx->args->options->key.vs_common_out.as_ls ||
-          ctx->args->options->key.vs_common_out.as_es)
+      if (ctx->shader_info->vs.as_ls ||
+          ctx->shader_info->vs.as_es)
          return;
-      outinfo = &ctx->args->shader_info->vs.outinfo;
+      outinfo = &ctx->shader_info->vs.outinfo;
       break;
    case MESA_SHADER_TESS_EVAL:
-      if (ctx->args->options->key.vs_common_out.as_es)
+      if (ctx->shader_info->tes.as_es)
          return;
-      outinfo = &ctx->args->shader_info->tes.outinfo;
+      outinfo = &ctx->shader_info->tes.outinfo;
       break;
    default:
       unreachable("Unhandled shader type");
@@ -2831,8 +2253,10 @@ ac_nir_eliminate_const_vs_outputs(struct radv_shader_context *ctx)
 static void
 ac_setup_rings(struct radv_shader_context *ctx)
 {
-   if (ctx->args->options->chip_class <= GFX8 &&
-       (ctx->stage == MESA_SHADER_GEOMETRY || ctx->args->options->key.vs_common_out.as_es)) {
+   if (ctx->options->chip_class <= GFX8 &&
+       (ctx->stage == MESA_SHADER_GEOMETRY ||
+        (ctx->stage == MESA_SHADER_VERTEX && ctx->shader_info->vs.as_es) ||
+        (ctx->stage == MESA_SHADER_TESS_EVAL && ctx->shader_info->tes.as_es))) {
       unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? RING_ESGS_GS : RING_ESGS_VS;
       LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, false);
 
@@ -2865,7 +2289,7 @@ ac_setup_rings(struct radv_shader_context *ctx)
          unsigned num_components, stride;
          LLVMValueRef ring, tmp;
 
-         num_components = ctx->args->shader_info->gs.num_stream_output_components[stream];
+         num_components = ctx->shader_info->gs.num_stream_output_components[stream];
 
          if (!num_components)
             continue;
@@ -2905,17 +2329,6 @@ ac_setup_rings(struct radv_shader_context *ctx)
    }
 }
 
-unsigned
-radv_nir_get_max_workgroup_size(enum chip_class chip_class, gl_shader_stage stage,
-                                const struct nir_shader *nir)
-{
-   const unsigned backup_sizes[] = {chip_class >= GFX9 ? 128 : 64, 1, 1};
-   unsigned sizes[3];
-   for (unsigned i = 0; i < 3; i++)
-      sizes[i] = nir ? nir->info.workgroup_size[i] : backup_sizes[i];
-   return radv_get_max_workgroup_size(chip_class, stage, sizes);
-}
-
 /* Fixup the HW not emitting the TCS regs if there are no HS threads. */
 static void
 ac_nir_fixup_ls_hs_input_vgprs(struct radv_shader_context *ctx)
@@ -2940,7 +2353,7 @@ prepare_gs_input_vgprs(struct radv_shader_context *ctx, bool merged)
    if (merged) {
       for (int i = 5; i >= 0; --i) {
          ctx->gs_vtx_offset[i] = ac_unpack_param(
-            &ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[i & ~1]), (i & 1) * 16, 16);
+            &ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.gs_vtx_offset[i / 2]), (i & 1) * 16, 16);
       }
 
       ctx->gs_wave_id =
@@ -2972,39 +2385,37 @@ declare_esgs_ring(struct radv_shader_context *ctx)
 }
 
 static LLVMModuleRef
-ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, struct nir_shader *const *shaders,
-                         int shader_count, const struct radv_shader_args *args)
+ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
+                         const struct radv_nir_compiler_options *options,
+                         struct radv_shader_info *info,
+                         struct nir_shader *const *shaders, int shader_count,
+                         const struct radv_shader_args *args)
 {
    struct radv_shader_context ctx = {0};
    ctx.args = args;
+   ctx.options = options;
+   ctx.shader_info = info;
 
    enum ac_float_mode float_mode = AC_FLOAT_MODE_DEFAULT;
 
-   if (args->shader_info->float_controls_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) {
+   if (shaders[0]->info.float_controls_execution_mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) {
       float_mode = AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO;
    }
 
-   ac_llvm_context_init(&ctx.ac, ac_llvm, args->options->chip_class, args->options->family,
-                        args->options->info, float_mode, args->shader_info->wave_size,
-                        args->shader_info->ballot_bit_size);
+   ac_llvm_context_init(&ctx.ac, ac_llvm, options->chip_class, options->family,
+                        options->info, float_mode, info->wave_size, info->ballot_bit_size);
    ctx.context = ctx.ac.context;
 
-   ctx.max_workgroup_size = 0;
-   for (int i = 0; i < shader_count; ++i) {
-      ctx.max_workgroup_size = MAX2(
-         ctx.max_workgroup_size, radv_nir_get_max_workgroup_size(
-                                    args->options->chip_class, shaders[i]->info.stage, shaders[i]));
-   }
+   ctx.max_workgroup_size = info->workgroup_size;
 
    if (ctx.ac.chip_class >= GFX10) {
-      if (is_pre_gs_stage(shaders[0]->info.stage) && args->options->key.vs_common_out.as_ngg) {
+      if (is_pre_gs_stage(shaders[0]->info.stage) && info->is_ngg) {
          ctx.max_workgroup_size = 128;
       }
    }
 
    create_function(&ctx, shaders[shader_count - 1]->info.stage, shader_count >= 2);
 
-   ctx.abi.inputs = &ctx.inputs[0];
    ctx.abi.emit_outputs = handle_shader_outputs_post;
    ctx.abi.emit_vertex_with_counter = visit_emit_vertex_with_counter;
    ctx.abi.load_ubo = radv_load_ubo;
@@ -3015,10 +2426,10 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, struct nir_shader *co
    ctx.abi.load_ring_tess_offchip = load_ring_tess_offchip;
    ctx.abi.load_ring_esgs = load_ring_esgs;
    ctx.abi.clamp_shadow_reference = false;
-   ctx.abi.adjust_frag_coord_z = args->options->adjust_frag_coord_z;
-   ctx.abi.robust_buffer_access = args->options->robust_buffer_access;
+   ctx.abi.adjust_frag_coord_z = options->adjust_frag_coord_z;
+   ctx.abi.robust_buffer_access = options->robust_buffer_access;
 
-   bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) && args->options->key.vs_common_out.as_ngg;
+   bool is_ngg = is_pre_gs_stage(shaders[0]->info.stage) && info->is_ngg;
    if (shader_count >= 2 || is_ngg)
       ac_init_exec_full_mask(&ctx.ac);
 
@@ -3029,7 +2440,7 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, struct nir_shader *co
    if (args->ac.instance_id.used)
       ctx.abi.instance_id = ac_get_arg(&ctx.ac, args->ac.instance_id);
 
-   if (args->options->has_ls_vgpr_init_bug &&
+   if (options->has_ls_vgpr_init_bug &&
        shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL)
       ac_nir_fixup_ls_hs_input_vgprs(&ctx);
 
@@ -3041,20 +2452,9 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, struct nir_shader *co
        * Add an extra dword per vertex to ensure an odd stride, which
        * avoids bank conflicts for SoA accesses.
        */
-      if (!args->options->key.vs_common_out.as_ngg_passthrough)
+      if (!info->is_ngg_passthrough)
          declare_esgs_ring(&ctx);
 
-      /* This is really only needed when streamout and / or vertex
-       * compaction is enabled.
-       */
-      if (args->shader_info->so.num_outputs) {
-         LLVMTypeRef asi32 = LLVMArrayType(ctx.ac.i32, 8);
-         ctx.gs_ngg_scratch =
-            LLVMAddGlobalInAddressSpace(ctx.ac.module, asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
-         LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(asi32));
-         LLVMSetAlignment(ctx.gs_ngg_scratch, 4);
-      }
-
       /* GFX10 hang workaround - there needs to be an s_barrier before gs_alloc_req always */
       if (ctx.ac.chip_class == GFX10 && shader_count == 1)
          ac_build_s_barrier(&ctx.ac);
@@ -3069,17 +2469,13 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, struct nir_shader *co
          for (int i = 0; i < 4; i++) {
             ctx.gs_next_vertex[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, "");
          }
-         if (args->options->key.vs_common_out.as_ngg) {
+         if (info->is_ngg) {
             for (unsigned i = 0; i < 4; ++i) {
                ctx.gs_curprim_verts[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, "");
                ctx.gs_generated_prims[i] = ac_build_alloca(&ctx.ac, ctx.ac.i32, "");
             }
 
-            unsigned scratch_size = 8;
-            if (args->shader_info->so.num_outputs)
-               scratch_size = 44;
-
-            LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, scratch_size);
+            LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, 8);
             ctx.gs_ngg_scratch =
                LLVMAddGlobalInAddressSpace(ctx.ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS);
             LLVMSetInitializer(ctx.gs_ngg_scratch, LLVMGetUndef(ai32));
@@ -3093,25 +2489,23 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, struct nir_shader *co
 
          ctx.abi.emit_primitive = visit_end_primitive;
       } else if (shaders[shader_idx]->info.stage == MESA_SHADER_TESS_EVAL) {
-         ctx.abi.load_tess_coord = load_tess_coord;
       } else if (shaders[shader_idx]->info.stage == MESA_SHADER_VERTEX) {
          ctx.abi.load_base_vertex = radv_load_base_vertex;
+         ctx.abi.load_inputs = radv_load_vs_inputs;
       } else if (shaders[shader_idx]->info.stage == MESA_SHADER_FRAGMENT) {
          ctx.abi.load_sample_position = load_sample_position;
          ctx.abi.load_sample_mask_in = load_sample_mask_in;
       }
 
-      if (shaders[shader_idx]->info.stage == MESA_SHADER_VERTEX &&
-          args->options->key.vs_common_out.as_ngg &&
-          args->options->key.vs_common_out.export_prim_id) {
+      if (shaders[shader_idx]->info.stage == MESA_SHADER_VERTEX && info->is_ngg &&
+          info->vs.outinfo.export_prim_id) {
          declare_esgs_ring(&ctx);
       }
 
       bool nested_barrier = false;
 
       if (shader_idx) {
-         if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY &&
-             args->options->key.vs_common_out.as_ngg) {
+         if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY && info->is_ngg) {
             gfx10_ngg_gs_emit_prologue(&ctx);
             nested_barrier = false;
          } else {
@@ -3162,8 +2556,6 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, struct nir_shader *co
 
       if (shaders[shader_idx]->info.stage == MESA_SHADER_FRAGMENT)
          prepare_interp_optimize(&ctx, shaders[shader_idx]);
-      else if (shaders[shader_idx]->info.stage == MESA_SHADER_VERTEX)
-         handle_vs_inputs(&ctx, shaders[shader_idx]);
       else if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY)
          prepare_gs_input_vgprs(&ctx, shader_count >= 2);
 
@@ -3176,33 +2568,28 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, struct nir_shader *co
 
       /* This needs to be outside the if wrapping the shader body, as sometimes
        * the HW generates waves with 0 es/vs threads. */
-      if (is_pre_gs_stage(shaders[shader_idx]->info.stage) &&
-          args->options->key.vs_common_out.as_ngg && shader_idx == shader_count - 1) {
+      if (is_pre_gs_stage(shaders[shader_idx]->info.stage) && info->is_ngg &&
+          shader_idx == shader_count - 1) {
          handle_ngg_outputs_post_2(&ctx);
-      } else if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY &&
-                 args->options->key.vs_common_out.as_ngg) {
+      } else if (shaders[shader_idx]->info.stage == MESA_SHADER_GEOMETRY && info->is_ngg) {
          gfx10_ngg_gs_emit_epilogue_2(&ctx);
       }
    }
 
    LLVMBuildRetVoid(ctx.ac.builder);
 
-   if (args->options->dump_preoptir) {
+   if (options->dump_preoptir) {
       fprintf(stderr, "%s LLVM IR:\n\n",
-              radv_get_shader_name(args->shader_info, shaders[shader_count - 1]->info.stage));
+              radv_get_shader_name(info, shaders[shader_count - 1]->info.stage));
       ac_dump_module(ctx.ac.module);
       fprintf(stderr, "\n");
    }
 
-   ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, args->options);
+   ac_llvm_finalize_module(&ctx, ac_llvm->passmgr);
 
    if (shader_count == 1)
       ac_nir_eliminate_const_vs_outputs(&ctx);
 
-   if (args->options->dump_shader) {
-      args->shader_info->private_mem_vgprs = ac_count_scratch_private_memory(ctx.main_function);
-   }
-
    return ctx.ac.module;
 }
 
@@ -3288,25 +2675,21 @@ ac_compile_llvm_module(struct ac_llvm_compiler *ac_llvm, LLVMModuleRef llvm_modu
 }
 
 static void
-radv_compile_nir_shader(struct ac_llvm_compiler *ac_llvm, struct radv_shader_binary **rbinary,
+radv_compile_nir_shader(struct ac_llvm_compiler *ac_llvm,
+                        const struct radv_nir_compiler_options *options,
+                        struct radv_shader_info *info,
+                        struct radv_shader_binary **rbinary,
                         const struct radv_shader_args *args, struct nir_shader *const *nir,
                         int nir_count)
 {
 
    LLVMModuleRef llvm_module;
 
-   llvm_module = ac_translate_nir_to_llvm(ac_llvm, nir, nir_count, args);
+   llvm_module = ac_translate_nir_to_llvm(ac_llvm, options, info, nir, nir_count, args);
 
    ac_compile_llvm_module(ac_llvm, llvm_module, rbinary, nir[nir_count - 1]->info.stage,
-                          radv_get_shader_name(args->shader_info, nir[nir_count - 1]->info.stage),
-                          args->options);
-
-   /* Determine the ES type (VS or TES) for the GS on GFX9. */
-   if (args->options->chip_class >= GFX9) {
-      if (nir_count == 2 && nir[1]->info.stage == MESA_SHADER_GEOMETRY) {
-         args->shader_info->gs.es_type = nir[0]->info.stage;
-      }
-   }
+                          radv_get_shader_name(info, nir[nir_count - 1]->info.stage),
+                          options);
 }
 
 static void
@@ -3318,7 +2701,7 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx)
    LLVMValueRef stream_id;
 
    /* Fetch the vertex stream ID. */
-   if (!ctx->args->options->use_ngg_streamout && ctx->args->shader_info->so.num_outputs) {
+   if (ctx->shader_info->so.num_outputs) {
       stream_id =
          ac_unpack_param(&ctx->ac, ac_get_arg(&ctx->ac, ctx->args->ac.streamout_config), 24, 2);
    } else {
@@ -3332,14 +2715,14 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx)
    switch_inst = LLVMBuildSwitch(ctx->ac.builder, stream_id, end_bb, 4);
 
    for (unsigned stream = 0; stream < 4; stream++) {
-      unsigned num_components = ctx->args->shader_info->gs.num_stream_output_components[stream];
+      unsigned num_components = ctx->shader_info->gs.num_stream_output_components[stream];
       LLVMBasicBlockRef bb;
       unsigned offset;
 
       if (stream > 0 && !num_components)
          continue;
 
-      if (stream > 0 && !ctx->args->shader_info->so.num_outputs)
+      if (stream > 0 && !ctx->shader_info->so.num_outputs)
          continue;
 
       bb = LLVMInsertBasicBlockInContext(ctx->ac.context, end_bb, "out");
@@ -3348,8 +2731,8 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx)
 
       offset = 0;
       for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
-         unsigned output_usage_mask = ctx->args->shader_info->gs.output_usage_mask[i];
-         unsigned output_stream = ctx->args->shader_info->gs.output_streams[i];
+         unsigned output_usage_mask = ctx->shader_info->gs.output_usage_mask[i];
+         unsigned output_stream = ctx->shader_info->gs.output_streams[i];
          int length = util_last_bit(output_usage_mask);
 
          if (!(ctx->output_mask & (1ull << i)) || output_stream != stream)
@@ -3380,11 +2763,12 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx)
          }
       }
 
-      if (!ctx->args->options->use_ngg_streamout && ctx->args->shader_info->so.num_outputs)
+      if (ctx->shader_info->so.num_outputs)
          radv_emit_streamout(ctx, stream);
 
       if (stream == 0) {
-         handle_vs_outputs_post(ctx, false, true, &ctx->args->shader_info->vs.outinfo);
+         handle_vs_outputs_post(ctx, false, ctx->shader_info->vs.outinfo.export_clip_dists,
+                                &ctx->shader_info->vs.outinfo);
       }
 
       LLVMBuildBr(ctx->ac.builder, end_bb);
@@ -3394,17 +2778,22 @@ ac_gs_copy_shader_emit(struct radv_shader_context *ctx)
 }
 
 static void
-radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm, struct nir_shader *geom_shader,
+radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm,
+                            const struct radv_nir_compiler_options *options,
+                            struct radv_shader_info *info,
+                            struct nir_shader *geom_shader,
                             struct radv_shader_binary **rbinary,
                             const struct radv_shader_args *args)
 {
    struct radv_shader_context ctx = {0};
    ctx.args = args;
+   ctx.options = options;
+   ctx.shader_info = info;
 
    assert(args->is_gs_copy_shader);
 
-   ac_llvm_context_init(&ctx.ac, ac_llvm, args->options->chip_class, args->options->family,
-                        args->options->info, AC_FLOAT_MODE_DEFAULT, 64, 64);
+   ac_llvm_context_init(&ctx.ac, ac_llvm, options->chip_class, options->family,
+                        options->info, AC_FLOAT_MODE_DEFAULT, 64, 64);
    ctx.context = ctx.ac.context;
 
    ctx.stage = MESA_SHADER_VERTEX;
@@ -3424,31 +2813,31 @@ radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm, struct nir_shader
 
    LLVMBuildRetVoid(ctx.ac.builder);
 
-   ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, args->options);
+   ac_llvm_finalize_module(&ctx, ac_llvm->passmgr);
 
    ac_compile_llvm_module(ac_llvm, ctx.ac.module, rbinary, MESA_SHADER_VERTEX, "GS Copy Shader",
-                          args->options);
+                          options);
    (*rbinary)->is_gs_copy_shader = true;
 }
 
 void
-llvm_compile_shader(struct radv_device *device, unsigned shader_count,
+llvm_compile_shader(const struct radv_nir_compiler_options *options,
+                    struct radv_shader_info *info, unsigned shader_count,
                     struct nir_shader *const *shaders, struct radv_shader_binary **binary,
-                    struct radv_shader_args *args)
+                    const struct radv_shader_args *args)
 {
    enum ac_target_machine_options tm_options = 0;
    struct ac_llvm_compiler ac_llvm;
 
    tm_options |= AC_TM_SUPPORTS_SPILL;
-   if (args->options->check_ir)
+   if (options->check_ir)
       tm_options |= AC_TM_CHECK_IR;
 
-   radv_init_llvm_compiler(&ac_llvm, args->options->family, tm_options,
-                           args->shader_info->wave_size);
+   radv_init_llvm_compiler(&ac_llvm, options->family, tm_options, info->wave_size);
 
    if (args->is_gs_copy_shader) {
-      radv_compile_gs_copy_shader(&ac_llvm, *shaders, binary, args);
+      radv_compile_gs_copy_shader(&ac_llvm, options, info, *shaders, binary, args);
    } else {
-      radv_compile_nir_shader(&ac_llvm, binary, args, shaders, shader_count);
+      radv_compile_nir_shader(&ac_llvm, options, info, binary, args, shaders, shader_count);
    }
 }
diff --git a/mesa 3D driver/src/amd/vulkan/radv_pass.c b/mesa 3D driver/src/amd/vulkan/radv_pass.c
index d5121d6295..26063f57dd 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_pass.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_pass.c	
@@ -392,7 +392,7 @@ radv_CreateRenderPass2(VkDevice _device, const VkRenderPassCreateInfo2 *pCreateI
 
    pass = vk_alloc2(&device->vk.alloc, pAllocator, size, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (pass == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    memset(pass, 0, size);
 
@@ -429,7 +429,7 @@ radv_CreateRenderPass2(VkDevice _device, const VkRenderPassCreateInfo2 *pCreateI
                    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
       if (pass->subpass_attachments == NULL) {
          radv_destroy_render_pass(device, pAllocator, pass);
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
       }
    } else
       pass->subpass_attachments = NULL;
diff --git a/mesa 3D driver/src/amd/vulkan/radv_pipeline.c b/mesa 3D driver/src/amd/vulkan/radv_pipeline.c
index bf78a8cceb..e397891b68 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_pipeline.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_pipeline.c	
@@ -173,10 +173,18 @@ radv_pipeline_has_gs_copy_shader(const struct radv_pipeline *pipeline)
    return !!pipeline->gs_copy_shader;
 }
 
-static void
+void
 radv_pipeline_destroy(struct radv_device *device, struct radv_pipeline *pipeline,
                       const VkAllocationCallbacks *allocator)
 {
+   if (pipeline->type == RADV_PIPELINE_COMPUTE) {
+      free(pipeline->compute.rt_group_handles);
+      free(pipeline->compute.rt_stack_sizes);
+   } else if (pipeline->type == RADV_PIPELINE_LIBRARY) {
+      free(pipeline->library.groups);
+      free(pipeline->library.stages);
+   }
+
    for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i)
       if (pipeline->shaders[i])
          radv_shader_variant_destroy(device, pipeline->shaders[i]);
@@ -204,15 +212,15 @@ radv_DestroyPipeline(VkDevice _device, VkPipeline _pipeline,
    radv_pipeline_destroy(device, pipeline, pAllocator);
 }
 
-static uint32_t
-get_hash_flags(const struct radv_device *device, bool stats)
+uint32_t
+radv_get_hash_flags(const struct radv_device *device, bool stats)
 {
    uint32_t hash_flags = 0;
 
-   if (device->instance->debug_flags & RADV_DEBUG_NO_NGG)
-      hash_flags |= RADV_HASH_SHADER_NO_NGG;
-   if (device->instance->perftest_flags & RADV_PERFTEST_NGGC)
-      hash_flags |= RADV_HASH_SHADER_FORCE_NGG_CULLING;
+   if (device->physical_device->use_ngg_culling)
+      hash_flags |= RADV_HASH_SHADER_USE_NGG_CULLING;
+   if (device->instance->perftest_flags & RADV_PERFTEST_FORCE_EMULATE_RT)
+      hash_flags |= RADV_HASH_SHADER_FORCE_EMULATE_RT;
    if (device->physical_device->cs_wave_size == 32)
       hash_flags |= RADV_HASH_SHADER_CS_WAVE32;
    if (device->physical_device->ps_wave_size == 32)
@@ -221,20 +229,12 @@ get_hash_flags(const struct radv_device *device, bool stats)
       hash_flags |= RADV_HASH_SHADER_GE_WAVE32;
    if (device->physical_device->use_llvm)
       hash_flags |= RADV_HASH_SHADER_LLVM;
-   if (device->instance->debug_flags & RADV_DEBUG_DISCARD_TO_DEMOTE)
-      hash_flags |= RADV_HASH_SHADER_DISCARD_TO_DEMOTE;
-   if (device->instance->enable_mrt_output_nan_fixup)
-      hash_flags |= RADV_HASH_SHADER_MRT_NAN_FIXUP;
-   if (device->instance->debug_flags & RADV_DEBUG_INVARIANT_GEOM)
-      hash_flags |= RADV_HASH_SHADER_INVARIANT_GEOM;
    if (stats)
       hash_flags |= RADV_HASH_SHADER_KEEP_STATISTICS;
-   if (device->force_vrs != RADV_FORCE_VRS_2x2)
-      hash_flags |= RADV_HASH_SHADER_FORCE_VRS_2x2;
-   if (device->force_vrs != RADV_FORCE_VRS_2x1)
-      hash_flags |= RADV_HASH_SHADER_FORCE_VRS_2x1;
-   if (device->force_vrs != RADV_FORCE_VRS_1x2)
-      hash_flags |= RADV_HASH_SHADER_FORCE_VRS_1x2;
+   if (device->robust_buffer_access) /* forces per-attribute vertex descriptors */
+      hash_flags |= RADV_HASH_SHADER_ROBUST_BUFFER_ACCESS;
+   if (device->robust_buffer_access2) /* affects load/store vectorizer */
+      hash_flags |= RADV_HASH_SHADER_ROBUST_BUFFER_ACCESS2;
    return hash_flags;
 }
 
@@ -243,7 +243,6 @@ radv_pipeline_init_scratch(const struct radv_device *device, struct radv_pipelin
 {
    unsigned scratch_bytes_per_wave = 0;
    unsigned max_waves = 0;
-   unsigned min_waves = 1;
 
    for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
       if (pipeline->shaders[i] && pipeline->shaders[i]->config.scratch_bytes_per_wave) {
@@ -254,18 +253,11 @@ radv_pipeline_init_scratch(const struct radv_device *device, struct radv_pipelin
 
          max_stage_waves =
             MIN2(max_stage_waves, 4 * device->physical_device->rad_info.num_good_compute_units *
-                                     (256 / pipeline->shaders[i]->config.num_vgprs));
+                 radv_get_max_waves(device, pipeline->shaders[i], i));
          max_waves = MAX2(max_waves, max_stage_waves);
       }
    }
 
-   if (pipeline->shaders[MESA_SHADER_COMPUTE]) {
-      unsigned group_size = pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.block_size[0] *
-                            pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.block_size[1] *
-                            pipeline->shaders[MESA_SHADER_COMPUTE]->info.cs.block_size[2];
-      min_waves = MAX2(min_waves, round_up_u32(group_size, 64));
-   }
-
    pipeline->scratch_bytes_per_wave = scratch_bytes_per_wave;
    pipeline->max_waves = max_waves;
 }
@@ -635,9 +627,18 @@ radv_pipeline_init_blend_state(struct radv_pipeline *pipeline,
          cb_color_control |= S_028808_ROP3(V_028808_ROP3_COPY);
    }
 
-   blend.db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(3) | S_028B70_ALPHA_TO_MASK_OFFSET1(1) |
-                            S_028B70_ALPHA_TO_MASK_OFFSET2(0) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
-                            S_028B70_OFFSET_ROUND(1);
+   if (pipeline->device->instance->debug_flags & RADV_DEBUG_NO_ATOC_DITHERING)
+   {
+      blend.db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(2) | S_028B70_ALPHA_TO_MASK_OFFSET1(2) |
+                               S_028B70_ALPHA_TO_MASK_OFFSET2(2) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
+                               S_028B70_OFFSET_ROUND(0);
+   }
+   else
+   {
+      blend.db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(3) | S_028B70_ALPHA_TO_MASK_OFFSET1(1) |
+                               S_028B70_ALPHA_TO_MASK_OFFSET2(0) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
+                               S_028B70_OFFSET_ROUND(1);
+   }
 
    if (vkms && vkms->alphaToCoverageEnable) {
       blend.db_alpha_to_mask |= S_028B70_ALPHA_TO_MASK_ENABLE(1);
@@ -1233,30 +1234,6 @@ si_conv_gl_prim_to_gs_out(unsigned gl_prim)
    }
 }
 
-static uint32_t
-si_conv_prim_to_gs_out(enum VkPrimitiveTopology topology)
-{
-   switch (topology) {
-   case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
-   case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
-      return V_028A6C_POINTLIST;
-   case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
-   case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
-   case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
-   case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
-      return V_028A6C_LINESTRIP;
-   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
-   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
-   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
-   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
-   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
-      return V_028A6C_TRISTRIP;
-   default:
-      assert(0);
-      return 0;
-   }
-}
-
 static uint64_t
 radv_dynamic_state_mask(VkDynamicState state)
 {
@@ -1321,6 +1298,8 @@ radv_dynamic_state_mask(VkDynamicState state)
       return RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
    case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT:
       return RADV_DYNAMIC_COLOR_WRITE_ENABLE;
+   case VK_DYNAMIC_STATE_VERTEX_INPUT_EXT:
+      return RADV_DYNAMIC_VERTEX_INPUT;
    default:
       unreachable("Unhandled dynamic state");
    }
@@ -1357,7 +1336,8 @@ radv_pipeline_needed_dynamic_state(const VkGraphicsPipelineCreateInfo *pCreateIn
    if (pCreateInfo->pRasterizationState->rasterizerDiscardEnable &&
        !radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT)) {
       return RADV_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE |
-             RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE | RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
+             RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE | RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE |
+             RADV_DYNAMIC_VERTEX_INPUT;
    }
 
    if (!pCreateInfo->pRasterizationState->depthBiasEnable &&
@@ -1383,10 +1363,14 @@ radv_pipeline_needed_dynamic_state(const VkGraphicsPipelineCreateInfo *pCreateIn
                              PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT))
       states &= ~RADV_DYNAMIC_SAMPLE_LOCATIONS;
 
-   if (!pCreateInfo->pRasterizationState ||
-       !vk_find_struct_const(pCreateInfo->pRasterizationState->pNext,
-                             PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT))
+   if (!pCreateInfo->pRasterizationState)
       states &= ~RADV_DYNAMIC_LINE_STIPPLE;
+   else {
+      const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line_info = vk_find_struct_const(pCreateInfo->pRasterizationState->pNext,
+                                                                                                 PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
+      if (!rast_line_info || !rast_line_info->stippledLineEnable)
+         states &= ~RADV_DYNAMIC_LINE_STIPPLE;
+   }
 
    if (!vk_find_struct_const(pCreateInfo->pNext,
                              PIPELINE_FRAGMENT_SHADING_RATE_STATE_CREATE_INFO_KHR) &&
@@ -1700,7 +1684,8 @@ radv_pipeline_init_dynamic_state(struct radv_pipeline *pipeline,
       dynamic->line_stipple.pattern = rast_line_info->lineStipplePattern;
    }
 
-   if (!(states & RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE))
+   if (!(states & RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE) ||
+       !(states & RADV_DYNAMIC_VERTEX_INPUT))
       pipeline->graphics.uses_dynamic_stride = true;
 
    const VkPipelineFragmentShadingRateStateCreateInfoKHR *shading_rate = vk_find_struct_const(
@@ -1842,15 +1827,16 @@ gfx9_get_gs_info(const struct radv_pipeline_key *key, const struct radv_pipeline
 {
    struct radv_shader_info *gs_info = &infos[MESA_SHADER_GEOMETRY];
    struct radv_es_output_info *es_info;
+   bool has_tess = !!nir[MESA_SHADER_TESS_CTRL];
    if (pipeline->device->physical_device->rad_info.chip_class >= GFX9)
-      es_info = nir[MESA_SHADER_TESS_CTRL] ? &gs_info->tes.es_info : &gs_info->vs.es_info;
+      es_info = has_tess ? &gs_info->tes.es_info : &gs_info->vs.es_info;
    else
-      es_info = nir[MESA_SHADER_TESS_CTRL] ? &infos[MESA_SHADER_TESS_EVAL].tes.es_info
-                                           : &infos[MESA_SHADER_VERTEX].vs.es_info;
+      es_info = has_tess ? &infos[MESA_SHADER_TESS_EVAL].tes.es_info
+                         : &infos[MESA_SHADER_VERTEX].vs.es_info;
 
    unsigned gs_num_invocations = MAX2(gs_info->gs.invocations, 1);
    bool uses_adjacency;
-   switch (key->topology) {
+   switch (key->vs.topology) {
    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
@@ -1949,6 +1935,14 @@ gfx9_get_gs_info(const struct radv_pipeline_key *key, const struct radv_pipeline
    out->vgt_gs_max_prims_per_subgroup = S_028A94_MAX_PRIMS_PER_SUBGROUP(max_prims_per_subgroup);
    out->vgt_esgs_ring_itemsize = esgs_itemsize;
    assert(max_prims_per_subgroup <= max_out_prims);
+
+   gl_shader_stage es_stage = has_tess ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
+   unsigned workgroup_size =
+      ac_compute_esgs_workgroup_size(
+         pipeline->device->physical_device->rad_info.chip_class, infos[es_stage].wave_size,
+         es_verts_per_subgroup, gs_inst_prims_in_subgroup);
+   infos[es_stage].workgroup_size = workgroup_size;
+   infos[MESA_SHADER_GEOMETRY].workgroup_size = workgroup_size;
 }
 
 static void
@@ -1986,12 +1980,6 @@ radv_get_num_input_vertices(nir_shader **nir)
 static void
 gfx10_emit_ge_pc_alloc(struct radeon_cmdbuf *cs, enum chip_class chip_class, uint32_t oversub_pc_lines)
 {
-   if (chip_class == GFX10) {
-      /* SQ_NON_EVENT must be emitted before GE_PC_ALLOC is written. */
-      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-      radeon_emit(cs, EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0));
-   }
-
    radeon_set_uconfig_reg(
       cs, R_030980_GE_PC_ALLOC,
       S_030980_OVERSUB_EN(oversub_pc_lines > 0) | S_030980_NUM_PC_LINES(oversub_pc_lines - 1));
@@ -2009,7 +1997,7 @@ gfx10_get_ngg_info(const struct radv_pipeline_key *key, struct radv_pipeline *pi
    unsigned min_verts_per_prim = gs_type == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1;
    unsigned gs_num_invocations = nir[MESA_SHADER_GEOMETRY] ? MAX2(gs_info->gs.invocations, 1) : 1;
    bool uses_adjacency;
-   switch (key->topology) {
+   switch (key->vs.topology) {
    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
@@ -2218,6 +2206,13 @@ gfx10_get_ngg_info(const struct radv_pipeline_key *key, struct radv_pipeline *pi
    }
 
    assert(ngg->hw_max_esverts >= min_esverts); /* HW limitation */
+
+   gl_shader_stage es_stage = nir[MESA_SHADER_TESS_CTRL] ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
+   unsigned workgroup_size =
+      ac_compute_ngg_workgroup_size(
+         max_esverts, max_gsprims * gs_num_invocations, max_out_vertices, prim_amp_factor);
+   infos[MESA_SHADER_GEOMETRY].workgroup_size = workgroup_size;
+   infos[es_stage].workgroup_size = workgroup_size;
 }
 
 static void
@@ -2290,8 +2285,53 @@ get_vs_output_info(const struct radv_pipeline *pipeline)
       return &pipeline->shaders[MESA_SHADER_VERTEX]->info.vs.outinfo;
 }
 
+static bool
+radv_nir_stage_uses_xfb(const nir_shader *nir)
+{
+   nir_xfb_info *xfb = nir_gather_xfb_info(nir, NULL);
+   bool uses_xfb = !!xfb;
+
+   ralloc_free(xfb);
+   return uses_xfb;
+}
+
+static bool
+radv_lower_viewport_to_zero(nir_shader *nir)
+{
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+
+   nir_builder b;
+   nir_builder_init(&b, impl);
+
+   /* There should be only one deref load for VIEWPORT after lower_io_to_temporaries. */
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+         if (intr->intrinsic != nir_intrinsic_load_deref)
+            continue;
+
+         nir_variable *var = nir_intrinsic_get_var(intr, 0);
+         if (var->data.mode != nir_var_shader_in ||
+             var->data.location != VARYING_SLOT_VIEWPORT)
+            continue;
+
+         b.cursor = nir_before_instr(instr);
+
+         nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_imm_zero(&b, 1, 32));
+         return true;
+      }
+   }
+
+   return false;
+}
+
 static void
-radv_link_shaders(struct radv_pipeline *pipeline, nir_shader **shaders,
+radv_link_shaders(struct radv_pipeline *pipeline,
+                  const struct radv_pipeline_key *pipeline_key,
+                  nir_shader **shaders,
                   bool optimize_conservatively)
 {
    nir_shader *ordered_shaders[MESA_SHADER_STAGES];
@@ -2375,6 +2415,49 @@ radv_link_shaders(struct radv_pipeline *pipeline, nir_shader **shaders,
       }
    }
 
+   bool uses_xfb = pipeline->graphics.last_vgt_api_stage != -1 &&
+                   radv_nir_stage_uses_xfb(shaders[pipeline->graphics.last_vgt_api_stage]);
+   if (!uses_xfb && !optimize_conservatively) {
+      /* Remove PSIZ from shaders when it's not needed.
+       * This is typically produced by translation layers like Zink or D9VK.
+       */
+      for (unsigned i = 0; i < shader_count; ++i) {
+         shader_info *info = &ordered_shaders[i]->info;
+         if (!(info->outputs_written & VARYING_BIT_PSIZ))
+            continue;
+
+         bool next_stage_needs_psiz =
+            i != 0 && /* ordered_shaders is backwards, so next stage is: i - 1 */
+            ordered_shaders[i - 1]->info.inputs_read & VARYING_BIT_PSIZ;
+         bool topology_uses_psiz =
+            info->stage == pipeline->graphics.last_vgt_api_stage &&
+            ((info->stage == MESA_SHADER_VERTEX && pipeline_key->vs.topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST) ||
+             (info->stage == MESA_SHADER_TESS_EVAL && info->tess.point_mode) ||
+             (info->stage == MESA_SHADER_GEOMETRY && info->gs.output_primitive == GL_POINTS));
+
+         nir_variable *psiz_var =
+               nir_find_variable_with_location(ordered_shaders[i], nir_var_shader_out, VARYING_SLOT_PSIZ);
+
+         if (!next_stage_needs_psiz && !topology_uses_psiz && psiz_var) {
+            /* Change PSIZ to a global variable which allows it to be DCE'd. */
+            psiz_var->data.location = 0;
+            psiz_var->data.mode = nir_var_shader_temp;
+
+            info->outputs_written &= ~VARYING_BIT_PSIZ;
+            nir_fixup_deref_modes(ordered_shaders[i]);
+            nir_remove_dead_variables(ordered_shaders[i], nir_var_shader_temp, NULL);
+            nir_opt_dce(ordered_shaders[i]);
+         }
+      }
+   }
+
+   /* Lower the viewport index to zero when the last vertex stage doesn't export it. */
+   if (shaders[MESA_SHADER_FRAGMENT] &&
+       (shaders[MESA_SHADER_FRAGMENT]->info.inputs_read & VARYING_BIT_VIEWPORT) &&
+       !(shaders[pipeline->graphics.last_vgt_api_stage]->info.outputs_written & VARYING_BIT_VIEWPORT)) {
+      radv_lower_viewport_to_zero(shaders[MESA_SHADER_FRAGMENT]);
+   }
+
    for (int i = 1; !optimize_conservatively && (i < shader_count); ++i) {
       if (nir_link_opt_varyings(ordered_shaders[i], ordered_shaders[i - 1])) {
          nir_opt_constant_folding(ordered_shaders[i - 1]);
@@ -2519,9 +2602,6 @@ radv_generate_graphics_pipeline_key(const struct radv_pipeline *pipeline,
 {
    RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
    struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
-   const VkPipelineVertexInputStateCreateInfo *input_state = pCreateInfo->pVertexInputState;
-   const VkPipelineVertexInputDivisorStateCreateInfoEXT *divisor_state =
-      vk_find_struct_const(input_state->pNext, PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT);
    bool uses_dynamic_stride = false;
 
    struct radv_pipeline_key key;
@@ -2532,153 +2612,122 @@ radv_generate_graphics_pipeline_key(const struct radv_pipeline *pipeline,
 
    key.has_multiview_view_index = !!subpass->view_mask;
 
-   uint32_t binding_input_rate = 0;
-   uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS];
-   for (unsigned i = 0; i < input_state->vertexBindingDescriptionCount; ++i) {
-      if (input_state->pVertexBindingDescriptions[i].inputRate) {
-         unsigned binding = input_state->pVertexBindingDescriptions[i].binding;
-         binding_input_rate |= 1u << binding;
-         instance_rate_divisors[binding] = 1;
-      }
-   }
-   if (divisor_state) {
-      for (unsigned i = 0; i < divisor_state->vertexBindingDivisorCount; ++i) {
-         instance_rate_divisors[divisor_state->pVertexBindingDivisors[i].binding] =
-            divisor_state->pVertexBindingDivisors[i].divisor;
-      }
-   }
-
    if (pCreateInfo->pDynamicState) {
       uint32_t count = pCreateInfo->pDynamicState->dynamicStateCount;
       for (uint32_t i = 0; i < count; i++) {
-         if (pCreateInfo->pDynamicState->pDynamicStates[i] ==
-             VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT) {
-            uses_dynamic_stride = true;
+         if (pCreateInfo->pDynamicState->pDynamicStates[i] == VK_DYNAMIC_STATE_VERTEX_INPUT_EXT) {
+            key.vs.dynamic_input_state = true;
+            /* we don't care about use_dynamic_stride in this case */
             break;
+         } else if (pCreateInfo->pDynamicState->pDynamicStates[i] ==
+                    VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT) {
+            uses_dynamic_stride = true;
          }
       }
    }
 
-   for (unsigned i = 0; i < input_state->vertexAttributeDescriptionCount; ++i) {
-      const VkVertexInputAttributeDescription *desc = &input_state->pVertexAttributeDescriptions[i];
-      const struct util_format_description *format_desc;
-      unsigned location = desc->location;
-      unsigned binding = desc->binding;
-      unsigned num_format, data_format;
-      int first_non_void;
+   if (!key.vs.dynamic_input_state) {
+      const VkPipelineVertexInputStateCreateInfo *input_state = pCreateInfo->pVertexInputState;
+      const VkPipelineVertexInputDivisorStateCreateInfoEXT *divisor_state = vk_find_struct_const(
+         input_state->pNext, PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT);
 
-      if (binding_input_rate & (1u << binding)) {
-         key.instance_rate_inputs |= 1u << location;
-         key.instance_rate_divisors[location] = instance_rate_divisors[binding];
-      }
-
-      format_desc = vk_format_description(desc->format);
-      first_non_void = vk_format_get_first_non_void_channel(desc->format);
-
-      num_format = radv_translate_buffer_numformat(format_desc, first_non_void);
-      data_format = radv_translate_buffer_dataformat(format_desc, first_non_void);
-
-      key.vertex_attribute_formats[location] = data_format | (num_format << 4);
-      key.vertex_attribute_bindings[location] = desc->binding;
-      key.vertex_attribute_offsets[location] = desc->offset;
-
-      const struct ac_data_format_info *dfmt_info = ac_get_data_format_info(data_format);
-      unsigned attrib_align =
-         dfmt_info->chan_byte_size ? dfmt_info->chan_byte_size : dfmt_info->element_size;
-
-      /* If desc->offset is misaligned, then the buffer offset must be too. Just
-       * skip updating vertex_binding_align in this case.
-       */
-      if (desc->offset % attrib_align == 0)
-         key.vertex_binding_align[desc->binding] =
-            MAX2(key.vertex_binding_align[desc->binding], attrib_align);
-
-      if (!uses_dynamic_stride) {
-         /* From the Vulkan spec 1.2.157:
-          *
-          * "If the bound pipeline state object was created
-          *  with the
-          *  VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT
-          *  dynamic state enabled then pStrides[i] specifies
-          *  the distance in bytes between two consecutive
-          *  elements within the corresponding buffer. In this
-          *  case the VkVertexInputBindingDescription::stride
-          *  state from the pipeline state object is ignored."
-          *
-          * Make sure the vertex attribute stride is zero to
-          * avoid computing a wrong offset if it's initialized
-          * to something else than zero.
-          */
-         key.vertex_attribute_strides[location] =
-            radv_get_attrib_stride(input_state, desc->binding);
-      }
-
-      enum ac_fetch_format adjust = AC_FETCH_FORMAT_NONE;
-      if (pipeline->device->physical_device->rad_info.chip_class <= GFX8 &&
-          pipeline->device->physical_device->rad_info.family != CHIP_STONEY) {
-         VkFormat format = input_state->pVertexAttributeDescriptions[i].format;
-         switch (format) {
-         case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
-         case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
-            adjust = AC_FETCH_FORMAT_SNORM;
-            break;
-         case VK_FORMAT_A2R10G10B10_SSCALED_PACK32:
-         case VK_FORMAT_A2B10G10R10_SSCALED_PACK32:
-            adjust = AC_FETCH_FORMAT_SSCALED;
-            break;
-         case VK_FORMAT_A2R10G10B10_SINT_PACK32:
-         case VK_FORMAT_A2B10G10R10_SINT_PACK32:
-            adjust = AC_FETCH_FORMAT_SINT;
-            break;
-         default:
-            break;
+      uint32_t binding_input_rate = 0;
+      uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS];
+      for (unsigned i = 0; i < input_state->vertexBindingDescriptionCount; ++i) {
+         if (input_state->pVertexBindingDescriptions[i].inputRate) {
+            unsigned binding = input_state->pVertexBindingDescriptions[i].binding;
+            binding_input_rate |= 1u << binding;
+            instance_rate_divisors[binding] = 1;
+         }
+      }
+      if (divisor_state) {
+         for (unsigned i = 0; i < divisor_state->vertexBindingDivisorCount; ++i) {
+            instance_rate_divisors[divisor_state->pVertexBindingDivisors[i].binding] =
+               divisor_state->pVertexBindingDivisors[i].divisor;
          }
       }
-      key.vertex_alpha_adjust[location] = adjust;
 
-      switch (desc->format) {
-      case VK_FORMAT_B8G8R8A8_UNORM:
-      case VK_FORMAT_B8G8R8A8_SNORM:
-      case VK_FORMAT_B8G8R8A8_USCALED:
-      case VK_FORMAT_B8G8R8A8_SSCALED:
-      case VK_FORMAT_B8G8R8A8_UINT:
-      case VK_FORMAT_B8G8R8A8_SINT:
-      case VK_FORMAT_B8G8R8A8_SRGB:
-      case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
-      case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
-      case VK_FORMAT_A2R10G10B10_USCALED_PACK32:
-      case VK_FORMAT_A2R10G10B10_SSCALED_PACK32:
-      case VK_FORMAT_A2R10G10B10_UINT_PACK32:
-      case VK_FORMAT_A2R10G10B10_SINT_PACK32:
-         key.vertex_post_shuffle |= 1 << location;
-         break;
-      default:
-         break;
+      for (unsigned i = 0; i < input_state->vertexAttributeDescriptionCount; ++i) {
+         const VkVertexInputAttributeDescription *desc =
+            &input_state->pVertexAttributeDescriptions[i];
+         const struct util_format_description *format_desc;
+         unsigned location = desc->location;
+         unsigned binding = desc->binding;
+         unsigned num_format, data_format;
+         bool post_shuffle;
+
+         if (binding_input_rate & (1u << binding)) {
+            key.vs.instance_rate_inputs |= 1u << location;
+            key.vs.instance_rate_divisors[location] = instance_rate_divisors[binding];
+         }
+
+         format_desc = vk_format_description(desc->format);
+         radv_translate_vertex_format(pipeline->device->physical_device, desc->format, format_desc,
+                                      &data_format, &num_format, &post_shuffle,
+                                      &key.vs.vertex_alpha_adjust[location]);
+
+         key.vs.vertex_attribute_formats[location] = data_format | (num_format << 4);
+         key.vs.vertex_attribute_bindings[location] = desc->binding;
+         key.vs.vertex_attribute_offsets[location] = desc->offset;
+
+         const struct ac_data_format_info *dfmt_info = ac_get_data_format_info(data_format);
+         unsigned attrib_align =
+            dfmt_info->chan_byte_size ? dfmt_info->chan_byte_size : dfmt_info->element_size;
+
+         /* If desc->offset is misaligned, then the buffer offset must be too. Just
+          * skip updating vertex_binding_align in this case.
+          */
+         if (desc->offset % attrib_align == 0)
+            key.vs.vertex_binding_align[desc->binding] =
+               MAX2(key.vs.vertex_binding_align[desc->binding], attrib_align);
+
+         if (!uses_dynamic_stride) {
+            /* From the Vulkan spec 1.2.157:
+             *
+             * "If the bound pipeline state object was created
+             *  with the
+             *  VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT
+             *  dynamic state enabled then pStrides[i] specifies
+             *  the distance in bytes between two consecutive
+             *  elements within the corresponding buffer. In this
+             *  case the VkVertexInputBindingDescription::stride
+             *  state from the pipeline state object is ignored."
+             *
+             * Make sure the vertex attribute stride is zero to
+             * avoid computing a wrong offset if it's initialized
+             * to something else than zero.
+             */
+            key.vs.vertex_attribute_strides[location] =
+               radv_get_attrib_stride(input_state, desc->binding);
+         }
+
+         if (post_shuffle)
+            key.vs.vertex_post_shuffle |= 1 << location;
       }
    }
 
    const VkPipelineTessellationStateCreateInfo *tess =
       radv_pipeline_get_tessellation_state(pCreateInfo);
    if (tess)
-      key.tess_input_vertices = tess->patchControlPoints;
+      key.tcs.tess_input_vertices = tess->patchControlPoints;
 
    const VkPipelineMultisampleStateCreateInfo *vkms =
       radv_pipeline_get_multisample_state(pCreateInfo);
    if (vkms && vkms->rasterizationSamples > 1) {
       uint32_t num_samples = vkms->rasterizationSamples;
       uint32_t ps_iter_samples = radv_pipeline_get_ps_iter_samples(pCreateInfo);
-      key.num_samples = num_samples;
-      key.log2_ps_iter_samples = util_logbase2(ps_iter_samples);
+      key.ps.num_samples = num_samples;
+      key.ps.log2_ps_iter_samples = util_logbase2(ps_iter_samples);
    }
 
-   key.col_format = blend->spi_shader_col_format;
+   key.ps.col_format = blend->spi_shader_col_format;
    if (pipeline->device->physical_device->rad_info.chip_class < GFX8) {
-      key.is_int8 = blend->col_format_is_int8;
-      key.is_int10 = blend->col_format_is_int10;
+      key.ps.is_int8 = blend->col_format_is_int8;
+      key.ps.is_int10 = blend->col_format_is_int10;
    }
 
    if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
-      key.topology = pCreateInfo->pInputAssemblyState->topology;
+      key.vs.topology = pCreateInfo->pInputAssemblyState->topology;
 
       const VkPipelineRasterizationStateCreateInfo *raster_info = pCreateInfo->pRasterizationState;
       const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *provoking_vtx_info =
@@ -2686,60 +2735,127 @@ radv_generate_graphics_pipeline_key(const struct radv_pipeline *pipeline,
                               PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
       if (provoking_vtx_info &&
           provoking_vtx_info->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT) {
-         key.provoking_vtx_last = true;
+         key.vs.provoking_vtx_last = true;
       }
    }
+
+   if (pipeline->device->instance->debug_flags & RADV_DEBUG_DISCARD_TO_DEMOTE)
+      key.ps.lower_discard_to_demote = true;
+
+   if (pipeline->device->instance->enable_mrt_output_nan_fixup)
+      key.ps.enable_mrt_output_nan_fixup = true;
+
+   key.ps.force_vrs = pipeline->device->force_vrs;
+
+   if (pipeline->device->instance->debug_flags & RADV_DEBUG_INVARIANT_GEOM)
+      key.invariant_geom = true;
+
+   key.use_ngg = pipeline->device->physical_device->use_ngg;
+
    return key;
 }
 
-static bool
-radv_nir_stage_uses_xfb(const nir_shader *nir)
+static uint8_t
+radv_get_wave_size(struct radv_device *device, const VkPipelineShaderStageCreateInfo *pStage,
+                   gl_shader_stage stage, const struct radv_shader_info *info)
 {
-   nir_xfb_info *xfb = nir_gather_xfb_info(nir, NULL);
-   bool uses_xfb = !!xfb;
+   if (stage == MESA_SHADER_GEOMETRY && !info->is_ngg)
+      return 64;
+   else if (stage == MESA_SHADER_COMPUTE) {
+      return info->cs.subgroup_size;
+   } else if (stage == MESA_SHADER_FRAGMENT)
+      return device->physical_device->ps_wave_size;
+   else
+      return device->physical_device->ge_wave_size;
+}
 
-   ralloc_free(xfb);
-   return uses_xfb;
+static uint8_t
+radv_get_ballot_bit_size(struct radv_device *device, const VkPipelineShaderStageCreateInfo *pStage,
+                         gl_shader_stage stage, const struct radv_shader_info *info)
+{
+   if (stage == MESA_SHADER_COMPUTE && info->cs.subgroup_size)
+      return info->cs.subgroup_size;
+   return 64;
 }
 
 static void
-radv_fill_shader_keys(struct radv_device *device, struct radv_shader_variant_key *keys,
-                      const struct radv_pipeline_key *key, nir_shader **nir)
+radv_determine_ngg_settings(struct radv_pipeline *pipeline,
+                            const struct radv_pipeline_key *pipeline_key,
+                            struct radv_shader_info *infos, nir_shader **nir)
 {
-   keys[MESA_SHADER_VERTEX].vs.instance_rate_inputs = key->instance_rate_inputs;
-   keys[MESA_SHADER_VERTEX].vs.post_shuffle = key->vertex_post_shuffle;
-   for (unsigned i = 0; i < MAX_VERTEX_ATTRIBS; ++i) {
-      keys[MESA_SHADER_VERTEX].vs.instance_rate_divisors[i] = key->instance_rate_divisors[i];
-      keys[MESA_SHADER_VERTEX].vs.vertex_attribute_formats[i] = key->vertex_attribute_formats[i];
-      keys[MESA_SHADER_VERTEX].vs.vertex_attribute_bindings[i] = key->vertex_attribute_bindings[i];
-      keys[MESA_SHADER_VERTEX].vs.vertex_attribute_offsets[i] = key->vertex_attribute_offsets[i];
-      keys[MESA_SHADER_VERTEX].vs.vertex_attribute_strides[i] = key->vertex_attribute_strides[i];
-      keys[MESA_SHADER_VERTEX].vs.alpha_adjust[i] = key->vertex_alpha_adjust[i];
+   struct radv_device *device = pipeline->device;
+
+   if (!nir[MESA_SHADER_GEOMETRY] && pipeline->graphics.last_vgt_api_stage != MESA_SHADER_NONE) {
+      uint64_t ps_inputs_read =
+         nir[MESA_SHADER_FRAGMENT] ? nir[MESA_SHADER_FRAGMENT]->info.inputs_read : 0;
+      gl_shader_stage es_stage = pipeline->graphics.last_vgt_api_stage;
+
+      unsigned num_vertices_per_prim = si_conv_prim_to_gs_out(pipeline_key->vs.topology) + 1;
+      if (es_stage == MESA_SHADER_TESS_EVAL)
+         num_vertices_per_prim = nir[es_stage]->info.tess.point_mode                      ? 1
+                                 : nir[es_stage]->info.tess.primitive_mode == GL_ISOLINES ? 2
+                                                                                          : 3;
+
+      infos[es_stage].has_ngg_culling = radv_consider_culling(
+         device, nir[es_stage], ps_inputs_read, num_vertices_per_prim, &infos[es_stage]);
+
+      nir_function_impl *impl = nir_shader_get_entrypoint(nir[es_stage]);
+      infos[es_stage].has_ngg_early_prim_export = exec_list_is_singular(&impl->body);
+
+      /* Invocations that process an input vertex */
+      const struct gfx10_ngg_info *ngg_info = &infos[es_stage].ngg_info;
+      unsigned max_vtx_in = MIN2(256, ngg_info->enable_vertex_grouping ? ngg_info->hw_max_esverts : num_vertices_per_prim * ngg_info->max_gsprims);
+
+      unsigned lds_bytes_if_culling_off = 0;
+      /* We need LDS space when VS needs to export the primitive ID. */
+      if (es_stage == MESA_SHADER_VERTEX && infos[es_stage].vs.outinfo.export_prim_id)
+         lds_bytes_if_culling_off = max_vtx_in * 4u;
+      infos[es_stage].num_lds_blocks_when_not_culling =
+         DIV_ROUND_UP(lds_bytes_if_culling_off,
+                      device->physical_device->rad_info.lds_encode_granularity);
+
+      /* NGG passthrough mode should be disabled when culling and when the vertex shader exports the
+       * primitive ID.
+       */
+      infos[es_stage].is_ngg_passthrough = infos[es_stage].is_ngg_passthrough &&
+                                           !infos[es_stage].has_ngg_culling &&
+                                           !(es_stage == MESA_SHADER_VERTEX &&
+                                             infos[es_stage].vs.outinfo.export_prim_id);
+   }
+}
+
+static void
+radv_fill_shader_info(struct radv_pipeline *pipeline,
+                      struct radv_pipeline_layout *pipeline_layout,
+                      const VkPipelineShaderStageCreateInfo **pStages,
+                      const struct radv_pipeline_key *pipeline_key,
+                      struct radv_shader_info *infos, nir_shader **nir)
+{
+   struct radv_device *device = pipeline->device;
+   unsigned active_stages = 0;
+   unsigned filled_stages = 0;
+
+   for (int i = 0; i < MESA_SHADER_STAGES; i++) {
+      if (nir[i])
+         active_stages |= (1 << i);
    }
-   for (unsigned i = 0; i < MAX_VBS; ++i)
-      keys[MESA_SHADER_VERTEX].vs.vertex_binding_align[i] = key->vertex_binding_align[i];
-   keys[MESA_SHADER_VERTEX].vs.outprim = si_conv_prim_to_gs_out(key->topology);
-   keys[MESA_SHADER_VERTEX].vs.provoking_vtx_last = key->provoking_vtx_last;
 
    if (nir[MESA_SHADER_TESS_CTRL]) {
-      keys[MESA_SHADER_VERTEX].vs_common_out.as_ls = true;
-      keys[MESA_SHADER_TESS_CTRL].tcs.input_vertices = key->tess_input_vertices;
-      keys[MESA_SHADER_TESS_CTRL].tcs.primitive_mode =
-         nir[MESA_SHADER_TESS_EVAL]->info.tess.primitive_mode;
+      infos[MESA_SHADER_VERTEX].vs.as_ls = true;
    }
 
    if (nir[MESA_SHADER_GEOMETRY]) {
       if (nir[MESA_SHADER_TESS_CTRL])
-         keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_es = true;
+         infos[MESA_SHADER_TESS_EVAL].tes.as_es = true;
       else
-         keys[MESA_SHADER_VERTEX].vs_common_out.as_es = true;
+         infos[MESA_SHADER_VERTEX].vs.as_es = true;
    }
 
-   if (device->physical_device->use_ngg) {
+   if (pipeline_key->use_ngg) {
       if (nir[MESA_SHADER_TESS_CTRL]) {
-         keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg = true;
+         infos[MESA_SHADER_TESS_EVAL].is_ngg = true;
       } else {
-         keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg = true;
+         infos[MESA_SHADER_VERTEX].is_ngg = true;
       }
 
       if (nir[MESA_SHADER_TESS_CTRL] && nir[MESA_SHADER_GEOMETRY] &&
@@ -2751,7 +2867,7 @@ radv_fill_shader_keys(struct radv_device *device, struct radv_shader_variant_key
           * EN_MAX_VERT_OUT_PER_GS_INSTANCE doesn't work and it
           * might hang.
           */
-         keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg = false;
+         infos[MESA_SHADER_TESS_EVAL].is_ngg = false;
       }
 
       gl_shader_stage last_xfb_stage = MESA_SHADER_VERTEX;
@@ -2765,9 +2881,9 @@ radv_fill_shader_keys(struct radv_device *device, struct radv_shader_variant_key
 
       if (!device->physical_device->use_ngg_streamout && uses_xfb) {
          if (nir[MESA_SHADER_TESS_CTRL])
-            keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg = false;
+           infos[MESA_SHADER_TESS_EVAL].is_ngg = false;
          else
-            keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg = false;
+           infos[MESA_SHADER_VERTEX].is_ngg = false;
       }
 
       /* Determine if the pipeline is eligible for the NGG passthrough
@@ -2776,27 +2892,99 @@ radv_fill_shader_keys(struct radv_device *device, struct radv_shader_variant_key
        * (this is checked later because we don't have the info here.)
        */
       if (!nir[MESA_SHADER_GEOMETRY] && !uses_xfb) {
-         if (nir[MESA_SHADER_TESS_CTRL] && keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg) {
-            keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg_passthrough = true;
-         } else if (nir[MESA_SHADER_VERTEX] && keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg) {
-            keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg_passthrough = true;
+         if (nir[MESA_SHADER_TESS_CTRL] && infos[MESA_SHADER_TESS_EVAL].is_ngg) {
+            infos[MESA_SHADER_TESS_EVAL].is_ngg_passthrough = true;
+         } else if (nir[MESA_SHADER_VERTEX] && infos[MESA_SHADER_VERTEX].is_ngg) {
+            infos[MESA_SHADER_VERTEX].is_ngg_passthrough = true;
          }
       }
    }
 
-   for (int i = 0; i < MESA_SHADER_STAGES; ++i)
-      keys[i].has_multiview_view_index = key->has_multiview_view_index;
+   if (nir[MESA_SHADER_FRAGMENT]) {
+      radv_nir_shader_info_init(&infos[MESA_SHADER_FRAGMENT]);
+      radv_nir_shader_info_pass(pipeline->device, nir[MESA_SHADER_FRAGMENT], pipeline_layout,
+                                pipeline_key, &infos[MESA_SHADER_FRAGMENT]);
 
-   keys[MESA_SHADER_FRAGMENT].fs.col_format = key->col_format;
-   keys[MESA_SHADER_FRAGMENT].fs.is_int8 = key->is_int8;
-   keys[MESA_SHADER_FRAGMENT].fs.is_int10 = key->is_int10;
-   keys[MESA_SHADER_FRAGMENT].fs.log2_ps_iter_samples = key->log2_ps_iter_samples;
-   keys[MESA_SHADER_FRAGMENT].fs.num_samples = key->num_samples;
+      assert(pipeline->graphics.last_vgt_api_stage != MESA_SHADER_NONE);
+      if (infos[MESA_SHADER_FRAGMENT].ps.prim_id_input) {
+         if (pipeline->graphics.last_vgt_api_stage == MESA_SHADER_VERTEX) {
+            infos[MESA_SHADER_VERTEX].vs.outinfo.export_prim_id = true;
+         } else if (pipeline->graphics.last_vgt_api_stage == MESA_SHADER_TESS_EVAL) {
+            infos[MESA_SHADER_TESS_EVAL].tes.outinfo.export_prim_id = true;
+         } else {
+            assert(pipeline->graphics.last_vgt_api_stage == MESA_SHADER_GEOMETRY);
+         }
+      }
+
+      if (!!infos[MESA_SHADER_FRAGMENT].ps.num_input_clips_culls) {
+         if (pipeline->graphics.last_vgt_api_stage == MESA_SHADER_VERTEX) {
+            infos[MESA_SHADER_VERTEX].vs.outinfo.export_clip_dists = true;
+         } else if (pipeline->graphics.last_vgt_api_stage == MESA_SHADER_TESS_EVAL) {
+            infos[MESA_SHADER_TESS_EVAL].tes.outinfo.export_clip_dists = true;
+         } else {
+            assert(pipeline->graphics.last_vgt_api_stage == MESA_SHADER_GEOMETRY);
+            infos[MESA_SHADER_GEOMETRY].vs.outinfo.export_clip_dists = true;
+         }
+      }
+
+      filled_stages |= (1 << MESA_SHADER_FRAGMENT);
+   }
+
+   if (pipeline->device->physical_device->rad_info.chip_class >= GFX9 &&
+       nir[MESA_SHADER_TESS_CTRL]) {
+      struct nir_shader *combined_nir[] = {nir[MESA_SHADER_VERTEX], nir[MESA_SHADER_TESS_CTRL]};
+
+      radv_nir_shader_info_init(&infos[MESA_SHADER_TESS_CTRL]);
+
+      /* Copy data to merged stage. */
+      infos[MESA_SHADER_TESS_CTRL].vs.as_ls = true;
+
+      for (int i = 0; i < 2; i++) {
+         radv_nir_shader_info_pass(pipeline->device, combined_nir[i], pipeline_layout, pipeline_key,
+                                   &infos[MESA_SHADER_TESS_CTRL]);
+      }
+
+      filled_stages |= (1 << MESA_SHADER_VERTEX);
+      filled_stages |= (1 << MESA_SHADER_TESS_CTRL);
+   }
+
+   if (pipeline->device->physical_device->rad_info.chip_class >= GFX9 &&
+       nir[MESA_SHADER_GEOMETRY]) {
+      gl_shader_stage pre_stage =
+         nir[MESA_SHADER_TESS_EVAL] ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
+      struct nir_shader *combined_nir[] = {nir[pre_stage], nir[MESA_SHADER_GEOMETRY]};
+
+      radv_nir_shader_info_init(&infos[MESA_SHADER_GEOMETRY]);
+
+      /* Copy data to merged stage. */
+      if (pre_stage == MESA_SHADER_VERTEX) {
+         infos[MESA_SHADER_GEOMETRY].vs.as_es = infos[MESA_SHADER_VERTEX].vs.as_es;
+      } else {
+         infos[MESA_SHADER_GEOMETRY].tes.as_es = infos[MESA_SHADER_TESS_EVAL].tes.as_es;
+      }
+      infos[MESA_SHADER_GEOMETRY].is_ngg = infos[pre_stage].is_ngg;
+      infos[MESA_SHADER_GEOMETRY].gs.es_type = pre_stage;
+
+      for (int i = 0; i < 2; i++) {
+         radv_nir_shader_info_pass(pipeline->device, combined_nir[i], pipeline_layout, pipeline_key,
+                                   &infos[MESA_SHADER_GEOMETRY]);
+      }
+
+      filled_stages |= (1 << pre_stage);
+      filled_stages |= (1 << MESA_SHADER_GEOMETRY);
+   }
+
+   active_stages ^= filled_stages;
+   while (active_stages) {
+      int i = u_bit_scan(&active_stages);
+      radv_nir_shader_info_init(&infos[i]);
+      radv_nir_shader_info_pass(pipeline->device, nir[i], pipeline_layout, pipeline_key, &infos[i]);
+   }
 
    if (nir[MESA_SHADER_COMPUTE]) {
-      unsigned subgroup_size = key->compute_subgroup_size;
+      unsigned subgroup_size = pipeline_key->cs.compute_subgroup_size;
       unsigned req_subgroup_size = subgroup_size;
-      bool require_full_subgroups = key->require_full_subgroups;
+      bool require_full_subgroups = pipeline_key->cs.require_full_subgroups;
 
       if (!subgroup_size)
          subgroup_size = device->physical_device->cs_wave_size;
@@ -2818,131 +3006,29 @@ radv_fill_shader_keys(struct radv_device *device, struct radv_shader_variant_key
          subgroup_size = RADV_SUBGROUP_SIZE;
       }
 
-      keys[MESA_SHADER_COMPUTE].cs.subgroup_size = subgroup_size;
-   }
-}
-
-static uint8_t
-radv_get_wave_size(struct radv_device *device, const VkPipelineShaderStageCreateInfo *pStage,
-                   gl_shader_stage stage, const struct radv_shader_variant_key *key,
-                   const struct radv_shader_info *info)
-{
-   if (stage == MESA_SHADER_GEOMETRY && !info->is_ngg)
-      return 64;
-   else if (stage == MESA_SHADER_COMPUTE) {
-      return key->cs.subgroup_size;
-   } else if (stage == MESA_SHADER_FRAGMENT)
-      return device->physical_device->ps_wave_size;
-   else
-      return device->physical_device->ge_wave_size;
-}
-
-static uint8_t
-radv_get_ballot_bit_size(struct radv_device *device, const VkPipelineShaderStageCreateInfo *pStage,
-                         gl_shader_stage stage, const struct radv_shader_variant_key *key)
-{
-   if (stage == MESA_SHADER_COMPUTE && key->cs.subgroup_size)
-      return key->cs.subgroup_size;
-   return 64;
-}
-
-static void
-radv_fill_shader_info(struct radv_pipeline *pipeline,
-                      const VkPipelineShaderStageCreateInfo **pStages,
-                      struct radv_shader_variant_key *keys, struct radv_shader_info *infos,
-                      nir_shader **nir)
-{
-   unsigned active_stages = 0;
-   unsigned filled_stages = 0;
-
-   for (int i = 0; i < MESA_SHADER_STAGES; i++) {
-      if (nir[i])
-         active_stages |= (1 << i);
-   }
-
-   if (nir[MESA_SHADER_FRAGMENT]) {
-      radv_nir_shader_info_init(&infos[MESA_SHADER_FRAGMENT]);
-      radv_nir_shader_info_pass(pipeline->device, nir[MESA_SHADER_FRAGMENT], pipeline->layout,
-                                &keys[MESA_SHADER_FRAGMENT], &infos[MESA_SHADER_FRAGMENT]);
-
-      /* TODO: These are no longer used as keys we should refactor this */
-      keys[MESA_SHADER_VERTEX].vs_common_out.export_prim_id =
-         infos[MESA_SHADER_FRAGMENT].ps.prim_id_input;
-      keys[MESA_SHADER_VERTEX].vs_common_out.export_layer_id =
-         infos[MESA_SHADER_FRAGMENT].ps.layer_input;
-      keys[MESA_SHADER_VERTEX].vs_common_out.export_clip_dists =
-         !!infos[MESA_SHADER_FRAGMENT].ps.num_input_clips_culls;
-      keys[MESA_SHADER_VERTEX].vs_common_out.export_viewport_index =
-         infos[MESA_SHADER_FRAGMENT].ps.viewport_index_input;
-      keys[MESA_SHADER_TESS_EVAL].vs_common_out.export_prim_id =
-         infos[MESA_SHADER_FRAGMENT].ps.prim_id_input;
-      keys[MESA_SHADER_TESS_EVAL].vs_common_out.export_layer_id =
-         infos[MESA_SHADER_FRAGMENT].ps.layer_input;
-      keys[MESA_SHADER_TESS_EVAL].vs_common_out.export_clip_dists =
-         !!infos[MESA_SHADER_FRAGMENT].ps.num_input_clips_culls;
-      keys[MESA_SHADER_TESS_EVAL].vs_common_out.export_viewport_index =
-         infos[MESA_SHADER_FRAGMENT].ps.viewport_index_input;
-
-      /* NGG passthrough mode can't be enabled for vertex shaders
-       * that export the primitive ID.
-       *
-       * TODO: I should really refactor the keys logic.
-       */
-      if (nir[MESA_SHADER_VERTEX] && keys[MESA_SHADER_VERTEX].vs_common_out.export_prim_id) {
-         keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg_passthrough = false;
-      }
-
-      filled_stages |= (1 << MESA_SHADER_FRAGMENT);
-   }
-
-   if (pipeline->device->physical_device->rad_info.chip_class >= GFX9 &&
-       nir[MESA_SHADER_TESS_CTRL]) {
-      struct nir_shader *combined_nir[] = {nir[MESA_SHADER_VERTEX], nir[MESA_SHADER_TESS_CTRL]};
-      struct radv_shader_variant_key key = keys[MESA_SHADER_TESS_CTRL];
-      key.tcs.vs_key = keys[MESA_SHADER_VERTEX].vs;
-
-      radv_nir_shader_info_init(&infos[MESA_SHADER_TESS_CTRL]);
-
-      for (int i = 0; i < 2; i++) {
-         radv_nir_shader_info_pass(pipeline->device, combined_nir[i], pipeline->layout, &key,
-                                   &infos[MESA_SHADER_TESS_CTRL]);
-      }
-
-      filled_stages |= (1 << MESA_SHADER_VERTEX);
-      filled_stages |= (1 << MESA_SHADER_TESS_CTRL);
-   }
-
-   if (pipeline->device->physical_device->rad_info.chip_class >= GFX9 &&
-       nir[MESA_SHADER_GEOMETRY]) {
-      gl_shader_stage pre_stage =
-         nir[MESA_SHADER_TESS_EVAL] ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
-      struct nir_shader *combined_nir[] = {nir[pre_stage], nir[MESA_SHADER_GEOMETRY]};
-
-      radv_nir_shader_info_init(&infos[MESA_SHADER_GEOMETRY]);
-
-      for (int i = 0; i < 2; i++) {
-         radv_nir_shader_info_pass(pipeline->device, combined_nir[i], pipeline->layout,
-                                   &keys[pre_stage], &infos[MESA_SHADER_GEOMETRY]);
-      }
-
-      filled_stages |= (1 << pre_stage);
-      filled_stages |= (1 << MESA_SHADER_GEOMETRY);
-   }
-
-   active_stages ^= filled_stages;
-   while (active_stages) {
-      int i = u_bit_scan(&active_stages);
-      radv_nir_shader_info_init(&infos[i]);
-      radv_nir_shader_info_pass(pipeline->device, nir[i], pipeline->layout, &keys[i], &infos[i]);
+      infos[MESA_SHADER_COMPUTE].cs.subgroup_size = subgroup_size;
    }
 
    for (int i = 0; i < MESA_SHADER_STAGES; i++) {
       if (nir[i]) {
-         infos[i].wave_size = radv_get_wave_size(pipeline->device, pStages[i], i, &keys[i], &infos[i]);
+         infos[i].wave_size = radv_get_wave_size(pipeline->device, pStages[i], i, &infos[i]);
          infos[i].ballot_bit_size =
-            radv_get_ballot_bit_size(pipeline->device, pStages[i], i, &keys[i]);
+            radv_get_ballot_bit_size(pipeline->device, pStages[i], i, &infos[i]);
       }
    }
+
+   /* PS always operates without workgroups. */
+   if (nir[MESA_SHADER_FRAGMENT])
+      infos[MESA_SHADER_FRAGMENT].workgroup_size = infos[MESA_SHADER_FRAGMENT].wave_size;
+
+   if (nir[MESA_SHADER_COMPUTE]) {
+      /* Variable workgroup size is not supported by Vulkan. */
+      assert(!nir[MESA_SHADER_COMPUTE]->info.workgroup_size_variable);
+
+      infos[MESA_SHADER_COMPUTE].workgroup_size =
+         ac_compute_cs_workgroup_size(
+            nir[MESA_SHADER_COMPUTE]->info.workgroup_size, false, UINT32_MAX);
+   }
 }
 
 static void
@@ -2994,9 +3080,12 @@ gather_tess_info(struct radv_device *device, nir_shader **nir, struct radv_shade
 {
    merge_tess_info(&nir[MESA_SHADER_TESS_EVAL]->info, &nir[MESA_SHADER_TESS_CTRL]->info);
 
+   unsigned tess_in_patch_size = pipeline_key->tcs.tess_input_vertices;
+   unsigned tess_out_patch_size = nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out;
+
    /* Number of tessellation patches per workgroup processed by the current pipeline. */
    unsigned num_patches = get_tcs_num_patches(
-      pipeline_key->tess_input_vertices, nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out,
+      tess_in_patch_size, tess_out_patch_size,
       infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_inputs,
       infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_outputs,
       infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_patch_outputs, device->tess_offchip_block_dw_size,
@@ -3004,8 +3093,7 @@ gather_tess_info(struct radv_device *device, nir_shader **nir, struct radv_shade
 
    /* LDS size used by VS+TCS for storing TCS inputs and outputs. */
    unsigned tcs_lds_size = calculate_tess_lds_size(
-      device->physical_device->rad_info.chip_class, pipeline_key->tess_input_vertices,
-      nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out,
+      device->physical_device->rad_info.chip_class, tess_in_patch_size, tess_out_patch_size,
       infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_inputs, num_patches,
       infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_outputs,
       infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_patch_outputs);
@@ -3021,6 +3109,9 @@ gather_tess_info(struct radv_device *device, nir_shader **nir, struct radv_shade
 
    infos[MESA_SHADER_TESS_EVAL].num_tess_patches = num_patches;
    infos[MESA_SHADER_GEOMETRY].num_tess_patches = num_patches;
+   infos[MESA_SHADER_VERTEX].num_tess_patches = num_patches;
+   infos[MESA_SHADER_TESS_CTRL].tcs.tcs_vertices_out = tess_out_patch_size;
+   infos[MESA_SHADER_VERTEX].tcs.tcs_vertices_out = tess_out_patch_size;
 
    if (!radv_use_llvm_for_stage(device, MESA_SHADER_VERTEX)) {
       /* When the number of TCS input and output vertices are the same (typically 3):
@@ -3034,8 +3125,7 @@ gather_tess_info(struct radv_device *device, nir_shader **nir, struct radv_shade
        */
       infos[MESA_SHADER_VERTEX].vs.tcs_in_out_eq =
          device->physical_device->rad_info.chip_class >= GFX9 &&
-         pipeline_key->tess_input_vertices ==
-            nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out &&
+         tess_in_patch_size == tess_out_patch_size &&
          nir[MESA_SHADER_VERTEX]->info.float_controls_execution_mode ==
             nir[MESA_SHADER_TESS_CTRL]->info.float_controls_execution_mode;
 
@@ -3052,6 +3142,12 @@ gather_tess_info(struct radv_device *device, nir_shader **nir, struct radv_shade
       infos[MESA_SHADER_TESS_CTRL].vs.tcs_temp_only_input_mask =
          infos[MESA_SHADER_VERTEX].vs.tcs_temp_only_input_mask;
    }
+
+   for (gl_shader_stage s = MESA_SHADER_VERTEX; s <= MESA_SHADER_TESS_CTRL; ++s)
+      infos[s].workgroup_size =
+         ac_compute_lshs_workgroup_size(
+            device->physical_device->rad_info.chip_class, s,
+            num_patches, tess_in_patch_size, tess_out_patch_size);
 }
 
 static void
@@ -3187,6 +3283,9 @@ lower_bit_size_callback(const nir_instr *instr, void *_)
       case nir_op_uadd_sat:
          return (bit_size == 8 || !(chip >= GFX8 && nir_dest_is_divergent(alu->dest.dest))) ? 32
                                                                                             : 0;
+      case nir_op_iadd_sat:
+         return bit_size == 8 || !nir_dest_is_divergent(alu->dest.dest) ? 32 : 0;
+
       default:
          return 0;
       }
@@ -3258,10 +3357,11 @@ non_uniform_access_callback(const nir_src *src, void *_)
 }
 
 VkResult
-radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
-                    struct radv_pipeline_cache *cache, const struct radv_pipeline_key *pipeline_key,
+radv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout *pipeline_layout,
+                    struct radv_device *device, struct radv_pipeline_cache *cache,
+                    const struct radv_pipeline_key *pipeline_key,
                     const VkPipelineShaderStageCreateInfo **pStages,
-                    const VkPipelineCreateFlags flags,
+                    const VkPipelineCreateFlags flags, const uint8_t *custom_hash,
                     VkPipelineCreationFeedbackEXT *pipeline_feedback,
                     VkPipelineCreationFeedbackEXT **stage_feedbacks)
 {
@@ -3271,7 +3371,6 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
    };
    nir_shader *nir[MESA_SHADER_STAGES] = {0};
    struct radv_shader_binary *binaries[MESA_SHADER_STAGES] = {NULL};
-   struct radv_shader_variant_key keys[MESA_SHADER_STAGES] = {{{{{0}}}}};
    struct radv_shader_info infos[MESA_SHADER_STAGES] = {0};
    unsigned char hash[20], gs_copy_hash[20];
    bool keep_executable_info =
@@ -3280,7 +3379,9 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
    bool keep_statistic_info = (flags & VK_PIPELINE_CREATE_CAPTURE_STATISTICS_BIT_KHR) ||
                               (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) ||
                               device->keep_shader_info;
-   bool disable_optimizations = flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT;
+   struct radv_pipeline_shader_stack_size **stack_sizes =
+      pipeline->type == RADV_PIPELINE_COMPUTE ? &pipeline->compute.rt_stack_sizes : NULL;
+   uint32_t *num_stack_sizes = stack_sizes ? &pipeline->compute.group_count : NULL;
 
    radv_start_feedback(pipeline_feedback);
 
@@ -3297,8 +3398,12 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
       }
    }
 
-   radv_hash_shaders(hash, pStages, pipeline->layout, pipeline_key,
-                     get_hash_flags(device, keep_statistic_info));
+   if (custom_hash)
+      memcpy(hash, custom_hash, 20);
+   else {
+      radv_hash_shaders(hash, pStages, pipeline_layout, pipeline_key,
+                        radv_get_hash_flags(device, keep_statistic_info));
+   }
    memcpy(gs_copy_hash, hash, 20);
    gs_copy_hash[0] ^= 1;
 
@@ -3307,15 +3412,17 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
    bool found_in_application_cache = true;
    if (modules[MESA_SHADER_GEOMETRY] && !keep_executable_info) {
       struct radv_shader_variant *variants[MESA_SHADER_STAGES] = {0};
-      radv_create_shader_variants_from_pipeline_cache(device, cache, gs_copy_hash, variants,
-                                                      &found_in_application_cache);
+      radv_create_shader_variants_from_pipeline_cache(device, cache, gs_copy_hash, variants, NULL,
+                                                      NULL, &found_in_application_cache);
       pipeline->gs_copy_shader = variants[MESA_SHADER_GEOMETRY];
    }
 
    if (!keep_executable_info &&
        radv_create_shader_variants_from_pipeline_cache(device, cache, hash, pipeline->shaders,
+                                                       stack_sizes, num_stack_sizes,
                                                        &found_in_application_cache) &&
-       (!modules[MESA_SHADER_GEOMETRY] || pipeline->gs_copy_shader)) {
+       (!modules[MESA_SHADER_GEOMETRY] || pipeline->gs_copy_shader ||
+        pipeline->shaders[MESA_SHADER_GEOMETRY]->info.is_ngg)) {
       radv_stop_feedback(pipeline_feedback, found_in_application_cache);
       return VK_SUCCESS;
    }
@@ -3340,8 +3447,8 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
       radv_start_feedback(stage_feedbacks[i]);
 
       nir[i] = radv_shader_compile_to_nir(device, modules[i], stage ? stage->pName : "main", i,
-                                          stage ? stage->pSpecializationInfo : NULL, flags,
-                                          pipeline->layout, pipeline_key);
+                                          stage ? stage->pSpecializationInfo : NULL,
+                                          pipeline_layout, pipeline_key);
 
       /* We don't want to alter meta shaders IR directly so clone it
        * first.
@@ -3353,9 +3460,9 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
       radv_stop_feedback(stage_feedbacks[i], false);
    }
 
-   bool optimize_conservatively = flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT;
+   bool optimize_conservatively = pipeline_key->optimisations_disabled;
 
-   radv_link_shaders(pipeline, nir, optimize_conservatively);
+   radv_link_shaders(pipeline, pipeline_key, nir, optimize_conservatively);
    radv_set_driver_locations(pipeline, nir, infos);
 
    for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
@@ -3371,22 +3478,16 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
       }
    }
 
-   infos[MESA_SHADER_VERTEX].vs.as_ls = !!nir[MESA_SHADER_TESS_CTRL];
-   infos[MESA_SHADER_VERTEX].vs.as_es = !!nir[MESA_SHADER_GEOMETRY] && !nir[MESA_SHADER_TESS_CTRL];
-   infos[MESA_SHADER_TESS_EVAL].tes.as_es =
-      !!nir[MESA_SHADER_GEOMETRY] && !!nir[MESA_SHADER_TESS_CTRL];
-
    if (nir[MESA_SHADER_TESS_CTRL]) {
       nir_lower_patch_vertices(nir[MESA_SHADER_TESS_EVAL],
                                nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out, NULL);
       gather_tess_info(device, nir, infos, pipeline_key);
    }
 
-   radv_fill_shader_keys(device, keys, pipeline_key, nir);
-   radv_fill_shader_info(pipeline, pStages, keys, infos, nir);
+   radv_fill_shader_info(pipeline, pipeline_layout, pStages, pipeline_key, infos, nir);
 
-   bool pipeline_has_ngg = (nir[MESA_SHADER_VERTEX] && keys[MESA_SHADER_VERTEX].vs_common_out.as_ngg) ||
-                           (nir[MESA_SHADER_TESS_EVAL] && keys[MESA_SHADER_TESS_EVAL].vs_common_out.as_ngg);
+   bool pipeline_has_ngg = (nir[MESA_SHADER_VERTEX] && infos[MESA_SHADER_VERTEX].is_ngg) ||
+                           (nir[MESA_SHADER_TESS_EVAL] && infos[MESA_SHADER_TESS_EVAL].is_ngg);
 
    if (pipeline_has_ngg) {
       struct gfx10_ngg_info *ngg_info;
@@ -3403,12 +3504,21 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
       struct gfx9_gs_info *gs_info = &infos[MESA_SHADER_GEOMETRY].gs_ring_info;
 
       gfx9_get_gs_info(pipeline_key, pipeline, nir, infos, gs_info);
+   } else {
+      gl_shader_stage hw_vs_api_stage =
+         nir[MESA_SHADER_TESS_EVAL] ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
+      infos[hw_vs_api_stage].workgroup_size = infos[hw_vs_api_stage].wave_size;
    }
 
+   radv_determine_ngg_settings(pipeline, pipeline_key, infos, nir);
+
    for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
       if (nir[i]) {
          radv_start_feedback(stage_feedbacks[i]);
 
+         /* Wave and workgroup size should already be filled. */
+         assert(infos[i].wave_size && infos[i].workgroup_size);
+
          if (!radv_use_llvm_for_stage(device, i)) {
             nir_lower_non_uniform_access_options options = {
                .types = nir_lower_non_uniform_ubo_access | nir_lower_non_uniform_ssbo_access |
@@ -3449,13 +3559,9 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
             nir_lower_alu_to_scalar(nir[i], NULL, NULL);
 
          /* lower ALU operations */
-         /* TODO: Some 64-bit tests crash inside LLVM. */
-         if (!radv_use_llvm_for_stage(device, i))
-            nir_lower_int64(nir[i]);
+         nir_lower_int64(nir[i]);
 
-         /* TODO: Implement nir_op_uadd_sat with LLVM. */
-         if (!radv_use_llvm_for_stage(device, i))
-            nir_opt_idiv_const(nir[i], 8);
+         nir_opt_idiv_const(nir[i], 8);
 
          nir_lower_idiv(nir[i],
                         &(nir_lower_idiv_options){
@@ -3470,11 +3576,8 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
          bool io_to_mem = radv_lower_io_to_mem(device, nir[i], &infos[i], pipeline_key);
          bool lowered_ngg = pipeline_has_ngg && i == pipeline->graphics.last_vgt_api_stage &&
                             !radv_use_llvm_for_stage(device, i);
-         if (lowered_ngg) {
-            uint64_t ps_inputs_read = nir[MESA_SHADER_FRAGMENT] ? nir[MESA_SHADER_FRAGMENT]->info.inputs_read : 0;
-            bool consider_culling = radv_consider_culling(device, nir[i], ps_inputs_read);
-            radv_lower_ngg(device, nir[i], &infos[i], pipeline_key, &keys[i], consider_culling);
-         }
+         if (lowered_ngg)
+            radv_lower_ngg(device, nir[i], &infos[i], pipeline_key);
 
          radv_optimize_nir_algebraic(nir[i], io_to_mem || lowered_ngg || i == MESA_SHADER_COMPUTE);
 
@@ -3516,19 +3619,20 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
       struct radv_shader_binary *gs_copy_binary = NULL;
       if (!pipeline_has_ngg) {
          struct radv_shader_info info = {0};
-         struct radv_shader_variant_key key = {0};
 
-         key.has_multiview_view_index = keys[MESA_SHADER_GEOMETRY].has_multiview_view_index;
+         if (infos[MESA_SHADER_GEOMETRY].vs.outinfo.export_clip_dists)
+            info.vs.outinfo.export_clip_dists = true;
 
-         radv_nir_shader_info_pass(device, nir[MESA_SHADER_GEOMETRY], pipeline->layout, &key,
+         radv_nir_shader_info_pass(device, nir[MESA_SHADER_GEOMETRY], pipeline_layout, pipeline_key,
                                    &info);
          info.wave_size = 64; /* Wave32 not supported. */
+         info.workgroup_size = 64; /* HW VS: separate waves, no workgroups */
          info.ballot_bit_size = 64;
 
          pipeline->gs_copy_shader = radv_create_gs_copy_shader(
             device, nir[MESA_SHADER_GEOMETRY], &info, &gs_copy_binary, keep_executable_info,
-            keep_statistic_info, keys[MESA_SHADER_GEOMETRY].has_multiview_view_index,
-            disable_optimizations);
+            keep_statistic_info, pipeline_key->has_multiview_view_index,
+            pipeline_key->optimisations_disabled);
       }
 
       if (!keep_executable_info && pipeline->gs_copy_shader) {
@@ -3538,7 +3642,8 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
          gs_binaries[MESA_SHADER_GEOMETRY] = gs_copy_binary;
          gs_variants[MESA_SHADER_GEOMETRY] = pipeline->gs_copy_shader;
 
-         radv_pipeline_cache_insert_shaders(device, cache, gs_copy_hash, gs_variants, gs_binaries);
+         radv_pipeline_cache_insert_shaders(device, cache, gs_copy_hash, gs_variants, gs_binaries,
+                                            NULL, 0);
 
          pipeline->gs_copy_shader = gs_variants[MESA_SHADER_GEOMETRY];
       }
@@ -3550,9 +3655,9 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
          radv_start_feedback(stage_feedbacks[MESA_SHADER_FRAGMENT]);
 
          pipeline->shaders[MESA_SHADER_FRAGMENT] = radv_shader_variant_compile(
-            device, modules[MESA_SHADER_FRAGMENT], &nir[MESA_SHADER_FRAGMENT], 1, pipeline->layout,
-            keys + MESA_SHADER_FRAGMENT, infos + MESA_SHADER_FRAGMENT, keep_executable_info,
-            keep_statistic_info, disable_optimizations, &binaries[MESA_SHADER_FRAGMENT]);
+            device, modules[MESA_SHADER_FRAGMENT], &nir[MESA_SHADER_FRAGMENT], 1, pipeline_layout,
+            pipeline_key, infos + MESA_SHADER_FRAGMENT, keep_executable_info,
+            keep_statistic_info, &binaries[MESA_SHADER_FRAGMENT]);
 
          radv_stop_feedback(stage_feedbacks[MESA_SHADER_FRAGMENT], false);
       }
@@ -3561,15 +3666,13 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
    if (device->physical_device->rad_info.chip_class >= GFX9 && modules[MESA_SHADER_TESS_CTRL]) {
       if (!pipeline->shaders[MESA_SHADER_TESS_CTRL]) {
          struct nir_shader *combined_nir[] = {nir[MESA_SHADER_VERTEX], nir[MESA_SHADER_TESS_CTRL]};
-         struct radv_shader_variant_key key = keys[MESA_SHADER_TESS_CTRL];
-         key.tcs.vs_key = keys[MESA_SHADER_VERTEX].vs;
 
          radv_start_feedback(stage_feedbacks[MESA_SHADER_TESS_CTRL]);
 
          pipeline->shaders[MESA_SHADER_TESS_CTRL] = radv_shader_variant_compile(
-            device, modules[MESA_SHADER_TESS_CTRL], combined_nir, 2, pipeline->layout, &key,
+            device, modules[MESA_SHADER_TESS_CTRL], combined_nir, 2, pipeline_layout, pipeline_key,
             &infos[MESA_SHADER_TESS_CTRL], keep_executable_info, keep_statistic_info,
-            disable_optimizations, &binaries[MESA_SHADER_TESS_CTRL]);
+            &binaries[MESA_SHADER_TESS_CTRL]);
 
          radv_stop_feedback(stage_feedbacks[MESA_SHADER_TESS_CTRL], false);
       }
@@ -3585,9 +3688,9 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
          radv_start_feedback(stage_feedbacks[MESA_SHADER_GEOMETRY]);
 
          pipeline->shaders[MESA_SHADER_GEOMETRY] = radv_shader_variant_compile(
-            device, modules[MESA_SHADER_GEOMETRY], combined_nir, 2, pipeline->layout,
-            &keys[pre_stage], &infos[MESA_SHADER_GEOMETRY], keep_executable_info,
-            keep_statistic_info, disable_optimizations, &binaries[MESA_SHADER_GEOMETRY]);
+            device, modules[MESA_SHADER_GEOMETRY], combined_nir, 2, pipeline_layout, pipeline_key,
+            &infos[MESA_SHADER_GEOMETRY], keep_executable_info,
+            keep_statistic_info, &binaries[MESA_SHADER_GEOMETRY]);
 
          radv_stop_feedback(stage_feedbacks[MESA_SHADER_GEOMETRY], false);
       }
@@ -3599,15 +3702,17 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
          radv_start_feedback(stage_feedbacks[i]);
 
          pipeline->shaders[i] = radv_shader_variant_compile(
-            device, modules[i], &nir[i], 1, pipeline->layout, keys + i, infos + i,
-            keep_executable_info, keep_statistic_info, disable_optimizations, &binaries[i]);
+            device, modules[i], &nir[i], 1, pipeline_layout, pipeline_key, infos + i,
+            keep_executable_info, keep_statistic_info, &binaries[i]);
 
          radv_stop_feedback(stage_feedbacks[i], false);
       }
    }
 
    if (!keep_executable_info) {
-      radv_pipeline_cache_insert_shaders(device, cache, hash, pipeline->shaders, binaries);
+      radv_pipeline_cache_insert_shaders(device, cache, hash, pipeline->shaders, binaries,
+                                         stack_sizes ? *stack_sizes : NULL,
+                                         num_stack_sizes ? *num_stack_sizes : 0);
    }
 
    for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
@@ -4194,10 +4299,6 @@ radv_pipeline_generate_depth_stencil_state(struct radeon_cmdbuf *ctx_cs,
       db_render_control |= S_028000_RESUMMARIZE_ENABLE(extra->resummarize_enable);
       db_render_control |= S_028000_DEPTH_COMPRESS_DISABLE(extra->depth_compress_disable);
       db_render_control |= S_028000_STENCIL_COMPRESS_DISABLE(extra->stencil_compress_disable);
-      db_render_override2 |=
-         S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(extra->db_depth_disable_expclear);
-      db_render_override2 |=
-         S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(extra->db_stencil_disable_expclear);
    }
 
    db_render_override |= S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) |
@@ -4253,8 +4354,6 @@ radv_pipeline_generate_raster_state(struct radeon_cmdbuf *ctx_cs,
    const VkConservativeRasterizationModeEXT mode = radv_get_conservative_raster_mode(vkraster);
    uint32_t pa_sc_conservative_rast = S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1);
 
-   radeon_set_context_reg(ctx_cs, R_028BDC_PA_SC_LINE_CNTL, S_028BDC_DX10_DIAMOND_TEST_ENA(1));
-
    if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) {
       /* Conservative rasterization. */
       if (mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
@@ -4340,7 +4439,7 @@ radv_pipeline_generate_hw_vs(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf
                              const struct radv_pipeline *pipeline,
                              const struct radv_shader_variant *shader)
 {
-   uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
+   uint64_t va = radv_shader_variant_get_va(shader);
 
    radeon_set_sh_reg_seq(cs, R_00B120_SPI_SHADER_PGM_LO_VS, 4);
    radeon_emit(cs, va >> 8);
@@ -4413,7 +4512,7 @@ static void
 radv_pipeline_generate_hw_es(struct radeon_cmdbuf *cs, const struct radv_pipeline *pipeline,
                              const struct radv_shader_variant *shader)
 {
-   uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
+   uint64_t va = radv_shader_variant_get_va(shader);
 
    radeon_set_sh_reg_seq(cs, R_00B320_SPI_SHADER_PGM_LO_ES, 4);
    radeon_emit(cs, va >> 8);
@@ -4427,12 +4526,10 @@ radv_pipeline_generate_hw_ls(struct radeon_cmdbuf *cs, const struct radv_pipelin
                              const struct radv_shader_variant *shader)
 {
    unsigned num_lds_blocks = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.num_lds_blocks;
-   uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
+   uint64_t va = radv_shader_variant_get_va(shader);
    uint32_t rsrc2 = shader->config.rsrc2;
 
-   radeon_set_sh_reg_seq(cs, R_00B520_SPI_SHADER_PGM_LO_LS, 2);
-   radeon_emit(cs, va >> 8);
-   radeon_emit(cs, S_00B524_MEM_BASE(va >> 40));
+   radeon_set_sh_reg(cs, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
 
    rsrc2 |= S_00B52C_LDS_SIZE(num_lds_blocks);
    if (pipeline->device->physical_device->rad_info.chip_class == GFX7 &&
@@ -4449,7 +4546,7 @@ radv_pipeline_generate_hw_ngg(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf
                               const struct radv_pipeline *pipeline,
                               const struct radv_shader_variant *shader)
 {
-   uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
+   uint64_t va = radv_shader_variant_get_va(shader);
    gl_shader_stage es_type =
       radv_pipeline_has_tess(pipeline) ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
    struct radv_shader_variant *es = es_type == MESA_SHADER_TESS_EVAL
@@ -4457,9 +4554,8 @@ radv_pipeline_generate_hw_ngg(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf
                                        : pipeline->shaders[MESA_SHADER_VERTEX];
    const struct gfx10_ngg_info *ngg_state = &shader->info.ngg_info;
 
-   radeon_set_sh_reg_seq(cs, R_00B320_SPI_SHADER_PGM_LO_ES, 2);
-   radeon_emit(cs, va >> 8);
-   radeon_emit(cs, S_00B324_MEM_BASE(va >> 40));
+   radeon_set_sh_reg(cs, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
+
    radeon_set_sh_reg_seq(cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2);
    radeon_emit(cs, shader->config.rsrc1);
    radeon_emit(cs, shader->config.rsrc2);
@@ -4540,22 +4636,6 @@ radv_pipeline_generate_hw_ngg(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf
       S_028B90_CNT(gs_num_invocations) | S_028B90_ENABLE(gs_num_invocations > 1) |
          S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(ngg_state->max_vert_out_per_gs_instance));
 
-   /* User edge flags are set by the pos exports. If user edge flags are
-    * not used, we must use hw-generated edge flags and pass them via
-    * the prim export to prevent drawing lines on internal edges of
-    * decomposed primitives (such as quads) with polygon mode = lines.
-    *
-    * TODO: We should combine hw-generated edge flags with user edge
-    *       flags in the shader.
-    */
-   radeon_set_context_reg(
-      ctx_cs, R_028838_PA_CL_NGG_CNTL,
-      S_028838_INDEX_BUF_EDGE_FLAG_ENA(!radv_pipeline_has_tess(pipeline) &&
-                                       !radv_pipeline_has_gs(pipeline)) |
-         /* Reuse for NGG. */
-         S_028838_VERTEX_REUSE_DEPTH(
-            pipeline->device->physical_device->rad_info.chip_class >= GFX10_3 ? 30 : 0));
-
    ge_cntl = S_03096C_PRIM_GRP_SIZE(ngg_state->max_gsprims) |
              S_03096C_VERT_GRP_SIZE(ngg_state->enable_vertex_grouping ? ngg_state->hw_max_esverts : 256) | /* 256 = disable vertex grouping */
              S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
@@ -4588,8 +4668,16 @@ radv_pipeline_generate_hw_ngg(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf
       S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));
 
    uint32_t oversub_pc_lines = late_alloc_wave64 ? pipeline->device->physical_device->rad_info.pc_lines / 4 : 0;
-   if (shader->info.has_ngg_culling)
-      oversub_pc_lines *= 3;
+   if (shader->info.has_ngg_culling) {
+      unsigned oversub_factor = 2;
+
+      if (outinfo->param_exports > 4)
+         oversub_factor = 4;
+      else if (outinfo->param_exports > 2)
+         oversub_factor = 3;
+
+      oversub_pc_lines *= oversub_factor;
+   }
 
    gfx10_emit_ge_pc_alloc(cs, pipeline->device->physical_device->rad_info.chip_class, oversub_pc_lines);
 }
@@ -4598,17 +4686,13 @@ static void
 radv_pipeline_generate_hw_hs(struct radeon_cmdbuf *cs, const struct radv_pipeline *pipeline,
                              const struct radv_shader_variant *shader)
 {
-   uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
+   uint64_t va = radv_shader_variant_get_va(shader);
 
    if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) {
       if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
-         radeon_set_sh_reg_seq(cs, R_00B520_SPI_SHADER_PGM_LO_LS, 2);
-         radeon_emit(cs, va >> 8);
-         radeon_emit(cs, S_00B524_MEM_BASE(va >> 40));
+         radeon_set_sh_reg(cs, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
       } else {
-         radeon_set_sh_reg_seq(cs, R_00B410_SPI_SHADER_PGM_LO_LS, 2);
-         radeon_emit(cs, va >> 8);
-         radeon_emit(cs, S_00B414_MEM_BASE(va >> 40));
+         radeon_set_sh_reg(cs, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8);
       }
 
       radeon_set_sh_reg_seq(cs, R_00B428_SPI_SHADER_PGM_RSRC1_HS, 2);
@@ -4799,17 +4883,13 @@ radv_pipeline_generate_hw_gs(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf
    radeon_set_context_reg(ctx_cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
                           gs_state->vgt_esgs_ring_itemsize);
 
-   va = radv_buffer_get_va(gs->bo) + gs->bo_offset;
+   va = radv_shader_variant_get_va(gs);
 
    if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) {
       if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
-         radeon_set_sh_reg_seq(cs, R_00B320_SPI_SHADER_PGM_LO_ES, 2);
-         radeon_emit(cs, va >> 8);
-         radeon_emit(cs, S_00B324_MEM_BASE(va >> 40));
+         radeon_set_sh_reg(cs, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
       } else {
-         radeon_set_sh_reg_seq(cs, R_00B210_SPI_SHADER_PGM_LO_ES, 2);
-         radeon_emit(cs, va >> 8);
-         radeon_emit(cs, S_00B214_MEM_BASE(va >> 40));
+         radeon_set_sh_reg(cs, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
       }
 
       radeon_set_sh_reg_seq(cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2);
@@ -4903,7 +4983,7 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs, const struct radv
       }
    }
 
-   if (ps->info.ps.layer_input || ps->info.needs_multiview_view_index) {
+   if (ps->info.ps.layer_input) {
       unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_LAYER];
       if (vs_offset != AC_EXP_PARAM_UNDEFINED)
          ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false, false);
@@ -4915,11 +4995,8 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs, const struct radv
 
    if (ps->info.ps.viewport_index_input) {
       unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_VIEWPORT];
-      if (vs_offset != AC_EXP_PARAM_UNDEFINED)
-         ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false, false);
-      else
-         ps_input_cntl[ps_offset] =
-            offset_to_ps_input(AC_EXP_PARAM_DEFAULT_VAL_0000, true, false, false);
+      assert(vs_offset != AC_EXP_PARAM_UNDEFINED);
+      ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false, false);
       ++ps_offset;
    }
 
@@ -5024,7 +5101,7 @@ radv_pipeline_generate_fragment_shader(struct radeon_cmdbuf *ctx_cs, struct rade
    assert(pipeline->shaders[MESA_SHADER_FRAGMENT]);
 
    ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
-   va = radv_buffer_get_va(ps->bo) + ps->bo_offset;
+   va = radv_shader_variant_get_va(ps);
 
    radeon_set_sh_reg_seq(cs, R_00B020_SPI_SHADER_PGM_LO_PS, 4);
    radeon_emit(cs, va >> 8);
@@ -5337,30 +5414,50 @@ radv_pipeline_generate_pm4(struct radv_pipeline *pipeline,
 
 static void
 radv_pipeline_init_vertex_input_state(struct radv_pipeline *pipeline,
-                                      const VkGraphicsPipelineCreateInfo *pCreateInfo)
+                                      const VkGraphicsPipelineCreateInfo *pCreateInfo,
+                                      const struct radv_pipeline_key *key)
 {
    const struct radv_shader_info *info = &radv_get_shader(pipeline, MESA_SHADER_VERTEX)->info;
-   const VkPipelineVertexInputStateCreateInfo *vi_info = pCreateInfo->pVertexInputState;
+   if (!key->vs.dynamic_input_state) {
+      const VkPipelineVertexInputStateCreateInfo *vi_info = pCreateInfo->pVertexInputState;
 
-   for (uint32_t i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
-      const VkVertexInputBindingDescription *desc = &vi_info->pVertexBindingDescriptions[i];
+      for (uint32_t i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
+         const VkVertexInputBindingDescription *desc = &vi_info->pVertexBindingDescriptions[i];
 
-      pipeline->binding_stride[desc->binding] = desc->stride;
-   }
+         pipeline->binding_stride[desc->binding] = desc->stride;
+      }
 
-   for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) {
-      const VkVertexInputAttributeDescription *desc = &vi_info->pVertexAttributeDescriptions[i];
+      for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) {
+         const VkVertexInputAttributeDescription *desc = &vi_info->pVertexAttributeDescriptions[i];
 
-      uint32_t end = desc->offset + vk_format_get_blocksize(desc->format);
-      pipeline->attrib_ends[desc->location] = end;
-      if (pipeline->binding_stride[desc->binding])
-         pipeline->attrib_index_offset[desc->location] =
-            desc->offset / pipeline->binding_stride[desc->binding];
-      pipeline->attrib_bindings[desc->location] = desc->binding;
+         uint32_t end = desc->offset + vk_format_get_blocksize(desc->format);
+         pipeline->attrib_ends[desc->location] = end;
+         if (pipeline->binding_stride[desc->binding])
+            pipeline->attrib_index_offset[desc->location] =
+               desc->offset / pipeline->binding_stride[desc->binding];
+         pipeline->attrib_bindings[desc->location] = desc->binding;
+      }
    }
 
    pipeline->use_per_attribute_vb_descs = info->vs.use_per_attribute_vb_descs;
-   pipeline->vb_desc_usage_mask = info->vs.vb_desc_usage_mask;
+   pipeline->last_vertex_attrib_bit = util_last_bit(info->vs.vb_desc_usage_mask);
+   if (pipeline->shaders[MESA_SHADER_VERTEX])
+      pipeline->next_vertex_stage = MESA_SHADER_VERTEX;
+   else if (pipeline->shaders[MESA_SHADER_TESS_CTRL])
+      pipeline->next_vertex_stage = MESA_SHADER_TESS_CTRL;
+   else
+      pipeline->next_vertex_stage = MESA_SHADER_GEOMETRY;
+   if (pipeline->next_vertex_stage == MESA_SHADER_VERTEX) {
+      const struct radv_shader_variant *vs_shader = pipeline->shaders[MESA_SHADER_VERTEX];
+      pipeline->can_use_simple_input = vs_shader->info.is_ngg == pipeline->device->physical_device->use_ngg &&
+                                       vs_shader->info.wave_size == pipeline->device->physical_device->ge_wave_size;
+   } else {
+      pipeline->can_use_simple_input = false;
+   }
+   if (info->vs.dynamic_inputs)
+      pipeline->vb_desc_usage_mask = BITFIELD_MASK(pipeline->last_vertex_attrib_bit);
+   else
+      pipeline->vb_desc_usage_mask = info->vs.vb_desc_usage_mask;
    pipeline->vb_desc_alloc_size = util_bitcount(pipeline->vb_desc_usage_mask) * 16;
 }
 
@@ -5379,6 +5476,14 @@ radv_pipeline_get_streamout_shader(struct radv_pipeline *pipeline)
    return NULL;
 }
 
+static bool
+radv_shader_need_indirect_descriptor_sets(struct radv_pipeline *pipeline, gl_shader_stage stage)
+{
+   struct radv_userdata_info *loc =
+      radv_lookup_user_sgpr(pipeline, stage, AC_UD_INDIRECT_DESCRIPTOR_SETS);
+   return loc->sgpr_idx != -1;
+}
+
 static void
 radv_pipeline_init_shader_stages_state(struct radv_pipeline *pipeline)
 {
@@ -5390,7 +5495,7 @@ radv_pipeline_init_shader_stages_state(struct radv_pipeline *pipeline)
 
       if (pipeline->shaders[i]) {
          pipeline->need_indirect_descriptor_sets |=
-            pipeline->shaders[i]->info.need_indirect_descriptor_sets;
+            radv_shader_need_indirect_descriptor_sets(pipeline, i);
       }
    }
 
@@ -5413,12 +5518,11 @@ radv_pipeline_init(struct radv_pipeline *pipeline, struct radv_device *device,
                    const VkGraphicsPipelineCreateInfo *pCreateInfo,
                    const struct radv_graphics_pipeline_create_info *extra)
 {
+   RADV_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, pCreateInfo->layout);
    VkResult result;
 
    pipeline->device = device;
-   pipeline->layout = radv_pipeline_layout_from_handle(pCreateInfo->layout);
    pipeline->graphics.last_vgt_api_stage = MESA_SHADER_NONE;
-   assert(pipeline->layout);
 
    struct radv_blend_state blend = radv_pipeline_init_blend_state(pipeline, pCreateInfo, extra);
 
@@ -5443,8 +5547,8 @@ radv_pipeline_init(struct radv_pipeline *pipeline, struct radv_device *device,
    struct radv_pipeline_key key =
       radv_generate_graphics_pipeline_key(pipeline, pCreateInfo, &blend);
 
-   result = radv_create_shaders(pipeline, device, cache, &key, pStages, pCreateInfo->flags,
-                                pipeline_feedback, stage_feedbacks);
+   result = radv_create_shaders(pipeline, pipeline_layout, device, cache, &key, pStages,
+                                pCreateInfo->flags, NULL, pipeline_feedback, stage_feedbacks);
    if (result != VK_SUCCESS)
       return result;
 
@@ -5505,7 +5609,7 @@ radv_pipeline_init(struct radv_pipeline *pipeline, struct radv_device *device,
          pCreateInfo->pTessellationState->patchControlPoints;
    }
 
-   radv_pipeline_init_vertex_input_state(pipeline, pCreateInfo);
+   radv_pipeline_init_vertex_input_state(pipeline, pCreateInfo, &key);
    radv_pipeline_init_binning_state(pipeline, pCreateInfo, &blend);
    radv_pipeline_init_shader_stages_state(pipeline);
    radv_pipeline_init_scratch(device, pipeline);
@@ -5518,6 +5622,9 @@ radv_pipeline_init(struct radv_pipeline *pipeline, struct radv_device *device,
       pipeline->graphics.is_ngg &&
       pipeline->shaders[pipeline->graphics.last_vgt_api_stage]->info.has_ngg_culling;
 
+   pipeline->push_constant_size = pipeline_layout->push_constant_size;
+   pipeline->dynamic_offset_count = pipeline_layout->dynamic_offset_count;
+
    radv_pipeline_generate_pm4(pipeline, pCreateInfo, extra, &blend);
 
    return result;
@@ -5537,9 +5644,10 @@ radv_graphics_pipeline_create(VkDevice _device, VkPipelineCache _cache,
    pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
                          VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (pipeline == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE);
+   pipeline->type = RADV_PIPELINE_GRAPHICS;
 
    result = radv_pipeline_init(pipeline, device, cache, pCreateInfo, extra);
    if (result != VK_SUCCESS) {
@@ -5583,12 +5691,10 @@ static void
 radv_pipeline_generate_hw_cs(struct radeon_cmdbuf *cs, const struct radv_pipeline *pipeline)
 {
    struct radv_shader_variant *shader = pipeline->shaders[MESA_SHADER_COMPUTE];
-   uint64_t va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
+   uint64_t va = radv_shader_variant_get_va(shader);
    struct radv_device *device = pipeline->device;
 
-   radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
-   radeon_emit(cs, va >> 8);
-   radeon_emit(cs, S_00B834_DATA(va >> 40));
+   radeon_set_sh_reg(cs, R_00B830_COMPUTE_PGM_LO, va >> 8);
 
    radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
    radeon_emit(cs, shader->config.rsrc1);
@@ -5660,21 +5766,24 @@ radv_generate_compute_pipeline_key(struct radv_pipeline *pipeline,
    if (subgroup_size) {
       assert(subgroup_size->requiredSubgroupSize == 32 ||
              subgroup_size->requiredSubgroupSize == 64);
-      key.compute_subgroup_size = subgroup_size->requiredSubgroupSize;
+      key.cs.compute_subgroup_size = subgroup_size->requiredSubgroupSize;
    } else if (stage->flags & VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT) {
-      key.require_full_subgroups = true;
+      key.cs.require_full_subgroups = true;
    }
 
    return key;
 }
 
-static VkResult
+VkResult
 radv_compute_pipeline_create(VkDevice _device, VkPipelineCache _cache,
                              const VkComputePipelineCreateInfo *pCreateInfo,
-                             const VkAllocationCallbacks *pAllocator, VkPipeline *pPipeline)
+                             const VkAllocationCallbacks *pAllocator, const uint8_t *custom_hash,
+                             struct radv_pipeline_shader_stack_size *rt_stack_sizes,
+                             uint32_t rt_group_count, VkPipeline *pPipeline)
 {
    RADV_FROM_HANDLE(radv_device, device, _device);
    RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache);
+   RADV_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, pCreateInfo->layout);
    const VkPipelineShaderStageCreateInfo *pStages[MESA_SHADER_STAGES] = {
       0,
    };
@@ -5684,15 +5793,18 @@ radv_compute_pipeline_create(VkDevice _device, VkPipelineCache _cache,
 
    pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
                          VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-   if (pipeline == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+   if (pipeline == NULL) {
+      free(rt_stack_sizes);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
 
    vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE);
+   pipeline->type = RADV_PIPELINE_COMPUTE;
 
    pipeline->device = device;
    pipeline->graphics.last_vgt_api_stage = MESA_SHADER_NONE;
-   pipeline->layout = radv_pipeline_layout_from_handle(pCreateInfo->layout);
-   assert(pipeline->layout);
+   pipeline->compute.rt_stack_sizes = rt_stack_sizes;
+   pipeline->compute.group_count = rt_group_count;
 
    const VkPipelineCreationFeedbackCreateInfoEXT *creation_feedback =
       vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO_EXT);
@@ -5707,8 +5819,8 @@ radv_compute_pipeline_create(VkDevice _device, VkPipelineCache _cache,
 
    struct radv_pipeline_key key = radv_generate_compute_pipeline_key(pipeline, pCreateInfo);
 
-   result = radv_create_shaders(pipeline, device, cache, &key, pStages, pCreateInfo->flags,
-                                pipeline_feedback, stage_feedbacks);
+   result = radv_create_shaders(pipeline, pipeline_layout, device, cache, &key, pStages,
+                                pCreateInfo->flags, custom_hash, pipeline_feedback, stage_feedbacks);
    if (result != VK_SUCCESS) {
       radv_pipeline_destroy(device, pipeline, pAllocator);
       return result;
@@ -5717,9 +5829,12 @@ radv_compute_pipeline_create(VkDevice _device, VkPipelineCache _cache,
    pipeline->user_data_0[MESA_SHADER_COMPUTE] = radv_pipeline_stage_to_user_data_0(
       pipeline, MESA_SHADER_COMPUTE, device->physical_device->rad_info.chip_class);
    pipeline->need_indirect_descriptor_sets |=
-      pipeline->shaders[MESA_SHADER_COMPUTE]->info.need_indirect_descriptor_sets;
+      radv_shader_need_indirect_descriptor_sets(pipeline, MESA_SHADER_COMPUTE);
    radv_pipeline_init_scratch(device, pipeline);
 
+   pipeline->push_constant_size = pipeline_layout->push_constant_size;
+   pipeline->dynamic_offset_count = pipeline_layout->dynamic_offset_count;
+
    radv_compute_generate_pm4(pipeline);
 
    *pPipeline = radv_pipeline_to_handle(pipeline);
@@ -5737,8 +5852,8 @@ radv_CreateComputePipelines(VkDevice _device, VkPipelineCache pipelineCache, uin
    unsigned i = 0;
    for (; i < count; i++) {
       VkResult r;
-      r = radv_compute_pipeline_create(_device, pipelineCache, &pCreateInfos[i], pAllocator,
-                                       &pPipelines[i]);
+      r = radv_compute_pipeline_create(_device, pipelineCache, &pCreateInfos[i], pAllocator, NULL,
+                                       NULL, 0, &pPipelines[i]);
       if (r != VK_SUCCESS) {
          result = r;
          pPipelines[i] = VK_NULL_HANDLE;
@@ -5949,14 +6064,6 @@ radv_GetPipelineExecutableStatisticsKHR(VkDevice _device,
    }
    ++s;
 
-   if (s < end) {
-      desc_copy(s->name, "PrivMem VGPRs");
-      desc_copy(s->description, "Number of VGPRs stored in private memory per subgroup");
-      s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
-      s->value.u64 = shader->info.private_mem_vgprs;
-   }
-   ++s;
-
    if (s < end) {
       desc_copy(s->name, "Code size");
       desc_copy(s->description, "Code size in bytes");
@@ -6073,7 +6180,7 @@ radv_GetPipelineExecutableInternalRepresentationsKHR(
    ++p;
 
    /* Disassembler */
-   if (p < end) {
+   if (p < end && shader->disasm_string) {
       p->isText = true;
       desc_copy(p->name, "Assembly");
       desc_copy(p->description, "Final Assembly");
diff --git a/mesa 3D driver/src/amd/vulkan/radv_pipeline_cache.c b/mesa 3D driver/src/amd/vulkan/radv_pipeline_cache.c
index f0fb1428d5..709ecbe229 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_pipeline_cache.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_pipeline_cache.c	
@@ -37,6 +37,7 @@ struct cache_entry {
       uint32_t sha1_dw[5];
    };
    uint32_t binary_sizes[MESA_SHADER_STAGES];
+   uint32_t num_stack_sizes;
    struct radv_shader_variant *variants[MESA_SHADER_STAGES];
    char code[0];
 };
@@ -62,6 +63,8 @@ radv_pipeline_cache_unlock(struct radv_pipeline_cache *cache)
 void
 radv_pipeline_cache_init(struct radv_pipeline_cache *cache, struct radv_device *device)
 {
+   vk_object_base_init(&device->vk, &cache->base, VK_OBJECT_TYPE_PIPELINE_CACHE);
+
    cache->device = device;
    mtx_init(&cache->mutex, mtx_plain);
    cache->flags = 0;
@@ -95,6 +98,8 @@ radv_pipeline_cache_finish(struct radv_pipeline_cache *cache)
       }
    mtx_destroy(&cache->mutex);
    free(cache->hash_table);
+
+   vk_object_base_finish(&cache->base);
 }
 
 static uint32_t
@@ -139,6 +144,39 @@ radv_hash_shaders(unsigned char *hash, const VkPipelineShaderStageCreateInfo **s
    _mesa_sha1_final(&ctx, hash);
 }
 
+void
+radv_hash_rt_shaders(unsigned char *hash, const VkRayTracingPipelineCreateInfoKHR *pCreateInfo,
+                     uint32_t flags)
+{
+   RADV_FROM_HANDLE(radv_pipeline_layout, layout, pCreateInfo->layout);
+   struct mesa_sha1 ctx;
+
+   _mesa_sha1_init(&ctx);
+   if (layout)
+      _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
+
+   for (uint32_t i = 0; i < pCreateInfo->stageCount; ++i) {
+      RADV_FROM_HANDLE(vk_shader_module, module, pCreateInfo->pStages[i].module);
+      const VkSpecializationInfo *spec_info = pCreateInfo->pStages[i].pSpecializationInfo;
+
+      _mesa_sha1_update(&ctx, module->sha1, sizeof(module->sha1));
+      _mesa_sha1_update(&ctx, pCreateInfo->pStages[i].pName, strlen(pCreateInfo->pStages[i].pName));
+      if (spec_info && spec_info->mapEntryCount) {
+         _mesa_sha1_update(&ctx, spec_info->pMapEntries,
+                           spec_info->mapEntryCount * sizeof spec_info->pMapEntries[0]);
+         _mesa_sha1_update(&ctx, spec_info->pData, spec_info->dataSize);
+      }
+   }
+
+   _mesa_sha1_update(&ctx, pCreateInfo->pGroups,
+                     pCreateInfo->groupCount * sizeof(*pCreateInfo->pGroups));
+
+   if (!radv_rt_pipeline_has_dynamic_stack_size(pCreateInfo))
+      _mesa_sha1_update(&ctx, &pCreateInfo->maxPipelineRayRecursionDepth, 4);
+   _mesa_sha1_update(&ctx, &flags, 4);
+   _mesa_sha1_final(&ctx, hash);
+}
+
 static struct cache_entry *
 radv_pipeline_cache_search_unlocked(struct radv_pipeline_cache *cache, const unsigned char *sha1)
 {
@@ -209,7 +247,7 @@ radv_pipeline_cache_grow(struct radv_pipeline_cache *cache)
 
    table = malloc(byte_size);
    if (table == NULL)
-      return vk_error(cache->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(cache, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    cache->hash_table = table;
    cache->table_size = table_size;
@@ -253,11 +291,10 @@ radv_is_cache_disabled(struct radv_device *device)
 }
 
 bool
-radv_create_shader_variants_from_pipeline_cache(struct radv_device *device,
-                                                struct radv_pipeline_cache *cache,
-                                                const unsigned char *sha1,
-                                                struct radv_shader_variant **variants,
-                                                bool *found_in_application_cache)
+radv_create_shader_variants_from_pipeline_cache(
+   struct radv_device *device, struct radv_pipeline_cache *cache, const unsigned char *sha1,
+   struct radv_shader_variant **variants, struct radv_pipeline_shader_stack_size **stack_sizes,
+   uint32_t *num_stack_sizes, bool *found_in_application_cache)
 {
    struct cache_entry *entry;
 
@@ -316,7 +353,7 @@ radv_create_shader_variants_from_pipeline_cache(struct radv_device *device,
          memcpy(binary, p, entry->binary_sizes[i]);
          p += entry->binary_sizes[i];
 
-         entry->variants[i] = radv_shader_variant_create(device, binary, false);
+         entry->variants[i] = radv_shader_variant_create(device, binary, false, true);
          free(binary);
       } else if (entry->binary_sizes[i]) {
          p += entry->binary_sizes[i];
@@ -325,6 +362,14 @@ radv_create_shader_variants_from_pipeline_cache(struct radv_device *device,
 
    memcpy(variants, entry->variants, sizeof(entry->variants));
 
+   if (num_stack_sizes) {
+      *num_stack_sizes = entry->num_stack_sizes;
+      if (entry->num_stack_sizes) {
+         *stack_sizes = malloc(entry->num_stack_sizes * sizeof(**stack_sizes));
+         memcpy(*stack_sizes, p, entry->num_stack_sizes * sizeof(**stack_sizes));
+      }
+   }
+
    if (device->instance->debug_flags & RADV_DEBUG_NO_MEMORY_CACHE && cache == device->mem_cache)
       vk_free(&cache->alloc, entry);
    else {
@@ -340,7 +385,9 @@ radv_create_shader_variants_from_pipeline_cache(struct radv_device *device,
 void
 radv_pipeline_cache_insert_shaders(struct radv_device *device, struct radv_pipeline_cache *cache,
                                    const unsigned char *sha1, struct radv_shader_variant **variants,
-                                   struct radv_shader_binary *const *binaries)
+                                   struct radv_shader_binary *const *binaries,
+                                   const struct radv_pipeline_shader_stack_size *stack_sizes,
+                                   uint32_t num_stack_sizes)
 {
    if (!cache)
       cache = device->mem_cache;
@@ -370,7 +417,7 @@ radv_pipeline_cache_insert_shaders(struct radv_device *device, struct radv_pipel
       return;
    }
 
-   size_t size = sizeof(*entry);
+   size_t size = sizeof(*entry) + sizeof(*stack_sizes) * num_stack_sizes;
    for (int i = 0; i < MESA_SHADER_STAGES; ++i)
       if (variants[i])
          size += binaries[i]->total_size;
@@ -398,6 +445,12 @@ radv_pipeline_cache_insert_shaders(struct radv_device *device, struct radv_pipel
       p += binaries[i]->total_size;
    }
 
+   if (num_stack_sizes) {
+      memcpy(p, stack_sizes, sizeof(*stack_sizes) * num_stack_sizes);
+      p += sizeof(*stack_sizes) * num_stack_sizes;
+   }
+   entry->num_stack_sizes = num_stack_sizes;
+
    // Make valgrind happy by filling the alignment hole at the end.
    assert(p == (char *)entry + size_without_align);
    assert(sizeof(*entry) + (p - entry->code) == size_without_align);
@@ -406,8 +459,10 @@ radv_pipeline_cache_insert_shaders(struct radv_device *device, struct radv_pipel
    /* Always add cache items to disk. This will allow collection of
     * compiled shaders by third parties such as steam, even if the app
     * implements its own pipeline cache.
+    *
+    * Make sure to exclude meta shaders because they are stored in a different cache file.
     */
-   if (device->physical_device->disk_cache) {
+   if (device->physical_device->disk_cache && cache != &device->meta_state.cache) {
       uint8_t disk_sha1[20];
       disk_cache_compute_key(device->physical_device->disk_cache, sha1, 20, disk_sha1);
 
@@ -494,9 +549,7 @@ radv_CreatePipelineCache(VkDevice _device, const VkPipelineCacheCreateInfo *pCre
    cache = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*cache), 8,
                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (cache == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   vk_object_base_init(&device->vk, &cache->base, VK_OBJECT_TYPE_PIPELINE_CACHE);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    if (pAllocator)
       cache->alloc = *pAllocator;
@@ -524,9 +577,8 @@ radv_DestroyPipelineCache(VkDevice _device, VkPipelineCache _cache,
 
    if (!cache)
       return;
-   radv_pipeline_cache_finish(cache);
 
-   vk_object_base_finish(&cache->base);
+   radv_pipeline_cache_finish(cache);
    vk_free2(&device->vk.alloc, pAllocator, cache);
 }
 
diff --git a/mesa 3D driver/src/amd/vulkan/radv_pipeline_rt.c b/mesa 3D driver/src/amd/vulkan/radv_pipeline_rt.c
new file mode 100644
index 0000000000..bb9daf5caf
--- /dev/null
+++ b/mesa 3D driver/src/amd/vulkan/radv_pipeline_rt.c	
@@ -0,0 +1,2354 @@
+/*
+ * Copyright © 2021 Google
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "radv_acceleration_structure.h"
+#include "radv_debug.h"
+#include "radv_private.h"
+#include "radv_shader.h"
+
+#include "nir/nir.h"
+#include "nir/nir_builder.h"
+#include "nir/nir_builtin_builder.h"
+
+static VkRayTracingPipelineCreateInfoKHR
+radv_create_merged_rt_create_info(const VkRayTracingPipelineCreateInfoKHR *pCreateInfo)
+{
+   VkRayTracingPipelineCreateInfoKHR local_create_info = *pCreateInfo;
+   uint32_t total_stages = pCreateInfo->stageCount;
+   uint32_t total_groups = pCreateInfo->groupCount;
+
+   if (pCreateInfo->pLibraryInfo) {
+      for (unsigned i = 0; i < pCreateInfo->pLibraryInfo->libraryCount; ++i) {
+         RADV_FROM_HANDLE(radv_pipeline, library, pCreateInfo->pLibraryInfo->pLibraries[i]);
+         total_stages += library->library.stage_count;
+         total_groups += library->library.group_count;
+      }
+   }
+   VkPipelineShaderStageCreateInfo *stages = NULL;
+   VkRayTracingShaderGroupCreateInfoKHR *groups = NULL;
+   local_create_info.stageCount = total_stages;
+   local_create_info.groupCount = total_groups;
+   local_create_info.pStages = stages =
+      malloc(sizeof(VkPipelineShaderStageCreateInfo) * total_stages);
+   local_create_info.pGroups = groups =
+      malloc(sizeof(VkRayTracingShaderGroupCreateInfoKHR) * total_groups);
+   if (!local_create_info.pStages || !local_create_info.pGroups)
+      return local_create_info;
+
+   total_stages = pCreateInfo->stageCount;
+   total_groups = pCreateInfo->groupCount;
+   for (unsigned j = 0; j < pCreateInfo->stageCount; ++j)
+      stages[j] = pCreateInfo->pStages[j];
+   for (unsigned j = 0; j < pCreateInfo->groupCount; ++j)
+      groups[j] = pCreateInfo->pGroups[j];
+
+   if (pCreateInfo->pLibraryInfo) {
+      for (unsigned i = 0; i < pCreateInfo->pLibraryInfo->libraryCount; ++i) {
+         RADV_FROM_HANDLE(radv_pipeline, library, pCreateInfo->pLibraryInfo->pLibraries[i]);
+         for (unsigned j = 0; j < library->library.stage_count; ++j)
+            stages[total_stages + j] = library->library.stages[j];
+         for (unsigned j = 0; j < library->library.group_count; ++j) {
+            VkRayTracingShaderGroupCreateInfoKHR *dst = &groups[total_groups + j];
+            *dst = library->library.groups[j];
+            if (dst->generalShader != VK_SHADER_UNUSED_KHR)
+               dst->generalShader += total_stages;
+            if (dst->closestHitShader != VK_SHADER_UNUSED_KHR)
+               dst->closestHitShader += total_stages;
+            if (dst->anyHitShader != VK_SHADER_UNUSED_KHR)
+               dst->anyHitShader += total_stages;
+            if (dst->intersectionShader != VK_SHADER_UNUSED_KHR)
+               dst->intersectionShader += total_stages;
+         }
+         total_stages += library->library.stage_count;
+         total_groups += library->library.group_count;
+      }
+   }
+   return local_create_info;
+}
+
+static VkResult
+radv_rt_pipeline_library_create(VkDevice _device, VkPipelineCache _cache,
+                                const VkRayTracingPipelineCreateInfoKHR *pCreateInfo,
+                                const VkAllocationCallbacks *pAllocator, VkPipeline *pPipeline)
+{
+   RADV_FROM_HANDLE(radv_device, device, _device);
+   struct radv_pipeline *pipeline;
+
+   pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
+                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (pipeline == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE);
+   pipeline->type = RADV_PIPELINE_LIBRARY;
+
+   VkRayTracingPipelineCreateInfoKHR local_create_info =
+      radv_create_merged_rt_create_info(pCreateInfo);
+   if (!local_create_info.pStages || !local_create_info.pGroups)
+      goto fail;
+
+   if (local_create_info.stageCount) {
+      size_t size = sizeof(VkPipelineShaderStageCreateInfo) * local_create_info.stageCount;
+      pipeline->library.stage_count = local_create_info.stageCount;
+      pipeline->library.stages = malloc(size);
+      if (!pipeline->library.stages)
+         goto fail;
+      memcpy(pipeline->library.stages, local_create_info.pStages, size);
+   }
+
+   if (local_create_info.groupCount) {
+      size_t size = sizeof(VkRayTracingShaderGroupCreateInfoKHR) * local_create_info.groupCount;
+      pipeline->library.group_count = local_create_info.groupCount;
+      pipeline->library.groups = malloc(size);
+      if (!pipeline->library.groups)
+         goto fail;
+      memcpy(pipeline->library.groups, local_create_info.pGroups, size);
+   }
+
+   *pPipeline = radv_pipeline_to_handle(pipeline);
+
+   free((void *)local_create_info.pGroups);
+   free((void *)local_create_info.pStages);
+   return VK_SUCCESS;
+fail:
+   free(pipeline->library.groups);
+   free(pipeline->library.stages);
+   free((void *)local_create_info.pGroups);
+   free((void *)local_create_info.pStages);
+   return VK_ERROR_OUT_OF_HOST_MEMORY;
+}
+
+/*
+ * Global variables for an RT pipeline
+ */
+struct rt_variables {
+   /* idx of the next shader to run in the next iteration of the main loop */
+   nir_variable *idx;
+
+   /* scratch offset of the argument area relative to stack_ptr */
+   nir_variable *arg;
+
+   nir_variable *stack_ptr;
+
+   /* global address of the SBT entry used for the shader */
+   nir_variable *shader_record_ptr;
+
+   /* trace_ray arguments */
+   nir_variable *accel_struct;
+   nir_variable *flags;
+   nir_variable *cull_mask;
+   nir_variable *sbt_offset;
+   nir_variable *sbt_stride;
+   nir_variable *miss_index;
+   nir_variable *origin;
+   nir_variable *tmin;
+   nir_variable *direction;
+   nir_variable *tmax;
+
+   /* from the BTAS instance currently being visited */
+   nir_variable *custom_instance_and_mask;
+
+   /* Properties of the primitive currently being visited. */
+   nir_variable *primitive_id;
+   nir_variable *geometry_id_and_flags;
+   nir_variable *instance_id;
+   nir_variable *instance_addr;
+   nir_variable *hit_kind;
+   nir_variable *opaque;
+
+   /* Safeguard to ensure we don't end up in an infinite loop of non-existing case. Should not be
+    * needed but is extra anti-hang safety during bring-up. */
+   nir_variable *main_loop_case_visited;
+
+   /* Output variable for intersection & anyhit shaders. */
+   nir_variable *ahit_status;
+
+   /* Array of stack size struct for recording the max stack size for each group. */
+   struct radv_pipeline_shader_stack_size *stack_sizes;
+   unsigned group_idx;
+};
+
+static struct rt_variables
+create_rt_variables(nir_shader *shader, struct radv_pipeline_shader_stack_size *stack_sizes)
+{
+   struct rt_variables vars = {
+      NULL,
+   };
+   vars.idx = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "idx");
+   vars.arg = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "arg");
+   vars.stack_ptr = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "stack_ptr");
+   vars.shader_record_ptr =
+      nir_variable_create(shader, nir_var_shader_temp, glsl_uint64_t_type(), "shader_record_ptr");
+
+   const struct glsl_type *vec3_type = glsl_vector_type(GLSL_TYPE_FLOAT, 3);
+   vars.accel_struct =
+      nir_variable_create(shader, nir_var_shader_temp, glsl_uint64_t_type(), "accel_struct");
+   vars.flags = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "ray_flags");
+   vars.cull_mask = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "cull_mask");
+   vars.sbt_offset =
+      nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "sbt_offset");
+   vars.sbt_stride =
+      nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "sbt_stride");
+   vars.miss_index =
+      nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "miss_index");
+   vars.origin = nir_variable_create(shader, nir_var_shader_temp, vec3_type, "ray_origin");
+   vars.tmin = nir_variable_create(shader, nir_var_shader_temp, glsl_float_type(), "ray_tmin");
+   vars.direction = nir_variable_create(shader, nir_var_shader_temp, vec3_type, "ray_direction");
+   vars.tmax = nir_variable_create(shader, nir_var_shader_temp, glsl_float_type(), "ray_tmax");
+
+   vars.custom_instance_and_mask = nir_variable_create(
+      shader, nir_var_shader_temp, glsl_uint_type(), "custom_instance_and_mask");
+   vars.primitive_id =
+      nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "primitive_id");
+   vars.geometry_id_and_flags =
+      nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "geometry_id_and_flags");
+   vars.instance_id =
+      nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "instance_id");
+   vars.instance_addr =
+      nir_variable_create(shader, nir_var_shader_temp, glsl_uint64_t_type(), "instance_addr");
+   vars.hit_kind = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "hit_kind");
+   vars.opaque = nir_variable_create(shader, nir_var_shader_temp, glsl_bool_type(), "opaque");
+
+   vars.main_loop_case_visited =
+      nir_variable_create(shader, nir_var_shader_temp, glsl_bool_type(), "main_loop_case_visited");
+   vars.ahit_status =
+      nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "ahit_status");
+
+   vars.stack_sizes = stack_sizes;
+   return vars;
+}
+
+/*
+ * Remap all the variables between the two rt_variables struct for inlining.
+ */
+static void
+map_rt_variables(struct hash_table *var_remap, struct rt_variables *src,
+                 const struct rt_variables *dst)
+{
+   _mesa_hash_table_insert(var_remap, src->idx, dst->idx);
+   _mesa_hash_table_insert(var_remap, src->arg, dst->arg);
+   _mesa_hash_table_insert(var_remap, src->stack_ptr, dst->stack_ptr);
+   _mesa_hash_table_insert(var_remap, src->shader_record_ptr, dst->shader_record_ptr);
+
+   _mesa_hash_table_insert(var_remap, src->accel_struct, dst->accel_struct);
+   _mesa_hash_table_insert(var_remap, src->flags, dst->flags);
+   _mesa_hash_table_insert(var_remap, src->cull_mask, dst->cull_mask);
+   _mesa_hash_table_insert(var_remap, src->sbt_offset, dst->sbt_offset);
+   _mesa_hash_table_insert(var_remap, src->sbt_stride, dst->sbt_stride);
+   _mesa_hash_table_insert(var_remap, src->miss_index, dst->miss_index);
+   _mesa_hash_table_insert(var_remap, src->origin, dst->origin);
+   _mesa_hash_table_insert(var_remap, src->tmin, dst->tmin);
+   _mesa_hash_table_insert(var_remap, src->direction, dst->direction);
+   _mesa_hash_table_insert(var_remap, src->tmax, dst->tmax);
+
+   _mesa_hash_table_insert(var_remap, src->custom_instance_and_mask, dst->custom_instance_and_mask);
+   _mesa_hash_table_insert(var_remap, src->primitive_id, dst->primitive_id);
+   _mesa_hash_table_insert(var_remap, src->geometry_id_and_flags, dst->geometry_id_and_flags);
+   _mesa_hash_table_insert(var_remap, src->instance_id, dst->instance_id);
+   _mesa_hash_table_insert(var_remap, src->instance_addr, dst->instance_addr);
+   _mesa_hash_table_insert(var_remap, src->hit_kind, dst->hit_kind);
+   _mesa_hash_table_insert(var_remap, src->opaque, dst->opaque);
+   _mesa_hash_table_insert(var_remap, src->ahit_status, dst->ahit_status);
+
+   src->stack_sizes = dst->stack_sizes;
+   src->group_idx = dst->group_idx;
+}
+
+/*
+ * Create a copy of the global rt variables where the primitive/instance related variables are
+ * independent.This is needed as we need to keep the old values of the global variables around
+ * in case e.g. an anyhit shader reject the collision. So there are inner variables that get copied
+ * to the outer variables once we commit to a better hit.
+ */
+static struct rt_variables
+create_inner_vars(nir_builder *b, const struct rt_variables *vars)
+{
+   struct rt_variables inner_vars = *vars;
+   inner_vars.idx =
+      nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "inner_idx");
+   inner_vars.shader_record_ptr = nir_variable_create(
+      b->shader, nir_var_shader_temp, glsl_uint64_t_type(), "inner_shader_record_ptr");
+   inner_vars.primitive_id =
+      nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "inner_primitive_id");
+   inner_vars.geometry_id_and_flags = nir_variable_create(
+      b->shader, nir_var_shader_temp, glsl_uint_type(), "inner_geometry_id_and_flags");
+   inner_vars.tmax =
+      nir_variable_create(b->shader, nir_var_shader_temp, glsl_float_type(), "inner_tmax");
+   inner_vars.instance_id =
+      nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "inner_instance_id");
+   inner_vars.instance_addr = nir_variable_create(b->shader, nir_var_shader_temp,
+                                                  glsl_uint64_t_type(), "inner_instance_addr");
+   inner_vars.hit_kind =
+      nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "inner_hit_kind");
+   inner_vars.custom_instance_and_mask = nir_variable_create(
+      b->shader, nir_var_shader_temp, glsl_uint_type(), "inner_custom_instance_and_mask");
+
+   return inner_vars;
+}
+
+/* The hit attributes are stored on the stack. This is the offset compared to the current stack
+ * pointer of where the hit attrib is stored. */
+const uint32_t RADV_HIT_ATTRIB_OFFSET = -(16 + RADV_MAX_HIT_ATTRIB_SIZE);
+
+static void
+insert_rt_return(nir_builder *b, const struct rt_variables *vars)
+{
+   nir_store_var(b, vars->stack_ptr,
+                 nir_iadd(b, nir_load_var(b, vars->stack_ptr), nir_imm_int(b, -16)), 1);
+   nir_store_var(b, vars->idx,
+                 nir_load_scratch(b, 1, 32, nir_load_var(b, vars->stack_ptr), .align_mul = 16), 1);
+}
+
+enum sbt_type {
+   SBT_RAYGEN,
+   SBT_MISS,
+   SBT_HIT,
+   SBT_CALLABLE,
+};
+
+static nir_ssa_def *
+get_sbt_ptr(nir_builder *b, nir_ssa_def *idx, enum sbt_type binding)
+{
+   nir_ssa_def *desc = nir_load_sbt_amd(b, 4, .binding = binding);
+   nir_ssa_def *base_addr = nir_pack_64_2x32(b, nir_channels(b, desc, 0x3));
+   nir_ssa_def *stride = nir_channel(b, desc, 2);
+
+   nir_ssa_def *ret = nir_imul(b, idx, stride);
+   ret = nir_iadd(b, base_addr, nir_u2u64(b, ret));
+
+   return ret;
+}
+
+static void
+load_sbt_entry(nir_builder *b, const struct rt_variables *vars, nir_ssa_def *idx,
+               enum sbt_type binding, unsigned offset)
+{
+   nir_ssa_def *addr = get_sbt_ptr(b, idx, binding);
+
+   nir_ssa_def *load_addr = addr;
+   if (offset)
+      load_addr = nir_iadd(b, load_addr, nir_imm_int64(b, offset));
+   nir_ssa_def *v_idx =
+      nir_build_load_global(b, 1, 32, load_addr, .align_mul = 4, .align_offset = 0);
+
+   nir_store_var(b, vars->idx, v_idx, 1);
+
+   nir_ssa_def *record_addr = nir_iadd(b, addr, nir_imm_int64(b, RADV_RT_HANDLE_SIZE));
+   nir_store_var(b, vars->shader_record_ptr, record_addr, 1);
+}
+
+static nir_ssa_def *
+nir_build_vec3_mat_mult(nir_builder *b, nir_ssa_def *vec, nir_ssa_def *matrix[], bool translation)
+{
+   nir_ssa_def *result_components[3] = {
+      nir_channel(b, matrix[0], 3),
+      nir_channel(b, matrix[1], 3),
+      nir_channel(b, matrix[2], 3),
+   };
+   for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 3; ++j) {
+         nir_ssa_def *v =
+            nir_fmul(b, nir_channels(b, vec, 1 << j), nir_channels(b, matrix[i], 1 << j));
+         result_components[i] = (translation || j) ? nir_fadd(b, result_components[i], v) : v;
+      }
+   }
+   return nir_vec(b, result_components, 3);
+}
+
+static nir_ssa_def *
+nir_build_vec3_mat_mult_pre(nir_builder *b, nir_ssa_def *vec, nir_ssa_def *matrix[])
+{
+   nir_ssa_def *result_components[3] = {
+      nir_channel(b, matrix[0], 3),
+      nir_channel(b, matrix[1], 3),
+      nir_channel(b, matrix[2], 3),
+   };
+   return nir_build_vec3_mat_mult(b, nir_fsub(b, vec, nir_vec(b, result_components, 3)), matrix,
+                                  false);
+}
+
+static void
+nir_build_wto_matrix_load(nir_builder *b, nir_ssa_def *instance_addr, nir_ssa_def **out)
+{
+   unsigned offset = offsetof(struct radv_bvh_instance_node, wto_matrix);
+   for (unsigned i = 0; i < 3; ++i) {
+      out[i] = nir_build_load_global(b, 4, 32,
+                                     nir_iadd(b, instance_addr, nir_imm_int64(b, offset + i * 16)),
+                                     .align_mul = 64, .align_offset = offset + i * 16);
+   }
+}
+
+/* This lowers all the RT instructions that we do not want to pass on to the combined shader and
+ * that we can implement using the variables from the shader we are going to inline into. */
+static void
+lower_rt_instructions(nir_shader *shader, struct rt_variables *vars, unsigned call_idx_base)
+{
+   nir_builder b_shader;
+   nir_builder_init(&b_shader, nir_shader_get_entrypoint(shader));
+
+   nir_foreach_block (block, nir_shader_get_entrypoint(shader)) {
+      nir_foreach_instr_safe (instr, block) {
+         switch (instr->type) {
+         case nir_instr_type_intrinsic: {
+            nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+            switch (intr->intrinsic) {
+            case nir_intrinsic_rt_execute_callable: {
+               uint32_t size = align(nir_intrinsic_stack_size(intr), 16) + RADV_MAX_HIT_ATTRIB_SIZE;
+               uint32_t ret = call_idx_base + nir_intrinsic_call_idx(intr) + 1;
+               b_shader.cursor = nir_instr_remove(instr);
+
+               nir_store_var(&b_shader, vars->stack_ptr,
+                             nir_iadd(&b_shader, nir_load_var(&b_shader, vars->stack_ptr),
+                                      nir_imm_int(&b_shader, size)),
+                             1);
+               nir_store_scratch(&b_shader, nir_imm_int(&b_shader, ret),
+                                 nir_load_var(&b_shader, vars->stack_ptr), .align_mul = 16,
+                                 .write_mask = 1);
+
+               nir_store_var(&b_shader, vars->stack_ptr,
+                             nir_iadd(&b_shader, nir_load_var(&b_shader, vars->stack_ptr),
+                                      nir_imm_int(&b_shader, 16)),
+                             1);
+               load_sbt_entry(&b_shader, vars, intr->src[0].ssa, SBT_CALLABLE, 0);
+
+               nir_store_var(
+                  &b_shader, vars->arg,
+                  nir_isub(&b_shader, intr->src[1].ssa, nir_imm_int(&b_shader, size + 16)), 1);
+
+               vars->stack_sizes[vars->group_idx].recursive_size =
+                  MAX2(vars->stack_sizes[vars->group_idx].recursive_size, size + 16);
+               break;
+            }
+            case nir_intrinsic_rt_trace_ray: {
+               uint32_t size = align(nir_intrinsic_stack_size(intr), 16) + RADV_MAX_HIT_ATTRIB_SIZE;
+               uint32_t ret = call_idx_base + nir_intrinsic_call_idx(intr) + 1;
+               b_shader.cursor = nir_instr_remove(instr);
+
+               nir_store_var(&b_shader, vars->stack_ptr,
+                             nir_iadd(&b_shader, nir_load_var(&b_shader, vars->stack_ptr),
+                                      nir_imm_int(&b_shader, size)),
+                             1);
+               nir_store_scratch(&b_shader, nir_imm_int(&b_shader, ret),
+                                 nir_load_var(&b_shader, vars->stack_ptr), .align_mul = 16,
+                                 .write_mask = 1);
+
+               nir_store_var(&b_shader, vars->stack_ptr,
+                             nir_iadd(&b_shader, nir_load_var(&b_shader, vars->stack_ptr),
+                                      nir_imm_int(&b_shader, 16)),
+                             1);
+
+               nir_store_var(&b_shader, vars->idx, nir_imm_int(&b_shader, 1), 1);
+               nir_store_var(
+                  &b_shader, vars->arg,
+                  nir_isub(&b_shader, intr->src[10].ssa, nir_imm_int(&b_shader, size + 16)), 1);
+
+               vars->stack_sizes[vars->group_idx].recursive_size =
+                  MAX2(vars->stack_sizes[vars->group_idx].recursive_size, size + 16);
+
+               /* Per the SPIR-V extension spec we have to ignore some bits for some arguments. */
+               nir_store_var(&b_shader, vars->accel_struct, intr->src[0].ssa, 0x1);
+               nir_store_var(&b_shader, vars->flags, intr->src[1].ssa, 0x1);
+               nir_store_var(&b_shader, vars->cull_mask,
+                             nir_iand(&b_shader, intr->src[2].ssa, nir_imm_int(&b_shader, 0xff)),
+                             0x1);
+               nir_store_var(&b_shader, vars->sbt_offset,
+                             nir_iand(&b_shader, intr->src[3].ssa, nir_imm_int(&b_shader, 0xf)),
+                             0x1);
+               nir_store_var(&b_shader, vars->sbt_stride,
+                             nir_iand(&b_shader, intr->src[4].ssa, nir_imm_int(&b_shader, 0xf)),
+                             0x1);
+               nir_store_var(&b_shader, vars->miss_index,
+                             nir_iand(&b_shader, intr->src[5].ssa, nir_imm_int(&b_shader, 0xffff)),
+                             0x1);
+               nir_store_var(&b_shader, vars->origin, intr->src[6].ssa, 0x7);
+               nir_store_var(&b_shader, vars->tmin, intr->src[7].ssa, 0x1);
+               nir_store_var(&b_shader, vars->direction, intr->src[8].ssa, 0x7);
+               nir_store_var(&b_shader, vars->tmax, intr->src[9].ssa, 0x1);
+               break;
+            }
+            case nir_intrinsic_rt_resume: {
+               uint32_t size = align(nir_intrinsic_stack_size(intr), 16) + RADV_MAX_HIT_ATTRIB_SIZE;
+               b_shader.cursor = nir_instr_remove(instr);
+
+               nir_store_var(&b_shader, vars->stack_ptr,
+                             nir_iadd(&b_shader, nir_load_var(&b_shader, vars->stack_ptr),
+                                      nir_imm_int(&b_shader, -size)),
+                             1);
+               break;
+            }
+            case nir_intrinsic_rt_return_amd: {
+               b_shader.cursor = nir_instr_remove(instr);
+
+               if (shader->info.stage == MESA_SHADER_RAYGEN) {
+                  nir_store_var(&b_shader, vars->idx, nir_imm_int(&b_shader, 0), 1);
+                  break;
+               }
+               insert_rt_return(&b_shader, vars);
+               break;
+            }
+            case nir_intrinsic_load_scratch: {
+               b_shader.cursor = nir_before_instr(instr);
+               nir_instr_rewrite_src_ssa(
+                  instr, &intr->src[0],
+                  nir_iadd(&b_shader, nir_load_var(&b_shader, vars->stack_ptr), intr->src[0].ssa));
+               break;
+            }
+            case nir_intrinsic_store_scratch: {
+               b_shader.cursor = nir_before_instr(instr);
+               nir_instr_rewrite_src_ssa(
+                  instr, &intr->src[1],
+                  nir_iadd(&b_shader, nir_load_var(&b_shader, vars->stack_ptr), intr->src[1].ssa));
+               break;
+            }
+            case nir_intrinsic_load_rt_arg_scratch_offset_amd: {
+               b_shader.cursor = nir_instr_remove(instr);
+               nir_ssa_def *ret = nir_load_var(&b_shader, vars->arg);
+               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
+               break;
+            }
+            case nir_intrinsic_load_shader_record_ptr: {
+               b_shader.cursor = nir_instr_remove(instr);
+               nir_ssa_def *ret = nir_load_var(&b_shader, vars->shader_record_ptr);
+               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
+               break;
+            }
+            case nir_intrinsic_load_ray_launch_id: {
+               b_shader.cursor = nir_instr_remove(instr);
+               nir_ssa_def *ret = nir_load_global_invocation_id(&b_shader, 32);
+               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
+               break;
+            }
+            case nir_intrinsic_load_ray_t_min: {
+               b_shader.cursor = nir_instr_remove(instr);
+               nir_ssa_def *ret = nir_load_var(&b_shader, vars->tmin);
+               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
+               break;
+            }
+            case nir_intrinsic_load_ray_t_max: {
+               b_shader.cursor = nir_instr_remove(instr);
+               nir_ssa_def *ret = nir_load_var(&b_shader, vars->tmax);
+               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
+               break;
+            }
+            case nir_intrinsic_load_ray_world_origin: {
+               b_shader.cursor = nir_instr_remove(instr);
+               nir_ssa_def *ret = nir_load_var(&b_shader, vars->origin);
+               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
+               break;
+            }
+            case nir_intrinsic_load_ray_world_direction: {
+               b_shader.cursor = nir_instr_remove(instr);
+               nir_ssa_def *ret = nir_load_var(&b_shader, vars->direction);
+               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
+               break;
+            }
+            case nir_intrinsic_load_ray_instance_custom_index: {
+               b_shader.cursor = nir_instr_remove(instr);
+               nir_ssa_def *ret = nir_load_var(&b_shader, vars->custom_instance_and_mask);
+               ret = nir_iand(&b_shader, ret, nir_imm_int(&b_shader, 0xFFFFFF));
+               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
+               break;
+            }
+            case nir_intrinsic_load_primitive_id: {
+               b_shader.cursor = nir_instr_remove(instr);
+               nir_ssa_def *ret = nir_load_var(&b_shader, vars->primitive_id);
+               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
+               break;
+            }
+            case nir_intrinsic_load_ray_geometry_index: {
+               b_shader.cursor = nir_instr_remove(instr);
+               nir_ssa_def *ret = nir_load_var(&b_shader, vars->geometry_id_and_flags);
+               ret = nir_iand(&b_shader, ret, nir_imm_int(&b_shader, 0xFFFFFFF));
+               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
+               break;
+            }
+            case nir_intrinsic_load_instance_id: {
+               b_shader.cursor = nir_instr_remove(instr);
+               nir_ssa_def *ret = nir_load_var(&b_shader, vars->instance_id);
+               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
+               break;
+            }
+            case nir_intrinsic_load_ray_flags: {
+               b_shader.cursor = nir_instr_remove(instr);
+               nir_ssa_def *ret = nir_load_var(&b_shader, vars->flags);
+               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
+               break;
+            }
+            case nir_intrinsic_load_ray_hit_kind: {
+               b_shader.cursor = nir_instr_remove(instr);
+               nir_ssa_def *ret = nir_load_var(&b_shader, vars->hit_kind);
+               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
+               break;
+            }
+            case nir_intrinsic_load_ray_world_to_object: {
+               unsigned c = nir_intrinsic_column(intr);
+               nir_ssa_def *instance_node_addr = nir_load_var(&b_shader, vars->instance_addr);
+               nir_ssa_def *wto_matrix[3];
+               nir_build_wto_matrix_load(&b_shader, instance_node_addr, wto_matrix);
+
+               nir_ssa_def *vals[3];
+               for (unsigned i = 0; i < 3; ++i)
+                  vals[i] = nir_channel(&b_shader, wto_matrix[i], c);
+
+               nir_ssa_def *val = nir_vec(&b_shader, vals, 3);
+               if (c == 3)
+                  val = nir_fneg(&b_shader,
+                                 nir_build_vec3_mat_mult(&b_shader, val, wto_matrix, false));
+               b_shader.cursor = nir_instr_remove(instr);
+               nir_ssa_def_rewrite_uses(&intr->dest.ssa, val);
+               break;
+            }
+            case nir_intrinsic_load_ray_object_to_world: {
+               unsigned c = nir_intrinsic_column(intr);
+               nir_ssa_def *instance_node_addr = nir_load_var(&b_shader, vars->instance_addr);
+               nir_ssa_def *val;
+               if (c == 3) {
+                  nir_ssa_def *wto_matrix[3];
+                  nir_build_wto_matrix_load(&b_shader, instance_node_addr, wto_matrix);
+
+                  nir_ssa_def *vals[3];
+                  for (unsigned i = 0; i < 3; ++i)
+                     vals[i] = nir_channel(&b_shader, wto_matrix[i], c);
+
+                  val = nir_vec(&b_shader, vals, 3);
+               } else {
+                  val = nir_build_load_global(
+                     &b_shader, 3, 32,
+                     nir_iadd(&b_shader, instance_node_addr, nir_imm_int64(&b_shader, 92 + c * 12)),
+                     .align_mul = 4, .align_offset = 0);
+               }
+               b_shader.cursor = nir_instr_remove(instr);
+               nir_ssa_def_rewrite_uses(&intr->dest.ssa, val);
+               break;
+            }
+            case nir_intrinsic_load_ray_object_origin: {
+               nir_ssa_def *instance_node_addr = nir_load_var(&b_shader, vars->instance_addr);
+               nir_ssa_def *wto_matrix[] = {
+                  nir_build_load_global(
+                     &b_shader, 4, 32,
+                     nir_iadd(&b_shader, instance_node_addr, nir_imm_int64(&b_shader, 16)),
+                     .align_mul = 64, .align_offset = 16),
+                  nir_build_load_global(
+                     &b_shader, 4, 32,
+                     nir_iadd(&b_shader, instance_node_addr, nir_imm_int64(&b_shader, 32)),
+                     .align_mul = 64, .align_offset = 32),
+                  nir_build_load_global(
+                     &b_shader, 4, 32,
+                     nir_iadd(&b_shader, instance_node_addr, nir_imm_int64(&b_shader, 48)),
+                     .align_mul = 64, .align_offset = 48)};
+               nir_ssa_def *val = nir_build_vec3_mat_mult_pre(
+                  &b_shader, nir_load_var(&b_shader, vars->origin), wto_matrix);
+               b_shader.cursor = nir_instr_remove(instr);
+               nir_ssa_def_rewrite_uses(&intr->dest.ssa, val);
+               break;
+            }
+            case nir_intrinsic_load_ray_object_direction: {
+               nir_ssa_def *instance_node_addr = nir_load_var(&b_shader, vars->instance_addr);
+               nir_ssa_def *wto_matrix[3];
+               nir_build_wto_matrix_load(&b_shader, instance_node_addr, wto_matrix);
+               nir_ssa_def *val = nir_build_vec3_mat_mult(
+                  &b_shader, nir_load_var(&b_shader, vars->direction), wto_matrix, false);
+               b_shader.cursor = nir_instr_remove(instr);
+               nir_ssa_def_rewrite_uses(&intr->dest.ssa, val);
+               break;
+            }
+            case nir_intrinsic_load_intersection_opaque_amd: {
+               b_shader.cursor = nir_instr_remove(instr);
+               nir_ssa_def *ret = nir_load_var(&b_shader, vars->opaque);
+               nir_ssa_def_rewrite_uses(&intr->dest.ssa, ret);
+               break;
+            }
+            case nir_intrinsic_ignore_ray_intersection: {
+               b_shader.cursor = nir_instr_remove(instr);
+               nir_store_var(&b_shader, vars->ahit_status, nir_imm_int(&b_shader, 1), 1);
+
+               /* The if is a workaround to avoid having to fix up control flow manually */
+               nir_push_if(&b_shader, nir_imm_true(&b_shader));
+               nir_jump(&b_shader, nir_jump_return);
+               nir_pop_if(&b_shader, NULL);
+               break;
+            }
+            case nir_intrinsic_terminate_ray: {
+               b_shader.cursor = nir_instr_remove(instr);
+               nir_store_var(&b_shader, vars->ahit_status, nir_imm_int(&b_shader, 2), 1);
+
+               /* The if is a workaround to avoid having to fix up control flow manually */
+               nir_push_if(&b_shader, nir_imm_true(&b_shader));
+               nir_jump(&b_shader, nir_jump_return);
+               nir_pop_if(&b_shader, NULL);
+               break;
+            }
+            case nir_intrinsic_report_ray_intersection: {
+               b_shader.cursor = nir_instr_remove(instr);
+               nir_push_if(
+                  &b_shader,
+                  nir_iand(
+                     &b_shader,
+                     nir_flt(&b_shader, intr->src[0].ssa, nir_load_var(&b_shader, vars->tmax)),
+                     nir_fge(&b_shader, intr->src[0].ssa, nir_load_var(&b_shader, vars->tmin))));
+               {
+                  nir_store_var(&b_shader, vars->ahit_status, nir_imm_int(&b_shader, 0), 1);
+                  nir_store_var(&b_shader, vars->tmax, intr->src[0].ssa, 1);
+                  nir_store_var(&b_shader, vars->hit_kind, intr->src[1].ssa, 1);
+               }
+               nir_pop_if(&b_shader, NULL);
+               break;
+            }
+            default:
+               break;
+            }
+            break;
+         }
+         case nir_instr_type_jump: {
+            nir_jump_instr *jump = nir_instr_as_jump(instr);
+            if (jump->type == nir_jump_halt) {
+               b_shader.cursor = nir_instr_remove(instr);
+               nir_jump(&b_shader, nir_jump_return);
+            }
+            break;
+         }
+         default:
+            break;
+         }
+      }
+   }
+
+   nir_metadata_preserve(nir_shader_get_entrypoint(shader), nir_metadata_none);
+}
+
+static void
+insert_rt_case(nir_builder *b, nir_shader *shader, const struct rt_variables *vars,
+               nir_ssa_def *idx, uint32_t call_idx_base, uint32_t call_idx)
+{
+   struct hash_table *var_remap = _mesa_pointer_hash_table_create(NULL);
+
+   nir_opt_dead_cf(shader);
+
+   struct rt_variables src_vars = create_rt_variables(shader, vars->stack_sizes);
+   map_rt_variables(var_remap, &src_vars, vars);
+
+   NIR_PASS_V(shader, lower_rt_instructions, &src_vars, call_idx_base);
+
+   NIR_PASS_V(shader, nir_opt_remove_phis);
+   NIR_PASS_V(shader, nir_lower_returns);
+   NIR_PASS_V(shader, nir_opt_dce);
+
+   if (b->shader->info.stage == MESA_SHADER_ANY_HIT ||
+       b->shader->info.stage == MESA_SHADER_INTERSECTION) {
+      src_vars.stack_sizes[src_vars.group_idx].non_recursive_size =
+         MAX2(src_vars.stack_sizes[src_vars.group_idx].non_recursive_size, shader->scratch_size);
+   } else {
+      src_vars.stack_sizes[src_vars.group_idx].recursive_size =
+         MAX2(src_vars.stack_sizes[src_vars.group_idx].recursive_size, shader->scratch_size);
+   }
+
+   nir_push_if(b, nir_ieq(b, idx, nir_imm_int(b, call_idx)));
+   nir_store_var(b, vars->main_loop_case_visited, nir_imm_bool(b, true), 1);
+   nir_inline_function_impl(b, nir_shader_get_entrypoint(shader), NULL, var_remap);
+   nir_pop_if(b, NULL);
+
+   /* Adopt the instructions from the source shader, since they are merely moved, not cloned. */
+   ralloc_adopt(ralloc_context(b->shader), ralloc_context(shader));
+
+   ralloc_free(var_remap);
+}
+
+static bool
+lower_rt_derefs(nir_shader *shader)
+{
+   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+
+   bool progress = false;
+
+   nir_builder b;
+   nir_builder_init(&b, impl);
+
+   b.cursor = nir_before_cf_list(&impl->body);
+   nir_ssa_def *arg_offset = nir_load_rt_arg_scratch_offset_amd(&b);
+
+   nir_foreach_block (block, impl) {
+      nir_foreach_instr_safe (instr, block) {
+         switch (instr->type) {
+         case nir_instr_type_deref: {
+            if (instr->type != nir_instr_type_deref)
+               continue;
+
+            nir_deref_instr *deref = nir_instr_as_deref(instr);
+            if (nir_deref_mode_is(deref, nir_var_shader_call_data)) {
+               deref->modes = nir_var_function_temp;
+               if (deref->deref_type == nir_deref_type_var) {
+                  b.cursor = nir_before_instr(&deref->instr);
+                  nir_deref_instr *cast = nir_build_deref_cast(
+                     &b, arg_offset, nir_var_function_temp, deref->var->type, 0);
+                  nir_ssa_def_rewrite_uses(&deref->dest.ssa, &cast->dest.ssa);
+                  nir_instr_remove(&deref->instr);
+               }
+               progress = true;
+            } else if (nir_deref_mode_is(deref, nir_var_ray_hit_attrib)) {
+               deref->modes = nir_var_function_temp;
+               if (deref->deref_type == nir_deref_type_var) {
+                  b.cursor = nir_before_instr(&deref->instr);
+                  nir_deref_instr *cast =
+                     nir_build_deref_cast(&b, nir_imm_int(&b, RADV_HIT_ATTRIB_OFFSET),
+                                          nir_var_function_temp, deref->type, 0);
+                  nir_ssa_def_rewrite_uses(&deref->dest.ssa, &cast->dest.ssa);
+                  nir_instr_remove(&deref->instr);
+               }
+               progress = true;
+            }
+            break;
+         }
+         default:
+            break;
+         }
+      }
+   }
+
+   if (progress) {
+      nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance);
+   } else {
+      nir_metadata_preserve(impl, nir_metadata_all);
+   }
+
+   return progress;
+}
+
+static gl_shader_stage
+convert_rt_stage(VkShaderStageFlagBits vk_stage)
+{
+   switch (vk_stage) {
+   case VK_SHADER_STAGE_RAYGEN_BIT_KHR:
+      return MESA_SHADER_RAYGEN;
+   case VK_SHADER_STAGE_ANY_HIT_BIT_KHR:
+      return MESA_SHADER_ANY_HIT;
+   case VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR:
+      return MESA_SHADER_CLOSEST_HIT;
+   case VK_SHADER_STAGE_MISS_BIT_KHR:
+      return MESA_SHADER_MISS;
+   case VK_SHADER_STAGE_INTERSECTION_BIT_KHR:
+      return MESA_SHADER_INTERSECTION;
+   case VK_SHADER_STAGE_CALLABLE_BIT_KHR:
+      return MESA_SHADER_CALLABLE;
+   default:
+      unreachable("Unhandled RT stage");
+   }
+}
+
+static nir_shader *
+parse_rt_stage(struct radv_device *device, struct radv_pipeline_layout *layout,
+               const VkPipelineShaderStageCreateInfo *stage)
+{
+   struct radv_pipeline_key key;
+   memset(&key, 0, sizeof(key));
+
+   nir_shader *shader = radv_shader_compile_to_nir(
+      device, vk_shader_module_from_handle(stage->module), stage->pName,
+      convert_rt_stage(stage->stage), stage->pSpecializationInfo, layout, &key);
+
+   if (shader->info.stage == MESA_SHADER_RAYGEN || shader->info.stage == MESA_SHADER_CLOSEST_HIT ||
+       shader->info.stage == MESA_SHADER_CALLABLE || shader->info.stage == MESA_SHADER_MISS) {
+      nir_block *last_block = nir_impl_last_block(nir_shader_get_entrypoint(shader));
+      nir_builder b_inner;
+      nir_builder_init(&b_inner, nir_shader_get_entrypoint(shader));
+      b_inner.cursor = nir_after_block(last_block);
+      nir_rt_return_amd(&b_inner);
+   }
+
+   NIR_PASS_V(shader, nir_lower_vars_to_explicit_types,
+              nir_var_function_temp | nir_var_shader_call_data | nir_var_ray_hit_attrib,
+              glsl_get_natural_size_align_bytes);
+
+   NIR_PASS_V(shader, lower_rt_derefs);
+
+   NIR_PASS_V(shader, nir_lower_explicit_io, nir_var_function_temp,
+              nir_address_format_32bit_offset);
+
+   return shader;
+}
+
+static nir_function_impl *
+lower_any_hit_for_intersection(nir_shader *any_hit)
+{
+   nir_function_impl *impl = nir_shader_get_entrypoint(any_hit);
+
+   /* Any-hit shaders need three parameters */
+   assert(impl->function->num_params == 0);
+   nir_parameter params[] = {
+      {
+         /* A pointer to a boolean value for whether or not the hit was
+          * accepted.
+          */
+         .num_components = 1,
+         .bit_size = 32,
+      },
+      {
+         /* The hit T value */
+         .num_components = 1,
+         .bit_size = 32,
+      },
+      {
+         /* The hit kind */
+         .num_components = 1,
+         .bit_size = 32,
+      },
+   };
+   impl->function->num_params = ARRAY_SIZE(params);
+   impl->function->params = ralloc_array(any_hit, nir_parameter, ARRAY_SIZE(params));
+   memcpy(impl->function->params, params, sizeof(params));
+
+   nir_builder build;
+   nir_builder_init(&build, impl);
+   nir_builder *b = &build;
+
+   b->cursor = nir_before_cf_list(&impl->body);
+
+   nir_ssa_def *commit_ptr = nir_load_param(b, 0);
+   nir_ssa_def *hit_t = nir_load_param(b, 1);
+   nir_ssa_def *hit_kind = nir_load_param(b, 2);
+
+   nir_deref_instr *commit =
+      nir_build_deref_cast(b, commit_ptr, nir_var_function_temp, glsl_bool_type(), 0);
+
+   nir_foreach_block_safe (block, impl) {
+      nir_foreach_instr_safe (instr, block) {
+         switch (instr->type) {
+         case nir_instr_type_intrinsic: {
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            switch (intrin->intrinsic) {
+            case nir_intrinsic_ignore_ray_intersection:
+               b->cursor = nir_instr_remove(&intrin->instr);
+               /* We put the newly emitted code inside a dummy if because it's
+                * going to contain a jump instruction and we don't want to
+                * deal with that mess here.  It'll get dealt with by our
+                * control-flow optimization passes.
+                */
+               nir_store_deref(b, commit, nir_imm_false(b), 0x1);
+               nir_push_if(b, nir_imm_true(b));
+               nir_jump(b, nir_jump_halt);
+               nir_pop_if(b, NULL);
+               break;
+
+            case nir_intrinsic_terminate_ray:
+               /* The "normal" handling of terminateRay works fine in
+                * intersection shaders.
+                */
+               break;
+
+            case nir_intrinsic_load_ray_t_max:
+               nir_ssa_def_rewrite_uses(&intrin->dest.ssa, hit_t);
+               nir_instr_remove(&intrin->instr);
+               break;
+
+            case nir_intrinsic_load_ray_hit_kind:
+               nir_ssa_def_rewrite_uses(&intrin->dest.ssa, hit_kind);
+               nir_instr_remove(&intrin->instr);
+               break;
+
+            default:
+               break;
+            }
+            break;
+         }
+         case nir_instr_type_jump: {
+            nir_jump_instr *jump = nir_instr_as_jump(instr);
+            if (jump->type == nir_jump_halt) {
+               b->cursor = nir_instr_remove(instr);
+               nir_jump(b, nir_jump_return);
+            }
+            break;
+         }
+
+         default:
+            break;
+         }
+      }
+   }
+
+   nir_validate_shader(any_hit, "after initial any-hit lowering");
+
+   nir_lower_returns_impl(impl);
+
+   nir_validate_shader(any_hit, "after lowering returns");
+
+   return impl;
+}
+
+/* Inline the any_hit shader into the intersection shader so we don't have
+ * to implement yet another shader call interface here. Neither do any recursion.
+ */
+static void
+nir_lower_intersection_shader(nir_shader *intersection, nir_shader *any_hit)
+{
+   void *dead_ctx = ralloc_context(intersection);
+
+   nir_function_impl *any_hit_impl = NULL;
+   struct hash_table *any_hit_var_remap = NULL;
+   if (any_hit) {
+      any_hit = nir_shader_clone(dead_ctx, any_hit);
+      NIR_PASS_V(any_hit, nir_opt_dce);
+      any_hit_impl = lower_any_hit_for_intersection(any_hit);
+      any_hit_var_remap = _mesa_pointer_hash_table_create(dead_ctx);
+   }
+
+   nir_function_impl *impl = nir_shader_get_entrypoint(intersection);
+
+   nir_builder build;
+   nir_builder_init(&build, impl);
+   nir_builder *b = &build;
+
+   b->cursor = nir_before_cf_list(&impl->body);
+
+   nir_variable *commit = nir_local_variable_create(impl, glsl_bool_type(), "ray_commit");
+   nir_store_var(b, commit, nir_imm_false(b), 0x1);
+
+   nir_foreach_block_safe (block, impl) {
+      nir_foreach_instr_safe (instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+         if (intrin->intrinsic != nir_intrinsic_report_ray_intersection)
+            continue;
+
+         b->cursor = nir_instr_remove(&intrin->instr);
+         nir_ssa_def *hit_t = nir_ssa_for_src(b, intrin->src[0], 1);
+         nir_ssa_def *hit_kind = nir_ssa_for_src(b, intrin->src[1], 1);
+         nir_ssa_def *min_t = nir_load_ray_t_min(b);
+         nir_ssa_def *max_t = nir_load_ray_t_max(b);
+
+         /* bool commit_tmp = false; */
+         nir_variable *commit_tmp = nir_local_variable_create(impl, glsl_bool_type(), "commit_tmp");
+         nir_store_var(b, commit_tmp, nir_imm_false(b), 0x1);
+
+         nir_push_if(b, nir_iand(b, nir_fge(b, hit_t, min_t), nir_fge(b, max_t, hit_t)));
+         {
+            /* Any-hit defaults to commit */
+            nir_store_var(b, commit_tmp, nir_imm_true(b), 0x1);
+
+            if (any_hit_impl != NULL) {
+               nir_push_if(b, nir_inot(b, nir_load_intersection_opaque_amd(b)));
+               {
+                  nir_ssa_def *params[] = {
+                     &nir_build_deref_var(b, commit_tmp)->dest.ssa,
+                     hit_t,
+                     hit_kind,
+                  };
+                  nir_inline_function_impl(b, any_hit_impl, params, any_hit_var_remap);
+               }
+               nir_pop_if(b, NULL);
+            }
+
+            nir_push_if(b, nir_load_var(b, commit_tmp));
+            {
+               nir_report_ray_intersection(b, 1, hit_t, hit_kind);
+            }
+            nir_pop_if(b, NULL);
+         }
+         nir_pop_if(b, NULL);
+
+         nir_ssa_def *accepted = nir_load_var(b, commit_tmp);
+         nir_ssa_def_rewrite_uses(&intrin->dest.ssa, accepted);
+      }
+   }
+
+   /* We did some inlining; have to re-index SSA defs */
+   nir_index_ssa_defs(impl);
+
+   /* Eliminate the casts introduced for the commit return of the any-hit shader. */
+   NIR_PASS_V(intersection, nir_opt_deref);
+
+   ralloc_free(dead_ctx);
+}
+
+/* Variables only used internally to ray traversal. This is data that describes
+ * the current state of the traversal vs. what we'd give to a shader.  e.g. what
+ * is the instance we're currently visiting vs. what is the instance of the
+ * closest hit. */
+struct rt_traversal_vars {
+   nir_variable *origin;
+   nir_variable *dir;
+   nir_variable *inv_dir;
+   nir_variable *sbt_offset_and_flags;
+   nir_variable *instance_id;
+   nir_variable *custom_instance_and_mask;
+   nir_variable *instance_addr;
+   nir_variable *should_return;
+   nir_variable *bvh_base;
+   nir_variable *stack;
+   nir_variable *top_stack;
+};
+
+static struct rt_traversal_vars
+init_traversal_vars(nir_builder *b)
+{
+   const struct glsl_type *vec3_type = glsl_vector_type(GLSL_TYPE_FLOAT, 3);
+   struct rt_traversal_vars ret;
+
+   ret.origin = nir_variable_create(b->shader, nir_var_shader_temp, vec3_type, "traversal_origin");
+   ret.dir = nir_variable_create(b->shader, nir_var_shader_temp, vec3_type, "traversal_dir");
+   ret.inv_dir =
+      nir_variable_create(b->shader, nir_var_shader_temp, vec3_type, "traversal_inv_dir");
+   ret.sbt_offset_and_flags = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(),
+                                                  "traversal_sbt_offset_and_flags");
+   ret.instance_id = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(),
+                                         "traversal_instance_id");
+   ret.custom_instance_and_mask = nir_variable_create(
+      b->shader, nir_var_shader_temp, glsl_uint_type(), "traversal_custom_instance_and_mask");
+   ret.instance_addr =
+      nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint64_t_type(), "instance_addr");
+   ret.should_return = nir_variable_create(b->shader, nir_var_shader_temp, glsl_bool_type(),
+                                           "traversal_should_return");
+   ret.bvh_base = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint64_t_type(),
+                                      "traversal_bvh_base");
+   ret.stack =
+      nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "traversal_stack_ptr");
+   ret.top_stack = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(),
+                                       "traversal_top_stack_ptr");
+   return ret;
+}
+
+static nir_ssa_def *
+build_addr_to_node(nir_builder *b, nir_ssa_def *addr)
+{
+   const uint64_t bvh_size = 1ull << 42;
+   nir_ssa_def *node = nir_ushr(b, addr, nir_imm_int(b, 3));
+   return nir_iand(b, node, nir_imm_int64(b, (bvh_size - 1) << 3));
+}
+
+static nir_ssa_def *
+build_node_to_addr(struct radv_device *device, nir_builder *b, nir_ssa_def *node)
+{
+   nir_ssa_def *addr = nir_iand(b, node, nir_imm_int64(b, ~7ull));
+   addr = nir_ishl(b, addr, nir_imm_int(b, 3));
+   /* Assumes everything is in the top half of address space, which is true in
+    * GFX9+ for now. */
+   return device->physical_device->rad_info.chip_class >= GFX9
+      ? nir_ior(b, addr, nir_imm_int64(b, 0xffffull << 48))
+      : addr;
+}
+
+/* When a hit is opaque the any_hit shader is skipped for this hit and the hit
+ * is assumed to be an actual hit. */
+static nir_ssa_def *
+hit_is_opaque(nir_builder *b, const struct rt_variables *vars,
+              const struct rt_traversal_vars *trav_vars, nir_ssa_def *geometry_id_and_flags)
+{
+   nir_ssa_def *geom_force_opaque = nir_ine(
+      b, nir_iand(b, geometry_id_and_flags, nir_imm_int(b, 1u << 28 /* VK_GEOMETRY_OPAQUE_BIT */)),
+      nir_imm_int(b, 0));
+   nir_ssa_def *instance_force_opaque =
+      nir_ine(b,
+              nir_iand(b, nir_load_var(b, trav_vars->sbt_offset_and_flags),
+                       nir_imm_int(b, 4 << 24 /* VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT */)),
+              nir_imm_int(b, 0));
+   nir_ssa_def *instance_force_non_opaque =
+      nir_ine(b,
+              nir_iand(b, nir_load_var(b, trav_vars->sbt_offset_and_flags),
+                       nir_imm_int(b, 8 << 24 /* VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT */)),
+              nir_imm_int(b, 0));
+
+   nir_ssa_def *opaque = geom_force_opaque;
+   opaque = nir_bcsel(b, instance_force_opaque, nir_imm_bool(b, true), opaque);
+   opaque = nir_bcsel(b, instance_force_non_opaque, nir_imm_bool(b, false), opaque);
+
+   nir_ssa_def *ray_force_opaque =
+      nir_ine(b, nir_iand(b, nir_load_var(b, vars->flags), nir_imm_int(b, 1 /* RayFlagsOpaque */)),
+              nir_imm_int(b, 0));
+   nir_ssa_def *ray_force_non_opaque = nir_ine(
+      b, nir_iand(b, nir_load_var(b, vars->flags), nir_imm_int(b, 2 /* RayFlagsNoOpaque */)),
+      nir_imm_int(b, 0));
+
+   opaque = nir_bcsel(b, ray_force_opaque, nir_imm_bool(b, true), opaque);
+   opaque = nir_bcsel(b, ray_force_non_opaque, nir_imm_bool(b, false), opaque);
+   return opaque;
+}
+
+static void
+visit_any_hit_shaders(struct radv_device *device,
+                      const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, nir_builder *b,
+                      struct rt_variables *vars)
+{
+   RADV_FROM_HANDLE(radv_pipeline_layout, layout, pCreateInfo->layout);
+   nir_ssa_def *sbt_idx = nir_load_var(b, vars->idx);
+
+   nir_push_if(b, nir_ine(b, sbt_idx, nir_imm_int(b, 0)));
+   for (unsigned i = 0; i < pCreateInfo->groupCount; ++i) {
+      const VkRayTracingShaderGroupCreateInfoKHR *group_info = &pCreateInfo->pGroups[i];
+      uint32_t shader_id = VK_SHADER_UNUSED_KHR;
+
+      switch (group_info->type) {
+      case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR:
+         shader_id = group_info->anyHitShader;
+         break;
+      default:
+         break;
+      }
+      if (shader_id == VK_SHADER_UNUSED_KHR)
+         continue;
+
+      const VkPipelineShaderStageCreateInfo *stage = &pCreateInfo->pStages[shader_id];
+      nir_shader *nir_stage = parse_rt_stage(device, layout, stage);
+
+      vars->group_idx = i;
+      insert_rt_case(b, nir_stage, vars, sbt_idx, 0, i + 2);
+   }
+   nir_pop_if(b, NULL);
+}
+
+static void
+insert_traversal_triangle_case(struct radv_device *device,
+                               const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, nir_builder *b,
+                               nir_ssa_def *result, const struct rt_variables *vars,
+                               const struct rt_traversal_vars *trav_vars, nir_ssa_def *bvh_node)
+{
+   nir_ssa_def *dist = nir_vector_extract(b, result, nir_imm_int(b, 0));
+   nir_ssa_def *div = nir_vector_extract(b, result, nir_imm_int(b, 1));
+   dist = nir_fdiv(b, dist, div);
+   nir_ssa_def *frontface = nir_flt(b, nir_imm_float(b, 0), div);
+   nir_ssa_def *switch_ccw = nir_ine(
+      b,
+      nir_iand(
+         b, nir_load_var(b, trav_vars->sbt_offset_and_flags),
+         nir_imm_int(b, 2 << 24 /* VK_GEOMETRY_INSTANCE_TRIANGLE_FRONT_COUNTERCLOCKWISE_BIT */)),
+      nir_imm_int(b, 0));
+   frontface = nir_ixor(b, frontface, switch_ccw);
+
+   nir_ssa_def *not_cull = nir_ieq(
+      b, nir_iand(b, nir_load_var(b, vars->flags), nir_imm_int(b, 256 /* RayFlagsSkipTriangles */)),
+      nir_imm_int(b, 0));
+   nir_ssa_def *not_facing_cull = nir_ieq(
+      b,
+      nir_iand(b, nir_load_var(b, vars->flags),
+               nir_bcsel(b, frontface, nir_imm_int(b, 32 /* RayFlagsCullFrontFacingTriangles */),
+                         nir_imm_int(b, 16 /* RayFlagsCullBackFacingTriangles */))),
+      nir_imm_int(b, 0));
+
+   not_cull = nir_iand(
+      b, not_cull,
+      nir_ior(
+         b, not_facing_cull,
+         nir_ine(
+            b,
+            nir_iand(
+               b, nir_load_var(b, trav_vars->sbt_offset_and_flags),
+               nir_imm_int(b, 1 << 24 /* VK_GEOMETRY_INSTANCE_TRIANGLE_FACING_CULL_DISABLE_BIT */)),
+            nir_imm_int(b, 0))));
+
+   nir_push_if(b, nir_iand(b,
+                           nir_iand(b, nir_flt(b, dist, nir_load_var(b, vars->tmax)),
+                                    nir_fge(b, dist, nir_load_var(b, vars->tmin))),
+                           not_cull));
+   {
+
+      nir_ssa_def *triangle_info = nir_build_load_global(
+         b, 2, 32,
+         nir_iadd(b, build_node_to_addr(device, b, bvh_node),
+                  nir_imm_int64(b, offsetof(struct radv_bvh_triangle_node, triangle_id))),
+         .align_mul = 4, .align_offset = 0);
+      nir_ssa_def *primitive_id = nir_channel(b, triangle_info, 0);
+      nir_ssa_def *geometry_id_and_flags = nir_channel(b, triangle_info, 1);
+      nir_ssa_def *geometry_id = nir_iand(b, geometry_id_and_flags, nir_imm_int(b, 0xfffffff));
+      nir_ssa_def *is_opaque = hit_is_opaque(b, vars, trav_vars, geometry_id_and_flags);
+
+      not_cull =
+         nir_ieq(b,
+                 nir_iand(b, nir_load_var(b, vars->flags),
+                          nir_bcsel(b, is_opaque, nir_imm_int(b, 0x40), nir_imm_int(b, 0x80))),
+                 nir_imm_int(b, 0));
+      nir_push_if(b, not_cull);
+      {
+         nir_ssa_def *sbt_idx =
+            nir_iadd(b,
+                     nir_iadd(b, nir_load_var(b, vars->sbt_offset),
+                              nir_iand(b, nir_load_var(b, trav_vars->sbt_offset_and_flags),
+                                       nir_imm_int(b, 0xffffff))),
+                     nir_imul(b, nir_load_var(b, vars->sbt_stride), geometry_id));
+         nir_ssa_def *divs[2] = {div, div};
+         nir_ssa_def *ij = nir_fdiv(b, nir_channels(b, result, 0xc), nir_vec(b, divs, 2));
+         nir_ssa_def *hit_kind =
+            nir_bcsel(b, frontface, nir_imm_int(b, 0xFE), nir_imm_int(b, 0xFF));
+
+         nir_store_scratch(
+            b, ij,
+            nir_iadd(b, nir_load_var(b, vars->stack_ptr), nir_imm_int(b, RADV_HIT_ATTRIB_OFFSET)),
+            .align_mul = 16, .write_mask = 3);
+
+         nir_store_var(b, vars->ahit_status, nir_imm_int(b, 0), 1);
+
+         nir_push_if(b, nir_ine(b, is_opaque, nir_imm_bool(b, true)));
+         {
+            struct rt_variables inner_vars = create_inner_vars(b, vars);
+
+            nir_store_var(b, inner_vars.primitive_id, primitive_id, 1);
+            nir_store_var(b, inner_vars.geometry_id_and_flags, geometry_id_and_flags, 1);
+            nir_store_var(b, inner_vars.tmax, dist, 0x1);
+            nir_store_var(b, inner_vars.instance_id, nir_load_var(b, trav_vars->instance_id), 0x1);
+            nir_store_var(b, inner_vars.instance_addr, nir_load_var(b, trav_vars->instance_addr),
+                          0x1);
+            nir_store_var(b, inner_vars.hit_kind, hit_kind, 0x1);
+            nir_store_var(b, inner_vars.custom_instance_and_mask,
+                          nir_load_var(b, trav_vars->custom_instance_and_mask), 0x1);
+
+            load_sbt_entry(b, &inner_vars, sbt_idx, SBT_HIT, 4);
+
+            visit_any_hit_shaders(device, pCreateInfo, b, &inner_vars);
+
+            nir_push_if(b, nir_ieq(b, nir_load_var(b, vars->ahit_status), nir_imm_int(b, 1)));
+            {
+               nir_jump(b, nir_jump_continue);
+            }
+            nir_pop_if(b, NULL);
+         }
+         nir_pop_if(b, NULL);
+
+         nir_store_var(b, vars->primitive_id, primitive_id, 1);
+         nir_store_var(b, vars->geometry_id_and_flags, geometry_id_and_flags, 1);
+         nir_store_var(b, vars->tmax, dist, 0x1);
+         nir_store_var(b, vars->instance_id, nir_load_var(b, trav_vars->instance_id), 0x1);
+         nir_store_var(b, vars->instance_addr, nir_load_var(b, trav_vars->instance_addr), 0x1);
+         nir_store_var(b, vars->hit_kind, hit_kind, 0x1);
+         nir_store_var(b, vars->custom_instance_and_mask,
+                       nir_load_var(b, trav_vars->custom_instance_and_mask), 0x1);
+
+         load_sbt_entry(b, vars, sbt_idx, SBT_HIT, 0);
+
+         nir_store_var(b, trav_vars->should_return,
+                       nir_ior(b,
+                               nir_ine(b,
+                                       nir_iand(b, nir_load_var(b, vars->flags),
+                                                nir_imm_int(b, 8 /* SkipClosestHitShader */)),
+                                       nir_imm_int(b, 0)),
+                               nir_ieq(b, nir_load_var(b, vars->idx), nir_imm_int(b, 0))),
+                       1);
+
+         nir_ssa_def *terminate_on_first_hit =
+            nir_ine(b,
+                    nir_iand(b, nir_load_var(b, vars->flags),
+                             nir_imm_int(b, 4 /* TerminateOnFirstHitKHR */)),
+                    nir_imm_int(b, 0));
+         nir_ssa_def *ray_terminated =
+            nir_ieq(b, nir_load_var(b, vars->ahit_status), nir_imm_int(b, 2));
+         nir_push_if(b, nir_ior(b, terminate_on_first_hit, ray_terminated));
+         {
+            nir_jump(b, nir_jump_break);
+         }
+         nir_pop_if(b, NULL);
+      }
+      nir_pop_if(b, NULL);
+   }
+   nir_pop_if(b, NULL);
+}
+
+static void
+insert_traversal_aabb_case(struct radv_device *device,
+                           const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, nir_builder *b,
+                           const struct rt_variables *vars,
+                           const struct rt_traversal_vars *trav_vars, nir_ssa_def *bvh_node)
+{
+   RADV_FROM_HANDLE(radv_pipeline_layout, layout, pCreateInfo->layout);
+
+   nir_ssa_def *node_addr = build_node_to_addr(device, b, bvh_node);
+   nir_ssa_def *triangle_info = nir_build_load_global(
+      b, 2, 32, nir_iadd(b, node_addr, nir_imm_int64(b, 24)), .align_mul = 4, .align_offset = 0);
+   nir_ssa_def *primitive_id = nir_channel(b, triangle_info, 0);
+   nir_ssa_def *geometry_id_and_flags = nir_channel(b, triangle_info, 1);
+   nir_ssa_def *geometry_id = nir_iand(b, geometry_id_and_flags, nir_imm_int(b, 0xfffffff));
+   nir_ssa_def *is_opaque = hit_is_opaque(b, vars, trav_vars, geometry_id_and_flags);
+
+   nir_ssa_def *not_cull =
+      nir_ieq(b,
+              nir_iand(b, nir_load_var(b, vars->flags),
+                       nir_bcsel(b, is_opaque, nir_imm_int(b, 0x40), nir_imm_int(b, 0x80))),
+              nir_imm_int(b, 0));
+   nir_push_if(b, not_cull);
+   {
+      nir_ssa_def *sbt_idx =
+         nir_iadd(b,
+                  nir_iadd(b, nir_load_var(b, vars->sbt_offset),
+                           nir_iand(b, nir_load_var(b, trav_vars->sbt_offset_and_flags),
+                                    nir_imm_int(b, 0xffffff))),
+                  nir_imul(b, nir_load_var(b, vars->sbt_stride), geometry_id));
+
+      struct rt_variables inner_vars = create_inner_vars(b, vars);
+
+      /* For AABBs the intersection shader writes the hit kind, and only does it if it is the
+       * next closest hit candidate. */
+      inner_vars.hit_kind = vars->hit_kind;
+
+      nir_store_var(b, inner_vars.primitive_id, primitive_id, 1);
+      nir_store_var(b, inner_vars.geometry_id_and_flags, geometry_id_and_flags, 1);
+      nir_store_var(b, inner_vars.tmax, nir_load_var(b, vars->tmax), 0x1);
+      nir_store_var(b, inner_vars.instance_id, nir_load_var(b, trav_vars->instance_id), 0x1);
+      nir_store_var(b, inner_vars.instance_addr, nir_load_var(b, trav_vars->instance_addr), 0x1);
+      nir_store_var(b, inner_vars.custom_instance_and_mask,
+                    nir_load_var(b, trav_vars->custom_instance_and_mask), 0x1);
+      nir_store_var(b, inner_vars.opaque, is_opaque, 1);
+
+      load_sbt_entry(b, &inner_vars, sbt_idx, SBT_HIT, 4);
+
+      nir_store_var(b, vars->ahit_status, nir_imm_int(b, 1), 1);
+
+      nir_push_if(b, nir_ine(b, nir_load_var(b, inner_vars.idx), nir_imm_int(b, 0)));
+      for (unsigned i = 0; i < pCreateInfo->groupCount; ++i) {
+         const VkRayTracingShaderGroupCreateInfoKHR *group_info = &pCreateInfo->pGroups[i];
+         uint32_t shader_id = VK_SHADER_UNUSED_KHR;
+         uint32_t any_hit_shader_id = VK_SHADER_UNUSED_KHR;
+
+         switch (group_info->type) {
+         case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR:
+            shader_id = group_info->intersectionShader;
+            any_hit_shader_id = group_info->anyHitShader;
+            break;
+         default:
+            break;
+         }
+         if (shader_id == VK_SHADER_UNUSED_KHR)
+            continue;
+
+         const VkPipelineShaderStageCreateInfo *stage = &pCreateInfo->pStages[shader_id];
+         nir_shader *nir_stage = parse_rt_stage(device, layout, stage);
+
+         nir_shader *any_hit_stage = NULL;
+         if (any_hit_shader_id != VK_SHADER_UNUSED_KHR) {
+            stage = &pCreateInfo->pStages[any_hit_shader_id];
+            any_hit_stage = parse_rt_stage(device, layout, stage);
+
+            nir_lower_intersection_shader(nir_stage, any_hit_stage);
+            ralloc_free(any_hit_stage);
+         }
+
+         inner_vars.group_idx = i;
+         insert_rt_case(b, nir_stage, &inner_vars, nir_load_var(b, inner_vars.idx), 0, i + 2);
+      }
+      nir_push_else(b, NULL);
+      {
+         nir_ssa_def *vec3_zero = nir_channels(b, nir_imm_vec4(b, 0, 0, 0, 0), 0x7);
+         nir_ssa_def *vec3_inf =
+            nir_channels(b, nir_imm_vec4(b, INFINITY, INFINITY, INFINITY, 0), 0x7);
+
+         nir_ssa_def *bvh_lo =
+            nir_build_load_global(b, 3, 32, nir_iadd(b, node_addr, nir_imm_int64(b, 0)),
+                                  .align_mul = 4, .align_offset = 0);
+         nir_ssa_def *bvh_hi =
+            nir_build_load_global(b, 3, 32, nir_iadd(b, node_addr, nir_imm_int64(b, 12)),
+                                  .align_mul = 4, .align_offset = 0);
+
+         bvh_lo = nir_fsub(b, bvh_lo, nir_load_var(b, trav_vars->origin));
+         bvh_hi = nir_fsub(b, bvh_hi, nir_load_var(b, trav_vars->origin));
+         nir_ssa_def *t_vec = nir_fmin(b, nir_fmul(b, bvh_lo, nir_load_var(b, trav_vars->inv_dir)),
+                                       nir_fmul(b, bvh_hi, nir_load_var(b, trav_vars->inv_dir)));
+         nir_ssa_def *t2_vec = nir_fmax(b, nir_fmul(b, bvh_lo, nir_load_var(b, trav_vars->inv_dir)),
+                                        nir_fmul(b, bvh_hi, nir_load_var(b, trav_vars->inv_dir)));
+         /* If we run parallel to one of the edges the range should be [0, inf) not [0,0] */
+         t2_vec =
+            nir_bcsel(b, nir_feq(b, nir_load_var(b, trav_vars->dir), vec3_zero), vec3_inf, t2_vec);
+
+         nir_ssa_def *t_min = nir_fmax(b, nir_channel(b, t_vec, 0), nir_channel(b, t_vec, 1));
+         t_min = nir_fmax(b, t_min, nir_channel(b, t_vec, 2));
+
+         nir_ssa_def *t_max = nir_fmin(b, nir_channel(b, t2_vec, 0), nir_channel(b, t2_vec, 1));
+         t_max = nir_fmin(b, t_max, nir_channel(b, t2_vec, 2));
+
+         nir_push_if(b, nir_iand(b, nir_flt(b, t_min, nir_load_var(b, vars->tmax)),
+                                 nir_fge(b, t_max, nir_load_var(b, vars->tmin))));
+         {
+            nir_store_var(b, vars->ahit_status, nir_imm_int(b, 0), 1);
+            nir_store_var(b, vars->tmax, nir_fmax(b, t_min, nir_load_var(b, vars->tmin)), 1);
+         }
+         nir_pop_if(b, NULL);
+      }
+      nir_pop_if(b, NULL);
+
+      nir_push_if(b, nir_ine(b, nir_load_var(b, vars->ahit_status), nir_imm_int(b, 1)));
+      {
+         nir_store_var(b, vars->primitive_id, primitive_id, 1);
+         nir_store_var(b, vars->geometry_id_and_flags, geometry_id_and_flags, 1);
+         nir_store_var(b, vars->tmax, nir_load_var(b, inner_vars.tmax), 0x1);
+         nir_store_var(b, vars->instance_id, nir_load_var(b, trav_vars->instance_id), 0x1);
+         nir_store_var(b, vars->instance_addr, nir_load_var(b, trav_vars->instance_addr), 0x1);
+         nir_store_var(b, vars->custom_instance_and_mask,
+                       nir_load_var(b, trav_vars->custom_instance_and_mask), 0x1);
+
+         load_sbt_entry(b, vars, sbt_idx, SBT_HIT, 0);
+
+         nir_store_var(b, trav_vars->should_return,
+                       nir_ior(b,
+                               nir_ine(b,
+                                       nir_iand(b, nir_load_var(b, vars->flags),
+                                                nir_imm_int(b, 8 /* SkipClosestHitShader */)),
+                                       nir_imm_int(b, 0)),
+                               nir_ieq(b, nir_load_var(b, vars->idx), nir_imm_int(b, 0))),
+                       1);
+
+         nir_ssa_def *terminate_on_first_hit =
+            nir_ine(b,
+                    nir_iand(b, nir_load_var(b, vars->flags),
+                             nir_imm_int(b, 4 /* TerminateOnFirstHitKHR */)),
+                    nir_imm_int(b, 0));
+         nir_ssa_def *ray_terminated =
+            nir_ieq(b, nir_load_var(b, vars->ahit_status), nir_imm_int(b, 2));
+         nir_push_if(b, nir_ior(b, terminate_on_first_hit, ray_terminated));
+         {
+            nir_jump(b, nir_jump_break);
+         }
+         nir_pop_if(b, NULL);
+      }
+      nir_pop_if(b, NULL);
+   }
+   nir_pop_if(b, NULL);
+}
+
+static void
+nir_sort_hit_pair(nir_builder *b, nir_variable *var_distances, nir_variable *var_indices, uint32_t chan_1, uint32_t chan_2)
+{
+   nir_ssa_def *ssa_distances = nir_load_var(b, var_distances);
+   nir_ssa_def *ssa_indices = nir_load_var(b, var_indices);
+   /* if (distances[chan_2] < distances[chan_1]) { */
+   nir_push_if(b, nir_flt(b, nir_channel(b, ssa_distances, chan_2), nir_channel(b, ssa_distances, chan_1)));
+   {
+      /* swap(distances[chan_2], distances[chan_1]); */
+      nir_ssa_def *new_distances[4] = {nir_ssa_undef(b, 1, 32), nir_ssa_undef(b, 1, 32), nir_ssa_undef(b, 1, 32), nir_ssa_undef(b, 1, 32)};
+      nir_ssa_def *new_indices[4]   = {nir_ssa_undef(b, 1, 32), nir_ssa_undef(b, 1, 32), nir_ssa_undef(b, 1, 32), nir_ssa_undef(b, 1, 32)};
+      new_distances[chan_2] = nir_channel(b, ssa_distances, chan_1);
+      new_distances[chan_1] = nir_channel(b, ssa_distances, chan_2);
+      new_indices[chan_2] = nir_channel(b, ssa_indices, chan_1);
+      new_indices[chan_1] = nir_channel(b, ssa_indices, chan_2);
+      nir_store_var(b, var_distances, nir_vec(b, new_distances, 4), (1u << chan_1) | (1u << chan_2));
+      nir_store_var(b, var_indices, nir_vec(b, new_indices, 4), (1u << chan_1) | (1u << chan_2));
+   }
+   /* } */
+   nir_pop_if(b, NULL);
+}
+
+static nir_ssa_def *
+intersect_ray_amd_software_box(struct radv_device *device,
+                               nir_builder *b, nir_ssa_def *bvh_node,
+                               nir_ssa_def *ray_tmax, nir_ssa_def *origin,
+                               nir_ssa_def *dir, nir_ssa_def *inv_dir)
+{
+   const struct glsl_type *vec4_type = glsl_vector_type(GLSL_TYPE_FLOAT, 4);
+   const struct glsl_type *uvec4_type = glsl_vector_type(GLSL_TYPE_UINT, 4);
+
+   nir_ssa_def *node_addr = build_node_to_addr(device, b, bvh_node);
+
+   /* vec4 distances = vec4(INF, INF, INF, INF); */
+   nir_variable *distances = nir_variable_create(b->shader, nir_var_shader_temp, vec4_type, "distances");
+   nir_store_var(b, distances, nir_imm_vec4(b, INFINITY, INFINITY, INFINITY, INFINITY), 0xf);
+
+   /* uvec4 child_indices = uvec4(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff); */
+   nir_variable *child_indices = nir_variable_create(b->shader, nir_var_shader_temp, uvec4_type, "child_indices");
+   nir_store_var(b, child_indices, nir_imm_ivec4(b, 0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu), 0xf);
+
+   /* Need to remove infinities here because otherwise we get nasty NaN propogation
+    * if the direction has 0s in it. */
+   /* inv_dir = clamp(inv_dir, -FLT_MAX, FLT_MAX); */
+   inv_dir = nir_fclamp(b, inv_dir, nir_imm_float(b, -FLT_MAX), nir_imm_float(b, FLT_MAX));
+
+   for (int i = 0; i < 4; i++) {
+      const uint32_t child_offset  = offsetof(struct radv_bvh_box32_node, children[i]);
+      const uint32_t coord_offsets[2] = {
+         offsetof(struct radv_bvh_box32_node, coords[i][0][0]),
+         offsetof(struct radv_bvh_box32_node, coords[i][1][0]),
+      };
+
+      /* node->children[i] -> uint */
+      nir_ssa_def *child_index = nir_build_load_global(b, 1, 32, nir_iadd(b, node_addr, nir_imm_int64(b, child_offset)),  .align_mul = 64, .align_offset = child_offset  % 64 );
+      /* node->coords[i][0], node->coords[i][1] -> vec3 */
+      nir_ssa_def *node_coords[2] = {
+         nir_build_load_global(b, 3, 32, nir_iadd(b, node_addr, nir_imm_int64(b, coord_offsets[0])), .align_mul = 64, .align_offset = coord_offsets[0] % 64 ),
+         nir_build_load_global(b, 3, 32, nir_iadd(b, node_addr, nir_imm_int64(b, coord_offsets[1])), .align_mul = 64, .align_offset = coord_offsets[1] % 64 ),
+      };
+
+      /* If x of the aabb min is NaN, then this is an inactive aabb.
+       * We don't need to care about any other components being NaN as that is UB.
+       * https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/chap36.html#VkAabbPositionsKHR */
+      nir_ssa_def *min_x = nir_channel(b, node_coords[0], 0);
+      nir_ssa_def *min_x_is_not_nan = nir_inot(b, nir_fneu(b, min_x, min_x)); /* NaN != NaN -> true */
+
+      /* vec3 bound0 = (node->coords[i][0] - origin) * inv_dir; */
+      nir_ssa_def *bound0 = nir_fmul(b, nir_fsub(b, node_coords[0], origin), inv_dir);
+      /* vec3 bound1 = (node->coords[i][1] - origin) * inv_dir; */
+      nir_ssa_def *bound1 = nir_fmul(b, nir_fsub(b, node_coords[1], origin), inv_dir);
+
+      /* float tmin = max(max(min(bound0.x, bound1.x), min(bound0.y, bound1.y)), min(bound0.z, bound1.z)); */
+      nir_ssa_def *tmin = nir_fmax(b, nir_fmax(b,
+         nir_fmin(b, nir_channel(b, bound0, 0), nir_channel(b, bound1, 0)),
+         nir_fmin(b, nir_channel(b, bound0, 1), nir_channel(b, bound1, 1))),
+         nir_fmin(b, nir_channel(b, bound0, 2), nir_channel(b, bound1, 2)));
+
+      /* float tmax = min(min(max(bound0.x, bound1.x), max(bound0.y, bound1.y)), max(bound0.z, bound1.z)); */
+      nir_ssa_def *tmax = nir_fmin(b, nir_fmin(b,
+         nir_fmax(b, nir_channel(b, bound0, 0), nir_channel(b, bound1, 0)),
+         nir_fmax(b, nir_channel(b, bound0, 1), nir_channel(b, bound1, 1))),
+         nir_fmax(b, nir_channel(b, bound0, 2), nir_channel(b, bound1, 2)));
+
+      /* if (!isnan(node->coords[i][0].x) && tmax >= max(0.0f, tmin) && tmin < ray_tmax) { */
+      nir_push_if(b,
+         nir_iand(b,
+            min_x_is_not_nan,
+            nir_iand(b,
+               nir_fge(b, tmax, nir_fmax(b, nir_imm_float(b, 0.0f), tmin)),
+               nir_flt(b, tmin, ray_tmax))));
+      {
+         /* child_indices[i] = node->children[i]; */
+         nir_ssa_def *new_child_indices[4] = {child_index, child_index, child_index, child_index};
+         nir_store_var(b, child_indices, nir_vec(b, new_child_indices, 4), 1u << i);
+
+         /* distances[i] = tmin; */
+         nir_ssa_def *new_distances[4] = {tmin, tmin, tmin, tmin};
+         nir_store_var(b, distances, nir_vec(b, new_distances, 4), 1u << i);
+
+      }
+      /* } */
+      nir_pop_if(b, NULL);
+   }
+
+   /* Sort our distances with a sorting network. */
+   nir_sort_hit_pair(b, distances, child_indices, 0, 1);
+   nir_sort_hit_pair(b, distances, child_indices, 2, 3);
+   nir_sort_hit_pair(b, distances, child_indices, 0, 2);
+   nir_sort_hit_pair(b, distances, child_indices, 1, 3);
+   nir_sort_hit_pair(b, distances, child_indices, 1, 2);
+
+   return nir_load_var(b, child_indices);
+}
+
+static nir_ssa_def *
+intersect_ray_amd_software_tri(struct radv_device *device,
+                               nir_builder *b, nir_ssa_def *bvh_node,
+                               nir_ssa_def *ray_tmax, nir_ssa_def *origin,
+                               nir_ssa_def *dir, nir_ssa_def *inv_dir)
+{
+   const struct glsl_type *vec4_type = glsl_vector_type(GLSL_TYPE_FLOAT, 4);
+
+   nir_ssa_def *node_addr = build_node_to_addr(device, b, bvh_node);
+
+   const uint32_t coord_offsets[3] = {
+      offsetof(struct radv_bvh_triangle_node, coords[0]),
+      offsetof(struct radv_bvh_triangle_node, coords[1]),
+      offsetof(struct radv_bvh_triangle_node, coords[2]),
+   };
+
+   /* node->coords[0], node->coords[1], node->coords[2] -> vec3 */
+   nir_ssa_def *node_coords[3] = {
+      nir_build_load_global(b, 3, 32, nir_iadd(b, node_addr, nir_imm_int64(b, coord_offsets[0])), .align_mul = 64, .align_offset = coord_offsets[0] % 64 ),
+      nir_build_load_global(b, 3, 32, nir_iadd(b, node_addr, nir_imm_int64(b, coord_offsets[1])), .align_mul = 64, .align_offset = coord_offsets[1] % 64 ),
+      nir_build_load_global(b, 3, 32, nir_iadd(b, node_addr, nir_imm_int64(b, coord_offsets[2])), .align_mul = 64, .align_offset = coord_offsets[2] % 64 ),
+   };
+
+   nir_variable *result = nir_variable_create(b->shader, nir_var_shader_temp, vec4_type, "result");
+   nir_store_var(b, result, nir_imm_vec4(b, INFINITY, 1.0f, 0.0f, 0.0f), 0xf);
+
+   /* Based on watertight Ray/Triangle intersection from
+    * http://jcgt.org/published/0002/01/05/paper.pdf */
+
+   /* Calculate the dimension where the ray direction is largest */
+   nir_ssa_def *abs_dir = nir_fabs(b, dir);
+
+   nir_ssa_def *abs_dirs[3] = {
+      nir_channel(b, abs_dir, 0),
+      nir_channel(b, abs_dir, 1),
+      nir_channel(b, abs_dir, 2),
+   };
+   /* Find index of greatest value of abs_dir and put that as kz. */
+   nir_ssa_def *kz = nir_bcsel(b, nir_fge(b, abs_dirs[0], abs_dirs[1]),
+         nir_bcsel(b, nir_fge(b, abs_dirs[0], abs_dirs[2]),
+            nir_imm_int(b, 0), nir_imm_int(b, 2)),
+         nir_bcsel(b, nir_fge(b, abs_dirs[1], abs_dirs[2]),
+            nir_imm_int(b, 1), nir_imm_int(b, 2)));
+   nir_ssa_def *kx = nir_imod(b, nir_iadd(b, kz, nir_imm_int(b, 1)), nir_imm_int(b, 3));
+   nir_ssa_def *ky = nir_imod(b, nir_iadd(b, kx, nir_imm_int(b, 1)), nir_imm_int(b, 3));
+   nir_ssa_def *k_indices[3] = { kx, ky, kz };
+   nir_ssa_def *k = nir_vec(b, k_indices, 3);
+
+   /* Swap kx and ky dimensions to preseve winding order */
+   unsigned swap_xy_swizzle[4] = {1, 0, 2, 3};
+   k = nir_bcsel(b,
+      nir_flt(b, nir_vector_extract(b, dir, kz), nir_imm_float(b, 0.0f)),
+      nir_swizzle(b, k, swap_xy_swizzle, 3),
+      k);
+
+   kx = nir_channel(b, k, 0);
+   ky = nir_channel(b, k, 1);
+   kz = nir_channel(b, k, 2);
+
+   /* Calculate shear constants */
+   nir_ssa_def *sz = nir_frcp(b, nir_vector_extract(b, dir, kz));
+   nir_ssa_def *sx = nir_fmul(b, nir_vector_extract(b, dir, kx), sz);
+   nir_ssa_def *sy = nir_fmul(b, nir_vector_extract(b, dir, ky), sz);
+
+   /* Calculate vertices relative to ray origin */
+   nir_ssa_def *v_a = nir_fsub(b, node_coords[0], origin);
+   nir_ssa_def *v_b = nir_fsub(b, node_coords[1], origin);
+   nir_ssa_def *v_c = nir_fsub(b, node_coords[2], origin);
+
+   /* Perform shear and scale */
+   nir_ssa_def *ax = nir_fsub(b, nir_vector_extract(b, v_a, kx), nir_fmul(b, sx, nir_vector_extract(b, v_a, kz)));
+   nir_ssa_def *ay = nir_fsub(b, nir_vector_extract(b, v_a, ky), nir_fmul(b, sy, nir_vector_extract(b, v_a, kz)));
+   nir_ssa_def *bx = nir_fsub(b, nir_vector_extract(b, v_b, kx), nir_fmul(b, sx, nir_vector_extract(b, v_b, kz)));
+   nir_ssa_def *by = nir_fsub(b, nir_vector_extract(b, v_b, ky), nir_fmul(b, sy, nir_vector_extract(b, v_b, kz)));
+   nir_ssa_def *cx = nir_fsub(b, nir_vector_extract(b, v_c, kx), nir_fmul(b, sx, nir_vector_extract(b, v_c, kz)));
+   nir_ssa_def *cy = nir_fsub(b, nir_vector_extract(b, v_c, ky), nir_fmul(b, sy, nir_vector_extract(b, v_c, kz)));
+
+   nir_ssa_def *u = nir_fsub(b, nir_fmul(b, cx, by), nir_fmul(b, cy, bx));
+   nir_ssa_def *v = nir_fsub(b, nir_fmul(b, ax, cy), nir_fmul(b, ay, cx));
+   nir_ssa_def *w = nir_fsub(b, nir_fmul(b, bx, ay), nir_fmul(b, by, ax));
+
+   nir_variable *u_var = nir_variable_create(b->shader, nir_var_shader_temp, glsl_float_type(), "u");
+   nir_variable *v_var = nir_variable_create(b->shader, nir_var_shader_temp, glsl_float_type(), "v");
+   nir_variable *w_var = nir_variable_create(b->shader, nir_var_shader_temp, glsl_float_type(), "w");
+   nir_store_var(b, u_var, u, 0x1);
+   nir_store_var(b, v_var, v, 0x1);
+   nir_store_var(b, w_var, w, 0x1);
+
+   /* Fallback to testing edges with double precision...
+    *
+    * The Vulkan spec states it only needs single precision watertightness
+    * but we fail dEQP-VK.ray_tracing_pipeline.watertightness.closedFan2.1024 with
+    * failures = 1 without doing this. :( */
+   nir_ssa_def *cond_retest = nir_ior(b, nir_ior(b,
+      nir_feq(b, u, nir_imm_float(b, 0.0f)),
+      nir_feq(b, v, nir_imm_float(b, 0.0f))),
+      nir_feq(b, w, nir_imm_float(b, 0.0f)));
+
+   nir_push_if(b, cond_retest);
+   {
+      ax = nir_f2f64(b, ax); ay = nir_f2f64(b, ay);
+      bx = nir_f2f64(b, bx); by = nir_f2f64(b, by);
+      cx = nir_f2f64(b, cx); cy = nir_f2f64(b, cy);
+
+      nir_store_var(b, u_var, nir_f2f32(b, nir_fsub(b, nir_fmul(b, cx, by), nir_fmul(b, cy, bx))), 0x1);
+      nir_store_var(b, v_var, nir_f2f32(b, nir_fsub(b, nir_fmul(b, ax, cy), nir_fmul(b, ay, cx))), 0x1);
+      nir_store_var(b, w_var, nir_f2f32(b, nir_fsub(b, nir_fmul(b, bx, ay), nir_fmul(b, by, ax))), 0x1);
+   }
+   nir_pop_if(b, NULL);
+
+   u = nir_load_var(b, u_var);
+   v = nir_load_var(b, v_var);
+   w = nir_load_var(b, w_var);
+
+   /* Perform edge tests. */
+   nir_ssa_def *cond_back = nir_ior(b, nir_ior(b,
+      nir_flt(b, u, nir_imm_float(b, 0.0f)),
+      nir_flt(b, v, nir_imm_float(b, 0.0f))),
+      nir_flt(b, w, nir_imm_float(b, 0.0f)));
+
+   nir_ssa_def *cond_front = nir_ior(b, nir_ior(b,
+      nir_flt(b, nir_imm_float(b, 0.0f), u),
+      nir_flt(b, nir_imm_float(b, 0.0f), v)),
+      nir_flt(b, nir_imm_float(b, 0.0f), w));
+
+   nir_ssa_def *cond = nir_inot(b, nir_iand(b, cond_back, cond_front));
+
+   nir_push_if(b, cond);
+   {
+      nir_ssa_def *det = nir_fadd(b, u, nir_fadd(b, v, w));
+
+      nir_ssa_def *az = nir_fmul(b, sz, nir_vector_extract(b, v_a, kz));
+      nir_ssa_def *bz = nir_fmul(b, sz, nir_vector_extract(b, v_b, kz));
+      nir_ssa_def *cz = nir_fmul(b, sz, nir_vector_extract(b, v_c, kz));
+
+      nir_ssa_def *t = nir_fadd(b, nir_fadd(b, nir_fmul(b, u, az), nir_fmul(b, v, bz)), nir_fmul(b, w, cz));
+
+      nir_ssa_def *t_signed = nir_fmul(b, nir_fsign(b, det), t);
+
+      nir_ssa_def *det_cond_front = nir_inot(b, nir_flt(b, t_signed, nir_imm_float(b, 0.0f)));
+
+      nir_push_if(b, det_cond_front);
+      {
+         nir_ssa_def *indices[4] = {
+            t, det,
+            v, w
+         };
+         nir_store_var(b, result, nir_vec(b, indices, 4), 0xf);
+      }
+      nir_pop_if(b, NULL);
+   }
+   nir_pop_if(b, NULL);
+
+   return nir_load_var(b, result);
+}
+
+static void
+insert_traversal(struct radv_device *device, const VkRayTracingPipelineCreateInfoKHR *pCreateInfo,
+                 nir_builder *b, const struct rt_variables *vars)
+{
+   unsigned stack_entry_size = 4;
+   unsigned lanes = b->shader->info.workgroup_size[0] * b->shader->info.workgroup_size[1] *
+                    b->shader->info.workgroup_size[2];
+   unsigned stack_entry_stride = stack_entry_size * lanes;
+   nir_ssa_def *stack_entry_stride_def = nir_imm_int(b, stack_entry_stride);
+   nir_ssa_def *stack_base =
+      nir_iadd(b, nir_imm_int(b, b->shader->info.shared_size),
+               nir_imul(b, nir_load_subgroup_invocation(b), nir_imm_int(b, stack_entry_size)));
+
+   /*
+    * A top-level AS can contain 2^24 children and a bottom-level AS can contain 2^24 triangles. At
+    * a branching factor of 4, that means we may need up to 24 levels of box nodes + 1 triangle node
+    * + 1 instance node. Furthermore, when processing a box node, worst case we actually push all 4
+    * children and remove one, so the DFS stack depth is box nodes * 3 + 2.
+    */
+   b->shader->info.shared_size += stack_entry_stride * 76;
+   assert(b->shader->info.shared_size <= 32768);
+
+   nir_ssa_def *accel_struct = nir_load_var(b, vars->accel_struct);
+
+   struct rt_traversal_vars trav_vars = init_traversal_vars(b);
+
+   /* Initialize the follow-up shader idx to 0, to be replaced by the miss shader
+    * if we actually miss. */
+   nir_store_var(b, vars->idx, nir_imm_int(b, 0), 1);
+
+   nir_store_var(b, trav_vars.should_return, nir_imm_bool(b, false), 1);
+
+   nir_push_if(b, nir_ine(b, accel_struct, nir_imm_int64(b, 0)));
+   {
+      nir_store_var(b, trav_vars.bvh_base, build_addr_to_node(b, accel_struct), 1);
+
+      nir_ssa_def *bvh_root =
+         nir_build_load_global(b, 1, 32, accel_struct, .access = ACCESS_NON_WRITEABLE,
+                               .align_mul = 64, .align_offset = 0);
+
+      /* We create a BVH descriptor that covers the entire memory range. That way we can always
+       * use the same descriptor, which avoids divergence when different rays hit different
+       * instances at the cost of having to use 64-bit node ids. */
+      const uint64_t bvh_size = 1ull << 42;
+      nir_ssa_def *desc = nir_imm_ivec4(
+         b, 0, 1u << 31 /* Enable box sorting */, (bvh_size - 1) & 0xFFFFFFFFu,
+         ((bvh_size - 1) >> 32) | (1u << 24 /* Return IJ for triangles */) | (1u << 31));
+
+      nir_ssa_def *vec3ones = nir_channels(b, nir_imm_vec4(b, 1.0, 1.0, 1.0, 1.0), 0x7);
+      nir_store_var(b, trav_vars.origin, nir_load_var(b, vars->origin), 7);
+      nir_store_var(b, trav_vars.dir, nir_load_var(b, vars->direction), 7);
+      nir_store_var(b, trav_vars.inv_dir, nir_fdiv(b, vec3ones, nir_load_var(b, trav_vars.dir)), 7);
+      nir_store_var(b, trav_vars.sbt_offset_and_flags, nir_imm_int(b, 0), 1);
+      nir_store_var(b, trav_vars.instance_addr, nir_imm_int64(b, 0), 1);
+
+      nir_store_var(b, trav_vars.stack, nir_iadd(b, stack_base, stack_entry_stride_def), 1);
+      nir_store_shared(b, bvh_root, stack_base, .base = 0, .write_mask = 0x1,
+                       .align_mul = stack_entry_size, .align_offset = 0);
+
+      nir_store_var(b, trav_vars.top_stack, nir_imm_int(b, 0), 1);
+
+      nir_push_loop(b);
+
+      nir_push_if(b, nir_ieq(b, nir_load_var(b, trav_vars.stack), stack_base));
+      nir_jump(b, nir_jump_break);
+      nir_pop_if(b, NULL);
+
+      nir_push_if(
+         b, nir_uge(b, nir_load_var(b, trav_vars.top_stack), nir_load_var(b, trav_vars.stack)));
+      nir_store_var(b, trav_vars.top_stack, nir_imm_int(b, 0), 1);
+      nir_store_var(b, trav_vars.bvh_base,
+                    build_addr_to_node(b, nir_load_var(b, vars->accel_struct)), 1);
+      nir_store_var(b, trav_vars.origin, nir_load_var(b, vars->origin), 7);
+      nir_store_var(b, trav_vars.dir, nir_load_var(b, vars->direction), 7);
+      nir_store_var(b, trav_vars.inv_dir, nir_fdiv(b, vec3ones, nir_load_var(b, trav_vars.dir)), 7);
+      nir_store_var(b, trav_vars.instance_addr, nir_imm_int64(b, 0), 1);
+
+      nir_pop_if(b, NULL);
+
+      nir_store_var(b, trav_vars.stack,
+                    nir_isub(b, nir_load_var(b, trav_vars.stack), stack_entry_stride_def), 1);
+
+      nir_ssa_def *bvh_node = nir_load_shared(b, 1, 32, nir_load_var(b, trav_vars.stack), .base = 0,
+                                              .align_mul = stack_entry_size, .align_offset = 0);
+      nir_ssa_def *bvh_node_type = nir_iand(b, bvh_node, nir_imm_int(b, 7));
+
+      bvh_node = nir_iadd(b, nir_load_var(b, trav_vars.bvh_base), nir_u2u(b, bvh_node, 64));
+      nir_ssa_def *intrinsic_result = NULL;
+      if (device->physical_device->rad_info.chip_class >= GFX10_3
+       && !(device->instance->perftest_flags & RADV_PERFTEST_FORCE_EMULATE_RT)) {
+         intrinsic_result = nir_bvh64_intersect_ray_amd(
+            b, 32, desc, nir_unpack_64_2x32(b, bvh_node), nir_load_var(b, vars->tmax),
+            nir_load_var(b, trav_vars.origin), nir_load_var(b, trav_vars.dir),
+            nir_load_var(b, trav_vars.inv_dir));
+      }
+
+      nir_push_if(b, nir_ine(b, nir_iand(b, bvh_node_type, nir_imm_int(b, 4)), nir_imm_int(b, 0)));
+      {
+         nir_push_if(b,
+                     nir_ine(b, nir_iand(b, bvh_node_type, nir_imm_int(b, 2)), nir_imm_int(b, 0)));
+         {
+            /* custom */
+            nir_push_if(
+               b, nir_ine(b, nir_iand(b, bvh_node_type, nir_imm_int(b, 1)), nir_imm_int(b, 0)));
+            {
+               insert_traversal_aabb_case(device, pCreateInfo, b, vars, &trav_vars, bvh_node);
+            }
+            nir_push_else(b, NULL);
+            {
+               /* instance */
+               nir_ssa_def *instance_node_addr = build_node_to_addr(device, b, bvh_node);
+               nir_ssa_def *instance_data = nir_build_load_global(
+                  b, 4, 32, instance_node_addr, .align_mul = 64, .align_offset = 0);
+               nir_ssa_def *wto_matrix[] = {
+                  nir_build_load_global(b, 4, 32,
+                                        nir_iadd(b, instance_node_addr, nir_imm_int64(b, 16)),
+                                        .align_mul = 64, .align_offset = 16),
+                  nir_build_load_global(b, 4, 32,
+                                        nir_iadd(b, instance_node_addr, nir_imm_int64(b, 32)),
+                                        .align_mul = 64, .align_offset = 32),
+                  nir_build_load_global(b, 4, 32,
+                                        nir_iadd(b, instance_node_addr, nir_imm_int64(b, 48)),
+                                        .align_mul = 64, .align_offset = 48)};
+               nir_ssa_def *instance_id = nir_build_load_global(
+                  b, 1, 32, nir_iadd(b, instance_node_addr, nir_imm_int64(b, 88)), .align_mul = 4,
+                  .align_offset = 0);
+               nir_ssa_def *instance_and_mask = nir_channel(b, instance_data, 2);
+               nir_ssa_def *instance_mask = nir_ushr(b, instance_and_mask, nir_imm_int(b, 24));
+
+               nir_push_if(b,
+                           nir_ieq(b, nir_iand(b, instance_mask, nir_load_var(b, vars->cull_mask)),
+                                   nir_imm_int(b, 0)));
+               nir_jump(b, nir_jump_continue);
+               nir_pop_if(b, NULL);
+
+               nir_store_var(b, trav_vars.top_stack, nir_load_var(b, trav_vars.stack), 1);
+               nir_store_var(b, trav_vars.bvh_base,
+                             build_addr_to_node(
+                                b, nir_pack_64_2x32(b, nir_channels(b, instance_data, 0x3))),
+                             1);
+               nir_store_shared(b,
+                                nir_iand(b, nir_channel(b, instance_data, 0), nir_imm_int(b, 63)),
+                                nir_load_var(b, trav_vars.stack), .base = 0, .write_mask = 0x1,
+                                .align_mul = stack_entry_size, .align_offset = 0);
+               nir_store_var(b, trav_vars.stack,
+                             nir_iadd(b, nir_load_var(b, trav_vars.stack), stack_entry_stride_def),
+                             1);
+
+               nir_store_var(
+                  b, trav_vars.origin,
+                  nir_build_vec3_mat_mult_pre(b, nir_load_var(b, vars->origin), wto_matrix), 7);
+               nir_store_var(
+                  b, trav_vars.dir,
+                  nir_build_vec3_mat_mult(b, nir_load_var(b, vars->direction), wto_matrix, false),
+                  7);
+               nir_store_var(b, trav_vars.inv_dir,
+                             nir_fdiv(b, vec3ones, nir_load_var(b, trav_vars.dir)), 7);
+               nir_store_var(b, trav_vars.custom_instance_and_mask, instance_and_mask, 1);
+               nir_store_var(b, trav_vars.sbt_offset_and_flags, nir_channel(b, instance_data, 3),
+                             1);
+               nir_store_var(b, trav_vars.instance_id, instance_id, 1);
+               nir_store_var(b, trav_vars.instance_addr, instance_node_addr, 1);
+            }
+            nir_pop_if(b, NULL);
+         }
+         nir_push_else(b, NULL);
+         {
+            /* box */
+            nir_ssa_def *result = intrinsic_result;
+            if (!result) {
+               /* If we didn't run the intrinsic cause the hardware didn't support it,
+                * emulate ray/box intersection here */
+               result = intersect_ray_amd_software_box(device,
+                  b, bvh_node, nir_load_var(b, vars->tmax), nir_load_var(b, trav_vars.origin),
+                  nir_load_var(b, trav_vars.dir), nir_load_var(b, trav_vars.inv_dir));
+            }
+
+            for (unsigned i = 4; i-- > 0; ) {
+               nir_ssa_def *new_node = nir_vector_extract(b, result, nir_imm_int(b, i));
+               nir_push_if(b, nir_ine(b, new_node, nir_imm_int(b, 0xffffffff)));
+               {
+                  nir_store_shared(b, new_node, nir_load_var(b, trav_vars.stack), .base = 0,
+                                   .write_mask = 0x1, .align_mul = stack_entry_size,
+                                   .align_offset = 0);
+                  nir_store_var(
+                     b, trav_vars.stack,
+                     nir_iadd(b, nir_load_var(b, trav_vars.stack), stack_entry_stride_def), 1);
+               }
+               nir_pop_if(b, NULL);
+            }
+         }
+         nir_pop_if(b, NULL);
+      }
+      nir_push_else(b, NULL);
+      {
+         nir_ssa_def *result = intrinsic_result;
+         if (!result) {
+            /* If we didn't run the intrinsic cause the hardware didn't support it,
+             * emulate ray/tri intersection here */
+            result = intersect_ray_amd_software_tri(device,
+               b, bvh_node, nir_load_var(b, vars->tmax), nir_load_var(b, trav_vars.origin),
+               nir_load_var(b, trav_vars.dir), nir_load_var(b, trav_vars.inv_dir));
+         }
+         insert_traversal_triangle_case(device, pCreateInfo, b, result, vars, &trav_vars, bvh_node);
+      }
+      nir_pop_if(b, NULL);
+
+      nir_pop_loop(b, NULL);
+   }
+   nir_pop_if(b, NULL);
+
+   /* should_return is set if we had a hit but we won't be calling the closest hit shader and hence
+    * need to return immediately to the calling shader. */
+   nir_push_if(b, nir_load_var(b, trav_vars.should_return));
+   {
+      insert_rt_return(b, vars);
+   }
+   nir_push_else(b, NULL);
+   {
+      /* Only load the miss shader if we actually miss, which we determining by not having set
+       * a closest hit shader. It is valid to not specify an SBT pointer for miss shaders if none
+       * of the rays miss. */
+      nir_push_if(b, nir_ieq(b, nir_load_var(b, vars->idx), nir_imm_int(b, 0)));
+      {
+         load_sbt_entry(b, vars, nir_load_var(b, vars->miss_index), SBT_MISS, 0);
+      }
+      nir_pop_if(b, NULL);
+   }
+   nir_pop_if(b, NULL);
+}
+
+static unsigned
+compute_rt_stack_size(const VkRayTracingPipelineCreateInfoKHR *pCreateInfo,
+                      const struct radv_pipeline_shader_stack_size *stack_sizes)
+{
+   unsigned raygen_size = 0;
+   unsigned callable_size = 0;
+   unsigned chit_size = 0;
+   unsigned miss_size = 0;
+   unsigned non_recursive_size = 0;
+
+   for (unsigned i = 0; i < pCreateInfo->groupCount; ++i) {
+      non_recursive_size = MAX2(stack_sizes[i].non_recursive_size, non_recursive_size);
+
+      const VkRayTracingShaderGroupCreateInfoKHR *group_info = &pCreateInfo->pGroups[i];
+      uint32_t shader_id = VK_SHADER_UNUSED_KHR;
+      unsigned size = stack_sizes[i].recursive_size;
+
+      switch (group_info->type) {
+      case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR:
+         shader_id = group_info->generalShader;
+         break;
+      case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR:
+      case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR:
+         shader_id = group_info->closestHitShader;
+         break;
+      default:
+         break;
+      }
+      if (shader_id == VK_SHADER_UNUSED_KHR)
+         continue;
+
+      const VkPipelineShaderStageCreateInfo *stage = &pCreateInfo->pStages[shader_id];
+      switch (stage->stage) {
+      case VK_SHADER_STAGE_RAYGEN_BIT_KHR:
+         raygen_size = MAX2(raygen_size, size);
+         break;
+      case VK_SHADER_STAGE_MISS_BIT_KHR:
+         miss_size = MAX2(miss_size, size);
+         break;
+      case VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR:
+         chit_size = MAX2(chit_size, size);
+         break;
+      case VK_SHADER_STAGE_CALLABLE_BIT_KHR:
+         callable_size = MAX2(callable_size, size);
+         break;
+      default:
+         unreachable("Invalid stage type in RT shader");
+      }
+   }
+   return raygen_size +
+          MIN2(pCreateInfo->maxPipelineRayRecursionDepth, 1) *
+             MAX2(MAX2(chit_size, miss_size), non_recursive_size) +
+          MAX2(0, (int)(pCreateInfo->maxPipelineRayRecursionDepth) - 1) *
+             MAX2(chit_size, miss_size) +
+          2 * callable_size;
+}
+
+bool
+radv_rt_pipeline_has_dynamic_stack_size(const VkRayTracingPipelineCreateInfoKHR *pCreateInfo)
+{
+   if (!pCreateInfo->pDynamicState)
+      return false;
+
+   for (unsigned i = 0; i < pCreateInfo->pDynamicState->dynamicStateCount; ++i) {
+      if (pCreateInfo->pDynamicState->pDynamicStates[i] ==
+          VK_DYNAMIC_STATE_RAY_TRACING_PIPELINE_STACK_SIZE_KHR)
+         return true;
+   }
+
+   return false;
+}
+
+static nir_shader *
+create_rt_shader(struct radv_device *device, const VkRayTracingPipelineCreateInfoKHR *pCreateInfo,
+                 struct radv_pipeline_shader_stack_size *stack_sizes)
+{
+   RADV_FROM_HANDLE(radv_pipeline_layout, layout, pCreateInfo->layout);
+   struct radv_pipeline_key key;
+   memset(&key, 0, sizeof(key));
+
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, "rt_combined");
+
+   b.shader->info.workgroup_size[0] = 8;
+   b.shader->info.workgroup_size[1] = 8;
+   b.shader->info.workgroup_size[2] = 1;
+
+   struct rt_variables vars = create_rt_variables(b.shader, stack_sizes);
+   load_sbt_entry(&b, &vars, nir_imm_int(&b, 0), SBT_RAYGEN, 0);
+   nir_store_var(&b, vars.stack_ptr, nir_imm_int(&b, 0), 0x1);
+
+   nir_store_var(&b, vars.main_loop_case_visited, nir_imm_bool(&b, true), 1);
+
+   nir_loop *loop = nir_push_loop(&b);
+
+   nir_push_if(&b, nir_ior(&b, nir_ieq(&b, nir_load_var(&b, vars.idx), nir_imm_int(&b, 0)),
+                           nir_ine(&b, nir_load_var(&b, vars.main_loop_case_visited),
+                                   nir_imm_bool(&b, true))));
+   nir_jump(&b, nir_jump_break);
+   nir_pop_if(&b, NULL);
+
+   nir_store_var(&b, vars.main_loop_case_visited, nir_imm_bool(&b, false), 1);
+
+   nir_push_if(&b, nir_ieq(&b, nir_load_var(&b, vars.idx), nir_imm_int(&b, 1)));
+   nir_store_var(&b, vars.main_loop_case_visited, nir_imm_bool(&b, true), 1);
+   insert_traversal(device, pCreateInfo, &b, &vars);
+   nir_pop_if(&b, NULL);
+
+   nir_ssa_def *idx = nir_load_var(&b, vars.idx);
+
+   /* We do a trick with the indexing of the resume shaders so that the first
+    * shader of group x always gets id x and the resume shader ids then come after
+    * groupCount. This makes the shadergroup handles independent of compilation. */
+   unsigned call_idx_base = pCreateInfo->groupCount + 1;
+   for (unsigned i = 0; i < pCreateInfo->groupCount; ++i) {
+      const VkRayTracingShaderGroupCreateInfoKHR *group_info = &pCreateInfo->pGroups[i];
+      uint32_t shader_id = VK_SHADER_UNUSED_KHR;
+
+      switch (group_info->type) {
+      case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR:
+         shader_id = group_info->generalShader;
+         break;
+      case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR:
+      case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR:
+         shader_id = group_info->closestHitShader;
+         break;
+      default:
+         break;
+      }
+      if (shader_id == VK_SHADER_UNUSED_KHR)
+         continue;
+
+      const VkPipelineShaderStageCreateInfo *stage = &pCreateInfo->pStages[shader_id];
+      nir_shader *nir_stage = parse_rt_stage(device, layout, stage);
+
+      b.shader->options = nir_stage->options;
+
+      uint32_t num_resume_shaders = 0;
+      nir_shader **resume_shaders = NULL;
+      nir_lower_shader_calls(nir_stage, nir_address_format_32bit_offset, 16, &resume_shaders,
+                             &num_resume_shaders, nir_stage);
+
+      vars.group_idx = i;
+      insert_rt_case(&b, nir_stage, &vars, idx, call_idx_base, i + 2);
+      for (unsigned j = 0; j < num_resume_shaders; ++j) {
+         insert_rt_case(&b, resume_shaders[j], &vars, idx, call_idx_base, call_idx_base + 1 + j);
+      }
+      call_idx_base += num_resume_shaders;
+   }
+
+   nir_pop_loop(&b, loop);
+
+   if (radv_rt_pipeline_has_dynamic_stack_size(pCreateInfo)) {
+      /* Put something so scratch gets enabled in the shader. */
+      b.shader->scratch_size = 16;
+   } else
+      b.shader->scratch_size = compute_rt_stack_size(pCreateInfo, stack_sizes);
+
+   /* Deal with all the inline functions. */
+   nir_index_ssa_defs(nir_shader_get_entrypoint(b.shader));
+   nir_metadata_preserve(nir_shader_get_entrypoint(b.shader), nir_metadata_none);
+
+   return b.shader;
+}
+
+static VkResult
+radv_rt_pipeline_create(VkDevice _device, VkPipelineCache _cache,
+                        const VkRayTracingPipelineCreateInfoKHR *pCreateInfo,
+                        const VkAllocationCallbacks *pAllocator, VkPipeline *pPipeline)
+{
+   RADV_FROM_HANDLE(radv_device, device, _device);
+   VkResult result;
+   struct radv_pipeline *pipeline = NULL;
+   struct radv_pipeline_shader_stack_size *stack_sizes = NULL;
+   uint8_t hash[20];
+   nir_shader *shader = NULL;
+   bool keep_statistic_info =
+      (pCreateInfo->flags & VK_PIPELINE_CREATE_CAPTURE_STATISTICS_BIT_KHR) ||
+      (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) || device->keep_shader_info;
+
+   if (pCreateInfo->flags & VK_PIPELINE_CREATE_LIBRARY_BIT_KHR)
+      return radv_rt_pipeline_library_create(_device, _cache, pCreateInfo, pAllocator, pPipeline);
+
+   VkRayTracingPipelineCreateInfoKHR local_create_info =
+      radv_create_merged_rt_create_info(pCreateInfo);
+   if (!local_create_info.pStages || !local_create_info.pGroups) {
+      result = VK_ERROR_OUT_OF_HOST_MEMORY;
+      goto fail;
+   }
+
+   radv_hash_rt_shaders(hash, &local_create_info, radv_get_hash_flags(device, keep_statistic_info));
+   struct vk_shader_module module = {.base.type = VK_OBJECT_TYPE_SHADER_MODULE};
+
+   VkComputePipelineCreateInfo compute_info = {
+      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+      .pNext = NULL,
+      .flags = pCreateInfo->flags | VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT,
+      .stage =
+         {
+            .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+            .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+            .module = vk_shader_module_to_handle(&module),
+            .pName = "main",
+         },
+      .layout = pCreateInfo->layout,
+   };
+
+   /* First check if we can get things from the cache before we take the expensive step of
+    * generating the nir. */
+   result = radv_compute_pipeline_create(_device, _cache, &compute_info, pAllocator, hash,
+                                         stack_sizes, local_create_info.groupCount, pPipeline);
+   if (result == VK_PIPELINE_COMPILE_REQUIRED_EXT) {
+      stack_sizes = calloc(sizeof(*stack_sizes), local_create_info.groupCount);
+      if (!stack_sizes) {
+         result = VK_ERROR_OUT_OF_HOST_MEMORY;
+         goto fail;
+      }
+
+      shader = create_rt_shader(device, &local_create_info, stack_sizes);
+      module.nir = shader;
+      compute_info.flags = pCreateInfo->flags;
+      result = radv_compute_pipeline_create(_device, _cache, &compute_info, pAllocator, hash,
+                                            stack_sizes, local_create_info.groupCount, pPipeline);
+      stack_sizes = NULL;
+
+      if (result != VK_SUCCESS)
+         goto shader_fail;
+   }
+   pipeline = radv_pipeline_from_handle(*pPipeline);
+
+   pipeline->compute.rt_group_handles =
+      calloc(sizeof(*pipeline->compute.rt_group_handles), local_create_info.groupCount);
+   if (!pipeline->compute.rt_group_handles) {
+      result = VK_ERROR_OUT_OF_HOST_MEMORY;
+      goto shader_fail;
+   }
+
+   pipeline->compute.dynamic_stack_size = radv_rt_pipeline_has_dynamic_stack_size(pCreateInfo);
+
+   for (unsigned i = 0; i < local_create_info.groupCount; ++i) {
+      const VkRayTracingShaderGroupCreateInfoKHR *group_info = &local_create_info.pGroups[i];
+      switch (group_info->type) {
+      case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR:
+         if (group_info->generalShader != VK_SHADER_UNUSED_KHR)
+            pipeline->compute.rt_group_handles[i].handles[0] = i + 2;
+         break;
+      case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR:
+         if (group_info->intersectionShader != VK_SHADER_UNUSED_KHR)
+            pipeline->compute.rt_group_handles[i].handles[1] = i + 2;
+         FALLTHROUGH;
+      case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR:
+         if (group_info->closestHitShader != VK_SHADER_UNUSED_KHR)
+            pipeline->compute.rt_group_handles[i].handles[0] = i + 2;
+         if (group_info->anyHitShader != VK_SHADER_UNUSED_KHR)
+            pipeline->compute.rt_group_handles[i].handles[1] = i + 2;
+         break;
+      case VK_SHADER_GROUP_SHADER_MAX_ENUM_KHR:
+         unreachable("VK_SHADER_GROUP_SHADER_MAX_ENUM_KHR");
+      }
+   }
+
+shader_fail:
+   if (result != VK_SUCCESS && pipeline)
+      radv_pipeline_destroy(device, pipeline, pAllocator);
+   ralloc_free(shader);
+fail:
+   free((void *)local_create_info.pGroups);
+   free((void *)local_create_info.pStages);
+   free(stack_sizes);
+   return result;
+}
+
+VkResult
+radv_CreateRayTracingPipelinesKHR(VkDevice _device, VkDeferredOperationKHR deferredOperation,
+                                  VkPipelineCache pipelineCache, uint32_t count,
+                                  const VkRayTracingPipelineCreateInfoKHR *pCreateInfos,
+                                  const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines)
+{
+   VkResult result = VK_SUCCESS;
+
+   unsigned i = 0;
+   for (; i < count; i++) {
+      VkResult r;
+      r = radv_rt_pipeline_create(_device, pipelineCache, &pCreateInfos[i], pAllocator,
+                                  &pPipelines[i]);
+      if (r != VK_SUCCESS) {
+         result = r;
+         pPipelines[i] = VK_NULL_HANDLE;
+
+         if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
+            break;
+      }
+   }
+
+   for (; i < count; ++i)
+      pPipelines[i] = VK_NULL_HANDLE;
+
+   return result;
+}
+
+VkResult
+radv_GetRayTracingShaderGroupHandlesKHR(VkDevice device, VkPipeline _pipeline, uint32_t firstGroup,
+                                        uint32_t groupCount, size_t dataSize, void *pData)
+{
+   RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
+   char *data = pData;
+
+   STATIC_ASSERT(sizeof(*pipeline->compute.rt_group_handles) <= RADV_RT_HANDLE_SIZE);
+
+   memset(data, 0, groupCount * RADV_RT_HANDLE_SIZE);
+
+   for (uint32_t i = 0; i < groupCount; ++i) {
+      memcpy(data + i * RADV_RT_HANDLE_SIZE, &pipeline->compute.rt_group_handles[firstGroup + i],
+             sizeof(*pipeline->compute.rt_group_handles));
+   }
+
+   return VK_SUCCESS;
+}
+
+VkDeviceSize
+radv_GetRayTracingShaderGroupStackSizeKHR(VkDevice device, VkPipeline _pipeline, uint32_t group,
+                                          VkShaderGroupShaderKHR groupShader)
+{
+   RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
+   const struct radv_pipeline_shader_stack_size *stack_size =
+      &pipeline->compute.rt_stack_sizes[group];
+
+   if (groupShader == VK_SHADER_GROUP_SHADER_ANY_HIT_KHR ||
+       groupShader == VK_SHADER_GROUP_SHADER_INTERSECTION_KHR)
+      return stack_size->non_recursive_size;
+   else
+      return stack_size->recursive_size;
+}
diff --git a/mesa 3D driver/src/amd/vulkan/radv_private.h b/mesa 3D driver/src/amd/vulkan/radv_private.h
index 970b9907dd..896a7672cc 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_private.h	
+++ b/mesa 3D driver/src/amd/vulkan/radv_private.h	
@@ -59,8 +59,11 @@
 #include "vk_device.h"
 #include "vk_format.h"
 #include "vk_instance.h"
+#include "vk_log.h"
 #include "vk_physical_device.h"
 #include "vk_shader_module.h"
+#include "vk_command_buffer.h"
+#include "vk_queue.h"
 #include "vk_util.h"
 
 #include "ac_binary.h"
@@ -71,6 +74,7 @@
 #include "radv_constants.h"
 #include "radv_descriptor_set.h"
 #include "radv_radeon_winsys.h"
+#include "radv_shader.h"
 #include "sid.h"
 
 /* Pre-declarations needed for WSI entrypoints */
@@ -89,6 +93,11 @@ typedef uint32_t xcb_window_t;
 
 #include "wsi_common.h"
 
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
 /* Helper to determine if we should compile
  * any of the Android AHB support.
  *
@@ -201,39 +210,11 @@ radv_clear_mask(uint32_t *inout_mask, uint32_t clear_mask)
 struct radv_image_view;
 struct radv_instance;
 
-VkResult __vk_errorv(struct radv_instance *instance, const void *object,
-                     VkDebugReportObjectTypeEXT type, VkResult error, const char *file, int line,
-                     const char *format, va_list args);
-
-VkResult __vk_errorf(struct radv_instance *instance, const void *object,
-                     VkDebugReportObjectTypeEXT type, VkResult error, const char *file, int line,
-                     const char *format, ...) radv_printflike(7, 8);
-
-#define vk_error(instance, error)                                                                  \
-   __vk_errorf(instance, NULL, VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT, error, __FILE__, __LINE__, \
-               NULL);
-#define vk_errorf(instance, error, format, ...)                                                    \
-   __vk_errorf(instance, NULL, VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT, error, __FILE__, __LINE__, \
-               format, ##__VA_ARGS__);
-
-void __radv_finishme(const char *file, int line, const char *format, ...) radv_printflike(3, 4);
 void radv_loge(const char *format, ...) radv_printflike(1, 2);
 void radv_loge_v(const char *format, va_list va);
 void radv_logi(const char *format, ...) radv_printflike(1, 2);
 void radv_logi_v(const char *format, va_list va);
 
-/**
- * Print a FINISHME message, including its source location.
- */
-#define radv_finishme(format, ...)                                                                 \
-   do {                                                                                            \
-      static bool reported = false;                                                                \
-      if (!reported) {                                                                             \
-         __radv_finishme(__FILE__, __LINE__, format, ##__VA_ARGS__);                               \
-         reported = true;                                                                          \
-      }                                                                                            \
-   } while (0)
-
 /* A non-fatal assert.  Useful for debugging. */
 #ifdef NDEBUG
 #define radv_assert(x)                                                                             \
@@ -282,6 +263,9 @@ struct radv_physical_device {
    /* Whether to enable NGG. */
    bool use_ngg;
 
+   /* Whether to enable NGG culling. */
+   bool use_ngg_culling;
+
    /* Whether to enable NGG streamout. */
    bool use_ngg_streamout;
 
@@ -310,6 +294,8 @@ struct radv_physical_device {
    dev_t primary_devid;
    dev_t render_devid;
 #endif
+
+   nir_shader_compiler_options nir_options;
 };
 
 struct radv_instance {
@@ -356,52 +342,23 @@ struct radv_pipeline_cache {
    VkAllocationCallbacks alloc;
 };
 
-struct radv_pipeline_key {
-   uint32_t instance_rate_inputs;
-   uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS];
-   uint8_t vertex_attribute_formats[MAX_VERTEX_ATTRIBS];
-   uint32_t vertex_attribute_bindings[MAX_VERTEX_ATTRIBS];
-   uint32_t vertex_attribute_offsets[MAX_VERTEX_ATTRIBS];
-   uint32_t vertex_attribute_strides[MAX_VERTEX_ATTRIBS];
-   uint8_t vertex_binding_align[MAX_VBS];
-   enum ac_fetch_format vertex_alpha_adjust[MAX_VERTEX_ATTRIBS];
-   uint32_t vertex_post_shuffle;
-   unsigned tess_input_vertices;
-   uint32_t col_format;
-   uint32_t is_int8;
-   uint32_t is_int10;
-   uint8_t log2_ps_iter_samples;
-   uint8_t num_samples;
-   uint32_t has_multiview_view_index : 1;
-   uint32_t optimisations_disabled : 1;
-   uint32_t provoking_vtx_last : 1;
-   uint8_t topology;
-
-   /* Non-zero if a required subgroup size is specified via
-    * VK_EXT_subgroup_size_control.
-    */
-   uint8_t compute_subgroup_size;
-   bool require_full_subgroups;
-};
-
 struct radv_shader_binary;
 struct radv_shader_variant;
+struct radv_pipeline_shader_stack_size;
 
 void radv_pipeline_cache_init(struct radv_pipeline_cache *cache, struct radv_device *device);
 void radv_pipeline_cache_finish(struct radv_pipeline_cache *cache);
 bool radv_pipeline_cache_load(struct radv_pipeline_cache *cache, const void *data, size_t size);
 
-bool radv_create_shader_variants_from_pipeline_cache(struct radv_device *device,
-                                                     struct radv_pipeline_cache *cache,
-                                                     const unsigned char *sha1,
-                                                     struct radv_shader_variant **variants,
-                                                     bool *found_in_application_cache);
+bool radv_create_shader_variants_from_pipeline_cache(
+   struct radv_device *device, struct radv_pipeline_cache *cache, const unsigned char *sha1,
+   struct radv_shader_variant **variants, struct radv_pipeline_shader_stack_size **stack_sizes,
+   uint32_t *num_stack_sizes, bool *found_in_application_cache);
 
-void radv_pipeline_cache_insert_shaders(struct radv_device *device,
-                                        struct radv_pipeline_cache *cache,
-                                        const unsigned char *sha1,
-                                        struct radv_shader_variant **variants,
-                                        struct radv_shader_binary *const *binaries);
+void radv_pipeline_cache_insert_shaders(
+   struct radv_device *device, struct radv_pipeline_cache *cache, const unsigned char *sha1,
+   struct radv_shader_variant **variants, struct radv_shader_binary *const *binaries,
+   const struct radv_pipeline_shader_stack_size *stack_sizes, uint32_t num_stack_sizes);
 
 enum radv_blit_ds_layout {
    RADV_BLIT_DS_LAYOUT_TILE_ENABLE,
@@ -486,7 +443,7 @@ struct radv_meta_state {
    VkDescriptorSetLayout copy_vrs_htile_ds_layout;
 
    /* Clear DCC with comp-to-single. */
-   VkPipeline clear_dcc_comp_to_single_pipeline;
+   VkPipeline clear_dcc_comp_to_single_pipeline[2]; /* 0: 1x, 1: 2x/4x/8x */
    VkPipelineLayout clear_dcc_comp_to_single_p_layout;
    VkDescriptorSetLayout clear_dcc_comp_to_single_ds_layout;
 
@@ -630,6 +587,10 @@ struct radv_meta_state {
       VkRenderPass pass;
    } depth_decomp[MAX_SAMPLES_LOG2];
 
+   VkDescriptorSetLayout expand_depth_stencil_compute_ds_layout;
+   VkPipelineLayout expand_depth_stencil_compute_p_layout;
+   VkPipeline expand_depth_stencil_compute_pipeline;
+
    struct {
       VkPipelineLayout p_layout;
       VkPipeline cmask_eliminate_pipeline;
@@ -669,7 +630,7 @@ struct radv_meta_state {
    struct {
       VkDescriptorSetLayout ds_layout;
       VkPipelineLayout p_layout;
-      VkPipeline pipeline;
+      VkPipeline pipeline[32];
    } dcc_retile;
 
    struct {
@@ -677,6 +638,8 @@ struct radv_meta_state {
       VkPipeline leaf_pipeline;
       VkPipelineLayout internal_p_layout;
       VkPipeline internal_pipeline;
+      VkPipelineLayout copy_p_layout;
+      VkPipeline copy_pipeline;
    } accel_struct_build;
 };
 
@@ -697,13 +660,10 @@ struct radv_deferred_queue_submission;
 enum ring_type radv_queue_family_to_ring(int f);
 
 struct radv_queue {
-   struct vk_object_base base;
+   struct vk_queue vk;
    struct radv_device *device;
    struct radeon_winsys_ctx *hw_ctx;
    enum radeon_ctx_priority priority;
-   uint32_t queue_family_index;
-   int queue_idx;
-   VkDeviceQueueCreateFlags flags;
 
    uint32_t scratch_size_per_wave;
    uint32_t scratch_waves;
@@ -810,8 +770,12 @@ struct radv_device {
     */
    uint32_t image_mrt_offset_counter;
    uint32_t fmask_mrt_offset_counter;
-   struct list_head shader_slabs;
-   mtx_t shader_slab_mutex;
+
+   struct list_head shader_arenas;
+   uint8_t shader_free_list_mask;
+   struct list_head shader_free_lists[RADV_SHADER_ALLOC_NUM_FREE_LISTS];
+   struct list_head shader_block_obj_pool;
+   mtx_t shader_arena_mutex;
 
    /* For detecting VM faults reported by dmesg. */
    uint64_t dmesg_timestamp;
@@ -831,6 +795,9 @@ struct radv_device {
    /* Whether attachment VRS is enabled. */
    bool attachment_vrs_enabled;
 
+   /* Whether shader image 32-bit float atomics are enabled. */
+   bool image_float32_atomics;
+
    /* Whether anisotropy is forced with RADV_TEX_ANISO (-1 is disabled). */
    int force_aniso;
 
@@ -862,8 +829,15 @@ struct radv_device {
    /* Depth image for VRS when not bound by the app. */
    struct {
       struct radv_image *image;
+      struct radv_buffer *buffer; /* HTILE */
       struct radv_device_memory *mem;
    } vrs;
+
+   struct u_rwlock vs_prologs_lock;
+   struct hash_table *vs_prologs;
+
+   struct radv_shader_prolog *simple_vs_prologs[MAX_VERTEX_ATTRIBS];
+   struct radv_shader_prolog *instance_rate_vs_prologs[816];
 };
 
 VkResult _radv_device_set_lost(struct radv_device *device, const char *file, int line,
@@ -893,6 +867,10 @@ struct radv_device_memory {
 #endif
 };
 
+void radv_device_memory_init(struct radv_device_memory *mem, struct radv_device *device,
+                             struct radeon_winsys_bo *bo);
+void radv_device_memory_finish(struct radv_device_memory *mem);
+
 struct radv_descriptor_range {
    uint64_t va;
    uint32_t size;
@@ -991,6 +969,10 @@ struct radv_buffer {
    bool shareable;
 };
 
+void radv_buffer_init(struct radv_buffer *buffer, struct radv_device *device,
+                      struct radeon_winsys_bo *bo, uint64_t size, uint64_t offset);
+void radv_buffer_finish(struct radv_buffer *buffer);
+
 enum radv_dynamic_state_bits {
    RADV_DYNAMIC_VIEWPORT = 1ull << 0,
    RADV_DYNAMIC_SCISSOR = 1ull << 1,
@@ -1021,7 +1003,8 @@ enum radv_dynamic_state_bits {
    RADV_DYNAMIC_LOGIC_OP = 1ull << 26,
    RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE = 1ull << 27,
    RADV_DYNAMIC_COLOR_WRITE_ENABLE = 1ull << 28,
-   RADV_DYNAMIC_ALL = (1ull << 29) - 1,
+   RADV_DYNAMIC_VERTEX_INPUT = 1ull << 29,
+   RADV_DYNAMIC_ALL = (1ull << 30) - 1,
 };
 
 enum radv_cmd_dirty_bits {
@@ -1056,12 +1039,14 @@ enum radv_cmd_dirty_bits {
    RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP = 1ull << 26,
    RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE = 1ull << 27,
    RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE = 1ull << 28,
-   RADV_CMD_DIRTY_DYNAMIC_ALL = (1ull << 29) - 1,
-   RADV_CMD_DIRTY_PIPELINE = 1ull << 29,
-   RADV_CMD_DIRTY_INDEX_BUFFER = 1ull << 30,
-   RADV_CMD_DIRTY_FRAMEBUFFER = 1ull << 31,
-   RADV_CMD_DIRTY_VERTEX_BUFFER = 1ull << 32,
-   RADV_CMD_DIRTY_STREAMOUT_BUFFER = 1ull << 33
+   RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT = 1ull << 29,
+   RADV_CMD_DIRTY_DYNAMIC_ALL = (1ull << 30) - 1,
+   RADV_CMD_DIRTY_PIPELINE = 1ull << 30,
+   RADV_CMD_DIRTY_INDEX_BUFFER = 1ull << 31,
+   RADV_CMD_DIRTY_FRAMEBUFFER = 1ull << 32,
+   RADV_CMD_DIRTY_VERTEX_BUFFER = 1ull << 33,
+   RADV_CMD_DIRTY_STREAMOUT_BUFFER = 1ull << 34,
+   RADV_CMD_DIRTY_VERTEX_STATE = RADV_CMD_DIRTY_VERTEX_BUFFER | RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT,
 };
 
 enum radv_cmd_flush_bits {
@@ -1297,6 +1282,8 @@ void radv_initialise_color_surface(struct radv_device *device, struct radv_color
                                    struct radv_image_view *iview);
 void radv_initialise_ds_surface(struct radv_device *device, struct radv_ds_buffer_info *ds,
                                 struct radv_image_view *iview);
+void radv_initialise_vrs_surface(struct radv_image *image, struct radv_buffer *htile_buffer,
+                                 struct radv_ds_buffer_info *ds);
 
 /**
  * Attachment state when recording a renderpass instance.
@@ -1371,6 +1358,7 @@ struct radv_cmd_state {
    struct radv_render_pass *pass;
    const struct radv_subpass *subpass;
    struct radv_dynamic_state dynamic;
+   struct radv_vs_input_state dynamic_vs_input;
    struct radv_attachment_state *attachments;
    struct radv_streamout_state streamout;
    VkRect2D render_area;
@@ -1408,6 +1396,9 @@ struct radv_cmd_state {
    /* Whether CP DMA is busy/idle. */
    bool dma_is_busy;
 
+   /* Whether any images that are not L2 coherent are dirty from the CB. */
+   bool rb_noncoherent_dirty;
+
    /* Conditional rendering info. */
    uint8_t predication_op; /* 32-bit or 64-bit predicate value */
    int predication_type;   /* -1: disabled, 0: normal, 1: inverted */
@@ -1434,6 +1425,14 @@ struct radv_cmd_state {
 
    /* Whether DRAW_{INDEX}_INDIRECT_MULTI is emitted. */
    bool uses_draw_indirect_multi;
+
+   uint32_t rt_stack_size;
+
+   struct radv_shader_prolog *emitted_vs_prolog;
+   uint32_t *emitted_vs_prolog_key;
+   uint32_t emitted_vs_prolog_key_hash;
+   uint32_t vbo_misaligned_mask;
+   uint32_t vbo_bound_mask;
 };
 
 struct radv_cmd_pool {
@@ -1461,7 +1460,7 @@ enum radv_cmd_buffer_status {
 };
 
 struct radv_cmd_buffer {
-   struct vk_object_base base;
+   struct vk_command_buffer vk;
 
    struct radv_device *device;
 
@@ -1551,10 +1550,17 @@ void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va, uin
 void si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer);
 
 void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer);
+
+unsigned radv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_inputs);
+uint32_t radv_hash_vs_prolog(const void *key_);
+bool radv_cmp_vs_prolog(const void *a_, const void *b_);
+
 bool radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer, unsigned size,
                                   unsigned *out_offset, void **ptr);
 void radv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer,
                                  const struct radv_subpass *subpass);
+void radv_cmd_buffer_restore_subpass(struct radv_cmd_buffer *cmd_buffer,
+                                     const struct radv_subpass *subpass);
 bool radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer, unsigned size,
                                  const void *data, unsigned *out_offset);
 
@@ -1571,7 +1577,7 @@ void radv_depth_stencil_resolve_subpass_fs(struct radv_cmd_buffer *cmd_buffer,
 void radv_emit_default_sample_locations(struct radeon_cmdbuf *cs, int nr_samples);
 unsigned radv_get_default_max_sample_dist(int log_samples);
 void radv_device_init_msaa(struct radv_device *device);
-VkResult radv_device_init_vrs_image(struct radv_device *device);
+VkResult radv_device_init_vrs_state(struct radv_device *device);
 
 void radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
                                    const struct radv_image_view *iview,
@@ -1663,30 +1669,38 @@ radv_get_viewport_xform(const VkViewport *viewport, float scale[3], float transl
 void radv_unaligned_dispatch(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y,
                              uint32_t z);
 
+void radv_indirect_dispatch(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys_bo *bo,
+                            uint64_t va);
+
 struct radv_event {
    struct vk_object_base base;
    struct radeon_winsys_bo *bo;
    uint64_t *map;
 };
 
-#define RADV_HASH_SHADER_NO_NGG            (1 << 0)
 #define RADV_HASH_SHADER_CS_WAVE32         (1 << 1)
 #define RADV_HASH_SHADER_PS_WAVE32         (1 << 2)
 #define RADV_HASH_SHADER_GE_WAVE32         (1 << 3)
 #define RADV_HASH_SHADER_LLVM              (1 << 4)
-#define RADV_HASH_SHADER_DISCARD_TO_DEMOTE (1 << 5)
-#define RADV_HASH_SHADER_MRT_NAN_FIXUP     (1 << 6)
-#define RADV_HASH_SHADER_INVARIANT_GEOM    (1 << 7)
 #define RADV_HASH_SHADER_KEEP_STATISTICS   (1 << 8)
-#define RADV_HASH_SHADER_FORCE_VRS_2x2     (1 << 9)
-#define RADV_HASH_SHADER_FORCE_VRS_2x1     (1 << 10)
-#define RADV_HASH_SHADER_FORCE_VRS_1x2     (1 << 11)
-#define RADV_HASH_SHADER_FORCE_NGG_CULLING (1 << 13)
+#define RADV_HASH_SHADER_USE_NGG_CULLING   (1 << 13)
+#define RADV_HASH_SHADER_ROBUST_BUFFER_ACCESS (1 << 14)
+#define RADV_HASH_SHADER_ROBUST_BUFFER_ACCESS2 (1 << 15)
+#define RADV_HASH_SHADER_FORCE_EMULATE_RT (1 << 16)
+
+struct radv_pipeline_key;
 
 void radv_hash_shaders(unsigned char *hash, const VkPipelineShaderStageCreateInfo **stages,
                        const struct radv_pipeline_layout *layout,
                        const struct radv_pipeline_key *key, uint32_t flags);
 
+void radv_hash_rt_shaders(unsigned char *hash, const VkRayTracingPipelineCreateInfoKHR *pCreateInfo,
+                          uint32_t flags);
+
+uint32_t radv_get_hash_flags(const struct radv_device *device, bool stats);
+
+bool radv_rt_pipeline_has_dynamic_stack_size(const VkRayTracingPipelineCreateInfoKHR *pCreateInfo);
+
 #define RADV_STAGE_MASK ((1 << MESA_SHADER_STAGES) - 1)
 
 #define radv_foreach_stage(stage, stage_bits)                                                      \
@@ -1728,13 +1742,31 @@ struct radv_binning_state {
 
 #define SI_GS_PER_ES 128
 
+enum radv_pipeline_type {
+   RADV_PIPELINE_GRAPHICS,
+   /* Compute pipeline (incl raytracing pipeline) */
+   RADV_PIPELINE_COMPUTE,
+   /* Pipeline library. This can't actually run and merely is a partial pipeline. */
+   RADV_PIPELINE_LIBRARY
+};
+
+struct radv_pipeline_group_handle {
+   uint32_t handles[2];
+};
+
+struct radv_pipeline_shader_stack_size {
+   uint32_t recursive_size;
+   /* anyhit + intersection */
+   uint32_t non_recursive_size;
+};
+
 struct radv_pipeline {
    struct vk_object_base base;
+   enum radv_pipeline_type type;
+
    struct radv_device *device;
    struct radv_dynamic_state dynamic_state;
 
-   struct radv_pipeline_layout *layout;
-
    bool need_indirect_descriptor_sets;
    struct radv_shader_variant *shaders[MESA_SHADER_STAGES];
    struct radv_shader_variant *gs_copy_shader;
@@ -1751,6 +1783,9 @@ struct radv_pipeline {
    uint32_t attrib_index_offset[MAX_VERTEX_ATTRIBS];
 
    bool use_per_attribute_vb_descs;
+   bool can_use_simple_input;
+   uint8_t last_vertex_attrib_bit;
+   uint8_t next_vertex_stage : 8;
    uint32_t vb_desc_usage_mask;
    uint32_t vb_desc_alloc_size;
 
@@ -1790,6 +1825,18 @@ struct radv_pipeline {
          /* Last pre-PS API stage */
          gl_shader_stage last_vgt_api_stage;
       } graphics;
+      struct {
+         struct radv_pipeline_group_handle *rt_group_handles;
+         struct radv_pipeline_shader_stack_size *rt_stack_sizes;
+         bool dynamic_stack_size;
+         uint32_t group_count;
+      } compute;
+      struct {
+         unsigned stage_count;
+         VkPipelineShaderStageCreateInfo *stages;
+         unsigned group_count;
+         VkRayTracingShaderGroupCreateInfoKHR *groups;
+      } library;
    };
 
    unsigned max_waves;
@@ -1800,6 +1847,10 @@ struct radv_pipeline {
 
    /* Unique pipeline hash identifier. */
    uint64_t pipeline_hash;
+
+   /* Pipeline layout info. */
+   uint32_t push_constant_size;
+   uint32_t dynamic_offset_count;
 };
 
 static inline bool
@@ -1828,8 +1879,6 @@ struct radv_graphics_pipeline_create_info {
    bool use_rectlist;
    bool db_depth_clear;
    bool db_stencil_clear;
-   bool db_depth_disable_expclear;
-   bool db_stencil_disable_expclear;
    bool depth_compress_disable;
    bool stencil_compress_disable;
    bool resummarize_enable;
@@ -1841,6 +1890,16 @@ VkResult radv_graphics_pipeline_create(VkDevice device, VkPipelineCache cache,
                                        const struct radv_graphics_pipeline_create_info *extra,
                                        const VkAllocationCallbacks *alloc, VkPipeline *pPipeline);
 
+VkResult radv_compute_pipeline_create(VkDevice _device, VkPipelineCache _cache,
+                                      const VkComputePipelineCreateInfo *pCreateInfo,
+                                      const VkAllocationCallbacks *pAllocator,
+                                      const uint8_t *custom_hash,
+                                      struct radv_pipeline_shader_stack_size *rt_stack_sizes,
+                                      uint32_t rt_group_count, VkPipeline *pPipeline);
+
+void radv_pipeline_destroy(struct radv_device *device, struct radv_pipeline *pipeline,
+                           const VkAllocationCallbacks *allocator);
+
 struct radv_binning_settings {
    unsigned context_states_per_bin;    /* allowed range: [1, 6] */
    unsigned persistent_states_per_bin; /* allowed range: [1, 32] */
@@ -1855,6 +1914,10 @@ uint32_t radv_translate_buffer_dataformat(const struct util_format_description *
 uint32_t radv_translate_buffer_numformat(const struct util_format_description *desc,
                                          int first_non_void);
 bool radv_is_buffer_format_supported(VkFormat format, bool *scaled);
+void radv_translate_vertex_format(const struct radv_physical_device *pdevice, VkFormat format,
+                                  const struct util_format_description *desc, unsigned *dfmt,
+                                  unsigned *nfmt, bool *post_shuffle,
+                                  enum radv_vs_input_alpha_adjust *alpha_adjust);
 uint32_t radv_translate_colorformat(VkFormat format);
 uint32_t radv_translate_color_numformat(VkFormat format, const struct util_format_description *desc,
                                         int first_non_void);
@@ -2271,6 +2334,7 @@ struct radv_image_view_extra_create_info {
 void radv_image_view_init(struct radv_image_view *view, struct radv_device *device,
                           const VkImageViewCreateInfo *pCreateInfo,
                           const struct radv_image_view_extra_create_info *extra_create_info);
+void radv_image_view_finish(struct radv_image_view *iview);
 
 VkFormat radv_get_aspect_format(struct radv_image *image, VkImageAspectFlags mask);
 
@@ -2293,6 +2357,7 @@ struct radv_buffer_view {
 };
 void radv_buffer_view_init(struct radv_buffer_view *view, struct radv_device *device,
                            const VkBufferViewCreateInfo *pCreateInfo);
+void radv_buffer_view_finish(struct radv_buffer_view *view);
 
 static inline struct VkExtent3D
 radv_sanitize_image_extent(const VkImageType imageType, const struct VkExtent3D imageExtent)
@@ -2358,7 +2423,7 @@ struct radv_subpass_barrier {
    VkAccessFlags dst_access_mask;
 };
 
-void radv_subpass_barrier(struct radv_cmd_buffer *cmd_buffer,
+void radv_emit_subpass_barrier(struct radv_cmd_buffer *cmd_buffer,
                           const struct radv_subpass_barrier *barrier);
 
 struct radv_subpass_attachment {
@@ -2556,21 +2621,20 @@ struct radv_fence {
 
 /* radv_nir_to_llvm.c */
 struct radv_shader_args;
+struct radv_nir_compiler_options;
+struct radv_shader_info;
 
-void llvm_compile_shader(struct radv_device *device, unsigned shader_count,
+void llvm_compile_shader(const struct radv_nir_compiler_options *options,
+                         struct radv_shader_info *info, unsigned shader_count,
                          struct nir_shader *const *shaders, struct radv_shader_binary **binary,
-                         struct radv_shader_args *args);
-
-unsigned radv_nir_get_max_workgroup_size(enum chip_class chip_class, gl_shader_stage stage,
-                                         const struct nir_shader *nir);
+                         const struct radv_shader_args *args);
 
 /* radv_shader_info.h */
 struct radv_shader_info;
-struct radv_shader_variant_key;
 
 void radv_nir_shader_info_pass(struct radv_device *device, const struct nir_shader *nir,
                                const struct radv_pipeline_layout *layout,
-                               const struct radv_shader_variant_key *key,
+                               const struct radv_pipeline_key *pipeline_key,
                                struct radv_shader_info *info);
 
 void radv_nir_shader_info_init(struct radv_shader_info *info);
@@ -2582,6 +2646,8 @@ bool radv_end_thread_trace(struct radv_queue *queue);
 bool radv_get_thread_trace(struct radv_queue *queue, struct ac_thread_trace *thread_trace);
 void radv_emit_thread_trace_userdata(const struct radv_device *device, struct radeon_cmdbuf *cs,
                                      const void *data, uint32_t num_dwords);
+bool radv_is_instruction_timing_enabled(void);
+
 /* radv_sqtt_layer_.c */
 struct radv_barrier_data {
    union {
@@ -2666,6 +2732,30 @@ si_conv_gl_prim_to_vertices(unsigned gl_prim)
    }
 }
 
+static inline uint32_t
+si_conv_prim_to_gs_out(enum VkPrimitiveTopology topology)
+{
+   switch (topology) {
+   case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
+   case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
+      return V_028A6C_POINTLIST;
+   case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
+   case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
+   case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
+   case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
+      return V_028A6C_LINESTRIP;
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
+   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
+      return V_028A6C_TRISTRIP;
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
 struct radv_extra_render_pass_begin_info {
    bool disable_dcc;
 };
@@ -2797,60 +2887,63 @@ radv_accel_struct_get_va(const struct radv_acceleration_structure *accel)
    return radv_buffer_get_va(accel->bo) + accel->mem_offset;
 }
 
-#define RADV_DEFINE_HANDLE_CASTS(__radv_type, __VkType)                                            \
-                                                                                                   \
-   static inline struct __radv_type *__radv_type##_from_handle(__VkType _handle)                   \
-   {                                                                                               \
-      return (struct __radv_type *)_handle;                                                        \
-   }                                                                                               \
-                                                                                                   \
-   static inline __VkType __radv_type##_to_handle(struct __radv_type *_obj)                        \
-   {                                                                                               \
-      return (__VkType)_obj;                                                                       \
-   }
+#define RADV_FROM_HANDLE(__radv_type, __name, __handle) \
+   VK_FROM_HANDLE(__radv_type, __name, __handle)
 
-#define RADV_DEFINE_NONDISP_HANDLE_CASTS(__radv_type, __VkType)                                    \
-                                                                                                   \
-   static inline struct __radv_type *__radv_type##_from_handle(__VkType _handle)                   \
-   {                                                                                               \
-      return (struct __radv_type *)(uintptr_t)_handle;                                             \
-   }                                                                                               \
-                                                                                                   \
-   static inline __VkType __radv_type##_to_handle(struct __radv_type *_obj)                        \
-   {                                                                                               \
-      return (__VkType)(uintptr_t)_obj;                                                            \
-   }
+VK_DEFINE_HANDLE_CASTS(radv_cmd_buffer, vk.base, VkCommandBuffer,
+                       VK_OBJECT_TYPE_COMMAND_BUFFER)
+VK_DEFINE_HANDLE_CASTS(radv_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
+VK_DEFINE_HANDLE_CASTS(radv_instance, vk.base, VkInstance, VK_OBJECT_TYPE_INSTANCE)
+VK_DEFINE_HANDLE_CASTS(radv_physical_device, vk.base, VkPhysicalDevice,
+                       VK_OBJECT_TYPE_PHYSICAL_DEVICE)
+VK_DEFINE_HANDLE_CASTS(radv_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(radv_acceleration_structure, base,
+                               VkAccelerationStructureKHR,
+                               VK_OBJECT_TYPE_ACCELERATION_STRUCTURE_KHR)
+VK_DEFINE_NONDISP_HANDLE_CASTS(radv_cmd_pool, base, VkCommandPool,
+                               VK_OBJECT_TYPE_COMMAND_POOL)
+VK_DEFINE_NONDISP_HANDLE_CASTS(radv_buffer, base, VkBuffer, VK_OBJECT_TYPE_BUFFER)
+VK_DEFINE_NONDISP_HANDLE_CASTS(radv_buffer_view, base, VkBufferView,
+                               VK_OBJECT_TYPE_BUFFER_VIEW)
+VK_DEFINE_NONDISP_HANDLE_CASTS(radv_descriptor_pool, base, VkDescriptorPool,
+                               VK_OBJECT_TYPE_DESCRIPTOR_POOL)
+VK_DEFINE_NONDISP_HANDLE_CASTS(radv_descriptor_set, header.base, VkDescriptorSet,
+                               VK_OBJECT_TYPE_DESCRIPTOR_SET)
+VK_DEFINE_NONDISP_HANDLE_CASTS(radv_descriptor_set_layout, base,
+                               VkDescriptorSetLayout,
+                               VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT)
+VK_DEFINE_NONDISP_HANDLE_CASTS(radv_descriptor_update_template, base,
+                               VkDescriptorUpdateTemplate,
+                               VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(radv_device_memory, base, VkDeviceMemory,
+                               VK_OBJECT_TYPE_DEVICE_MEMORY)
+VK_DEFINE_NONDISP_HANDLE_CASTS(radv_fence, base, VkFence, VK_OBJECT_TYPE_FENCE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(radv_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
+VK_DEFINE_NONDISP_HANDLE_CASTS(radv_framebuffer, base, VkFramebuffer,
+                               VK_OBJECT_TYPE_FRAMEBUFFER)
+VK_DEFINE_NONDISP_HANDLE_CASTS(radv_image, base, VkImage, VK_OBJECT_TYPE_IMAGE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(radv_image_view, base, VkImageView,
+                               VK_OBJECT_TYPE_IMAGE_VIEW);
+VK_DEFINE_NONDISP_HANDLE_CASTS(radv_pipeline_cache, base, VkPipelineCache,
+                               VK_OBJECT_TYPE_PIPELINE_CACHE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(radv_pipeline, base, VkPipeline,
+                               VK_OBJECT_TYPE_PIPELINE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(radv_pipeline_layout, base, VkPipelineLayout,
+                               VK_OBJECT_TYPE_PIPELINE_LAYOUT)
+VK_DEFINE_NONDISP_HANDLE_CASTS(radv_query_pool, base, VkQueryPool,
+                               VK_OBJECT_TYPE_QUERY_POOL)
+VK_DEFINE_NONDISP_HANDLE_CASTS(radv_render_pass, base, VkRenderPass,
+                               VK_OBJECT_TYPE_RENDER_PASS)
+VK_DEFINE_NONDISP_HANDLE_CASTS(radv_sampler, base, VkSampler,
+                               VK_OBJECT_TYPE_SAMPLER)
+VK_DEFINE_NONDISP_HANDLE_CASTS(radv_sampler_ycbcr_conversion, base,
+                               VkSamplerYcbcrConversion,
+                               VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION)
+VK_DEFINE_NONDISP_HANDLE_CASTS(radv_semaphore, base, VkSemaphore,
+                               VK_OBJECT_TYPE_SEMAPHORE)
 
-#define RADV_FROM_HANDLE(__radv_type, __name, __handle)                                            \
-   struct __radv_type *__name = __radv_type##_from_handle(__handle)
-
-RADV_DEFINE_HANDLE_CASTS(radv_cmd_buffer, VkCommandBuffer)
-RADV_DEFINE_HANDLE_CASTS(radv_device, VkDevice)
-RADV_DEFINE_HANDLE_CASTS(radv_instance, VkInstance)
-RADV_DEFINE_HANDLE_CASTS(radv_physical_device, VkPhysicalDevice)
-RADV_DEFINE_HANDLE_CASTS(radv_queue, VkQueue)
-
-RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_acceleration_structure, VkAccelerationStructureKHR)
-RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_cmd_pool, VkCommandPool)
-RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_buffer, VkBuffer)
-RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_buffer_view, VkBufferView)
-RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_descriptor_pool, VkDescriptorPool)
-RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_descriptor_set, VkDescriptorSet)
-RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_descriptor_set_layout, VkDescriptorSetLayout)
-RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_descriptor_update_template, VkDescriptorUpdateTemplate)
-RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_device_memory, VkDeviceMemory)
-RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_fence, VkFence)
-RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_event, VkEvent)
-RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_framebuffer, VkFramebuffer)
-RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_image, VkImage)
-RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_image_view, VkImageView);
-RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_pipeline_cache, VkPipelineCache)
-RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_pipeline, VkPipeline)
-RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_pipeline_layout, VkPipelineLayout)
-RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_query_pool, VkQueryPool)
-RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_render_pass, VkRenderPass)
-RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_sampler, VkSampler)
-RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_sampler_ycbcr_conversion, VkSamplerYcbcrConversion)
-RADV_DEFINE_NONDISP_HANDLE_CASTS(radv_semaphore, VkSemaphore)
+#ifdef __cplusplus
+}
+#endif
 
 #endif /* RADV_PRIVATE_H */
diff --git a/mesa 3D driver/src/amd/vulkan/radv_query.c b/mesa 3D driver/src/amd/vulkan/radv_query.c
index b0b8453cf4..d6ee60ed89 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_query.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_query.c	
@@ -30,6 +30,7 @@
 
 #include "nir/nir_builder.h"
 #include "util/u_atomic.h"
+#include "radv_acceleration_structure.h"
 #include "radv_cs.h"
 #include "radv_meta.h"
 #include "radv_private.h"
@@ -148,13 +149,7 @@ build_occlusion_query_shader(struct radv_device *device)
    nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
    nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
 
-   nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
-   nir_ssa_def *wg_id = nir_load_workgroup_id(&b, 32);
-   nir_ssa_def *block_size =
-      nir_imm_ivec4(&b, b.shader->info.workgroup_size[0], b.shader->info.workgroup_size[1],
-                    b.shader->info.workgroup_size[2], 0);
-   nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
-   global_id = nir_channel(&b, global_id, 0); // We only care about x here.
+   nir_ssa_def *global_id = get_global_ids(&b, 1);
 
    nir_ssa_def *input_stride = nir_imm_int(&b, db_count * 16);
    nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
@@ -289,13 +284,7 @@ build_pipeline_statistics_query_shader(struct radv_device *device)
    nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
    nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
 
-   nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
-   nir_ssa_def *wg_id = nir_load_workgroup_id(&b, 32);
-   nir_ssa_def *block_size =
-      nir_imm_ivec4(&b, b.shader->info.workgroup_size[0], b.shader->info.workgroup_size[1],
-                    b.shader->info.workgroup_size[2], 0);
-   nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
-   global_id = nir_channel(&b, global_id, 0); // We only care about x here.
+   nir_ssa_def *global_id = get_global_ids(&b, 1);
 
    nir_ssa_def *input_stride = nir_imm_int(&b, pipelinestat_block_size * 2);
    nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
@@ -440,13 +429,7 @@ build_tfb_query_shader(struct radv_device *device)
    nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
 
    /* Compute global ID. */
-   nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
-   nir_ssa_def *wg_id = nir_load_workgroup_id(&b, 32);
-   nir_ssa_def *block_size =
-      nir_imm_ivec4(&b, b.shader->info.workgroup_size[0], b.shader->info.workgroup_size[1],
-                    b.shader->info.workgroup_size[2], 0);
-   nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
-   global_id = nir_channel(&b, global_id, 0); // We only care about x here.
+   nir_ssa_def *global_id = get_global_ids(&b, 1);
 
    /* Compute src/dst strides. */
    nir_ssa_def *input_stride = nir_imm_int(&b, 32);
@@ -570,13 +553,7 @@ build_timestamp_query_shader(struct radv_device *device)
    nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
 
    /* Compute global ID. */
-   nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
-   nir_ssa_def *wg_id = nir_load_workgroup_id(&b, 32);
-   nir_ssa_def *block_size =
-      nir_imm_ivec4(&b, b.shader->info.workgroup_size[0], b.shader->info.workgroup_size[1],
-                    b.shader->info.workgroup_size[2], 0);
-   nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
-   global_id = nir_channel(&b, global_id, 0); // We only care about x here.
+   nir_ssa_def *global_id = get_global_ids(&b, 1);
 
    /* Compute src/dst strides. */
    nir_ssa_def *input_stride = nir_imm_int(&b, 8);
@@ -834,6 +811,7 @@ radv_query_shader(struct radv_cmd_buffer *cmd_buffer, VkPipeline *pipeline,
 {
    struct radv_device *device = cmd_buffer->device;
    struct radv_meta_saved_state saved_state;
+   struct radv_buffer src_buffer, dst_buffer;
    bool old_predicating;
 
    if (!*pipeline) {
@@ -854,12 +832,11 @@ radv_query_shader(struct radv_cmd_buffer *cmd_buffer, VkPipeline *pipeline,
    old_predicating = cmd_buffer->state.predicating;
    cmd_buffer->state.predicating = false;
 
-   struct radv_buffer dst_buffer = {.bo = dst_bo, .offset = dst_offset, .size = dst_stride * count};
+   uint64_t src_buffer_size = MAX2(src_stride * count, avail_offset + 4 * count - src_offset);
+   uint64_t dst_buffer_size = count == 1 ? src_stride : dst_stride * count;
 
-   struct radv_buffer src_buffer = {
-      .bo = src_bo,
-      .offset = src_offset,
-      .size = MAX2(src_stride * count, avail_offset + 4 * count - src_offset)};
+   radv_buffer_init(&src_buffer, device, src_bo, src_buffer_size, src_offset);
+   radv_buffer_init(&dst_buffer, device, dst_bo, dst_buffer_size, dst_offset);
 
    radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
                         *pipeline);
@@ -911,6 +888,9 @@ radv_query_shader(struct radv_cmd_buffer *cmd_buffer, VkPipeline *pipeline,
    /* Restore conditional rendering. */
    cmd_buffer->state.predicating = old_predicating;
 
+   radv_buffer_finish(&src_buffer);
+   radv_buffer_finish(&dst_buffer);
+
    radv_meta_restore(&saved_state, cmd_buffer);
 }
 
@@ -948,7 +928,7 @@ radv_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo,
       vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pool), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 
    if (!pool)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    vk_object_base_init(&device->vk, &pool->base, VK_OBJECT_TYPE_QUERY_POOL);
 
@@ -960,6 +940,8 @@ radv_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo,
       pool->stride = pipelinestat_block_size * 2;
       break;
    case VK_QUERY_TYPE_TIMESTAMP:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
       pool->stride = 8;
       break;
    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
@@ -981,13 +963,13 @@ radv_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo,
                                                RADV_BO_PRIORITY_QUERY_POOL, 0, &pool->bo);
    if (result != VK_SUCCESS) {
       radv_destroy_query_pool(device, pAllocator, pool);
-      return vk_error(device->instance, result);
+      return vk_error(device, result);
    }
 
    pool->ptr = device->ws->buffer_map(pool->bo);
    if (!pool->ptr) {
       radv_destroy_query_pool(device, pAllocator, pool);
-      return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
    }
 
    *pQueryPool = radv_query_pool_to_handle(pool);
@@ -1026,7 +1008,9 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t first
       uint32_t available;
 
       switch (pool->type) {
-      case VK_QUERY_TYPE_TIMESTAMP: {
+      case VK_QUERY_TYPE_TIMESTAMP:
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: {
          uint64_t const *src64 = (uint64_t const *)src;
          uint64_t value;
 
@@ -1213,6 +1197,9 @@ radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPoo
    uint64_t dest_va = radv_buffer_get_va(dst_buffer->bo);
    dest_va += dst_buffer->offset + dstOffset;
 
+   if (!queryCount)
+      return;
+
    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->bo);
    radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, dst_buffer->bo);
 
@@ -1266,6 +1253,8 @@ radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPoo
                         pool->pipeline_stats_mask, pool->availability_offset + 4 * firstQuery);
       break;
    case VK_QUERY_TYPE_TIMESTAMP:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
          for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
             unsigned query = firstQuery + i;
@@ -1312,13 +1301,26 @@ radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPoo
    }
 }
 
+static uint32_t
+query_clear_value(VkQueryType type)
+{
+   switch (type) {
+   case VK_QUERY_TYPE_TIMESTAMP:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+      return (uint32_t)TIMESTAMP_NOT_READY;
+   default:
+      return 0;
+   }
+}
+
 void
 radv_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool, uint32_t firstQuery,
                        uint32_t queryCount)
 {
    RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
-   uint32_t value = pool->type == VK_QUERY_TYPE_TIMESTAMP ? (uint32_t)TIMESTAMP_NOT_READY : 0;
+   uint32_t value = query_clear_value(pool->type);
    uint32_t flush_bits = 0;
 
    /* Make sure to sync all previous work if the given command buffer has
@@ -1348,7 +1350,7 @@ radv_ResetQueryPool(VkDevice _device, VkQueryPool queryPool, uint32_t firstQuery
 {
    RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
 
-   uint32_t value = pool->type == VK_QUERY_TYPE_TIMESTAMP ? (uint32_t)TIMESTAMP_NOT_READY : 0;
+   uint32_t value = query_clear_value(pool->type);
    uint32_t *data = (uint32_t *)(pool->ptr + firstQuery * pool->stride);
    uint32_t *data_end = (uint32_t *)(pool->ptr + (firstQuery + queryCount) * pool->stride);
 
@@ -1671,3 +1673,51 @@ radv_CmdWriteTimestamp(VkCommandBuffer commandBuffer, VkPipelineStageFlagBits pi
 
    assert(cmd_buffer->cs->cdw <= cdw_max);
 }
+
+void
+radv_CmdWriteAccelerationStructuresPropertiesKHR(
+   VkCommandBuffer commandBuffer, uint32_t accelerationStructureCount,
+   const VkAccelerationStructureKHR *pAccelerationStructures, VkQueryType queryType,
+   VkQueryPool queryPool, uint32_t firstQuery)
+{
+   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
+   struct radeon_cmdbuf *cs = cmd_buffer->cs;
+   uint64_t pool_va = radv_buffer_get_va(pool->bo);
+   uint64_t query_va = pool_va + pool->stride * firstQuery;
+
+   radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
+
+   emit_query_flush(cmd_buffer, pool);
+
+   ASSERTED unsigned cdw_max =
+      radeon_check_space(cmd_buffer->device->ws, cs, 6 * accelerationStructureCount);
+
+   for (uint32_t i = 0; i < accelerationStructureCount; ++i) {
+      RADV_FROM_HANDLE(radv_acceleration_structure, accel_struct, pAccelerationStructures[i]);
+      uint64_t va = radv_accel_struct_get_va(accel_struct);
+
+      switch (queryType) {
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+         va += offsetof(struct radv_accel_struct_header, compacted_size);
+         break;
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+         va += offsetof(struct radv_accel_struct_header, serialization_size);
+         break;
+      default:
+         unreachable("Unhandle accel struct query type.");
+      }
+
+      radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+      radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
+                         COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM);
+      radeon_emit(cs, va);
+      radeon_emit(cs, va >> 32);
+      radeon_emit(cs, query_va);
+      radeon_emit(cs, query_va >> 32);
+
+      query_va += pool->stride;
+   }
+
+   assert(cmd_buffer->cs->cdw <= cdw_max);
+}
diff --git a/mesa 3D driver/src/amd/vulkan/radv_shader.c b/mesa 3D driver/src/amd/vulkan/radv_shader.c
index 82dade3dee..9652447516 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_shader.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_shader.c	
@@ -35,63 +35,72 @@
 #include "radv_debug.h"
 #include "radv_private.h"
 #include "radv_shader_args.h"
-#include "radv_shader_helper.h"
 
 #include "util/debug.h"
 #include "ac_binary.h"
 #include "ac_exp_param.h"
-#include "ac_llvm_util.h"
 #include "ac_nir.h"
 #include "ac_rtld.h"
 #include "aco_interface.h"
 #include "sid.h"
 #include "vk_format.h"
 
-static const struct nir_shader_compiler_options nir_options = {
-   .vertex_id_zero_based = true,
-   .lower_scmp = true,
-   .lower_flrp16 = true,
-   .lower_flrp32 = true,
-   .lower_flrp64 = true,
-   .lower_device_index_to_zero = true,
-   .lower_fdiv = true,
-   .lower_fmod = true,
-   .lower_ineg = true,
-   .lower_bitfield_insert_to_bitfield_select = true,
-   .lower_bitfield_extract = true,
-   .lower_pack_snorm_2x16 = true,
-   .lower_pack_snorm_4x8 = true,
-   .lower_pack_unorm_2x16 = true,
-   .lower_pack_unorm_4x8 = true,
-   .lower_pack_half_2x16 = true,
-   .lower_pack_64_2x32 = true,
-   .lower_pack_64_4x16 = true,
-   .lower_pack_32_2x16 = true,
-   .lower_unpack_snorm_2x16 = true,
-   .lower_unpack_snorm_4x8 = true,
-   .lower_unpack_unorm_2x16 = true,
-   .lower_unpack_unorm_4x8 = true,
-   .lower_unpack_half_2x16 = true,
-   .lower_ffma16 = true,
-   .lower_ffma32 = true,
-   .lower_ffma64 = true,
-   .lower_fpow = true,
-   .lower_mul_2x32_64 = true,
-   .lower_rotate = true,
-   .has_fsub = true,
-   .has_isub = true,
-   .use_scoped_barrier = true,
-   .max_unroll_iterations = 32,
-   .max_unroll_iterations_aggressive = 128,
-   .use_interpolated_input_intrinsics = true,
-   .vectorize_vec2_16bit = true,
-   /* nir_lower_int64() isn't actually called for the LLVM backend, but
-    * this helps the loop unrolling heuristics. */
-   .lower_int64_options = nir_lower_imul64 | nir_lower_imul_high64 | nir_lower_imul_2x32_64 |
-                          nir_lower_divmod64 | nir_lower_minmax64 | nir_lower_iabs64,
-   .lower_doubles_options = nir_lower_drcp | nir_lower_dsqrt | nir_lower_drsq | nir_lower_ddiv,
-   .divergence_analysis_options = nir_divergence_view_index_uniform,
-};
+#ifdef LLVM_AVAILABLE
+#include "ac_llvm_util.h"
+#endif
+
+void
+radv_get_nir_options(struct radv_physical_device *device)
+{
+   device->nir_options = (nir_shader_compiler_options){
+      .vertex_id_zero_based = true,
+      .lower_scmp = true,
+      .lower_flrp16 = true,
+      .lower_flrp32 = true,
+      .lower_flrp64 = true,
+      .lower_device_index_to_zero = true,
+      .lower_fdiv = true,
+      .lower_fmod = true,
+      .lower_ineg = true,
+      .lower_bitfield_insert_to_bitfield_select = true,
+      .lower_bitfield_extract = true,
+      .lower_pack_snorm_2x16 = true,
+      .lower_pack_snorm_4x8 = true,
+      .lower_pack_unorm_2x16 = true,
+      .lower_pack_unorm_4x8 = true,
+      .lower_pack_half_2x16 = true,
+      .lower_pack_64_2x32 = true,
+      .lower_pack_64_4x16 = true,
+      .lower_pack_32_2x16 = true,
+      .lower_unpack_snorm_2x16 = true,
+      .lower_unpack_snorm_4x8 = true,
+      .lower_unpack_unorm_2x16 = true,
+      .lower_unpack_unorm_4x8 = true,
+      .lower_unpack_half_2x16 = true,
+      .lower_ffma16 = true,
+      .lower_ffma32 = true,
+      .lower_ffma64 = true,
+      .lower_fpow = true,
+      .lower_mul_2x32_64 = true,
+      .lower_rotate = true,
+      .lower_iadd_sat = device->rad_info.chip_class <= GFX8,
+      .has_fsub = true,
+      .has_isub = true,
+      .has_dot_4x8 = device->rad_info.has_accelerated_dot_product,
+      .has_dot_2x16 = device->rad_info.has_accelerated_dot_product,
+      .use_scoped_barrier = true,
+      .max_unroll_iterations = 32,
+      .max_unroll_iterations_aggressive = 128,
+      .use_interpolated_input_intrinsics = true,
+      .vectorize_vec2_16bit = true,
+      /* nir_lower_int64() isn't actually called for the LLVM backend,
+       * but this helps the loop unrolling heuristics. */
+      .lower_int64_options = nir_lower_imul64 | nir_lower_imul_high64 | nir_lower_imul_2x32_64 |
+                             nir_lower_divmod64 | nir_lower_minmax64 | nir_lower_iabs64,
+      .lower_doubles_options = nir_lower_drcp | nir_lower_dsqrt | nir_lower_drsq | nir_lower_ddiv,
+      .divergence_analysis_options = nir_divergence_view_index_uniform,
+   };
+}
 
 bool
 radv_can_dump_shader(struct radv_device *device, struct vk_shader_module *module,
@@ -117,9 +126,6 @@ radv_optimize_nir(const struct radv_device *device, struct nir_shader *shader,
                   bool optimize_conservatively, bool allow_copies)
 {
    bool progress;
-   unsigned lower_flrp = (shader->options->lower_flrp16 ? 16 : 0) |
-                         (shader->options->lower_flrp32 ? 32 : 0) |
-                         (shader->options->lower_flrp64 ? 64 : 0);
 
    do {
       progress = false;
@@ -162,21 +168,6 @@ radv_optimize_nir(const struct radv_device *device, struct nir_shader *shader,
       NIR_PASS(progress, shader, nir_opt_constant_folding);
       NIR_PASS(progress, shader, nir_opt_algebraic);
 
-      if (lower_flrp != 0) {
-         bool lower_flrp_progress = false;
-         NIR_PASS(lower_flrp_progress, shader, nir_lower_flrp, lower_flrp,
-                  false /* always_precise */);
-         if (lower_flrp_progress) {
-            NIR_PASS(progress, shader, nir_opt_constant_folding);
-            progress = true;
-         }
-
-         /* Nothing should rematerialize any flrps, so we only
-          * need to do this lowering once.
-          */
-         lower_flrp = 0;
-      }
-
       NIR_PASS(progress, shader, nir_opt_undef);
       NIR_PASS(progress, shader, nir_opt_shrink_vectors,
                !device->instance->disable_shrink_image_store);
@@ -305,14 +296,13 @@ lower_intrinsics(nir_shader *nir, const struct radv_pipeline_key *key,
             if (nir_intrinsic_desc_type(intrin) == VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR) {
                nir_ssa_def *addr =
                   convert_pointer_to_64(&b, pdev,
-                                        nir_iadd(&b, nir_channels(&b, intrin->src[0].ssa, 1),
-                                                 nir_channels(&b, intrin->src[0].ssa, 2)));
+                                        nir_iadd(&b, nir_channel(&b, intrin->src[0].ssa, 0),
+                                                 nir_channel(&b, intrin->src[0].ssa, 1)));
 
                def = nir_build_load_global(&b, 1, 64, addr, .access = ACCESS_NON_WRITEABLE,
                                            .align_mul = 8, .align_offset = 0);
             } else {
-               def = nir_vec3(&b, nir_channel(&b, intrin->src[0].ssa, 0),
-                              nir_channel(&b, intrin->src[0].ssa, 1), nir_imm_int(&b, 0));
+               def = nir_vector_insert_imm(&b, intrin->src[0].ssa, nir_imm_int(&b, 0), 2);
             }
             break;
          case nir_intrinsic_vulkan_resource_index: {
@@ -323,8 +313,6 @@ lower_intrinsics(nir_shader *nir, const struct radv_pipeline_key *key,
             nir_ssa_def *new_res = nir_vulkan_resource_index(
                &b, 3, 32, intrin->src[0].ssa, .desc_set = desc_set, .binding = binding,
                .desc_type = nir_intrinsic_desc_type(intrin));
-            nir_ssa_def *set_ptr = nir_channel(&b, new_res, 0);
-            nir_ssa_def *binding_ptr = nir_channel(&b, new_res, 1);
 
             nir_ssa_def *stride;
             if (desc_layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
@@ -333,15 +321,14 @@ lower_intrinsics(nir_shader *nir, const struct radv_pipeline_key *key,
             } else {
                stride = nir_imm_int(&b, desc_layout->binding[binding].size);
             }
-            def = nir_vec3(&b, set_ptr, binding_ptr, stride);
+            def = nir_vector_insert_imm(&b, new_res, stride, 2);
             break;
          }
          case nir_intrinsic_vulkan_resource_reindex: {
-            nir_ssa_def *set_ptr = nir_channel(&b, intrin->src[0].ssa, 0);
             nir_ssa_def *binding_ptr = nir_channel(&b, intrin->src[0].ssa, 1);
             nir_ssa_def *stride = nir_channel(&b, intrin->src[0].ssa, 2);
             binding_ptr = nir_iadd(&b, binding_ptr, nir_imul(&b, intrin->src[1].ssa, stride));
-            def = nir_vec3(&b, set_ptr, binding_ptr, stride);
+            def = nir_vector_insert_imm(&b, intrin->src[0].ssa, binding_ptr, 1);
             break;
          }
          case nir_intrinsic_is_sparse_texels_resident:
@@ -428,18 +415,18 @@ radv_lower_primitive_shading_rate(nir_shader *nir)
 nir_shader *
 radv_shader_compile_to_nir(struct radv_device *device, struct vk_shader_module *module,
                            const char *entrypoint_name, gl_shader_stage stage,
-                           const VkSpecializationInfo *spec_info, const VkPipelineCreateFlags flags,
+                           const VkSpecializationInfo *spec_info,
                            const struct radv_pipeline_layout *layout,
                            const struct radv_pipeline_key *key)
 {
    unsigned subgroup_size = 64, ballot_bit_size = 64;
-   if (key->compute_subgroup_size) {
+   if (key->cs.compute_subgroup_size) {
       /* Only compute shaders currently support requiring a
        * specific subgroup size.
        */
       assert(stage == MESA_SHADER_COMPUTE);
-      subgroup_size = key->compute_subgroup_size;
-      ballot_bit_size = key->compute_subgroup_size;
+      subgroup_size = key->cs.compute_subgroup_size;
+      ballot_bit_size = key->cs.compute_subgroup_size;
    }
 
    nir_shader *nir;
@@ -449,7 +436,7 @@ radv_shader_compile_to_nir(struct radv_device *device, struct vk_shader_module *
        * shader directly.  In that case, we just ignore the SPIR-V entirely
        * and just use the NIR shader */
       nir = module->nir;
-      nir->options = &nir_options;
+      nir->options = &device->physical_device->nir_options;
       nir_validate_shader(nir, "in internal shader");
 
       assert(exec_list_length(&nir->functions) == 1);
@@ -491,6 +478,7 @@ radv_shader_compile_to_nir(struct radv_device *device, struct vk_shader_module *
                .float64 = true,
                .float64_atomic_min_max = true,
                .geometry_streams = true,
+               .groups = true,
                .image_atomic_int64 = true,
                .image_ms_array = true,
                .image_read_without_format = true,
@@ -503,6 +491,7 @@ radv_shader_compile_to_nir(struct radv_device *device, struct vk_shader_module *
                .multiview = true,
                .physical_storage_buffer_address = true,
                .post_depth_coverage = true,
+               .ray_tracing = true,
                .runtime_descriptor_array = true,
                .shader_clock = true,
                .shader_viewport_index_layer = true,
@@ -531,7 +520,7 @@ radv_shader_compile_to_nir(struct radv_device *device, struct vk_shader_module *
          .phys_ssbo_addr_format = nir_address_format_64bit_global,
          .push_const_addr_format = nir_address_format_logical,
          .shared_addr_format = nir_address_format_32bit_offset,
-         .frag_coord_is_sysval = true,
+         .constant_addr_format = nir_address_format_64bit_global,
          .use_deref_buffer_array_length = true,
          .debug =
             {
@@ -540,12 +529,17 @@ radv_shader_compile_to_nir(struct radv_device *device, struct vk_shader_module *
             },
       };
       nir = spirv_to_nir(spirv, module->size / 4, spec_entries, num_spec_entries, stage,
-                         entrypoint_name, &spirv_options, &nir_options);
+                         entrypoint_name, &spirv_options, &device->physical_device->nir_options);
       assert(nir->info.stage == stage);
       nir_validate_shader(nir, "after spirv_to_nir");
 
       free(spec_entries);
 
+      const struct nir_lower_sysvals_to_varyings_options sysvals_to_varyings = {
+         .point_coord = true,
+      };
+      NIR_PASS_V(nir, nir_lower_sysvals_to_varyings, &sysvals_to_varyings);
+
       /* We have to lower away local constant initializers right before we
        * inline functions.  That way they get properly initialized at the top
        * of the function and not at the top of its caller.
@@ -601,16 +595,11 @@ radv_shader_compile_to_nir(struct radv_device *device, struct vk_shader_module *
       NIR_PASS_V(nir, nir_lower_global_vars_to_local);
       NIR_PASS_V(nir, nir_lower_vars_to_ssa);
 
-      NIR_PASS_V(nir, nir_propagate_invariant,
-                 device->instance->debug_flags & RADV_DEBUG_INVARIANT_GEOM);
-
-      NIR_PASS_V(nir, nir_lower_system_values);
-      NIR_PASS_V(nir, nir_lower_compute_system_values, NULL);
+      NIR_PASS_V(nir, nir_propagate_invariant, key->invariant_geom);
 
       NIR_PASS_V(nir, nir_lower_clip_cull_distance_arrays);
 
-      NIR_PASS_V(nir, nir_lower_discard_or_demote,
-                 device->instance->debug_flags & RADV_DEBUG_DISCARD_TO_DEMOTE);
+      NIR_PASS_V(nir, nir_lower_discard_or_demote, key->ps.lower_discard_to_demote);
 
       nir_lower_doubles_options lower_doubles = nir->options->lower_doubles_options;
 
@@ -625,6 +614,9 @@ radv_shader_compile_to_nir(struct radv_device *device, struct vk_shader_module *
       NIR_PASS_V(nir, nir_lower_doubles, NULL, lower_doubles);
    }
 
+   NIR_PASS_V(nir, nir_lower_system_values);
+   NIR_PASS_V(nir, nir_lower_compute_system_values, NULL);
+
    /* Vulkan uses the separate-shader linking model */
    nir->info.separate_shader = true;
 
@@ -633,7 +625,7 @@ radv_shader_compile_to_nir(struct radv_device *device, struct vk_shader_module *
    if (nir->info.stage == MESA_SHADER_GEOMETRY) {
       unsigned nir_gs_flags = nir_lower_gs_intrinsics_per_stream;
 
-      if (device->physical_device->use_ngg && !radv_use_llvm_for_stage(device, stage)) {
+      if (key->use_ngg && !radv_use_llvm_for_stage(device, stage)) {
          /* ACO needs NIR to do some of the hard lifting */
          nir_gs_flags |= nir_lower_gs_intrinsics_count_primitives |
                          nir_lower_gs_intrinsics_count_vertices_per_primitive |
@@ -647,6 +639,7 @@ radv_shader_compile_to_nir(struct radv_device *device, struct vk_shader_module *
       .lower_txp = ~0,
       .lower_tg4_offsets = true,
       .lower_txs_cube_array = true,
+      .lower_to_fragment_fetch_amd = true,
    };
 
    nir_lower_tex(nir, &tex_options);
@@ -683,12 +676,11 @@ radv_shader_compile_to_nir(struct radv_device *device, struct vk_shader_module *
                                .lower_quad_broadcast_dynamic = 1,
                                .lower_quad_broadcast_dynamic_to_const = gfx7minus,
                                .lower_shuffle_to_swizzle_amd = 1,
-                               .lower_elect = radv_use_llvm_for_stage(device, stage),
                             });
 
    nir_lower_load_const_to_scalar(nir);
 
-   if (!(flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT))
+   if (!key->optimisations_disabled)
       radv_optimize_nir(device, nir, false, true);
 
    /* call radv_nir_lower_ycbcr_textures() late as there might still be
@@ -700,6 +692,14 @@ radv_shader_compile_to_nir(struct radv_device *device, struct vk_shader_module *
     */
    nir_lower_var_copies(nir);
 
+   unsigned lower_flrp = (nir->options->lower_flrp16 ? 16 : 0) |
+                         (nir->options->lower_flrp32 ? 32 : 0) |
+                         (nir->options->lower_flrp64 ? 64 : 0);
+   if (lower_flrp != 0) {
+      if (nir_lower_flrp(nir, lower_flrp, false /* always_precise */))
+         NIR_PASS_V(nir, nir_opt_constant_folding);
+   }
+
    const nir_opt_access_options opt_access_options = {
       .is_vulkan = true,
       .infer_non_readable = true,
@@ -727,7 +727,8 @@ radv_shader_compile_to_nir(struct radv_device *device, struct vk_shader_module *
       }
    }
 
-   nir_lower_explicit_io(nir, nir_var_mem_global, nir_address_format_64bit_global);
+   nir_lower_explicit_io(nir, nir_var_mem_global | nir_var_mem_constant,
+                         nir_address_format_64bit_global);
 
    /* Lower large variables that are always constant with load_constant
     * intrinsics, which get turned into PC-relative loads from a data
@@ -748,8 +749,7 @@ radv_shader_compile_to_nir(struct radv_device *device, struct vk_shader_module *
     * considered too large for unrolling.
     */
    if (ac_nir_lower_indirect_derefs(nir, device->physical_device->rad_info.chip_class) &&
-       !(flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT) &&
-       nir->info.stage != MESA_SHADER_COMPUTE) {
+       !key->optimisations_disabled && nir->info.stage != MESA_SHADER_COMPUTE) {
       /* Optimize the lowered code before the linking optimizations. */
       radv_optimize_nir(device, nir, false, false);
    }
@@ -843,7 +843,7 @@ radv_lower_io(struct radv_device *device, nir_shader *nir)
 
 bool
 radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
-                     struct radv_shader_info *info, const struct radv_pipeline_key *pl_key)
+                     const struct radv_shader_info *info, const struct radv_pipeline_key *pl_key)
 {
    if (nir->info.stage == MESA_SHADER_VERTEX) {
       if (info->vs.as_ls) {
@@ -862,7 +862,7 @@ radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
          nir, device->physical_device->rad_info.chip_class, info->tcs.tes_reads_tess_factors,
          info->tcs.tes_inputs_read, info->tcs.tes_patch_inputs_read, info->tcs.num_linked_inputs,
          info->tcs.num_linked_outputs, info->tcs.num_linked_patch_outputs, true);
-      ac_nir_lower_tess_to_const(nir, pl_key->tess_input_vertices, info->num_tess_patches,
+      ac_nir_lower_tess_to_const(nir, pl_key->tcs.tess_input_vertices, info->num_tess_patches,
                                  ac_nir_lower_patch_vtx_in | ac_nir_lower_num_patches);
 
       return true;
@@ -888,8 +888,8 @@ radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
 }
 
 bool
-radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
-                      uint64_t ps_inputs_read)
+radv_consider_culling(struct radv_device *device, struct nir_shader *nir, uint64_t ps_inputs_read,
+                      unsigned num_vertices_per_primitive, const struct radv_shader_info *info)
 {
    /* Culling doesn't make sense for meta shaders. */
    if (!!nir->info.name)
@@ -899,10 +899,11 @@ radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
    if (nir->info.outputs_written & (VARYING_BIT_VIEWPORT | VARYING_BIT_VIEWPORT_MASK))
       return false;
 
-   /* TODO: enable by default on GFX10.3 when we're confident about performance. */
-   bool culling_enabled = device->instance->perftest_flags & RADV_PERFTEST_NGGC;
+   /* We don't support culling with vertex shader prologs. */
+   if (info->vs.has_prolog)
+      return false;
 
-   if (!culling_enabled)
+   if (!device->physical_device->use_ngg_culling)
       return false;
 
    /* Shader based culling efficiency can depend on PS throughput.
@@ -912,23 +913,40 @@ radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
    unsigned max_render_backends = device->physical_device->rad_info.max_render_backends;
    unsigned max_se = device->physical_device->rad_info.max_se;
 
-   if (max_render_backends < 2)
-      return false; /* Don't use NGG culling on 1 RB chips. */
-   else if (max_render_backends / max_se == 4)
+   if (max_render_backends / max_se == 4)
       max_ps_params = 6; /* Sienna Cichlid and other GFX10.3 dGPUs. */
    else
       max_ps_params = 4; /* Navi 1x. */
 
    /* TODO: consider other heuristics here, such as PS execution time */
+   if (util_bitcount64(ps_inputs_read & ~VARYING_BIT_POS) > max_ps_params)
+      return false;
 
-   return util_bitcount64(ps_inputs_read & ~VARYING_BIT_POS) <= max_ps_params;
+   /* Only triangle culling is supported. */
+   if (num_vertices_per_primitive != 3)
+      return false;
+
+   /* When the shader writes memory, it is difficult to guarantee correctness.
+    * Future work:
+    * - if only write-only SSBOs are used
+    * - if we can prove that non-position outputs don't rely on memory stores
+    * then may be okay to keep the memory stores in the 1st shader part, and delete them from the 2nd.
+    */
+   if (nir->info.writes_memory)
+      return false;
+
+   /* When the shader relies on the subgroup invocation ID, we'd break it, because the ID changes after the culling.
+    * Future work: try to save this to LDS and reload, but it can still be broken in subtle ways.
+    */
+   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SUBGROUP_INVOCATION))
+      return false;
+
+   return true;
 }
 
 void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
-                    struct radv_shader_info *info,
-                    const struct radv_pipeline_key *pl_key,
-                    struct radv_shader_variant_key *key,
-                    bool consider_culling)
+                    const struct radv_shader_info *info,
+                    const struct radv_pipeline_key *pl_key)
 {
    /* TODO: support the LLVM backend with the NIR lowering */
    assert(!radv_use_llvm_for_stage(device, nir->info.stage));
@@ -937,9 +955,7 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
           nir->info.stage == MESA_SHADER_TESS_EVAL ||
           nir->info.stage == MESA_SHADER_GEOMETRY);
 
-   ac_nir_ngg_config out_conf = {0};
    const struct gfx10_ngg_info *ngg_info = &info->ngg_info;
-   unsigned num_gs_invocations = (nir->info.stage != MESA_SHADER_GEOMETRY || ngg_info->max_vert_out_per_gs_instance) ? 1 : info->gs.invocations;
    unsigned num_vertices_per_prim = 3;
 
    /* Get the number of vertices per input primitive */
@@ -950,15 +966,15 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
          num_vertices_per_prim = 2;
 
       /* Manually mark the primitive ID used, so the shader can repack it. */
-      if (key->vs_common_out.export_prim_id)
+      if (info->tes.outinfo.export_prim_id)
          BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
 
    } else if (nir->info.stage == MESA_SHADER_VERTEX) {
       /* Need to add 1, because: V_028A6C_POINTLIST=0, V_028A6C_LINESTRIP=1, V_028A6C_TRISTRIP=2, etc. */
-      num_vertices_per_prim = key->vs.outprim + 1;
+      num_vertices_per_prim = si_conv_prim_to_gs_out(pl_key->vs.topology) + 1;
 
       /* Manually mark the instance ID used, so the shader can repack it. */
-      if (key->vs.instance_rate_inputs)
+      if (pl_key->vs.instance_rate_inputs)
          BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
 
    } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
@@ -969,133 +985,307 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
 
    /* Invocations that process an input vertex */
    unsigned max_vtx_in = MIN2(256, ngg_info->enable_vertex_grouping ? ngg_info->hw_max_esverts : num_vertices_per_prim * ngg_info->max_gsprims);
-   /* Invocations that export an output vertex */
-   unsigned max_vtx_out = ngg_info->max_out_verts;
-   /* Invocations that process an input primitive */
-   unsigned max_prm_in = ngg_info->max_gsprims * num_gs_invocations;
-   /* Invocations that produce an output primitive */
-   unsigned max_prm_out = ngg_info->max_gsprims * num_gs_invocations * ngg_info->prim_amp_factor;
-
-   unsigned max_workgroup_size = MAX4(max_vtx_in, max_vtx_out, max_prm_in, max_prm_out);
-
-   /* Maximum HW limit for NGG workgroups */
-   max_workgroup_size = MIN2(256, max_workgroup_size);
 
    if (nir->info.stage == MESA_SHADER_VERTEX ||
        nir->info.stage == MESA_SHADER_TESS_EVAL) {
-      assert(key->vs_common_out.as_ngg);
+      bool export_prim_id;
 
-      if (consider_culling)
+      assert(info->is_ngg);
+
+      if (info->has_ngg_culling)
          radv_optimize_nir_algebraic(nir, false);
 
-      out_conf =
-         ac_nir_lower_ngg_nogs(
-            nir,
-            max_vtx_in,
-            num_vertices_per_prim,
-            max_workgroup_size,
-            info->wave_size,
-            consider_culling,
-            key->vs_common_out.as_ngg_passthrough,
-            key->vs_common_out.export_prim_id,
-            key->vs.provoking_vtx_last);
+      if (nir->info.stage == MESA_SHADER_VERTEX) {
+         export_prim_id = info->vs.outinfo.export_prim_id;
+      } else {
+         export_prim_id = info->tes.outinfo.export_prim_id;
+      }
 
-      info->has_ngg_culling = out_conf.can_cull;
-      info->has_ngg_early_prim_export = out_conf.early_prim_export;
-      info->num_lds_blocks_when_not_culling = DIV_ROUND_UP(out_conf.lds_bytes_if_culling_off, device->physical_device->rad_info.lds_encode_granularity);
-      info->is_ngg_passthrough = out_conf.passthrough;
-      key->vs_common_out.as_ngg_passthrough = out_conf.passthrough;
+      ac_nir_lower_ngg_nogs(
+         nir,
+         max_vtx_in,
+         num_vertices_per_prim,
+         info->workgroup_size,
+         info->wave_size,
+         info->has_ngg_culling,
+         info->has_ngg_early_prim_export,
+         info->is_ngg_passthrough,
+         export_prim_id,
+         pl_key->vs.provoking_vtx_last,
+         false,
+         pl_key->vs.instance_rate_inputs);
    } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
       assert(info->is_ngg);
       ac_nir_lower_ngg_gs(
-         nir, info->wave_size, max_workgroup_size,
+         nir, info->wave_size, info->workgroup_size,
          info->ngg_info.esgs_ring_size,
          info->gs.gsvs_vertex_size,
          info->ngg_info.ngg_emit_size * 4u,
-         key->vs.provoking_vtx_last);
+         pl_key->vs.provoking_vtx_last);
    } else {
       unreachable("invalid SW stage passed to radv_lower_ngg");
    }
 }
 
+static unsigned
+get_size_class(unsigned size, bool round_up)
+{
+   size = round_up ? util_logbase2_ceil(size) : util_logbase2(size);
+   unsigned size_class =
+      MAX2(size, RADV_SHADER_ALLOC_MIN_SIZE_CLASS) - RADV_SHADER_ALLOC_MIN_SIZE_CLASS;
+   return MIN2(size_class, RADV_SHADER_ALLOC_NUM_FREE_LISTS - 1);
+}
+
+static void
+remove_hole(struct radv_device *device, union radv_shader_arena_block *hole)
+{
+   unsigned size_class = get_size_class(hole->size, false);
+   list_del(&hole->freelist);
+   if (list_is_empty(&device->shader_free_lists[size_class]))
+      device->shader_free_list_mask &= ~(1u << size_class);
+}
+
+static void
+add_hole(struct radv_device *device, union radv_shader_arena_block *hole)
+{
+   unsigned size_class = get_size_class(hole->size, false);
+   list_addtail(&hole->freelist, &device->shader_free_lists[size_class]);
+   device->shader_free_list_mask |= 1u << size_class;
+}
+
+static union radv_shader_arena_block *
+alloc_block_obj(struct radv_device *device)
+{
+   if (!list_is_empty(&device->shader_block_obj_pool)) {
+      union radv_shader_arena_block *block =
+         list_first_entry(&device->shader_block_obj_pool, union radv_shader_arena_block, pool);
+      list_del(&block->pool);
+      return block;
+   }
+
+   return malloc(sizeof(union radv_shader_arena_block));
+}
+
+static void
+free_block_obj(struct radv_device *device, union radv_shader_arena_block *block)
+{
+   list_add(&block->pool, &device->shader_block_obj_pool);
+}
+
+/* Segregated fit allocator, implementing a good-fit allocation policy.
+ *
+ * This is an variation of sequential fit allocation with several lists of free blocks ("holes")
+ * instead of one. Each list of holes only contains holes of a certain range of sizes, so holes that
+ * are too small can easily be ignored while allocating. Because this also ignores holes that are
+ * larger than necessary (approximating best-fit allocation), this could be described as a
+ * "good-fit" allocator.
+ *
+ * Typically, shaders are allocated and only free'd when the device is destroyed. For this pattern,
+ * this should allocate blocks for shaders fast and with no fragmentation, while still allowing
+ * free'd memory to be re-used.
+ */
+static union radv_shader_arena_block *
+alloc_shader_memory(struct radv_device *device, uint32_t size, void *ptr)
+{
+   size = align(size, RADV_SHADER_ALLOC_ALIGNMENT);
+
+   mtx_lock(&device->shader_arena_mutex);
+
+   /* Try to use an existing hole. Unless the shader is very large, this should only have to look
+    * at the first one available.
+    */
+   unsigned free_list_mask = BITFIELD_MASK(RADV_SHADER_ALLOC_NUM_FREE_LISTS);
+   unsigned size_class =
+      ffs(device->shader_free_list_mask & (free_list_mask << get_size_class(size, true)));
+   if (size_class) {
+      size_class--;
+
+      list_for_each_entry(union radv_shader_arena_block, hole,
+                          &device->shader_free_lists[size_class], freelist)
+      {
+         if (hole->size < size)
+            continue;
+
+         assert(hole->offset % RADV_SHADER_ALLOC_ALIGNMENT == 0);
+
+         if (size == hole->size) {
+            remove_hole(device, hole);
+            hole->freelist.next = ptr;
+            mtx_unlock(&device->shader_arena_mutex);
+            return hole;
+         } else {
+            union radv_shader_arena_block *alloc = alloc_block_obj(device);
+            if (!alloc) {
+               mtx_unlock(&device->shader_arena_mutex);
+               return NULL;
+            }
+            list_addtail(&alloc->list, &hole->list);
+            alloc->freelist.prev = NULL;
+            alloc->freelist.next = ptr;
+            alloc->arena = hole->arena;
+            alloc->offset = hole->offset;
+            alloc->size = size;
+
+            remove_hole(device, hole);
+            hole->offset += size;
+            hole->size -= size;
+            add_hole(device, hole);
+
+            mtx_unlock(&device->shader_arena_mutex);
+            return alloc;
+         }
+      }
+   }
+
+   /* Allocate a new shader arena. */
+   struct radv_shader_arena *arena = calloc(1, sizeof(struct radv_shader_arena));
+   union radv_shader_arena_block *alloc = NULL, *hole = NULL;
+   if (!arena)
+      goto fail;
+
+   unsigned arena_size = MAX2(RADV_SHADER_ALLOC_MIN_ARENA_SIZE, size);
+   VkResult result = device->ws->buffer_create(
+      device->ws, arena_size, RADV_SHADER_ALLOC_ALIGNMENT, RADEON_DOMAIN_VRAM,
+      RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_32BIT |
+         (device->physical_device->rad_info.cpdma_prefetch_writes_memory ? 0
+                                                                         : RADEON_FLAG_READ_ONLY),
+      RADV_BO_PRIORITY_SHADER, 0, &arena->bo);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   list_inithead(&arena->entries);
+
+   arena->ptr = (char *)device->ws->buffer_map(arena->bo);
+   if (!arena->ptr)
+      goto fail;
+
+   alloc = alloc_block_obj(device);
+   hole = arena_size - size > 0 ? alloc_block_obj(device) : alloc;
+   if (!alloc || !hole)
+      goto fail;
+   list_addtail(&alloc->list, &arena->entries);
+   alloc->freelist.prev = NULL;
+   alloc->freelist.next = ptr;
+   alloc->arena = arena;
+   alloc->offset = 0;
+   alloc->size = size;
+
+   if (hole != alloc) {
+      hole->arena = arena;
+      hole->offset = size;
+      hole->size = arena_size - size;
+
+      list_addtail(&hole->list, &arena->entries);
+      add_hole(device, hole);
+   }
+
+   list_addtail(&arena->list, &device->shader_arenas);
+
+   mtx_unlock(&device->shader_arena_mutex);
+   return alloc;
+
+fail:
+   mtx_unlock(&device->shader_arena_mutex);
+   free(alloc);
+   free(hole);
+   if (arena && arena->bo)
+      device->ws->buffer_destroy(device->ws, arena->bo);
+   free(arena);
+   return NULL;
+}
+
+static union radv_shader_arena_block *
+get_hole(struct radv_shader_arena *arena, struct list_head *head)
+{
+   if (head == &arena->entries)
+      return NULL;
+
+   union radv_shader_arena_block *hole = LIST_ENTRY(union radv_shader_arena_block, head, list);
+   return hole->freelist.prev ? hole : NULL;
+}
+
+static void
+free_shader_memory(struct radv_device *device, union radv_shader_arena_block *alloc)
+{
+   mtx_lock(&device->shader_arena_mutex);
+
+   union radv_shader_arena_block *hole_prev = get_hole(alloc->arena, alloc->list.prev);
+   union radv_shader_arena_block *hole_next = get_hole(alloc->arena, alloc->list.next);
+
+   union radv_shader_arena_block *hole = alloc;
+
+   /* merge with previous hole */
+   if (hole_prev) {
+      remove_hole(device, hole_prev);
+
+      hole_prev->size += hole->size;
+      list_del(&hole->list);
+      free_block_obj(device, hole);
+
+      hole = hole_prev;
+   }
+
+   /* merge with next hole */
+   if (hole_next) {
+      remove_hole(device, hole_next);
+
+      hole_next->offset -= hole->size;
+      hole_next->size += hole->size;
+      list_del(&hole->list);
+      free_block_obj(device, hole);
+
+      hole = hole_next;
+   }
+
+   if (list_is_singular(&hole->list)) {
+      struct radv_shader_arena *arena = hole->arena;
+      free_block_obj(device, hole);
+
+      device->ws->buffer_destroy(device->ws, arena->bo);
+      list_del(&arena->list);
+      free(arena);
+   } else {
+      add_hole(device, hole);
+   }
+
+   mtx_unlock(&device->shader_arena_mutex);
+}
+
 static void *
 radv_alloc_shader_memory(struct radv_device *device, struct radv_shader_variant *shader)
 {
-   mtx_lock(&device->shader_slab_mutex);
-   list_for_each_entry(struct radv_shader_slab, slab, &device->shader_slabs, slabs)
-   {
-      uint64_t offset = 0;
-
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wshadow"
-#endif
-      list_for_each_entry(struct radv_shader_variant, s, &slab->shaders, slab_list)
-      {
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-         if (s->bo_offset - offset >= shader->code_size) {
-            shader->bo = slab->bo;
-            shader->bo_offset = offset;
-            list_addtail(&shader->slab_list, &s->slab_list);
-            mtx_unlock(&device->shader_slab_mutex);
-            return slab->ptr + offset;
-         }
-         offset = align_u64(s->bo_offset + s->code_size, 256);
-      }
-      if (offset <= slab->size && slab->size - offset >= shader->code_size) {
-         shader->bo = slab->bo;
-         shader->bo_offset = offset;
-         list_addtail(&shader->slab_list, &slab->shaders);
-         mtx_unlock(&device->shader_slab_mutex);
-         return slab->ptr + offset;
-      }
-   }
-
-   mtx_unlock(&device->shader_slab_mutex);
-   struct radv_shader_slab *slab = calloc(1, sizeof(struct radv_shader_slab));
-
-   slab->size = MAX2(256 * 1024, shader->code_size);
-   VkResult result = device->ws->buffer_create(
-      device->ws, slab->size, 256, RADEON_DOMAIN_VRAM,
-      RADEON_FLAG_NO_INTERPROCESS_SHARING |
-         (device->physical_device->rad_info.cpdma_prefetch_writes_memory ? 0
-                                                                         : RADEON_FLAG_READ_ONLY),
-      RADV_BO_PRIORITY_SHADER, 0, &slab->bo);
-   if (result != VK_SUCCESS) {
-      free(slab);
+   shader->alloc = alloc_shader_memory(device, shader->code_size, shader);
+   if (!shader->alloc)
       return NULL;
-   }
-
-   slab->ptr = (char *)device->ws->buffer_map(slab->bo);
-   if (!slab->ptr) {
-      device->ws->buffer_destroy(device->ws, slab->bo);
-      free(slab);
-      return NULL;
-   }
-
-   list_inithead(&slab->shaders);
-
-   mtx_lock(&device->shader_slab_mutex);
-   list_add(&slab->slabs, &device->shader_slabs);
-
-   shader->bo = slab->bo;
-   shader->bo_offset = 0;
-   list_add(&shader->slab_list, &slab->shaders);
-   mtx_unlock(&device->shader_slab_mutex);
-   return slab->ptr;
+   shader->bo = shader->alloc->arena->bo;
+   return shader->alloc->arena->ptr + shader->alloc->offset;
 }
 
 void
-radv_destroy_shader_slabs(struct radv_device *device)
+radv_init_shader_arenas(struct radv_device *device)
 {
-   list_for_each_entry_safe(struct radv_shader_slab, slab, &device->shader_slabs, slabs)
+   mtx_init(&device->shader_arena_mutex, mtx_plain);
+
+   device->shader_free_list_mask = 0;
+
+   list_inithead(&device->shader_arenas);
+   list_inithead(&device->shader_block_obj_pool);
+   for (unsigned i = 0; i < RADV_SHADER_ALLOC_NUM_FREE_LISTS; i++)
+      list_inithead(&device->shader_free_lists[i]);
+}
+
+void
+radv_destroy_shader_arenas(struct radv_device *device)
+{
+   list_for_each_entry_safe(union radv_shader_arena_block, block, &device->shader_block_obj_pool,
+                            pool) free(block);
+
+   list_for_each_entry_safe(struct radv_shader_arena, arena, &device->shader_arenas, list)
    {
-      device->ws->buffer_destroy(device->ws, slab->bo);
-      free(slab);
+      device->ws->buffer_destroy(device->ws, arena->bo);
+      free(arena);
    }
-   mtx_destroy(&device->shader_slab_mutex);
+   mtx_destroy(&device->shader_arena_mutex);
 }
 
 /* For the UMR disassembler. */
@@ -1197,7 +1387,7 @@ radv_postprocess_config(const struct radv_device *device, const struct ac_shader
 
          config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1) | S_00B12C_EXCP_EN(excp_en);
       } else {
-         bool enable_prim_id = info->tes.export_prim_id || info->uses_prim_id;
+         bool enable_prim_id = info->tes.outinfo.export_prim_id || info->uses_prim_id;
          vgpr_comp_cnt = enable_prim_id ? 3 : 2;
 
          config_out->rsrc1 |= S_00B128_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
@@ -1248,7 +1438,7 @@ radv_postprocess_config(const struct radv_device *device, const struct ac_shader
           */
          if (info->vs.needs_instance_id && pdevice->rad_info.chip_class >= GFX10) {
             vgpr_comp_cnt = 3;
-         } else if (info->vs.export_prim_id) {
+         } else if (info->vs.outinfo.export_prim_id) {
             vgpr_comp_cnt = 2;
          } else if (info->vs.needs_instance_id) {
             vgpr_comp_cnt = 1;
@@ -1264,7 +1454,7 @@ radv_postprocess_config(const struct radv_device *device, const struct ac_shader
    case MESA_SHADER_FRAGMENT:
       config_out->rsrc1 |= S_00B028_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
       config_out->rsrc2 |= S_00B02C_SHARED_VGPR_CNT(num_shared_vgpr_blocks) |
-                           S_00B02C_TRAP_PRESENT(1) | S_00B02C_EXCP_EN(excp_en);
+                           S_00B02C_EXCP_EN(excp_en);
       break;
    case MESA_SHADER_GEOMETRY:
       config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
@@ -1302,18 +1492,20 @@ radv_postprocess_config(const struct radv_device *device, const struct ac_shader
       if (es_stage == MESA_SHADER_VERTEX) {
          es_vgpr_comp_cnt = info->vs.needs_instance_id ? 3 : 0;
       } else if (es_stage == MESA_SHADER_TESS_EVAL) {
-         bool enable_prim_id = info->tes.export_prim_id || info->uses_prim_id;
+         bool enable_prim_id = info->tes.outinfo.export_prim_id || info->uses_prim_id;
          es_vgpr_comp_cnt = enable_prim_id ? 3 : 2;
       } else
          unreachable("Unexpected ES shader stage");
 
+      bool nggc = info->has_ngg_culling; /* Culling uses GS vertex offsets 0, 1, 2. */
       bool tes_triangles =
          stage == MESA_SHADER_TESS_EVAL && info->tes.primitive_mode >= 4; /* GL_TRIANGLES */
-      if (info->uses_invocation_id || stage == MESA_SHADER_VERTEX) {
+      if (info->uses_invocation_id) {
          gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */
-      } else if (info->uses_prim_id) {
+      } else if (info->uses_prim_id || (es_stage == MESA_SHADER_VERTEX &&
+                                        info->vs.outinfo.export_prim_id)) {
          gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
-      } else if (info->gs.vertices_in >= 3 || tes_triangles) {
+      } else if (info->gs.vertices_in >= 3 || tes_triangles || nggc) {
          gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
       } else {
          gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
@@ -1371,7 +1563,7 @@ radv_postprocess_config(const struct radv_device *device, const struct ac_shader
 
 struct radv_shader_variant *
 radv_shader_variant_create(struct radv_device *device, const struct radv_shader_binary *binary,
-                           bool keep_shader_info)
+                           bool keep_shader_info, bool from_cache)
 {
    struct ac_shader_config config = {0};
    struct ac_rtld_binary rtld_binary = {0};
@@ -1438,14 +1630,20 @@ radv_shader_variant_create(struct radv_device *device, const struct radv_shader_
       variant->exec_size = rtld_binary.exec_size;
    } else {
       assert(binary->type == RADV_BINARY_TYPE_LEGACY);
-      config = ((struct radv_shader_binary_legacy *)binary)->config;
+      config = ((struct radv_shader_binary_legacy *)binary)->base.config;
       variant->code_size =
          radv_get_shader_binary_size(((struct radv_shader_binary_legacy *)binary)->code_size);
       variant->exec_size = ((struct radv_shader_binary_legacy *)binary)->exec_size;
    }
 
    variant->info = binary->info;
-   radv_postprocess_config(device, &config, &binary->info, binary->stage, &variant->config);
+
+   if (from_cache) {
+      /* Copy the shader binary configuration from the cache. */
+      memcpy(&variant->config, &binary->config, sizeof(variant->config));
+   } else {
+      radv_postprocess_config(device, &config, &binary->info, binary->stage, &variant->config);
+   }
 
    void *dest_ptr = radv_alloc_shader_memory(device, variant);
    if (!dest_ptr) {
@@ -1459,7 +1657,7 @@ radv_shader_variant_create(struct radv_device *device, const struct radv_shader_
       struct radv_shader_binary_rtld *bin = (struct radv_shader_binary_rtld *)binary;
       struct ac_rtld_upload_info info = {
          .binary = &rtld_binary,
-         .rx_va = radv_buffer_get_va(variant->bo) + variant->bo_offset,
+         .rx_va = radv_shader_variant_get_va(variant),
          .rx_ptr = dest_ptr,
       };
 
@@ -1560,18 +1758,16 @@ shader_variant_compile(struct radv_device *device, struct vk_shader_module *modu
    options->record_ir = keep_shader_info;
    options->record_stats = keep_statistic_info;
    options->check_ir = device->instance->debug_flags & RADV_DEBUG_CHECKIR;
-   options->tess_offchip_block_dw_size = device->tess_offchip_block_dw_size;
    options->address32_hi = device->physical_device->rad_info.address32_hi;
    options->has_ls_vgpr_init_bug = device->physical_device->rad_info.has_ls_vgpr_init_bug;
-   options->use_ngg_streamout = device->physical_device->use_ngg_streamout;
    options->enable_mrt_output_nan_fixup =
-      module && !module->nir && device->instance->enable_mrt_output_nan_fixup;
+      module && !module->nir && options->key.ps.enable_mrt_output_nan_fixup;
    options->adjust_frag_coord_z = device->adjust_frag_coord_z;
    options->has_image_load_dcc_bug = device->physical_device->rad_info.has_image_load_dcc_bug;
    options->debug.func = radv_compiler_debug;
    options->debug.private_data = &debug_data;
 
-   switch (device->force_vrs) {
+   switch (options->key.ps.force_vrs) {
    case RADV_FORCE_VRS_2x2:
       options->force_vrs_rates = (1u << 2) | (1u << 4);
       break;
@@ -1586,29 +1782,31 @@ shader_variant_compile(struct radv_device *device, struct vk_shader_module *modu
    }
 
    struct radv_shader_args args = {0};
-   args.options = options;
-   args.shader_info = info;
    args.is_gs_copy_shader = gs_copy_shader;
    args.is_trap_handler_shader = trap_handler_shader;
 
-   radv_declare_shader_args(
-      &args, gs_copy_shader ? MESA_SHADER_VERTEX : shaders[shader_count - 1]->info.stage,
+   radv_declare_shader_args(options, info,
+      gs_copy_shader ? MESA_SHADER_VERTEX : shaders[shader_count - 1]->info.stage,
       shader_count >= 2,
-      shader_count >= 2 ? shaders[shader_count - 2]->info.stage : MESA_SHADER_VERTEX);
+      shader_count >= 2 ? shaders[shader_count - 2]->info.stage : MESA_SHADER_VERTEX, &args);
 
+#ifdef LLVM_AVAILABLE
    if (radv_use_llvm_for_stage(device, stage) || options->dump_shader || options->record_ir)
       ac_init_llvm_once();
 
    if (radv_use_llvm_for_stage(device, stage)) {
-      llvm_compile_shader(device, shader_count, shaders, &binary, &args);
+      llvm_compile_shader(options, info, shader_count, shaders, &binary, &args);
+#else
+   if (false) {
+#endif
    } else {
-      aco_compile_shader(shader_count, shaders, &binary, &args);
+      aco_compile_shader(options, info, shader_count, shaders, &args, &binary);
    }
 
    binary->info = *info;
 
    struct radv_shader_variant *variant =
-      radv_shader_variant_create(device, binary, keep_shader_info);
+      radv_shader_variant_create(device, binary, keep_shader_info, false);
    if (!variant) {
       free(binary);
       return NULL;
@@ -1637,6 +1835,9 @@ shader_variant_compile(struct radv_device *device, struct vk_shader_module *modu
       }
    }
 
+   /* Copy the shader binary configuration to store it in the cache. */
+   memcpy(&binary->config, &variant->config, sizeof(binary->config));
+
    if (binary_out)
       *binary_out = binary;
    else
@@ -1649,9 +1850,9 @@ struct radv_shader_variant *
 radv_shader_variant_compile(struct radv_device *device, struct vk_shader_module *module,
                             struct nir_shader *const *shaders, int shader_count,
                             struct radv_pipeline_layout *layout,
-                            const struct radv_shader_variant_key *key,
+                            const struct radv_pipeline_key *key,
                             struct radv_shader_info *info, bool keep_shader_info,
-                            bool keep_statistic_info, bool disable_optimizations,
+                            bool keep_statistic_info,
                             struct radv_shader_binary **binary_out)
 {
    gl_shader_stage stage = shaders[shader_count - 1]->info.stage;
@@ -1662,8 +1863,8 @@ radv_shader_variant_compile(struct radv_device *device, struct vk_shader_module
       options.key = *key;
 
    options.explicit_scratch_args = !radv_use_llvm_for_stage(device, stage);
+   options.remap_spi_ps_input = !radv_use_llvm_for_stage(device, stage);
    options.robust_buffer_access = device->robust_buffer_access;
-   options.disable_optimizations = disable_optimizations;
    options.wgp_mode = radv_should_use_wgp_mode(device, stage, info);
 
    return shader_variant_compile(device, module, shaders, shader_count, stage, info, &options,
@@ -1680,8 +1881,9 @@ radv_create_gs_copy_shader(struct radv_device *device, struct nir_shader *shader
    gl_shader_stage stage = MESA_SHADER_VERTEX;
 
    options.explicit_scratch_args = !radv_use_llvm_for_stage(device, stage);
+   options.remap_spi_ps_input = !radv_use_llvm_for_stage(device, stage);
    options.key.has_multiview_view_index = multiview;
-   options.disable_optimizations = disable_optimizations;
+   options.key.optimisations_disabled = disable_optimizations;
 
    return shader_variant_compile(device, NULL, &shader, 1, stage, info, &options, true, false,
                                  keep_shader_info, keep_statistic_info, binary_out);
@@ -1710,15 +1912,80 @@ radv_create_trap_handler_shader(struct radv_device *device)
    return shader;
 }
 
+static struct radv_shader_prolog *
+upload_vs_prolog(struct radv_device *device, struct radv_prolog_binary *bin, unsigned wave_size)
+{
+   struct radv_shader_prolog *prolog = malloc(sizeof(struct radv_shader_prolog));
+   if (!prolog)
+      return NULL;
+
+   prolog->alloc = alloc_shader_memory(device, bin->code_size, NULL);
+   if (!prolog->alloc) {
+      free(prolog);
+      return NULL;
+   }
+
+   prolog->bo = prolog->alloc->arena->bo;
+   char *dest_ptr = prolog->alloc->arena->ptr + prolog->alloc->offset;
+
+   memcpy(dest_ptr, bin->data, bin->code_size);
+
+   prolog->rsrc1 = S_00B848_VGPRS((bin->num_vgprs - 1) / (wave_size == 32 ? 8 : 4)) |
+                   S_00B228_SGPRS((bin->num_sgprs - 1) / 8);
+   prolog->num_preserved_sgprs = bin->num_preserved_sgprs;
+
+   return prolog;
+}
+
+struct radv_shader_prolog *
+radv_create_vs_prolog(struct radv_device *device, const struct radv_vs_prolog_key *key)
+{
+   struct radv_nir_compiler_options options = {0};
+   options.explicit_scratch_args = true;
+   options.family = device->physical_device->rad_info.family;
+   options.chip_class = device->physical_device->rad_info.chip_class;
+   options.info = &device->physical_device->rad_info;
+   options.address32_hi = device->physical_device->rad_info.address32_hi;
+   options.dump_shader = device->instance->debug_flags & RADV_DEBUG_DUMP_PROLOGS;
+
+   struct radv_shader_info info = {0};
+   info.wave_size = key->wave32 ? 32 : 64;
+   info.vs.needs_instance_id = true;
+   info.vs.needs_base_instance = true;
+   info.vs.needs_draw_id = true;
+   info.vs.use_per_attribute_vb_descs = true;
+   info.vs.vb_desc_usage_mask = BITFIELD_MASK(key->num_attributes);
+   info.vs.has_prolog = true;
+   info.vs.as_ls = key->as_ls;
+   info.is_ngg = key->is_ngg;
+
+   struct radv_shader_args args = {0};
+   radv_declare_shader_args(&options, &info, key->next_stage, key->next_stage != MESA_SHADER_VERTEX,
+                            MESA_SHADER_VERTEX, &args);
+
+#ifdef LLVM_AVAILABLE
+   if (options.dump_shader)
+      ac_init_llvm_once();
+#endif
+
+   struct radv_prolog_binary *binary = NULL;
+   aco_compile_vs_prolog(&options, &info, key, &args, &binary);
+   struct radv_shader_prolog *prolog = upload_vs_prolog(device, binary, info.wave_size);
+   if (prolog) {
+      prolog->nontrivial_divisors = key->state->nontrivial_divisors;
+   }
+   free(binary);
+
+   return prolog;
+}
+
 void
 radv_shader_variant_destroy(struct radv_device *device, struct radv_shader_variant *variant)
 {
    if (!p_atomic_dec_zero(&variant->ref_count))
       return;
 
-   mtx_lock(&device->shader_slab_mutex);
-   list_del(&variant->slab_list);
-   mtx_unlock(&device->shader_slab_mutex);
+   free_shader_memory(device, variant->alloc);
 
    free(variant->spirv);
    free(variant->nir_string);
@@ -1728,6 +1995,49 @@ radv_shader_variant_destroy(struct radv_device *device, struct radv_shader_varia
    free(variant);
 }
 
+void
+radv_prolog_destroy(struct radv_device *device, struct radv_shader_prolog *prolog)
+{
+   if (!prolog)
+      return;
+
+   free_shader_memory(device, prolog->alloc);
+   free(prolog);
+}
+
+uint64_t
+radv_shader_variant_get_va(const struct radv_shader_variant *variant)
+{
+   return radv_buffer_get_va(variant->bo) + variant->alloc->offset;
+}
+
+struct radv_shader_variant *
+radv_find_shader_variant(struct radv_device *device, uint64_t pc)
+{
+   mtx_lock(&device->shader_arena_mutex);
+   list_for_each_entry(struct radv_shader_arena, arena, &device->shader_arenas, list)
+   {
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wshadow"
+#endif
+      list_for_each_entry(union radv_shader_arena_block, block, &arena->entries, list)
+      {
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+         uint64_t start = radv_buffer_get_va(block->arena->bo) + block->offset;
+         if (!block->freelist.prev && pc >= start && pc < start + block->size) {
+            mtx_unlock(&device->shader_arena_mutex);
+            return (struct radv_shader_variant *)block->freelist.next;
+         }
+      }
+   }
+
+   mtx_unlock(&device->shader_arena_mutex);
+   return NULL;
+}
+
 const char *
 radv_get_shader_name(struct radv_shader_info *info, gl_shader_stage stage)
 {
@@ -1762,26 +2072,7 @@ radv_get_shader_name(struct radv_shader_info *info, gl_shader_stage stage)
 }
 
 unsigned
-radv_get_max_workgroup_size(enum chip_class chip_class, gl_shader_stage stage,
-                            const unsigned *sizes)
-{
-   switch (stage) {
-   case MESA_SHADER_TESS_CTRL:
-      return chip_class >= GFX7 ? 128 : 64;
-   case MESA_SHADER_GEOMETRY:
-      return chip_class >= GFX9 ? 128 : 64;
-   case MESA_SHADER_COMPUTE:
-      break;
-   default:
-      return 0;
-   }
-
-   unsigned max_workgroup_size = sizes[0] * sizes[1] * sizes[2];
-   return max_workgroup_size;
-}
-
-unsigned
-radv_get_max_waves(struct radv_device *device, struct radv_shader_variant *variant,
+radv_get_max_waves(const struct radv_device *device, struct radv_shader_variant *variant,
                    gl_shader_stage stage)
 {
    struct radeon_info *info = &device->physical_device->rad_info;
@@ -1798,8 +2089,7 @@ radv_get_max_waves(struct radv_device *device, struct radv_shader_variant *varia
          conf->lds_size * info->lds_encode_granularity + variant->info.ps.num_interp * 48;
       lds_per_wave = align(lds_per_wave, info->lds_alloc_granularity);
    } else if (stage == MESA_SHADER_COMPUTE) {
-      unsigned max_workgroup_size =
-         radv_get_max_workgroup_size(chip_class, stage, variant->info.cs.block_size);
+      unsigned max_workgroup_size = variant->info.workgroup_size;
       lds_per_wave =
          align(conf->lds_size * info->lds_encode_granularity, info->lds_alloc_granularity);
       lds_per_wave /= DIV_ROUND_UP(max_workgroup_size, wave_size);
@@ -1829,6 +2119,56 @@ radv_get_max_waves(struct radv_device *device, struct radv_shader_variant *varia
    return chip_class >= GFX10 ? max_simd_waves * (wave_size / 32) : max_simd_waves;
 }
 
+unsigned
+radv_compute_spi_ps_input(const struct radv_device *device,
+                          const struct radv_shader_info *info)
+{
+   unsigned spi_ps_input;
+
+   spi_ps_input = S_0286CC_PERSP_CENTER_ENA(info->ps.reads_persp_center) |
+                  S_0286CC_PERSP_CENTROID_ENA(info->ps.reads_persp_centroid) |
+                  S_0286CC_PERSP_SAMPLE_ENA(info->ps.reads_persp_sample) |
+                  S_0286CC_LINEAR_CENTER_ENA(info->ps.reads_linear_center) |
+                  S_0286CC_LINEAR_CENTROID_ENA(info->ps.reads_linear_centroid) |
+                  S_0286CC_LINEAR_SAMPLE_ENA(info->ps.reads_linear_sample)|
+                  S_0286CC_PERSP_PULL_MODEL_ENA(info->ps.reads_barycentric_model) |
+                  S_0286CC_FRONT_FACE_ENA(info->ps.reads_front_face);
+
+   if (info->ps.reads_frag_coord_mask ||
+       info->ps.reads_sample_pos_mask) {
+      uint8_t mask = info->ps.reads_frag_coord_mask | info->ps.reads_sample_pos_mask;
+
+      for (unsigned i = 0; i < 4; i++) {
+         if (mask & (1 << i))
+            spi_ps_input |= S_0286CC_POS_X_FLOAT_ENA(1) << i;
+      }
+
+      if (device->adjust_frag_coord_z && info->ps.reads_frag_coord_mask & (1 << 2)) {
+         spi_ps_input |= S_0286CC_ANCILLARY_ENA(1);
+      }
+   }
+
+   if (info->ps.reads_sample_id || info->ps.reads_frag_shading_rate || info->ps.reads_sample_mask_in) {
+      spi_ps_input |= S_0286CC_ANCILLARY_ENA(1);
+   }
+
+   if (info->ps.reads_sample_mask_in) {
+      spi_ps_input |= S_0286CC_SAMPLE_COVERAGE_ENA(1);
+   }
+
+   if (G_0286CC_POS_W_FLOAT_ENA(spi_ps_input)) {
+      /* If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be enabled too */
+      spi_ps_input |= S_0286CC_PERSP_CENTER_ENA(1);
+   }
+
+   if (!(spi_ps_input & 0x7F)) {
+      /* At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled */
+      spi_ps_input |= S_0286CC_PERSP_CENTER_ENA(1);
+   }
+
+   return spi_ps_input;
+}
+
 VkResult
 radv_GetShaderInfoAMD(VkDevice _device, VkPipeline _pipeline, VkShaderStageFlagBits shaderStage,
                       VkShaderInfoTypeAMD infoType, size_t *pInfoSize, void *pInfo)
@@ -1842,7 +2182,7 @@ radv_GetShaderInfoAMD(VkDevice _device, VkPipeline _pipeline, VkShaderStageFlagB
    /* Spec doesn't indicate what to do if the stage is invalid, so just
     * return no info for this. */
    if (!variant)
-      return vk_error(device->instance, VK_ERROR_FEATURE_NOT_PRESENT);
+      return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
 
    switch (infoType) {
    case VK_SHADER_INFO_TYPE_STATISTICS_AMD:
@@ -1862,7 +2202,7 @@ radv_GetShaderInfoAMD(VkDevice _device, VkPipeline _pipeline, VkShaderStageFlagB
 
          if (stage == MESA_SHADER_COMPUTE) {
             unsigned *local_size = variant->info.cs.block_size;
-            unsigned workgroup_size = local_size[0] * local_size[1] * local_size[2];
+            unsigned workgroup_size = pipeline->shaders[MESA_SHADER_COMPUTE]->info.workgroup_size;
 
             statistics.numAvailableVgprs =
                statistics.numPhysicalVgprs /
@@ -1900,7 +2240,9 @@ radv_GetShaderInfoAMD(VkDevice _device, VkPipeline _pipeline, VkShaderStageFlagB
 
       fprintf(memf, "%s:\n", radv_get_shader_name(&variant->info, stage));
       fprintf(memf, "%s\n\n", variant->ir_string);
-      fprintf(memf, "%s\n\n", variant->disasm_string);
+      if (variant->disasm_string) {
+         fprintf(memf, "%s\n\n", variant->disasm_string);
+      }
       radv_dump_shader_stats(device, pipeline, stage, memf);
       u_memstream_close(&mem);
 
diff --git a/mesa 3D driver/src/amd/vulkan/radv_shader.h b/mesa 3D driver/src/amd/vulkan/radv_shader.h
index cabf6845a8..9cabbfc21c 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_shader.h	
+++ b/mesa 3D driver/src/amd/vulkan/radv_shader.h	
@@ -41,81 +41,64 @@
 
 #define RADV_VERT_ATTRIB_MAX MAX2(VERT_ATTRIB_MAX, VERT_ATTRIB_GENERIC0 + MAX_VERTEX_ATTRIBS)
 
+struct radv_physical_device;
 struct radv_device;
 struct radv_pipeline;
 struct radv_pipeline_cache;
 struct radv_pipeline_key;
+struct radv_vs_input_state;
 
-struct radv_vs_out_key {
-   uint32_t as_es : 1;
-   uint32_t as_ls : 1;
-   uint32_t as_ngg : 1;
-   uint32_t as_ngg_passthrough : 1;
-   uint32_t export_prim_id : 1;
-   uint32_t export_layer_id : 1;
-   uint32_t export_clip_dists : 1;
-   uint32_t export_viewport_index : 1;
+enum radv_vs_input_alpha_adjust {
+   ALPHA_ADJUST_NONE = 0,
+   ALPHA_ADJUST_SNORM = 1,
+   ALPHA_ADJUST_SSCALED = 2,
+   ALPHA_ADJUST_SINT = 3,
 };
 
-struct radv_vs_variant_key {
-   struct radv_vs_out_key out;
+struct radv_pipeline_key {
+   uint32_t has_multiview_view_index : 1;
+   uint32_t optimisations_disabled : 1;
+   uint32_t invariant_geom : 1;
+   uint32_t use_ngg : 1;
 
-   uint32_t instance_rate_inputs;
-   uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS];
-   uint8_t vertex_attribute_formats[MAX_VERTEX_ATTRIBS];
-   uint32_t vertex_attribute_bindings[MAX_VERTEX_ATTRIBS];
-   uint32_t vertex_attribute_offsets[MAX_VERTEX_ATTRIBS];
-   uint32_t vertex_attribute_strides[MAX_VERTEX_ATTRIBS];
-   uint8_t vertex_binding_align[MAX_VBS];
+   struct {
+      uint32_t instance_rate_inputs;
+      uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS];
+      uint8_t vertex_attribute_formats[MAX_VERTEX_ATTRIBS];
+      uint32_t vertex_attribute_bindings[MAX_VERTEX_ATTRIBS];
+      uint32_t vertex_attribute_offsets[MAX_VERTEX_ATTRIBS];
+      uint32_t vertex_attribute_strides[MAX_VERTEX_ATTRIBS];
+      uint8_t vertex_binding_align[MAX_VBS];
+      enum radv_vs_input_alpha_adjust vertex_alpha_adjust[MAX_VERTEX_ATTRIBS];
+      uint32_t vertex_post_shuffle;
+      uint32_t provoking_vtx_last : 1;
+      uint32_t dynamic_input_state : 1;
+      uint8_t topology;
+   } vs;
 
-   /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
-    * so we may need to fix it up. */
-   enum ac_fetch_format alpha_adjust[MAX_VERTEX_ATTRIBS];
+   struct {
+      unsigned tess_input_vertices;
+   } tcs;
 
-   /* For some formats the channels have to be shuffled. */
-   uint32_t post_shuffle;
+   struct {
+      uint32_t col_format;
+      uint32_t is_int8;
+      uint32_t is_int10;
+      uint8_t log2_ps_iter_samples;
+      uint8_t num_samples;
 
-   /* Output primitive type. */
-   uint8_t outprim;
+      bool lower_discard_to_demote;
+      bool enable_mrt_output_nan_fixup;
+      uint8_t force_vrs;
+   } ps;
 
-   /* Provoking vertex mode. */
-   bool provoking_vtx_last;
-};
-
-struct radv_tes_variant_key {
-   struct radv_vs_out_key out;
-};
-
-struct radv_tcs_variant_key {
-   struct radv_vs_variant_key vs_key;
-   unsigned primitive_mode;
-   unsigned input_vertices;
-};
-
-struct radv_fs_variant_key {
-   uint32_t col_format;
-   uint8_t log2_ps_iter_samples;
-   uint8_t num_samples;
-   uint32_t is_int8;
-   uint32_t is_int10;
-};
-
-struct radv_cs_variant_key {
-   uint8_t subgroup_size;
-};
-
-struct radv_shader_variant_key {
-   union {
-      struct radv_vs_variant_key vs;
-      struct radv_fs_variant_key fs;
-      struct radv_tes_variant_key tes;
-      struct radv_tcs_variant_key tcs;
-      struct radv_cs_variant_key cs;
-
-      /* A common prefix of the vs and tes keys. */
-      struct radv_vs_out_key vs_common_out;
-   };
-   bool has_multiview_view_index;
+   struct {
+      /* Non-zero if a required subgroup size is specified via
+       * VK_EXT_subgroup_size_control.
+       */
+      uint8_t compute_subgroup_size;
+      bool require_full_subgroups;
+   } cs;
 };
 
 enum radv_compiler_debug_level {
@@ -125,9 +108,8 @@ enum radv_compiler_debug_level {
 
 struct radv_nir_compiler_options {
    struct radv_pipeline_layout *layout;
-   struct radv_shader_variant_key key;
+   struct radv_pipeline_key key;
    bool explicit_scratch_args;
-   bool clamp_shadow_reference;
    bool robust_buffer_access;
    bool adjust_frag_coord_z;
    bool dump_shader;
@@ -137,14 +119,12 @@ struct radv_nir_compiler_options {
    bool check_ir;
    bool has_ls_vgpr_init_bug;
    bool has_image_load_dcc_bug;
-   bool use_ngg_streamout;
    bool enable_mrt_output_nan_fixup;
-   bool disable_optimizations; /* only used by ACO */
    bool wgp_mode;
+   bool remap_spi_ps_input;
    enum radeon_family family;
    enum chip_class chip_class;
    const struct radeon_info *info;
-   uint32_t tess_offchip_block_dw_size;
    uint32_t address32_hi;
    uint8_t force_vrs_rates;
 
@@ -167,10 +147,12 @@ enum radv_ud_index {
    AC_UD_SHADER_START = 9,
    AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START,
    AC_UD_VS_BASE_VERTEX_START_INSTANCE,
+   AC_UD_VS_PROLOG_INPUTS,
    AC_UD_VS_MAX_UD,
    AC_UD_PS_MAX_UD,
    AC_UD_CS_GRID_SIZE = AC_UD_SHADER_START,
    AC_UD_CS_SBT_DESCRIPTORS,
+   AC_UD_CS_RAY_LAUNCH_SIZE,
    AC_UD_CS_MAX_UD,
    AC_UD_GS_MAX_UD,
    AC_UD_TCS_MAX_UD,
@@ -214,6 +196,7 @@ struct radv_vs_output_info {
    bool writes_viewport_index;
    bool writes_primitive_shading_rate;
    bool export_prim_id;
+   bool export_clip_dists;
    unsigned pos_exports;
 };
 
@@ -247,10 +230,8 @@ struct radv_shader_info {
    uint8_t max_push_constant_used;
    bool has_only_32bit_push_constants;
    bool has_indirect_push_constants;
-   uint8_t num_inline_push_consts;
-   uint8_t base_inline_push_consts;
    uint32_t desc_set_used_mask;
-   bool needs_multiview_view_index;
+   bool uses_view_index;
    bool uses_invocation_id;
    bool uses_prim_id;
    uint8_t wave_size;
@@ -259,14 +240,13 @@ struct radv_shader_info {
    unsigned num_user_sgprs;
    unsigned num_input_sgprs;
    unsigned num_input_vgprs;
-   unsigned private_mem_vgprs;
-   bool need_indirect_descriptor_sets;
    bool is_ngg;
    bool is_ngg_passthrough;
    bool has_ngg_culling;
    bool has_ngg_early_prim_export;
    uint32_t num_lds_blocks_when_not_culling;
    uint32_t num_tess_patches;
+   unsigned workgroup_size;
    struct {
       uint8_t input_usage_mask[RADV_VERT_ATTRIB_MAX];
       uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1];
@@ -276,13 +256,14 @@ struct radv_shader_info {
       struct radv_es_output_info es_info;
       bool as_es;
       bool as_ls;
-      bool export_prim_id;
       bool tcs_in_out_eq;
       uint64_t tcs_temp_only_input_mask;
       uint8_t num_linked_outputs;
       bool needs_base_instance;
       bool use_per_attribute_vb_descs;
       uint32_t vb_desc_usage_mask;
+      bool has_prolog;
+      bool dynamic_inputs;
    } vs;
    struct {
       uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1];
@@ -307,7 +288,6 @@ struct radv_shader_info {
       enum gl_tess_spacing spacing;
       bool ccw;
       bool point_mode;
-      bool export_prim_id;
       uint8_t num_linked_inputs;
       uint8_t num_linked_patch_inputs;
       uint8_t num_linked_outputs;
@@ -333,9 +313,21 @@ struct radv_shader_info {
       bool early_fragment_test;
       bool post_depth_coverage;
       bool reads_sample_mask_in;
+      bool reads_front_face;
+      bool reads_sample_id;
+      bool reads_frag_shading_rate;
+      bool reads_barycentric_model;
+      bool reads_persp_sample;
+      bool reads_persp_center;
+      bool reads_persp_centroid;
+      bool reads_linear_sample;
+      bool reads_linear_center;
+      bool reads_linear_centroid;
+      uint8_t reads_frag_coord_mask;
+      uint8_t reads_sample_pos_mask;
       uint8_t depth_layout;
-      bool uses_persp_or_linear_interp;
       bool allow_flat_shading;
+      unsigned spi_ps_input;
    } ps;
    struct {
       bool uses_grid_size;
@@ -344,7 +336,10 @@ struct radv_shader_info {
       bool uses_local_invocation_idx;
       unsigned block_size[3];
 
+      uint8_t subgroup_size;
+
       bool uses_sbt;
+      bool uses_ray_launch_size;
    } cs;
    struct {
       uint64_t tes_inputs_read;
@@ -361,8 +356,38 @@ struct radv_shader_info {
 
    struct gfx9_gs_info gs_ring_info;
    struct gfx10_ngg_info ngg_info;
+};
 
-   unsigned float_controls_mode;
+struct radv_vs_input_state {
+   uint32_t attribute_mask;
+   uint32_t misaligned_mask;
+   uint32_t possibly_misaligned_mask;
+
+   uint32_t instance_rate_inputs;
+   uint32_t nontrivial_divisors;
+   uint32_t post_shuffle;
+   /* Having two separate fields instead of a single uint64_t makes it easier to remove attributes
+    * using bitwise arithmetic.
+    */
+   uint32_t alpha_adjust_lo;
+   uint32_t alpha_adjust_hi;
+
+   uint8_t bindings[MAX_VERTEX_ATTRIBS];
+   uint32_t divisors[MAX_VERTEX_ATTRIBS];
+   uint32_t offsets[MAX_VERTEX_ATTRIBS];
+   uint8_t formats[MAX_VERTEX_ATTRIBS];
+   uint8_t format_align_req_minus_1[MAX_VERTEX_ATTRIBS];
+   uint8_t format_sizes[MAX_VERTEX_ATTRIBS];
+};
+
+struct radv_vs_prolog_key {
+   const struct radv_vs_input_state *state;
+   unsigned num_attributes;
+   uint32_t misaligned_mask;
+   bool as_ls;
+   bool is_ngg;
+   bool wave32;
+   gl_shader_stage next_stage;
 };
 
 enum radv_shader_binary_type { RADV_BINARY_TYPE_LEGACY, RADV_BINARY_TYPE_RTLD };
@@ -372,6 +397,7 @@ struct radv_shader_binary {
    gl_shader_stage stage;
    bool is_gs_copy_shader;
 
+   struct ac_shader_config config;
    struct radv_shader_info info;
 
    /* Self-referential size so we avoid consistency issues. */
@@ -380,7 +406,6 @@ struct radv_shader_binary {
 
 struct radv_shader_binary_legacy {
    struct radv_shader_binary base;
-   struct ac_shader_config config;
    unsigned code_size;
    unsigned exec_size;
    unsigned ir_size;
@@ -399,11 +424,41 @@ struct radv_shader_binary_rtld {
    uint8_t data[0];
 };
 
+struct radv_prolog_binary {
+   uint8_t num_sgprs;
+   uint8_t num_vgprs;
+   uint8_t num_preserved_sgprs;
+   unsigned code_size;
+   uint8_t data[0];
+};
+
+struct radv_shader_arena {
+   struct list_head list;
+   struct list_head entries;
+   struct radeon_winsys_bo *bo;
+   char *ptr;
+};
+
+union radv_shader_arena_block {
+   struct list_head pool;
+   struct {
+      /* List of blocks in the arena, sorted by address. */
+      struct list_head list;
+      /* For holes, a list_head for the free-list. For allocations, freelist.prev=NULL and
+       * freelist.next is a pointer associated with the allocation.
+       */
+      struct list_head freelist;
+      struct radv_shader_arena *arena;
+      uint32_t offset;
+      uint32_t size;
+   };
+};
+
 struct radv_shader_variant {
    uint32_t ref_count;
 
    struct radeon_winsys_bo *bo;
-   uint64_t bo_offset;
+   union radv_shader_arena_block *alloc;
    struct ac_shader_config config;
    uint8_t *code_ptr;
    uint32_t code_size;
@@ -417,16 +472,14 @@ struct radv_shader_variant {
    char *disasm_string;
    char *ir_string;
    uint32_t *statistics;
-
-   struct list_head slab_list;
 };
 
-struct radv_shader_slab {
-   struct list_head slabs;
-   struct list_head shaders;
+struct radv_shader_prolog {
    struct radeon_winsys_bo *bo;
-   uint64_t size;
-   char *ptr;
+   union radv_shader_arena_block *alloc;
+   uint32_t rsrc1;
+   uint8_t num_preserved_sgprs;
+   bool nontrivial_divisors;
 };
 
 void radv_optimize_nir(const struct radv_device *device, struct nir_shader *shader,
@@ -437,27 +490,29 @@ bool radv_nir_lower_ycbcr_textures(nir_shader *shader, const struct radv_pipelin
 nir_shader *radv_shader_compile_to_nir(struct radv_device *device, struct vk_shader_module *module,
                                        const char *entrypoint_name, gl_shader_stage stage,
                                        const VkSpecializationInfo *spec_info,
-                                       const VkPipelineCreateFlags flags,
                                        const struct radv_pipeline_layout *layout,
                                        const struct radv_pipeline_key *key);
 
-void radv_destroy_shader_slabs(struct radv_device *device);
+void radv_init_shader_arenas(struct radv_device *device);
+void radv_destroy_shader_arenas(struct radv_device *device);
 
-VkResult radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
-                             struct radv_pipeline_cache *cache, const struct radv_pipeline_key *key,
+VkResult radv_create_shaders(struct radv_pipeline *pipeline,
+                             struct radv_pipeline_layout *pipeline_layout,
+                             struct radv_device *device, struct radv_pipeline_cache *cache,
+                             const struct radv_pipeline_key *key,
                              const VkPipelineShaderStageCreateInfo **pStages,
-                             const VkPipelineCreateFlags flags,
+                             const VkPipelineCreateFlags flags, const uint8_t *custom_hash,
                              VkPipelineCreationFeedbackEXT *pipeline_feedback,
                              VkPipelineCreationFeedbackEXT **stage_feedbacks);
 
 struct radv_shader_variant *radv_shader_variant_create(struct radv_device *device,
                                                        const struct radv_shader_binary *binary,
-                                                       bool keep_shader_info);
+                                                       bool keep_shader_info, bool from_cache);
 struct radv_shader_variant *radv_shader_variant_compile(
    struct radv_device *device, struct vk_shader_module *module, struct nir_shader *const *shaders,
-   int shader_count, struct radv_pipeline_layout *layout, const struct radv_shader_variant_key *key,
+   int shader_count, struct radv_pipeline_layout *layout, const struct radv_pipeline_key *key,
    struct radv_shader_info *info, bool keep_shader_info, bool keep_statistic_info,
-   bool disable_optimizations, struct radv_shader_binary **binary_out);
+   struct radv_shader_binary **binary_out);
 
 struct radv_shader_variant *
 radv_create_gs_copy_shader(struct radv_device *device, struct nir_shader *nir,
@@ -467,16 +522,24 @@ radv_create_gs_copy_shader(struct radv_device *device, struct nir_shader *nir,
 
 struct radv_shader_variant *radv_create_trap_handler_shader(struct radv_device *device);
 
+struct radv_shader_prolog *radv_create_vs_prolog(struct radv_device *device,
+                                                 const struct radv_vs_prolog_key *key);
+
 void radv_shader_variant_destroy(struct radv_device *device, struct radv_shader_variant *variant);
 
-unsigned radv_get_max_waves(struct radv_device *device, struct radv_shader_variant *variant,
+void radv_prolog_destroy(struct radv_device *device, struct radv_shader_prolog *prolog);
+
+uint64_t radv_shader_variant_get_va(const struct radv_shader_variant *variant);
+struct radv_shader_variant *radv_find_shader_variant(struct radv_device *device, uint64_t pc);
+
+unsigned radv_get_max_waves(const struct radv_device *device, struct radv_shader_variant *variant,
                             gl_shader_stage stage);
 
-unsigned radv_get_max_workgroup_size(enum chip_class chip_class, gl_shader_stage stage,
-                                     const unsigned *sizes);
-
 const char *radv_get_shader_name(struct radv_shader_info *info, gl_shader_stage stage);
 
+unsigned radv_compute_spi_ps_input(const struct radv_device *device,
+                                   const struct radv_shader_info *info);
+
 bool radv_can_dump_shader(struct radv_device *device, struct vk_shader_module *module,
                           bool meta_shader);
 
@@ -565,15 +628,16 @@ get_tcs_num_patches(unsigned tcs_num_input_vertices, unsigned tcs_num_output_ver
 void radv_lower_io(struct radv_device *device, nir_shader *nir);
 
 bool radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
-                          struct radv_shader_info *info, const struct radv_pipeline_key *pl_key);
+                          const struct radv_shader_info *info, const struct radv_pipeline_key *pl_key);
 
 void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
-                    struct radv_shader_info *info,
-                    const struct radv_pipeline_key *pl_key,
-                    struct radv_shader_variant_key *key,
-                    bool consider_culling);
+                    const struct radv_shader_info *info,
+                    const struct radv_pipeline_key *pl_key);
 
 bool radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
-                           uint64_t ps_inputs_read);
+                           uint64_t ps_inputs_read, unsigned num_vertices_per_primitive,
+                           const struct radv_shader_info *info);
+
+void radv_get_nir_options(struct radv_physical_device *device);
 
 #endif
diff --git a/mesa 3D driver/src/amd/vulkan/radv_shader_args.c b/mesa 3D driver/src/amd/vulkan/radv_shader_args.c
index 773a236456..18efc9f11d 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_shader_args.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_shader_args.c	
@@ -39,26 +39,26 @@ set_loc(struct radv_userdata_info *ud_info, uint8_t *sgpr_idx, uint8_t num_sgprs
 }
 
 static void
-set_loc_shader(struct radv_shader_args *args, int idx, uint8_t *sgpr_idx, uint8_t num_sgprs)
+set_loc_shader(struct radv_shader_info *info, int idx, uint8_t *sgpr_idx, uint8_t num_sgprs)
 {
-   struct radv_userdata_info *ud_info = &args->shader_info->user_sgprs_locs.shader_data[idx];
+   struct radv_userdata_info *ud_info = &info->user_sgprs_locs.shader_data[idx];
    assert(ud_info);
 
    set_loc(ud_info, sgpr_idx, num_sgprs);
 }
 
 static void
-set_loc_shader_ptr(struct radv_shader_args *args, int idx, uint8_t *sgpr_idx)
+set_loc_shader_ptr(struct radv_shader_info*info, int idx, uint8_t *sgpr_idx)
 {
    bool use_32bit_pointers = idx != AC_UD_SCRATCH_RING_OFFSETS;
 
-   set_loc_shader(args, idx, sgpr_idx, use_32bit_pointers ? 1 : 2);
+   set_loc_shader(info, idx, sgpr_idx, use_32bit_pointers ? 1 : 2);
 }
 
 static void
-set_loc_desc(struct radv_shader_args *args, int idx, uint8_t *sgpr_idx)
+set_loc_desc(struct radv_shader_info *info, int idx, uint8_t *sgpr_idx)
 {
-   struct radv_userdata_locations *locs = &args->shader_info->user_sgprs_locs;
+   struct radv_userdata_locations *locs = &info->user_sgprs_locs;
    struct radv_userdata_info *ud_info = &locs->descriptor_sets[idx];
    assert(ud_info);
 
@@ -70,30 +70,33 @@ set_loc_desc(struct radv_shader_args *args, int idx, uint8_t *sgpr_idx)
 struct user_sgpr_info {
    bool indirect_all_descriptor_sets;
    uint8_t remaining_sgprs;
+   unsigned num_inline_push_consts;
+   bool inlined_all_push_consts;
 };
 
 static bool
-needs_view_index_sgpr(struct radv_shader_args *args, gl_shader_stage stage)
+needs_view_index_sgpr(const struct radv_nir_compiler_options *options,
+                      const struct radv_shader_info *info, gl_shader_stage stage)
 {
    switch (stage) {
    case MESA_SHADER_VERTEX:
-      if (args->shader_info->needs_multiview_view_index ||
-          (!args->options->key.vs_common_out.as_es && !args->options->key.vs_common_out.as_ls &&
-           args->options->key.has_multiview_view_index))
+      if (info->uses_view_index ||
+          (!info->vs.as_es && !info->vs.as_ls &&
+           options->key.has_multiview_view_index))
          return true;
       break;
    case MESA_SHADER_TESS_EVAL:
-      if (args->shader_info->needs_multiview_view_index ||
-          (!args->options->key.vs_common_out.as_es && args->options->key.has_multiview_view_index))
+      if (info->uses_view_index ||
+          (!info->tes.as_es && options->key.has_multiview_view_index))
          return true;
       break;
    case MESA_SHADER_TESS_CTRL:
-      if (args->shader_info->needs_multiview_view_index)
+      if (info->uses_view_index)
          return true;
       break;
    case MESA_SHADER_GEOMETRY:
-      if (args->shader_info->needs_multiview_view_index ||
-          (args->options->key.vs_common_out.as_ngg && args->options->key.has_multiview_view_index))
+      if (info->uses_view_index ||
+          (info->is_ngg && options->key.has_multiview_view_index))
          return true;
       break;
    default:
@@ -103,79 +106,79 @@ needs_view_index_sgpr(struct radv_shader_args *args, gl_shader_stage stage)
 }
 
 static uint8_t
-count_vs_user_sgprs(struct radv_shader_args *args)
+count_vs_user_sgprs(const struct radv_shader_info *info)
 {
    uint8_t count = 1; /* vertex offset */
 
-   if (args->shader_info->vs.vb_desc_usage_mask)
+   if (info->vs.vb_desc_usage_mask)
       count++;
-   if (args->shader_info->vs.needs_draw_id)
+   if (info->vs.needs_draw_id)
       count++;
-   if (args->shader_info->vs.needs_base_instance)
+   if (info->vs.needs_base_instance)
       count++;
 
    return count;
 }
 
 static unsigned
-count_ngg_sgprs(struct radv_shader_args *args, gl_shader_stage stage)
+count_ngg_sgprs(const struct radv_shader_info *info, bool has_api_gs)
 {
    unsigned count = 0;
 
-   if (stage == MESA_SHADER_GEOMETRY)
+   if (has_api_gs)
       count += 1; /* ngg_gs_state */
-   if (args->shader_info->has_ngg_culling)
+   if (info->has_ngg_culling)
       count += 5; /* ngg_culling_settings + 4x ngg_viewport_* */
 
    return count;
 }
 
 static void
-allocate_inline_push_consts(struct radv_shader_args *args, struct user_sgpr_info *user_sgpr_info)
+allocate_inline_push_consts(const struct radv_shader_info *info,
+                            struct user_sgpr_info *user_sgpr_info)
 {
    uint8_t remaining_sgprs = user_sgpr_info->remaining_sgprs;
 
    /* Only supported if shaders use push constants. */
-   if (args->shader_info->min_push_constant_used == UINT8_MAX)
+   if (info->min_push_constant_used == UINT8_MAX)
       return;
 
    /* Only supported if shaders don't have indirect push constants. */
-   if (args->shader_info->has_indirect_push_constants)
+   if (info->has_indirect_push_constants)
       return;
 
    /* Only supported for 32-bit push constants. */
-   if (!args->shader_info->has_only_32bit_push_constants)
+   if (!info->has_only_32bit_push_constants)
       return;
 
    uint8_t num_push_consts =
-      (args->shader_info->max_push_constant_used - args->shader_info->min_push_constant_used) / 4;
+      (info->max_push_constant_used - info->min_push_constant_used) / 4;
 
    /* Check if the number of user SGPRs is large enough. */
    if (num_push_consts < remaining_sgprs) {
-      args->shader_info->num_inline_push_consts = num_push_consts;
+      user_sgpr_info->num_inline_push_consts = num_push_consts;
    } else {
-      args->shader_info->num_inline_push_consts = remaining_sgprs;
+      user_sgpr_info->num_inline_push_consts = remaining_sgprs;
    }
 
    /* Clamp to the maximum number of allowed inlined push constants. */
-   if (args->shader_info->num_inline_push_consts > AC_MAX_INLINE_PUSH_CONSTS)
-      args->shader_info->num_inline_push_consts = AC_MAX_INLINE_PUSH_CONSTS;
+   if (user_sgpr_info->num_inline_push_consts > AC_MAX_INLINE_PUSH_CONSTS)
+      user_sgpr_info->num_inline_push_consts = AC_MAX_INLINE_PUSH_CONSTS;
 
-   if (args->shader_info->num_inline_push_consts == num_push_consts &&
-       !args->shader_info->loads_dynamic_offsets) {
+   if (user_sgpr_info->num_inline_push_consts == num_push_consts &&
+       !info->loads_dynamic_offsets) {
       /* Disable the default push constants path if all constants are
        * inlined and if shaders don't use dynamic descriptors.
        */
-      args->shader_info->loads_push_constants = false;
+      user_sgpr_info->inlined_all_push_consts = true;
    }
-
-   args->shader_info->base_inline_push_consts = args->shader_info->min_push_constant_used / 4;
 }
 
 static void
-allocate_user_sgprs(struct radv_shader_args *args, gl_shader_stage stage, bool has_previous_stage,
-                    gl_shader_stage previous_stage, bool needs_view_index,
-                    struct user_sgpr_info *user_sgpr_info)
+allocate_user_sgprs(const struct radv_nir_compiler_options *options,
+                    const struct radv_shader_info *info, gl_shader_stage stage,
+                    bool has_previous_stage, gl_shader_stage previous_stage, bool needs_view_index,
+                    bool has_api_gs, bool is_gs_copy_shader, struct user_sgpr_info *user_sgpr_info)
 {
    uint8_t user_sgpr_count = 0;
 
@@ -184,39 +187,40 @@ allocate_user_sgprs(struct radv_shader_args *args, gl_shader_stage stage, bool h
    /* 2 user sgprs will always be allocated for scratch/rings */
    user_sgpr_count += 2;
 
+   /* prolog inputs */
+   if (info->vs.has_prolog)
+      user_sgpr_count += 2;
+
    switch (stage) {
    case MESA_SHADER_COMPUTE:
-      if (args->shader_info->cs.uses_sbt)
+      if (info->cs.uses_sbt)
          user_sgpr_count += 1;
-      if (args->shader_info->cs.uses_grid_size)
+      if (info->cs.uses_grid_size)
+         user_sgpr_count += 3;
+      if (info->cs.uses_ray_launch_size)
          user_sgpr_count += 3;
       break;
    case MESA_SHADER_FRAGMENT:
-      user_sgpr_count += args->shader_info->ps.needs_sample_positions;
       break;
    case MESA_SHADER_VERTEX:
-      if (!args->is_gs_copy_shader)
-         user_sgpr_count += count_vs_user_sgprs(args);
-      if (args->options->key.vs_common_out.as_ngg)
-         user_sgpr_count += count_ngg_sgprs(args, stage);
+      if (!is_gs_copy_shader)
+         user_sgpr_count += count_vs_user_sgprs(info);
       break;
    case MESA_SHADER_TESS_CTRL:
       if (has_previous_stage) {
          if (previous_stage == MESA_SHADER_VERTEX)
-            user_sgpr_count += count_vs_user_sgprs(args);
+            user_sgpr_count += count_vs_user_sgprs(info);
       }
       break;
    case MESA_SHADER_TESS_EVAL:
-      if (args->options->key.vs_common_out.as_ngg)
-         user_sgpr_count += count_ngg_sgprs(args, stage);
       break;
    case MESA_SHADER_GEOMETRY:
       if (has_previous_stage) {
-         if (args->options->key.vs_common_out.as_ngg)
-            user_sgpr_count += count_ngg_sgprs(args, stage);
+         if (info->is_ngg)
+            user_sgpr_count += count_ngg_sgprs(info, has_api_gs);
 
          if (previous_stage == MESA_SHADER_VERTEX) {
-            user_sgpr_count += count_vs_user_sgprs(args);
+            user_sgpr_count += count_vs_user_sgprs(info);
          }
       }
       break;
@@ -227,16 +231,16 @@ allocate_user_sgprs(struct radv_shader_args *args, gl_shader_stage stage, bool h
    if (needs_view_index)
       user_sgpr_count++;
 
-   if (args->shader_info->loads_push_constants)
+   if (info->loads_push_constants)
       user_sgpr_count++;
 
-   if (args->shader_info->so.num_outputs)
+   if (info->so.num_outputs)
       user_sgpr_count++;
 
    uint32_t available_sgprs =
-      args->options->chip_class >= GFX9 && stage != MESA_SHADER_COMPUTE ? 32 : 16;
+      options->chip_class >= GFX9 && stage != MESA_SHADER_COMPUTE ? 32 : 16;
    uint32_t remaining_sgprs = available_sgprs - user_sgpr_count;
-   uint32_t num_desc_set = util_bitcount(args->shader_info->desc_set_used_mask);
+   uint32_t num_desc_set = util_bitcount(info->desc_set_used_mask);
 
    if (remaining_sgprs < num_desc_set) {
       user_sgpr_info->indirect_all_descriptor_sets = true;
@@ -245,16 +249,17 @@ allocate_user_sgprs(struct radv_shader_args *args, gl_shader_stage stage, bool h
       user_sgpr_info->remaining_sgprs = remaining_sgprs - num_desc_set;
    }
 
-   allocate_inline_push_consts(args, user_sgpr_info);
+   allocate_inline_push_consts(info, user_sgpr_info);
 }
 
 static void
-declare_global_input_sgprs(struct radv_shader_args *args,
-                           const struct user_sgpr_info *user_sgpr_info)
+declare_global_input_sgprs(const struct radv_shader_info *info,
+                           const struct user_sgpr_info *user_sgpr_info,
+                           struct radv_shader_args *args)
 {
    /* 1 for each descriptor set */
    if (!user_sgpr_info->indirect_all_descriptor_sets) {
-      uint32_t mask = args->shader_info->desc_set_used_mask;
+      uint32_t mask = info->desc_set_used_mask;
 
       while (mask) {
          int i = u_bit_scan(&mask);
@@ -265,49 +270,53 @@ declare_global_input_sgprs(struct radv_shader_args *args,
       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_PTR_PTR, &args->descriptor_sets[0]);
    }
 
-   if (args->shader_info->loads_push_constants) {
+   if (info->loads_push_constants && !user_sgpr_info->inlined_all_push_consts) {
       /* 1 for push constants and dynamic descriptors */
       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_PTR, &args->ac.push_constants);
    }
 
-   for (unsigned i = 0; i < args->shader_info->num_inline_push_consts; i++) {
+   for (unsigned i = 0; i < user_sgpr_info->num_inline_push_consts; i++) {
       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.inline_push_consts[i]);
    }
-   args->ac.num_inline_push_consts = args->shader_info->num_inline_push_consts;
-   args->ac.base_inline_push_consts = args->shader_info->base_inline_push_consts;
+   args->ac.base_inline_push_consts = info->min_push_constant_used / 4;
 
-   if (args->shader_info->so.num_outputs) {
+   if (info->so.num_outputs) {
       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->streamout_buffers);
    }
 }
 
 static void
-declare_vs_specific_input_sgprs(struct radv_shader_args *args, gl_shader_stage stage,
-                                bool has_previous_stage, gl_shader_stage previous_stage)
+declare_vs_specific_input_sgprs(const struct radv_shader_info *info, struct radv_shader_args *args,
+                                gl_shader_stage stage, bool has_previous_stage,
+                                gl_shader_stage previous_stage)
 {
+   if (info->vs.has_prolog)
+      ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_INT, &args->prolog_inputs);
+
    if (!args->is_gs_copy_shader && (stage == MESA_SHADER_VERTEX ||
                                     (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) {
-      if (args->shader_info->vs.vb_desc_usage_mask) {
+      if (info->vs.vb_desc_usage_mask) {
          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->ac.vertex_buffers);
       }
       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
-      if (args->shader_info->vs.needs_draw_id) {
+      if (info->vs.needs_draw_id) {
          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
       }
-      if (args->shader_info->vs.needs_base_instance) {
+      if (info->vs.needs_base_instance) {
          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
       }
    }
 }
 
 static void
-declare_vs_input_vgprs(struct radv_shader_args *args)
+declare_vs_input_vgprs(const struct radv_nir_compiler_options *options,
+                       const struct radv_shader_info *info, struct radv_shader_args *args)
 {
    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vertex_id);
    if (!args->is_gs_copy_shader) {
-      if (args->options->key.vs_common_out.as_ls) {
+      if (info->vs.as_ls) {
          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_rel_patch_id);
-         if (args->options->chip_class >= GFX10) {
+         if (options->chip_class >= GFX10) {
             ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user vgpr */
             ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
          } else {
@@ -315,8 +324,8 @@ declare_vs_input_vgprs(struct radv_shader_args *args)
             ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
          }
       } else {
-         if (args->options->chip_class >= GFX10) {
-            if (args->options->key.vs_common_out.as_ngg) {
+         if (options->chip_class >= GFX10) {
+            if (info->is_ngg) {
                ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user vgpr */
                ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user vgpr */
                ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
@@ -332,21 +341,27 @@ declare_vs_input_vgprs(struct radv_shader_args *args)
          }
       }
    }
+
+   if (info->vs.dynamic_inputs) {
+      assert(info->vs.use_per_attribute_vb_descs);
+      unsigned num_attributes = util_last_bit(info->vs.vb_desc_usage_mask);
+      for (unsigned i = 0; i < num_attributes; i++)
+         ac_add_arg(&args->ac, AC_ARG_VGPR, 4, AC_ARG_INT, &args->vs_inputs[i]);
+      /* Ensure the main shader doesn't use less vgprs than the prolog. The prolog requires one
+       * VGPR more than the number of shader arguments in the case of non-trivial divisors on GFX8.
+       */
+      ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
+   }
 }
 
 static void
-declare_streamout_sgprs(struct radv_shader_args *args, gl_shader_stage stage)
+declare_streamout_sgprs(const struct radv_shader_info *info, struct radv_shader_args *args,
+                        gl_shader_stage stage)
 {
    int i;
 
-   if (args->options->use_ngg_streamout) {
-      if (stage == MESA_SHADER_TESS_EVAL)
-         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-      return;
-   }
-
    /* Streamout SGPRs. */
-   if (args->shader_info->so.num_outputs) {
+   if (info->so.num_outputs) {
       assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL);
 
       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_config);
@@ -357,7 +372,7 @@ declare_streamout_sgprs(struct radv_shader_args *args, gl_shader_stage stage)
 
    /* A streamout buffer offset is loaded if the stride is non-zero. */
    for (i = 0; i < 4; i++) {
-      if (!args->shader_info->so.strides[i])
+      if (!info->so.strides[i])
          continue;
 
       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_offset[i]);
@@ -374,13 +389,61 @@ declare_tes_input_vgprs(struct radv_shader_args *args)
 }
 
 static void
-declare_ngg_sgprs(struct radv_shader_args *args, gl_shader_stage stage)
+declare_ps_input_vgprs(const struct radv_shader_info *info, struct radv_shader_args *args,
+                       bool remap_spi_ps_input)
 {
-   if (stage == MESA_SHADER_GEOMETRY) {
+   unsigned spi_ps_input = info->ps.spi_ps_input;
+
+   ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_sample);
+   ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_center);
+   ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_centroid);
+   ac_add_arg(&args->ac, AC_ARG_VGPR, 3, AC_ARG_INT, &args->ac.pull_model);
+   ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_sample);
+   ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_center);
+   ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_centroid);
+   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL); /* line stipple tex */
+   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[0]);
+   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[1]);
+   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[2]);
+   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[3]);
+   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.front_face);
+   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.ancillary);
+   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.sample_coverage);
+   ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* fixed pt */
+
+   if (remap_spi_ps_input) {
+      /* LLVM optimizes away unused FS inputs and computes spi_ps_input_addr itself and then
+       * communicates the results back via the ELF binary. Mirror what LLVM does by re-mapping the
+       * VGPR arguments here.
+       */
+      unsigned arg_count = 0;
+      for (unsigned i = 0, vgpr_arg = 0, vgpr_reg = 0; i < args->ac.arg_count; i++) {
+         if (args->ac.args[i].file != AC_ARG_VGPR) {
+            arg_count++;
+            continue;
+         }
+
+         if (!(spi_ps_input & (1 << vgpr_arg))) {
+            args->ac.args[i].skip = true;
+         } else {
+            args->ac.args[i].offset = vgpr_reg;
+            vgpr_reg += args->ac.args[i].size;
+            arg_count++;
+         }
+         vgpr_arg++;
+      }
+   }
+}
+
+static void
+declare_ngg_sgprs(const struct radv_shader_info *info, struct radv_shader_args *args,
+                  bool has_api_gs)
+{
+   if (has_api_gs) {
       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_gs_state);
    }
 
-   if (args->shader_info->has_ngg_culling) {
+   if (info->has_ngg_culling) {
       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_culling_settings);
       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_viewport_scale[0]);
       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ngg_viewport_scale[1]);
@@ -390,69 +453,55 @@ declare_ngg_sgprs(struct radv_shader_args *args, gl_shader_stage stage)
 }
 
 static void
-set_global_input_locs(struct radv_shader_args *args, const struct user_sgpr_info *user_sgpr_info,
-                      uint8_t *user_sgpr_idx)
+set_global_input_locs(struct radv_shader_info *info, struct radv_shader_args *args,
+                      const struct user_sgpr_info *user_sgpr_info, uint8_t *user_sgpr_idx)
 {
-   uint32_t mask = args->shader_info->desc_set_used_mask;
+   unsigned num_inline_push_consts = 0;
 
    if (!user_sgpr_info->indirect_all_descriptor_sets) {
-      while (mask) {
-         int i = u_bit_scan(&mask);
-
-         set_loc_desc(args, i, user_sgpr_idx);
+      for (unsigned i = 0; i < ARRAY_SIZE(args->descriptor_sets); i++) {
+         if (args->descriptor_sets[i].used)
+            set_loc_desc(info, i, user_sgpr_idx);
       }
    } else {
-      set_loc_shader_ptr(args, AC_UD_INDIRECT_DESCRIPTOR_SETS, user_sgpr_idx);
-
-      args->shader_info->need_indirect_descriptor_sets = true;
+      set_loc_shader_ptr(info, AC_UD_INDIRECT_DESCRIPTOR_SETS, user_sgpr_idx);
    }
 
-   if (args->shader_info->loads_push_constants) {
-      set_loc_shader_ptr(args, AC_UD_PUSH_CONSTANTS, user_sgpr_idx);
+   if (args->ac.push_constants.used) {
+      set_loc_shader_ptr(info, AC_UD_PUSH_CONSTANTS, user_sgpr_idx);
    }
 
-   if (args->shader_info->num_inline_push_consts) {
-      set_loc_shader(args, AC_UD_INLINE_PUSH_CONSTANTS, user_sgpr_idx,
-                     args->shader_info->num_inline_push_consts);
+   for (unsigned i = 0; i < ARRAY_SIZE(args->ac.inline_push_consts); i++) {
+      if (args->ac.inline_push_consts[i].used)
+         num_inline_push_consts++;
+   }
+
+   if (num_inline_push_consts) {
+      set_loc_shader(info, AC_UD_INLINE_PUSH_CONSTANTS, user_sgpr_idx, num_inline_push_consts);
    }
 
    if (args->streamout_buffers.used) {
-      set_loc_shader_ptr(args, AC_UD_STREAMOUT_BUFFERS, user_sgpr_idx);
+      set_loc_shader_ptr(info, AC_UD_STREAMOUT_BUFFERS, user_sgpr_idx);
    }
 }
 
 static void
-set_vs_specific_input_locs(struct radv_shader_args *args, gl_shader_stage stage,
-                           bool has_previous_stage, gl_shader_stage previous_stage,
-                           uint8_t *user_sgpr_idx)
+set_vs_specific_input_locs(struct radv_shader_info *info, struct radv_shader_args *args,
+                           gl_shader_stage stage, bool has_previous_stage,
+                           gl_shader_stage previous_stage, uint8_t *user_sgpr_idx)
 {
+   if (args->prolog_inputs.used)
+      set_loc_shader(info, AC_UD_VS_PROLOG_INPUTS, user_sgpr_idx, 2);
+
    if (!args->is_gs_copy_shader && (stage == MESA_SHADER_VERTEX ||
                                     (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) {
-      if (args->shader_info->vs.vb_desc_usage_mask) {
-         set_loc_shader_ptr(args, AC_UD_VS_VERTEX_BUFFERS, user_sgpr_idx);
+      if (args->ac.vertex_buffers.used) {
+         set_loc_shader_ptr(info, AC_UD_VS_VERTEX_BUFFERS, user_sgpr_idx);
       }
 
-      unsigned vs_num =
-         count_vs_user_sgprs(args) - (args->shader_info->vs.vb_desc_usage_mask ? 1 : 0);
-      set_loc_shader(args, AC_UD_VS_BASE_VERTEX_START_INSTANCE, user_sgpr_idx, vs_num);
-   }
-}
-
-static void
-set_ngg_sgprs_locs(struct radv_shader_args *args, gl_shader_stage stage, uint8_t *user_sgpr_idx)
-{
-   if (stage == MESA_SHADER_GEOMETRY) {
-      assert(args->ngg_gs_state.used);
-      set_loc_shader(args, AC_UD_NGG_GS_STATE, user_sgpr_idx, 1);
-   }
-
-   if (args->shader_info->has_ngg_culling) {
-      assert(args->ngg_culling_settings.used &&
-             args->ngg_viewport_scale[0].used && args->ngg_viewport_scale[1].used &&
-             args->ngg_viewport_translate[0].used && args->ngg_viewport_translate[1].used);
-
-      set_loc_shader(args, AC_UD_NGG_CULLING_SETTINGS, user_sgpr_idx, 1);
-      set_loc_shader(args, AC_UD_NGG_VIEWPORT, user_sgpr_idx, 4);
+      unsigned vs_num = args->ac.base_vertex.used + args->ac.draw_id.used +
+                        args->ac.start_instance.used;
+      set_loc_shader(info, AC_UD_VS_BASE_VERTEX_START_INSTANCE, user_sgpr_idx, vs_num);
    }
 }
 
@@ -464,14 +513,17 @@ is_pre_gs_stage(gl_shader_stage stage)
 }
 
 void
-radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage,
-                         bool has_previous_stage, gl_shader_stage previous_stage)
+radv_declare_shader_args(const struct radv_nir_compiler_options *options,
+                         struct radv_shader_info *info, gl_shader_stage stage,
+                         bool has_previous_stage, gl_shader_stage previous_stage,
+                         struct radv_shader_args *args)
 {
    struct user_sgpr_info user_sgpr_info;
-   bool needs_view_index = needs_view_index_sgpr(args, stage);
+   bool needs_view_index = needs_view_index_sgpr(options, info, stage);
+   bool has_api_gs = stage == MESA_SHADER_GEOMETRY;
 
-   if (args->options->chip_class >= GFX10) {
-      if (is_pre_gs_stage(stage) && args->options->key.vs_common_out.as_ngg) {
+   if (options->chip_class >= GFX10) {
+      if (is_pre_gs_stage(stage) && info->is_ngg) {
          /* On GFX10, VS is merged into GS for NGG. */
          previous_stage = stage;
          stage = MESA_SHADER_GEOMETRY;
@@ -480,70 +532,78 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage,
    }
 
    for (int i = 0; i < MAX_SETS; i++)
-      args->shader_info->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1;
+      info->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1;
    for (int i = 0; i < AC_UD_MAX_UD; i++)
-      args->shader_info->user_sgprs_locs.shader_data[i].sgpr_idx = -1;
+      info->user_sgprs_locs.shader_data[i].sgpr_idx = -1;
 
-   allocate_user_sgprs(args, stage, has_previous_stage, previous_stage, needs_view_index,
-                       &user_sgpr_info);
+   allocate_user_sgprs(options, info, stage, has_previous_stage, previous_stage, needs_view_index,
+                       has_api_gs, args->is_gs_copy_shader, &user_sgpr_info);
 
-   if (args->options->explicit_scratch_args) {
+   if (options->explicit_scratch_args) {
       ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_CONST_DESC_PTR, &args->ring_offsets);
    }
 
+   /* To ensure prologs match the main VS, VS specific input SGPRs have to be placed before other
+    * sgprs.
+    */
+
    switch (stage) {
    case MESA_SHADER_COMPUTE:
-      declare_global_input_sgprs(args, &user_sgpr_info);
+      declare_global_input_sgprs(info, &user_sgpr_info, args);
 
-      if (args->shader_info->cs.uses_sbt) {
+      if (info->cs.uses_sbt) {
          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->ac.sbt_descriptors);
       }
 
-      if (args->shader_info->cs.uses_grid_size) {
+      if (info->cs.uses_grid_size) {
          ac_add_arg(&args->ac, AC_ARG_SGPR, 3, AC_ARG_INT, &args->ac.num_work_groups);
       }
 
+      if (info->cs.uses_ray_launch_size) {
+         ac_add_arg(&args->ac, AC_ARG_SGPR, 3, AC_ARG_INT, &args->ac.ray_launch_size);
+      }
+
       for (int i = 0; i < 3; i++) {
-         if (args->shader_info->cs.uses_block_id[i]) {
+         if (info->cs.uses_block_id[i]) {
             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.workgroup_ids[i]);
          }
       }
 
-      if (args->shader_info->cs.uses_local_invocation_idx) {
+      if (info->cs.uses_local_invocation_idx) {
          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tg_size);
       }
 
-      if (args->options->explicit_scratch_args) {
+      if (options->explicit_scratch_args) {
          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
       }
 
       ac_add_arg(&args->ac, AC_ARG_VGPR, 3, AC_ARG_INT, &args->ac.local_invocation_ids);
       break;
    case MESA_SHADER_VERTEX:
-      declare_global_input_sgprs(args, &user_sgpr_info);
+      /* NGG is handled by the GS case */
+      assert(!info->is_ngg);
 
-      declare_vs_specific_input_sgprs(args, stage, has_previous_stage, previous_stage);
+      declare_vs_specific_input_sgprs(info, args, stage, has_previous_stage, previous_stage);
+
+      declare_global_input_sgprs(info, &user_sgpr_info, args);
 
       if (needs_view_index) {
          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.view_index);
       }
 
-      if (args->options->key.vs_common_out.as_es) {
+      if (info->vs.as_es) {
          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.es2gs_offset);
-      } else if (args->options->key.vs_common_out.as_ls) {
+      } else if (info->vs.as_ls) {
          /* no extra parameters */
       } else {
-         declare_streamout_sgprs(args, stage);
+         declare_streamout_sgprs(info, args, stage);
       }
 
-      if (args->options->explicit_scratch_args) {
+      if (options->explicit_scratch_args) {
          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
       }
-      if (args->options->key.vs_common_out.as_ngg) {
-         declare_ngg_sgprs(args, stage);
-      }
 
-      declare_vs_input_vgprs(args);
+      declare_vs_input_vgprs(options, info, args);
       break;
    case MESA_SHADER_TESS_CTRL:
       if (has_previous_stage) {
@@ -556,9 +616,9 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage,
          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); // unknown
          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); // unknown
 
-         declare_global_input_sgprs(args, &user_sgpr_info);
+         declare_vs_specific_input_sgprs(info, args, stage, has_previous_stage, previous_stage);
 
-         declare_vs_specific_input_sgprs(args, stage, has_previous_stage, previous_stage);
+         declare_global_input_sgprs(info, &user_sgpr_info, args);
 
          if (needs_view_index) {
             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.view_index);
@@ -567,9 +627,9 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage,
          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_patch_id);
          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_rel_ids);
 
-         declare_vs_input_vgprs(args);
+         declare_vs_input_vgprs(options, info, args);
       } else {
-         declare_global_input_sgprs(args, &user_sgpr_info);
+         declare_global_input_sgprs(info, &user_sgpr_info, args);
 
          if (needs_view_index) {
             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.view_index);
@@ -577,7 +637,7 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage,
 
          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset);
-         if (args->options->explicit_scratch_args) {
+         if (options->explicit_scratch_args) {
             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
          }
          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_patch_id);
@@ -585,31 +645,31 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage,
       }
       break;
    case MESA_SHADER_TESS_EVAL:
-      declare_global_input_sgprs(args, &user_sgpr_info);
+      /* NGG is handled by the GS case */
+      assert(!info->is_ngg);
+
+      declare_global_input_sgprs(info, &user_sgpr_info, args);
 
       if (needs_view_index)
          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.view_index);
 
-      if (args->options->key.vs_common_out.as_es) {
+      if (info->tes.as_es) {
          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.es2gs_offset);
       } else {
-         declare_streamout_sgprs(args, stage);
+         declare_streamout_sgprs(info, args, stage);
          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
       }
-      if (args->options->explicit_scratch_args) {
+      if (options->explicit_scratch_args) {
          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
       }
-      if (args->options->key.vs_common_out.as_ngg) {
-         declare_ngg_sgprs(args, stage);
-      }
       declare_tes_input_vgprs(args);
       break;
    case MESA_SHADER_GEOMETRY:
       if (has_previous_stage) {
          // First 6 system regs
-         if (args->options->key.vs_common_out.as_ngg) {
+         if (info->is_ngg) {
             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_tg_info);
          } else {
             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs2vs_offset);
@@ -622,33 +682,33 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage,
          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); // unknown
          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); // unknown
 
-         declare_global_input_sgprs(args, &user_sgpr_info);
-
          if (previous_stage != MESA_SHADER_TESS_EVAL) {
-            declare_vs_specific_input_sgprs(args, stage, has_previous_stage, previous_stage);
+            declare_vs_specific_input_sgprs(info, args, stage, has_previous_stage, previous_stage);
          }
 
+         declare_global_input_sgprs(info, &user_sgpr_info, args);
+
          if (needs_view_index) {
             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.view_index);
          }
 
-         if (args->options->key.vs_common_out.as_ngg) {
-            declare_ngg_sgprs(args, stage);
+         if (info->is_ngg) {
+            declare_ngg_sgprs(info, args, has_api_gs);
          }
 
          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
-         ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]);
+         ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_invocation_id);
-         ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[4]);
+         ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]);
 
          if (previous_stage == MESA_SHADER_VERTEX) {
-            declare_vs_input_vgprs(args);
+            declare_vs_input_vgprs(options, info, args);
          } else {
             declare_tes_input_vgprs(args);
          }
       } else {
-         declare_global_input_sgprs(args, &user_sgpr_info);
+         declare_global_input_sgprs(info, &user_sgpr_info, args);
 
          if (needs_view_index) {
             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.view_index);
@@ -656,7 +716,7 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage,
 
          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs2vs_offset);
          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_wave_id);
-         if (args->options->explicit_scratch_args) {
+         if (options->explicit_scratch_args) {
             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
          }
          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
@@ -670,87 +730,80 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage,
       }
       break;
    case MESA_SHADER_FRAGMENT:
-      declare_global_input_sgprs(args, &user_sgpr_info);
+      declare_global_input_sgprs(info, &user_sgpr_info, args);
 
       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.prim_mask);
-      if (args->options->explicit_scratch_args) {
+      if (options->explicit_scratch_args) {
          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
       }
-      ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_sample);
-      ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_center);
-      ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_centroid);
-      ac_add_arg(&args->ac, AC_ARG_VGPR, 3, AC_ARG_INT, &args->ac.pull_model);
-      ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_sample);
-      ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_center);
-      ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_centroid);
-      ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL); /* line stipple tex */
-      ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[0]);
-      ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[1]);
-      ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[2]);
-      ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[3]);
-      ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.front_face);
-      ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.ancillary);
-      ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.sample_coverage);
-      ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* fixed pt */
+
+      declare_ps_input_vgprs(info, args, options->remap_spi_ps_input);
       break;
    default:
       unreachable("Shader stage not implemented");
    }
 
-   args->shader_info->num_input_vgprs = 0;
-   args->shader_info->num_input_sgprs = 2;
-   args->shader_info->num_input_sgprs += args->ac.num_sgprs_used;
-   args->shader_info->num_input_vgprs = args->ac.num_vgprs_used;
+   info->num_input_vgprs = 0;
+   info->num_input_sgprs = 2;
+   info->num_input_sgprs += args->ac.num_sgprs_used;
+   info->num_input_vgprs = args->ac.num_vgprs_used;
 
    uint8_t user_sgpr_idx = 0;
 
-   set_loc_shader_ptr(args, AC_UD_SCRATCH_RING_OFFSETS, &user_sgpr_idx);
+   set_loc_shader_ptr(info, AC_UD_SCRATCH_RING_OFFSETS, &user_sgpr_idx);
 
    /* For merged shaders the user SGPRs start at 8, with 8 system SGPRs in front (including
     * the rw_buffers at s0/s1. With user SGPR0 = s8, lets restart the count from 0 */
    if (has_previous_stage)
       user_sgpr_idx = 0;
 
-   set_global_input_locs(args, &user_sgpr_info, &user_sgpr_idx);
+   if (stage == MESA_SHADER_VERTEX || (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))
+      set_vs_specific_input_locs(info, args, stage, has_previous_stage, previous_stage, &user_sgpr_idx);
+
+   set_global_input_locs(info, args, &user_sgpr_info, &user_sgpr_idx);
 
    switch (stage) {
    case MESA_SHADER_COMPUTE:
-      if (args->shader_info->cs.uses_sbt) {
-         set_loc_shader_ptr(args, AC_UD_CS_SBT_DESCRIPTORS, &user_sgpr_idx);
+      if (args->ac.sbt_descriptors.used) {
+         set_loc_shader_ptr(info, AC_UD_CS_SBT_DESCRIPTORS, &user_sgpr_idx);
       }
-      if (args->shader_info->cs.uses_grid_size) {
-         set_loc_shader(args, AC_UD_CS_GRID_SIZE, &user_sgpr_idx, 3);
+      if (args->ac.num_work_groups.used) {
+         set_loc_shader(info, AC_UD_CS_GRID_SIZE, &user_sgpr_idx, 3);
+      }
+      if (args->ac.ray_launch_size.used) {
+         set_loc_shader(info, AC_UD_CS_RAY_LAUNCH_SIZE, &user_sgpr_idx, 3);
       }
       break;
    case MESA_SHADER_VERTEX:
-      set_vs_specific_input_locs(args, stage, has_previous_stage, previous_stage, &user_sgpr_idx);
       if (args->ac.view_index.used)
-         set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
-      if (args->options->key.vs_common_out.as_ngg)
-         set_ngg_sgprs_locs(args, stage, &user_sgpr_idx);
+         set_loc_shader(info, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
       break;
    case MESA_SHADER_TESS_CTRL:
-      set_vs_specific_input_locs(args, stage, has_previous_stage, previous_stage, &user_sgpr_idx);
       if (args->ac.view_index.used)
-         set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
+         set_loc_shader(info, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
       break;
    case MESA_SHADER_TESS_EVAL:
       if (args->ac.view_index.used)
-         set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
-      if (args->options->key.vs_common_out.as_ngg)
-         set_ngg_sgprs_locs(args, stage, &user_sgpr_idx);
+         set_loc_shader(info, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
       break;
    case MESA_SHADER_GEOMETRY:
-      if (has_previous_stage) {
-         if (previous_stage == MESA_SHADER_VERTEX)
-            set_vs_specific_input_locs(args, stage, has_previous_stage, previous_stage,
-                                       &user_sgpr_idx);
-      }
       if (args->ac.view_index.used)
-         set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
+         set_loc_shader(info, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
 
-      if (args->options->key.vs_common_out.as_ngg)
-         set_ngg_sgprs_locs(args, stage, &user_sgpr_idx);
+      if (args->ngg_gs_state.used) {
+         set_loc_shader(info, AC_UD_NGG_GS_STATE, &user_sgpr_idx, 1);
+      }
+
+      if (args->ngg_culling_settings.used) {
+         set_loc_shader(info, AC_UD_NGG_CULLING_SETTINGS, &user_sgpr_idx, 1);
+      }
+
+      if (args->ngg_viewport_scale[0].used) {
+         assert(args->ngg_viewport_scale[1].used &&
+                args->ngg_viewport_translate[0].used &&
+                args->ngg_viewport_translate[1].used);
+         set_loc_shader(info, AC_UD_NGG_VIEWPORT, &user_sgpr_idx, 4);
+      }
       break;
    case MESA_SHADER_FRAGMENT:
       break;
@@ -758,5 +811,5 @@ radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage,
       unreachable("Shader stage not implemented");
    }
 
-   args->shader_info->num_user_sgprs = user_sgpr_idx;
+   info->num_user_sgprs = user_sgpr_idx;
 }
diff --git a/mesa 3D driver/src/amd/vulkan/radv_shader_args.h b/mesa 3D driver/src/amd/vulkan/radv_shader_args.h
index a7c13152fc..6aa98a61d8 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_shader_args.h	
+++ b/mesa 3D driver/src/amd/vulkan/radv_shader_args.h	
@@ -30,8 +30,6 @@
 
 struct radv_shader_args {
    struct ac_shader_args ac;
-   struct radv_shader_info *shader_info;
-   const struct radv_nir_compiler_options *options;
 
    struct ac_arg descriptor_sets[MAX_SETS];
    struct ac_arg ring_offsets;
@@ -45,6 +43,9 @@ struct radv_shader_args {
    struct ac_arg ngg_viewport_scale[2];
    struct ac_arg ngg_viewport_translate[2];
 
+   struct ac_arg prolog_inputs;
+   struct ac_arg vs_inputs[MAX_VERTEX_ATTRIBS];
+
    bool is_gs_copy_shader;
    bool is_trap_handler_shader;
 };
@@ -55,5 +56,10 @@ radv_shader_args_from_ac(struct ac_shader_args *args)
    return container_of(args, struct radv_shader_args, ac);
 }
 
-void radv_declare_shader_args(struct radv_shader_args *args, gl_shader_stage stage,
-                              bool has_previous_stage, gl_shader_stage previous_stage);
+struct radv_nir_compiler_options;
+struct radv_shader_info;
+
+void radv_declare_shader_args(const struct radv_nir_compiler_options *options,
+                              struct radv_shader_info *info, gl_shader_stage stage,
+                              bool has_previous_stage, gl_shader_stage previous_stage,
+                              struct radv_shader_args *args);
diff --git a/mesa 3D driver/src/amd/vulkan/radv_shader_info.c b/mesa 3D driver/src/amd/vulkan/radv_shader_info.c
index 428f51823b..6fe4083839 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_shader_info.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_shader_info.c	
@@ -25,6 +25,8 @@
 #include "radv_private.h"
 #include "radv_shader.h"
 
+#include "ac_exp_param.h"
+
 static void
 mark_sampler_desc(const nir_variable *var, struct radv_shader_info *info)
 {
@@ -128,27 +130,42 @@ gather_intrinsic_info(const nir_shader *nir, const nir_intrinsic_instr *instr,
    switch (instr->intrinsic) {
    case nir_intrinsic_load_barycentric_sample:
    case nir_intrinsic_load_barycentric_pixel:
-   case nir_intrinsic_load_barycentric_centroid: {
+   case nir_intrinsic_load_barycentric_centroid:
+   case nir_intrinsic_load_barycentric_at_sample:
+   case nir_intrinsic_load_barycentric_at_offset: {
       enum glsl_interp_mode mode = nir_intrinsic_interp_mode(instr);
       switch (mode) {
-      case INTERP_MODE_NONE:
       case INTERP_MODE_SMOOTH:
+      case INTERP_MODE_NONE:
+         if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel ||
+             instr->intrinsic == nir_intrinsic_load_barycentric_at_sample ||
+             instr->intrinsic == nir_intrinsic_load_barycentric_at_offset)
+            info->ps.reads_persp_center = true;
+         else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
+            info->ps.reads_persp_centroid = true;
+         else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
+            info->ps.reads_persp_sample = true;
+         break;
       case INTERP_MODE_NOPERSPECTIVE:
-         info->ps.uses_persp_or_linear_interp = true;
+         if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel ||
+             instr->intrinsic == nir_intrinsic_load_barycentric_at_sample ||
+             instr->intrinsic == nir_intrinsic_load_barycentric_at_offset)
+            info->ps.reads_linear_center = true;
+         else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
+            info->ps.reads_linear_centroid = true;
+         else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
+            info->ps.reads_linear_sample = true;
          break;
       default:
          break;
       }
-      break;
-   }
-   case nir_intrinsic_load_barycentric_at_offset:
-   case nir_intrinsic_load_barycentric_at_sample:
-      if (nir_intrinsic_interp_mode(instr) != INTERP_MODE_FLAT)
-         info->ps.uses_persp_or_linear_interp = true;
-
       if (instr->intrinsic == nir_intrinsic_load_barycentric_at_sample)
          info->ps.needs_sample_positions = true;
       break;
+   }
+   case nir_intrinsic_load_barycentric_model:
+      info->ps.reads_barycentric_model = true;
+      break;
    case nir_intrinsic_load_draw_id:
       info->vs.needs_draw_id = true;
       break;
@@ -161,6 +178,9 @@ gather_intrinsic_info(const nir_shader *nir, const nir_intrinsic_instr *instr,
    case nir_intrinsic_load_num_workgroups:
       info->cs.uses_grid_size = true;
       break;
+   case nir_intrinsic_load_ray_launch_size:
+      info->cs.uses_ray_launch_size = true;
+      break;
    case nir_intrinsic_load_local_invocation_id:
    case nir_intrinsic_load_workgroup_id: {
       unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
@@ -182,14 +202,23 @@ gather_intrinsic_info(const nir_shader *nir, const nir_intrinsic_instr *instr,
    case nir_intrinsic_load_sample_mask_in:
       info->ps.reads_sample_mask_in = true;
       break;
-   case nir_intrinsic_load_view_index:
-      info->needs_multiview_view_index = true;
-      if (nir->info.stage == MESA_SHADER_FRAGMENT)
-         info->ps.layer_input = true;
+   case nir_intrinsic_load_sample_id:
+      info->ps.reads_sample_id = true;
       break;
-   case nir_intrinsic_load_layer_id:
-      if (nir->info.stage == MESA_SHADER_FRAGMENT)
-         info->ps.layer_input = true;
+   case nir_intrinsic_load_frag_shading_rate:
+      info->ps.reads_frag_shading_rate = true;
+      break;
+   case nir_intrinsic_load_front_face:
+      info->ps.reads_front_face = true;
+      break;
+   case nir_intrinsic_load_frag_coord:
+      info->ps.reads_frag_coord_mask = nir_ssa_def_components_read(&instr->dest.ssa);
+      break;
+   case nir_intrinsic_load_sample_pos:
+      info->ps.reads_sample_pos_mask = nir_ssa_def_components_read(&instr->dest.ssa);
+      break;
+   case nir_intrinsic_load_view_index:
+      info->uses_view_index = true;
       break;
    case nir_intrinsic_load_invocation_id:
       info->uses_invocation_id = true;
@@ -218,7 +247,8 @@ gather_intrinsic_info(const nir_shader *nir, const nir_intrinsic_instr *instr,
    case nir_intrinsic_image_deref_atomic_comp_swap:
    case nir_intrinsic_image_deref_atomic_fmin:
    case nir_intrinsic_image_deref_atomic_fmax:
-   case nir_intrinsic_image_deref_size: {
+   case nir_intrinsic_image_deref_size:
+   case nir_intrinsic_image_deref_samples: {
       nir_variable *var =
          nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
       mark_sampler_desc(var, info);
@@ -318,7 +348,7 @@ gather_info_block(const nir_shader *nir, const nir_block *block, struct radv_sha
 
 static void
 gather_info_input_decl_vs(const nir_shader *nir, const nir_variable *var,
-                          struct radv_shader_info *info, const struct radv_shader_variant_key *key)
+                          const struct radv_pipeline_key *key, struct radv_shader_info *info)
 {
    unsigned attrib_count = glsl_count_attribute_slots(var->type, true);
 
@@ -406,11 +436,11 @@ gather_info_input_decl_ps(const nir_shader *nir, const nir_variable *var,
 
 static void
 gather_info_input_decl(const nir_shader *nir, const nir_variable *var,
-                       struct radv_shader_info *info, const struct radv_shader_variant_key *key)
+                       const struct radv_pipeline_key *key, struct radv_shader_info *info)
 {
    switch (nir->info.stage) {
    case MESA_SHADER_VERTEX:
-      gather_info_input_decl_vs(nir, var, info, key);
+      gather_info_input_decl_vs(nir, var, key, info);
       break;
    case MESA_SHADER_FRAGMENT:
       gather_info_input_decl_ps(nir, var, info);
@@ -456,31 +486,45 @@ gather_info_output_decl_gs(const nir_shader *nir, const nir_variable *var,
    info->gs.output_streams[idx] = stream;
 }
 
+static struct radv_vs_output_info *
+get_vs_output_info(const nir_shader *nir, struct radv_shader_info *info)
+{
+
+   switch (nir->info.stage) {
+   case MESA_SHADER_VERTEX:
+      if (!info->vs.as_ls && !info->vs.as_es)
+         return &info->vs.outinfo;
+      break;
+   case MESA_SHADER_GEOMETRY:
+      return &info->vs.outinfo;
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      if (!info->tes.as_es)
+         return &info->tes.outinfo;
+      break;
+   default:
+      break;
+   }
+
+   return NULL;
+}
+
 static void
 gather_info_output_decl(const nir_shader *nir, const nir_variable *var,
-                        struct radv_shader_info *info, const struct radv_shader_variant_key *key)
+                        struct radv_shader_info *info)
 {
-   struct radv_vs_output_info *vs_info = NULL;
+   struct radv_vs_output_info *vs_info = get_vs_output_info(nir, info);
 
    switch (nir->info.stage) {
    case MESA_SHADER_FRAGMENT:
       gather_info_output_decl_ps(nir, var, info);
       break;
    case MESA_SHADER_VERTEX:
-      if (!key->vs_common_out.as_ls && !key->vs_common_out.as_es)
-         vs_info = &info->vs.outinfo;
-
-      /* TODO: Adjust as_ls/as_nng. */
-      if (!key->vs_common_out.as_ls && key->vs_common_out.as_ngg)
-         gather_info_output_decl_gs(nir, var, info);
       break;
    case MESA_SHADER_GEOMETRY:
-      vs_info = &info->vs.outinfo;
       gather_info_output_decl_gs(nir, var, info);
       break;
    case MESA_SHADER_TESS_EVAL:
-      if (!key->vs_common_out.as_es)
-         vs_info = &info->tes.outinfo;
       break;
    default:
       break;
@@ -553,7 +597,8 @@ radv_nir_shader_info_init(struct radv_shader_info *info)
 void
 radv_nir_shader_info_pass(struct radv_device *device, const struct nir_shader *nir,
                           const struct radv_pipeline_layout *layout,
-                          const struct radv_shader_variant_key *key, struct radv_shader_info *info)
+                          const struct radv_pipeline_key *pipeline_key,
+                          struct radv_shader_info *info)
 {
    struct nir_function *func = (struct nir_function *)exec_list_get_head_const(&nir->functions);
 
@@ -564,44 +609,38 @@ radv_nir_shader_info_pass(struct radv_device *device, const struct nir_shader *n
    }
 
    if (nir->info.stage == MESA_SHADER_VERTEX) {
+      if (pipeline_key->vs.dynamic_input_state && nir->info.inputs_read) {
+         info->vs.has_prolog = true;
+         info->vs.dynamic_inputs = true;
+      }
+
       /* Use per-attribute vertex descriptors to prevent faults and
        * for correct bounds checking.
        */
-      info->vs.use_per_attribute_vb_descs = device->robust_buffer_access;
+      info->vs.use_per_attribute_vb_descs = device->robust_buffer_access || info->vs.dynamic_inputs;
    }
 
+   /* We have to ensure consistent input register assignments between the main shader and the
+    * prolog. */
+   info->vs.needs_instance_id |= info->vs.has_prolog;
+   info->vs.needs_base_instance |= info->vs.has_prolog;
+   info->vs.needs_draw_id |= info->vs.has_prolog;
+
    nir_foreach_shader_in_variable (variable, nir)
-      gather_info_input_decl(nir, variable, info, key);
+      gather_info_input_decl(nir, variable, pipeline_key, info);
 
    nir_foreach_block (block, func->impl) {
       gather_info_block(nir, block, info);
    }
 
-   nir_foreach_shader_out_variable(variable, nir) gather_info_output_decl(nir, variable, info, key);
+   nir_foreach_shader_out_variable(variable, nir) gather_info_output_decl(nir, variable, info);
 
    if (nir->info.stage == MESA_SHADER_VERTEX || nir->info.stage == MESA_SHADER_TESS_EVAL ||
        nir->info.stage == MESA_SHADER_GEOMETRY)
       gather_xfb_info(nir, info);
 
-   /* Make sure to export the LayerID if the fragment shader needs it. */
-   if (key->vs_common_out.export_layer_id) {
-      switch (nir->info.stage) {
-      case MESA_SHADER_VERTEX:
-         info->vs.output_usage_mask[VARYING_SLOT_LAYER] |= 0x1;
-         break;
-      case MESA_SHADER_TESS_EVAL:
-         info->tes.output_usage_mask[VARYING_SLOT_LAYER] |= 0x1;
-         break;
-      case MESA_SHADER_GEOMETRY:
-         info->gs.output_usage_mask[VARYING_SLOT_LAYER] |= 0x1;
-         break;
-      default:
-         break;
-      }
-   }
-
    /* Make sure to export the LayerID if the subpass has multiviews. */
-   if (key->has_multiview_view_index) {
+   if (pipeline_key->has_multiview_view_index) {
       switch (nir->info.stage) {
       case MESA_SHADER_VERTEX:
          info->vs.outinfo.writes_layer = true;
@@ -617,37 +656,51 @@ radv_nir_shader_info_pass(struct radv_device *device, const struct nir_shader *n
       }
    }
 
-   /* Make sure to export the PrimitiveID if the fragment shader needs it. */
-   if (key->vs_common_out.export_prim_id) {
-      switch (nir->info.stage) {
-      case MESA_SHADER_VERTEX:
-         info->vs.outinfo.export_prim_id = true;
-         break;
-      case MESA_SHADER_TESS_EVAL:
-         info->tes.outinfo.export_prim_id = true;
-         break;
-      case MESA_SHADER_GEOMETRY:
-         info->vs.outinfo.export_prim_id = true;
-         break;
-      default:
-         break;
-      }
-   }
+   struct radv_vs_output_info *outinfo = get_vs_output_info(nir, info);
+   if (outinfo) {
+      bool writes_primitive_shading_rate =
+         outinfo->writes_primitive_shading_rate || device->force_vrs != RADV_FORCE_VRS_NONE;
+      int pos_written = 0x1;
 
-   /* Make sure to export the ViewportIndex if the fragment shader needs it. */
-   if (key->vs_common_out.export_viewport_index) {
-      switch (nir->info.stage) {
-      case MESA_SHADER_VERTEX:
-         info->vs.output_usage_mask[VARYING_SLOT_VIEWPORT] |= 0x1;
-         break;
-      case MESA_SHADER_TESS_EVAL:
-         info->tes.output_usage_mask[VARYING_SLOT_VIEWPORT] |= 0x1;
-         break;
-      case MESA_SHADER_GEOMETRY:
-         info->gs.output_usage_mask[VARYING_SLOT_VIEWPORT] |= 0x1;
-         break;
-      default:
-         break;
+      if (outinfo->writes_pointsize || outinfo->writes_viewport_index || outinfo->writes_layer ||
+          writes_primitive_shading_rate)
+         pos_written |= 1 << 1;
+
+      unsigned num_clip_distances = util_bitcount(outinfo->clip_dist_mask);
+      unsigned num_cull_distances = util_bitcount(outinfo->cull_dist_mask);
+
+      if (num_clip_distances + num_cull_distances > 0)
+         pos_written |= 1 << 2;
+      if (num_clip_distances + num_cull_distances > 4)
+         pos_written |= 1 << 3;
+
+      outinfo->pos_exports = util_bitcount(pos_written);
+
+      memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
+             sizeof(outinfo->vs_output_param_offset));
+      outinfo->param_exports = 0;
+
+      uint64_t mask = nir->info.outputs_written;
+      while (mask) {
+         int idx = u_bit_scan64(&mask);
+         if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER ||
+             idx == VARYING_SLOT_PRIMITIVE_ID || idx == VARYING_SLOT_VIEWPORT ||
+             ((idx == VARYING_SLOT_CLIP_DIST0 || idx == VARYING_SLOT_CLIP_DIST1) &&
+              outinfo->export_clip_dists)) {
+            if (outinfo->vs_output_param_offset[idx] == AC_EXP_PARAM_UNDEFINED)
+               outinfo->vs_output_param_offset[idx] = outinfo->param_exports++;
+         }
+      }
+      if (outinfo->writes_layer &&
+          outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] == AC_EXP_PARAM_UNDEFINED) {
+         /* when ctx->options->key.has_multiview_view_index = true, the layer
+          * variable isn't declared in NIR and it's isel's job to get the layer */
+         outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] = outinfo->param_exports++;
+      }
+
+      if (outinfo->export_prim_id) {
+         assert(outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] == AC_EXP_PARAM_UNDEFINED);
+         outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = outinfo->param_exports++;
       }
    }
 
@@ -677,20 +730,11 @@ radv_nir_shader_info_pass(struct radv_device *device, const struct nir_shader *n
       info->tes.spacing = nir->info.tess.spacing;
       info->tes.ccw = nir->info.tess.ccw;
       info->tes.point_mode = nir->info.tess.point_mode;
-      info->tes.as_es = key->vs_common_out.as_es;
-      info->tes.export_prim_id = key->vs_common_out.export_prim_id;
-      info->is_ngg = key->vs_common_out.as_ngg;
-      info->is_ngg_passthrough = key->vs_common_out.as_ngg_passthrough;
       break;
    case MESA_SHADER_TESS_CTRL:
       info->tcs.tcs_vertices_out = nir->info.tess.tcs_vertices_out;
       break;
    case MESA_SHADER_VERTEX:
-      info->vs.as_es = key->vs_common_out.as_es;
-      info->vs.as_ls = key->vs_common_out.as_ls;
-      info->vs.export_prim_id = key->vs_common_out.export_prim_id;
-      info->is_ngg = key->vs_common_out.as_ngg;
-      info->is_ngg_passthrough = key->vs_common_out.as_ngg_passthrough;
       break;
    default:
       break;
@@ -704,8 +748,8 @@ radv_nir_shader_info_pass(struct radv_device *device, const struct nir_shader *n
    }
 
    /* Compute the ESGS item size for VS or TES as ES. */
-   if ((nir->info.stage == MESA_SHADER_VERTEX || nir->info.stage == MESA_SHADER_TESS_EVAL) &&
-       key->vs_common_out.as_es) {
+   if ((nir->info.stage == MESA_SHADER_VERTEX && info->vs.as_es) ||
+       (nir->info.stage == MESA_SHADER_TESS_EVAL && info->tes.as_es)) {
       struct radv_es_output_info *es_info =
          nir->info.stage == MESA_SHADER_VERTEX ? &info->vs.es_info : &info->tes.es_info;
       uint32_t num_outputs_written = nir->info.stage == MESA_SHADER_VERTEX
@@ -714,11 +758,16 @@ radv_nir_shader_info_pass(struct radv_device *device, const struct nir_shader *n
       es_info->esgs_itemsize = num_outputs_written * 16;
    }
 
-   info->float_controls_mode = nir->info.float_controls_execution_mode;
-
    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+      bool uses_persp_or_linear_interp = info->ps.reads_persp_center ||
+                                         info->ps.reads_persp_centroid ||
+                                         info->ps.reads_persp_sample ||
+                                         info->ps.reads_linear_center ||
+                                         info->ps.reads_linear_centroid ||
+                                         info->ps.reads_linear_sample;
+
       info->ps.allow_flat_shading =
-         !(info->ps.uses_persp_or_linear_interp || info->ps.needs_sample_positions ||
+         !(uses_persp_or_linear_interp || info->ps.needs_sample_positions ||
            info->ps.writes_memory || nir->info.fs.needs_quad_helper_invocations ||
            BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
            BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_POINT_COORD) ||
@@ -726,5 +775,7 @@ radv_nir_shader_info_pass(struct radv_device *device, const struct nir_shader *n
            BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS) ||
            BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN) ||
            BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_HELPER_INVOCATION));
+
+      info->ps.spi_ps_input = radv_compute_spi_ps_input(device, info);
    }
 }
diff --git a/mesa 3D driver/src/amd/vulkan/radv_sqtt.c b/mesa 3D driver/src/amd/vulkan/radv_sqtt.c
index bd874cb9d2..71c489910a 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_sqtt.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_sqtt.c	
@@ -29,6 +29,12 @@
 
 #define SQTT_BUFFER_ALIGN_SHIFT 12
 
+bool
+radv_is_instruction_timing_enabled(void)
+{
+   return getenv("RADV_THREAD_TRACE_PIPELINE");
+}
+
 static bool
 radv_se_is_disabled(struct radv_device *device, unsigned se)
 {
@@ -36,6 +42,21 @@ radv_se_is_disabled(struct radv_device *device, unsigned se)
    return device->physical_device->rad_info.cu_mask[se][0] == 0;
 }
 
+static uint32_t
+gfx10_get_thread_trace_ctrl(struct radv_device *device, bool enable)
+{
+   uint32_t thread_trace_ctrl = S_008D1C_MODE(enable) | S_008D1C_HIWATER(5) |
+                                S_008D1C_UTIL_TIMER(1) | S_008D1C_RT_FREQ(2) | /* 4096 clk */
+                                S_008D1C_DRAW_EVENT_EN(1) | S_008D1C_REG_STALL_EN(1) |
+                                S_008D1C_SPI_STALL_EN(1) | S_008D1C_SQ_STALL_EN(1) |
+                                S_008D1C_REG_DROP_ON_STALL(0);
+
+   if (device->physical_device->rad_info.chip_class == GFX10_3)
+      thread_trace_ctrl |= S_008D1C_LOWATER_OFFSET(4);
+
+   return thread_trace_ctrl;
+}
+
 static void
 radv_emit_thread_trace_start(struct radv_device *device, struct radeon_cmdbuf *cs,
                              uint32_t queue_family_index)
@@ -44,8 +65,6 @@ radv_emit_thread_trace_start(struct radv_device *device, struct radeon_cmdbuf *c
    struct radeon_info *rad_info = &device->physical_device->rad_info;
    unsigned max_se = rad_info->max_se;
 
-   assert(device->physical_device->rad_info.chip_class >= GFX8);
-
    for (unsigned se = 0; se < max_se; se++) {
       uint64_t va = radv_buffer_get_va(device->thread_trace.bo);
       uint64_t data_va = ac_thread_trace_get_data_va(rad_info, &device->thread_trace, va, se);
@@ -77,25 +96,25 @@ radv_emit_thread_trace_start(struct radv_device *device, struct radeon_cmdbuf *c
             V_008D18_REG_INCLUDE_SQDEC | V_008D18_REG_INCLUDE_SHDEC | V_008D18_REG_INCLUDE_GFXUDEC |
             V_008D18_REG_INCLUDE_COMP | V_008D18_REG_INCLUDE_CONTEXT | V_008D18_REG_INCLUDE_CONFIG);
 
-         /* Performance counters with SQTT are considered
-          * deprecated.
-          */
-         thread_trace_token_mask |= S_008D18_TOKEN_EXCLUDE(V_008D18_TOKEN_EXCLUDE_PERF);
+         /* Performance counters with SQTT are considered deprecated. */
+         uint32_t token_exclude = V_008D18_TOKEN_EXCLUDE_PERF;
+
+         if (!radv_is_instruction_timing_enabled()) {
+            /* Reduce SQTT traffic when instruction timing isn't enabled. */
+            token_exclude |= V_008D18_TOKEN_EXCLUDE_VMEMEXEC |
+                             V_008D18_TOKEN_EXCLUDE_ALUEXEC |
+                             V_008D18_TOKEN_EXCLUDE_VALUINST |
+                             V_008D18_TOKEN_EXCLUDE_IMMEDIATE |
+                             V_008D18_TOKEN_EXCLUDE_INST;
+         }
+         thread_trace_token_mask |= S_008D18_TOKEN_EXCLUDE(token_exclude);
 
          radeon_set_privileged_config_reg(cs, R_008D18_SQ_THREAD_TRACE_TOKEN_MASK,
                                           thread_trace_token_mask);
 
-         uint32_t thread_trace_ctrl = S_008D1C_MODE(1) | S_008D1C_HIWATER(5) |
-                                      S_008D1C_UTIL_TIMER(1) | S_008D1C_RT_FREQ(2) | /* 4096 clk */
-                                      S_008D1C_DRAW_EVENT_EN(1) | S_008D1C_REG_STALL_EN(1) |
-                                      S_008D1C_SPI_STALL_EN(1) | S_008D1C_SQ_STALL_EN(1) |
-                                      S_008D1C_REG_DROP_ON_STALL(0);
-
-         if (device->physical_device->rad_info.chip_class == GFX10_3)
-            thread_trace_ctrl |= S_008D1C_LOWATER_OFFSET(4);
-
          /* Should be emitted last (it enables thread traces). */
-         radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL, thread_trace_ctrl);
+         radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL,
+                                          gfx10_get_thread_trace_ctrl(device, true));
       } else {
          /* Order seems important for the following 4 registers. */
          radeon_set_uconfig_reg(cs, R_030CDC_SQ_THREAD_TRACE_BASE2,
@@ -222,8 +241,6 @@ radv_emit_thread_trace_stop(struct radv_device *device, struct radeon_cmdbuf *cs
 {
    unsigned max_se = device->physical_device->rad_info.max_se;
 
-   assert(device->physical_device->rad_info.chip_class >= GFX8);
-
    /* Stop the thread trace with a different event based on the queue. */
    if (queue_family_index == RADV_QUEUE_COMPUTE &&
        device->physical_device->rad_info.chip_class >= GFX7) {
@@ -258,7 +275,8 @@ radv_emit_thread_trace_stop(struct radv_device *device, struct radeon_cmdbuf *cs
          radeon_emit(cs, 4);                       /* poll interval */
 
          /* Disable the thread trace mode. */
-         radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL, S_008D1C_MODE(0));
+         radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL,
+                                          gfx10_get_thread_trace_ctrl(device, false));
 
          /* Wait for thread trace completion. */
          radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
@@ -369,6 +387,7 @@ radv_thread_trace_init_bo(struct radv_device *device)
 {
    unsigned max_se = device->physical_device->rad_info.max_se;
    struct radeon_winsys *ws = device->ws;
+   VkResult result;
    uint64_t size;
 
    /* The buffer size and address need to be aligned in HW regs. Align the
@@ -382,7 +401,7 @@ radv_thread_trace_init_bo(struct radv_device *device)
    size += device->thread_trace.buffer_size * (uint64_t)max_se;
 
    struct radeon_winsys_bo *bo = NULL;
-   VkResult result = ws->buffer_create(
+   result = ws->buffer_create(
       ws, size, 4096, RADEON_DOMAIN_VRAM,
       RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_ZERO_VRAM,
       RADV_BO_PRIORITY_SCRATCH, 0, &bo);
@@ -390,6 +409,10 @@ radv_thread_trace_init_bo(struct radv_device *device)
    if (result != VK_SUCCESS)
       return false;
 
+   result = ws->buffer_make_resident(ws, device->thread_trace.bo, true);
+   if (result != VK_SUCCESS)
+      return false;
+
    device->thread_trace.ptr = ws->buffer_map(device->thread_trace.bo);
    if (!device->thread_trace.ptr)
       return false;
@@ -397,6 +420,17 @@ radv_thread_trace_init_bo(struct radv_device *device)
    return true;
 }
 
+static void
+radv_thread_trace_finish_bo(struct radv_device *device)
+{
+   struct radeon_winsys *ws = device->ws;
+
+   if (unlikely(device->thread_trace.bo)) {
+      ws->buffer_make_resident(ws, device->thread_trace.bo, false);
+      ws->buffer_destroy(ws, device->thread_trace.bo);
+   }
+}
+
 bool
 radv_thread_trace_init(struct radv_device *device)
 {
@@ -432,8 +466,7 @@ radv_thread_trace_finish(struct radv_device *device)
    struct ac_thread_trace_data *thread_trace_data = &device->thread_trace;
    struct radeon_winsys *ws = device->ws;
 
-   if (unlikely(device->thread_trace.bo))
-      ws->buffer_destroy(ws, device->thread_trace.bo);
+   radv_thread_trace_finish_bo(device);
 
    for (unsigned i = 0; i < 2; i++) {
       if (device->thread_trace.start_cs[i])
@@ -455,10 +488,8 @@ radv_thread_trace_finish(struct radv_device *device)
 static bool
 radv_thread_trace_resize_bo(struct radv_device *device)
 {
-   struct radeon_winsys *ws = device->ws;
-
    /* Destroy the previous thread trace BO. */
-   ws->buffer_destroy(ws, device->thread_trace.bo);
+   radv_thread_trace_finish_bo(device);
 
    /* Double the size of the thread trace buffer per SE. */
    device->thread_trace.buffer_size *= 2;
@@ -476,7 +507,7 @@ bool
 radv_begin_thread_trace(struct radv_queue *queue)
 {
    struct radv_device *device = queue->device;
-   int family = queue->queue_family_index;
+   int family = queue->vk.queue_family_index;
    struct radeon_winsys *ws = device->ws;
    struct radeon_cmdbuf *cs;
    VkResult result;
@@ -532,7 +563,7 @@ bool
 radv_end_thread_trace(struct radv_queue *queue)
 {
    struct radv_device *device = queue->device;
-   int family = queue->queue_family_index;
+   int family = queue->vk.queue_family_index;
    struct radeon_winsys *ws = device->ws;
    struct radeon_cmdbuf *cs;
    VkResult result;
diff --git a/mesa 3D driver/src/amd/vulkan/radv_util.c b/mesa 3D driver/src/amd/vulkan/radv_util.c
index 153f65dd90..b8d3a22493 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_util.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_util.c	
@@ -71,60 +71,3 @@ radv_logi_v(const char *format, va_list va)
    vfprintf(stderr, format, va);
    fprintf(stderr, "\n");
 }
-
-void radv_printflike(3, 4) __radv_finishme(const char *file, int line, const char *format, ...)
-{
-   va_list ap;
-   char buffer[256];
-
-   va_start(ap, format);
-   vsnprintf(buffer, sizeof(buffer), format, ap);
-   va_end(ap);
-
-   fprintf(stderr, "%s:%d: FINISHME: %s\n", file, line, buffer);
-}
-
-VkResult
-__vk_errorv(struct radv_instance *instance, const void *object, VkDebugReportObjectTypeEXT type,
-            VkResult error, const char *file, int line, const char *format, va_list ap)
-{
-   char buffer[256];
-   char report[512];
-
-   const char *error_str = vk_Result_to_str(error);
-
-#ifndef DEBUG
-   if (instance && !(instance->debug_flags & RADV_DEBUG_ERRORS))
-      return error;
-#endif
-
-   if (format) {
-      vsnprintf(buffer, sizeof(buffer), format, ap);
-
-      snprintf(report, sizeof(report), "%s:%d: %s (%s)", file, line, buffer, error_str);
-   } else {
-      snprintf(report, sizeof(report), "%s:%d: %s", file, line, error_str);
-   }
-
-   if (instance) {
-      vk_debug_report(&instance->vk, VK_DEBUG_REPORT_ERROR_BIT_EXT, object, line, 0, "radv",
-                      report);
-   }
-
-   fprintf(stderr, "%s\n", report);
-
-   return error;
-}
-
-VkResult
-__vk_errorf(struct radv_instance *instance, const void *object, VkDebugReportObjectTypeEXT type,
-            VkResult error, const char *file, int line, const char *format, ...)
-{
-   va_list ap;
-
-   va_start(ap, format);
-   __vk_errorv(instance, object, type, error, file, line, format, ap);
-   va_end(ap);
-
-   return error;
-}
diff --git a/mesa 3D driver/src/amd/vulkan/radv_wsi.c b/mesa 3D driver/src/amd/vulkan/radv_wsi.c
index 8e9d70b56c..a90bceeffe 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_wsi.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_wsi.c	
@@ -59,153 +59,19 @@ radv_init_wsi(struct radv_physical_device *physical_device)
 
    physical_device->wsi_device.supports_modifiers = physical_device->rad_info.chip_class >= GFX9;
    physical_device->wsi_device.set_memory_ownership = radv_wsi_set_memory_ownership;
+
+   physical_device->vk.wsi_device = &physical_device->wsi_device;
+
    return VK_SUCCESS;
 }
 
 void
 radv_finish_wsi(struct radv_physical_device *physical_device)
 {
+   physical_device->vk.wsi_device = NULL;
    wsi_device_finish(&physical_device->wsi_device, &physical_device->instance->vk.alloc);
 }
 
-void
-radv_DestroySurfaceKHR(VkInstance _instance, VkSurfaceKHR _surface,
-                       const VkAllocationCallbacks *pAllocator)
-{
-   RADV_FROM_HANDLE(radv_instance, instance, _instance);
-   ICD_FROM_HANDLE(VkIcdSurfaceBase, surface, _surface);
-
-   vk_free2(&instance->vk.alloc, pAllocator, surface);
-}
-
-VkResult
-radv_GetPhysicalDeviceSurfaceSupportKHR(VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex,
-                                        VkSurfaceKHR surface, VkBool32 *pSupported)
-{
-   RADV_FROM_HANDLE(radv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_support(&device->wsi_device, queueFamilyIndex, surface,
-                                         pSupported);
-}
-
-VkResult
-radv_GetPhysicalDeviceSurfaceCapabilitiesKHR(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface,
-                                             VkSurfaceCapabilitiesKHR *pSurfaceCapabilities)
-{
-   RADV_FROM_HANDLE(radv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_capabilities(&device->wsi_device, surface, pSurfaceCapabilities);
-}
-
-VkResult
-radv_GetPhysicalDeviceSurfaceCapabilities2KHR(VkPhysicalDevice physicalDevice,
-                                              const VkPhysicalDeviceSurfaceInfo2KHR *pSurfaceInfo,
-                                              VkSurfaceCapabilities2KHR *pSurfaceCapabilities)
-{
-   RADV_FROM_HANDLE(radv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_capabilities2(&device->wsi_device, pSurfaceInfo,
-                                               pSurfaceCapabilities);
-}
-
-VkResult
-radv_GetPhysicalDeviceSurfaceCapabilities2EXT(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface,
-                                              VkSurfaceCapabilities2EXT *pSurfaceCapabilities)
-{
-   RADV_FROM_HANDLE(radv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_capabilities2ext(&device->wsi_device, surface,
-                                                  pSurfaceCapabilities);
-}
-
-VkResult
-radv_GetPhysicalDeviceSurfaceFormatsKHR(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface,
-                                        uint32_t *pSurfaceFormatCount,
-                                        VkSurfaceFormatKHR *pSurfaceFormats)
-{
-   RADV_FROM_HANDLE(radv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_formats(&device->wsi_device, surface, pSurfaceFormatCount,
-                                         pSurfaceFormats);
-}
-
-VkResult
-radv_GetPhysicalDeviceSurfaceFormats2KHR(VkPhysicalDevice physicalDevice,
-                                         const VkPhysicalDeviceSurfaceInfo2KHR *pSurfaceInfo,
-                                         uint32_t *pSurfaceFormatCount,
-                                         VkSurfaceFormat2KHR *pSurfaceFormats)
-{
-   RADV_FROM_HANDLE(radv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_formats2(&device->wsi_device, pSurfaceInfo, pSurfaceFormatCount,
-                                          pSurfaceFormats);
-}
-
-VkResult
-radv_GetPhysicalDeviceSurfacePresentModesKHR(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface,
-                                             uint32_t *pPresentModeCount,
-                                             VkPresentModeKHR *pPresentModes)
-{
-   RADV_FROM_HANDLE(radv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_present_modes(&device->wsi_device, surface, pPresentModeCount,
-                                               pPresentModes);
-}
-
-VkResult
-radv_CreateSwapchainKHR(VkDevice _device, const VkSwapchainCreateInfoKHR *pCreateInfo,
-                        const VkAllocationCallbacks *pAllocator, VkSwapchainKHR *pSwapchain)
-{
-   RADV_FROM_HANDLE(radv_device, device, _device);
-   const VkAllocationCallbacks *alloc;
-   if (pAllocator)
-      alloc = pAllocator;
-   else
-      alloc = &device->vk.alloc;
-
-   return wsi_common_create_swapchain(&device->physical_device->wsi_device,
-                                      radv_device_to_handle(device), pCreateInfo, alloc,
-                                      pSwapchain);
-}
-
-void
-radv_DestroySwapchainKHR(VkDevice _device, VkSwapchainKHR swapchain,
-                         const VkAllocationCallbacks *pAllocator)
-{
-   RADV_FROM_HANDLE(radv_device, device, _device);
-   const VkAllocationCallbacks *alloc;
-
-   if (pAllocator)
-      alloc = pAllocator;
-   else
-      alloc = &device->vk.alloc;
-
-   wsi_common_destroy_swapchain(_device, swapchain, alloc);
-}
-
-VkResult
-radv_GetSwapchainImagesKHR(VkDevice device, VkSwapchainKHR swapchain,
-                           uint32_t *pSwapchainImageCount, VkImage *pSwapchainImages)
-{
-   return wsi_common_get_images(swapchain, pSwapchainImageCount, pSwapchainImages);
-}
-
-VkResult
-radv_AcquireNextImageKHR(VkDevice device, VkSwapchainKHR swapchain, uint64_t timeout,
-                         VkSemaphore semaphore, VkFence fence, uint32_t *pImageIndex)
-{
-   VkAcquireNextImageInfoKHR acquire_info = {
-      .sType = VK_STRUCTURE_TYPE_ACQUIRE_NEXT_IMAGE_INFO_KHR,
-      .swapchain = swapchain,
-      .timeout = timeout,
-      .semaphore = semaphore,
-      .fence = fence,
-      .deviceMask = 0,
-   };
-
-   return radv_AcquireNextImage2KHR(device, &acquire_info, pImageIndex);
-}
-
 VkResult
 radv_AcquireNextImage2KHR(VkDevice _device, const VkAcquireNextImageInfoKHR *pAcquireInfo,
                           uint32_t *pImageIndex)
@@ -252,34 +118,5 @@ radv_QueuePresentKHR(VkQueue _queue, const VkPresentInfoKHR *pPresentInfo)
    RADV_FROM_HANDLE(radv_queue, queue, _queue);
    return wsi_common_queue_present(&queue->device->physical_device->wsi_device,
                                    radv_device_to_handle(queue->device), _queue,
-                                   queue->queue_family_index, pPresentInfo);
-}
-
-VkResult
-radv_GetDeviceGroupPresentCapabilitiesKHR(VkDevice device,
-                                          VkDeviceGroupPresentCapabilitiesKHR *pCapabilities)
-{
-   memset(pCapabilities->presentMask, 0, sizeof(pCapabilities->presentMask));
-   pCapabilities->presentMask[0] = 0x1;
-   pCapabilities->modes = VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHR;
-
-   return VK_SUCCESS;
-}
-
-VkResult
-radv_GetDeviceGroupSurfacePresentModesKHR(VkDevice device, VkSurfaceKHR surface,
-                                          VkDeviceGroupPresentModeFlagsKHR *pModes)
-{
-   *pModes = VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHR;
-
-   return VK_SUCCESS;
-}
-
-VkResult
-radv_GetPhysicalDevicePresentRectanglesKHR(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface,
-                                           uint32_t *pRectCount, VkRect2D *pRects)
-{
-   RADV_FROM_HANDLE(radv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_present_rectangles(&device->wsi_device, surface, pRectCount, pRects);
+                                   queue->vk.queue_family_index, pPresentInfo);
 }
diff --git a/mesa 3D driver/src/amd/vulkan/radv_wsi_display.c b/mesa 3D driver/src/amd/vulkan/radv_wsi_display.c
index 3f1a41aafd..a84d82a8cf 100644
--- a/mesa 3D driver/src/amd/vulkan/radv_wsi_display.c	
+++ b/mesa 3D driver/src/amd/vulkan/radv_wsi_display.c	
@@ -41,168 +41,8 @@
 
 #define MM_PER_PIXEL (1.0 / 96.0 * 25.4)
 
-VkResult
-radv_GetPhysicalDeviceDisplayPropertiesKHR(VkPhysicalDevice physical_device,
-                                           uint32_t *property_count,
-                                           VkDisplayPropertiesKHR *properties)
-{
-   RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_physical_device_display_properties(physical_device, &pdevice->wsi_device,
-                                                             property_count, properties);
-}
-
-VkResult
-radv_GetPhysicalDeviceDisplayProperties2KHR(VkPhysicalDevice physical_device,
-                                            uint32_t *property_count,
-                                            VkDisplayProperties2KHR *properties)
-{
-   RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_physical_device_display_properties2(physical_device, &pdevice->wsi_device,
-                                                              property_count, properties);
-}
-
-VkResult
-radv_GetPhysicalDeviceDisplayPlanePropertiesKHR(VkPhysicalDevice physical_device,
-                                                uint32_t *property_count,
-                                                VkDisplayPlanePropertiesKHR *properties)
-{
-   RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_physical_device_display_plane_properties(
-      physical_device, &pdevice->wsi_device, property_count, properties);
-}
-
-VkResult
-radv_GetPhysicalDeviceDisplayPlaneProperties2KHR(VkPhysicalDevice physical_device,
-                                                 uint32_t *property_count,
-                                                 VkDisplayPlaneProperties2KHR *properties)
-{
-   RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_physical_device_display_plane_properties2(
-      physical_device, &pdevice->wsi_device, property_count, properties);
-}
-
-VkResult
-radv_GetDisplayPlaneSupportedDisplaysKHR(VkPhysicalDevice physical_device, uint32_t plane_index,
-                                         uint32_t *display_count, VkDisplayKHR *displays)
-{
-   RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_display_plane_supported_displays(physical_device, &pdevice->wsi_device,
-                                                           plane_index, display_count, displays);
-}
-
-VkResult
-radv_GetDisplayModePropertiesKHR(VkPhysicalDevice physical_device, VkDisplayKHR display,
-                                 uint32_t *property_count, VkDisplayModePropertiesKHR *properties)
-{
-   RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_display_mode_properties(physical_device, &pdevice->wsi_device, display,
-                                                  property_count, properties);
-}
-
-VkResult
-radv_GetDisplayModeProperties2KHR(VkPhysicalDevice physical_device, VkDisplayKHR display,
-                                  uint32_t *property_count, VkDisplayModeProperties2KHR *properties)
-{
-   RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_display_mode_properties2(physical_device, &pdevice->wsi_device, display,
-                                                   property_count, properties);
-}
-
-VkResult
-radv_CreateDisplayModeKHR(VkPhysicalDevice physical_device, VkDisplayKHR display,
-                          const VkDisplayModeCreateInfoKHR *create_info,
-                          const VkAllocationCallbacks *allocator, VkDisplayModeKHR *mode)
-{
-   RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
-
-   return wsi_display_create_display_mode(physical_device, &pdevice->wsi_device, display,
-                                          create_info, allocator, mode);
-}
-
-VkResult
-radv_GetDisplayPlaneCapabilitiesKHR(VkPhysicalDevice physical_device, VkDisplayModeKHR mode_khr,
-                                    uint32_t plane_index,
-                                    VkDisplayPlaneCapabilitiesKHR *capabilities)
-{
-   RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
-
-   return wsi_get_display_plane_capabilities(physical_device, &pdevice->wsi_device, mode_khr,
-                                             plane_index, capabilities);
-}
-
-VkResult
-radv_GetDisplayPlaneCapabilities2KHR(VkPhysicalDevice physical_device,
-                                     const VkDisplayPlaneInfo2KHR *pDisplayPlaneInfo,
-                                     VkDisplayPlaneCapabilities2KHR *capabilities)
-{
-   RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
-
-   return wsi_get_display_plane_capabilities2(physical_device, &pdevice->wsi_device,
-                                              pDisplayPlaneInfo, capabilities);
-}
-
-VkResult
-radv_CreateDisplayPlaneSurfaceKHR(VkInstance _instance,
-                                  const VkDisplaySurfaceCreateInfoKHR *create_info,
-                                  const VkAllocationCallbacks *allocator, VkSurfaceKHR *surface)
-{
-   RADV_FROM_HANDLE(radv_instance, instance, _instance);
-   const VkAllocationCallbacks *alloc;
-
-   if (allocator)
-      alloc = allocator;
-   else
-      alloc = &instance->vk.alloc;
-
-   return wsi_create_display_surface(_instance, alloc, create_info, surface);
-}
-
-VkResult
-radv_ReleaseDisplayEXT(VkPhysicalDevice physical_device, VkDisplayKHR display)
-{
-   RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
-
-   return wsi_release_display(physical_device, &pdevice->wsi_device, display);
-}
-
-#ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT
-VkResult
-radv_AcquireXlibDisplayEXT(VkPhysicalDevice physical_device, Display *dpy, VkDisplayKHR display)
-{
-   RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
-
-   return wsi_acquire_xlib_display(physical_device, &pdevice->wsi_device, dpy, display);
-}
-
-VkResult
-radv_GetRandROutputDisplayEXT(VkPhysicalDevice physical_device, Display *dpy, RROutput output,
-                              VkDisplayKHR *display)
-{
-   RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
-
-   return wsi_get_randr_output_display(physical_device, &pdevice->wsi_device, dpy, output, display);
-}
-#endif /* VK_USE_PLATFORM_XLIB_XRANDR_EXT */
-
 /* VK_EXT_display_control */
 
-VkResult
-radv_DisplayPowerControlEXT(VkDevice _device, VkDisplayKHR display,
-                            const VkDisplayPowerInfoEXT *display_power_info)
-{
-   RADV_FROM_HANDLE(radv_device, device, _device);
-
-   return wsi_display_power_control(_device, &device->physical_device->wsi_device, display,
-                                    display_power_info);
-}
-
 VkResult
 radv_RegisterDeviceEventEXT(VkDevice _device, const VkDeviceEventInfoEXT *device_event_info,
                             const VkAllocationCallbacks *allocator, VkFence *_fence)
@@ -281,30 +121,3 @@ radv_RegisterDisplayEventEXT(VkDevice _device, VkDisplayKHR display,
 
    return ret;
 }
-
-VkResult
-radv_GetSwapchainCounterEXT(VkDevice _device, VkSwapchainKHR swapchain,
-                            VkSurfaceCounterFlagBitsEXT flag_bits, uint64_t *value)
-{
-   RADV_FROM_HANDLE(radv_device, device, _device);
-
-   return wsi_get_swapchain_counter(_device, &device->physical_device->wsi_device, swapchain,
-                                    flag_bits, value);
-}
-
-VkResult
-radv_AcquireDrmDisplayEXT(VkPhysicalDevice physicalDevice, int32_t drmFd, VkDisplayKHR display)
-{
-   RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
-
-   return wsi_acquire_drm_display(physicalDevice, &pdevice->wsi_device, drmFd, display);
-}
-
-VkResult radv_GetDrmDisplayEXT(VkPhysicalDevice physicalDevice, int32_t drmFd, uint32_t connectorId,
-                               VkDisplayKHR* display)
-{
-   RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
-
-   return wsi_get_drm_display(physicalDevice, &pdevice->wsi_device, drmFd, connectorId, display);
-}
-
diff --git a/mesa 3D driver/src/amd/vulkan/si_cmd_buffer.c b/mesa 3D driver/src/amd/vulkan/si_cmd_buffer.c
index a4471d8791..8826682087 100644
--- a/mesa 3D driver/src/amd/vulkan/si_cmd_buffer.c	
+++ b/mesa 3D driver/src/amd/vulkan/si_cmd_buffer.c	
@@ -79,6 +79,9 @@ si_emit_compute(struct radv_device *device, struct radeon_cmdbuf *cs)
    radeon_emit(cs, 0);
    radeon_emit(cs, 0);
 
+   radeon_set_sh_reg(cs, R_00B834_COMPUTE_PGM_HI,
+                     S_00B834_DATA(device->physical_device->rad_info.address32_hi >> 8));
+
    radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2);
    /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1,
     * renamed COMPUTE_DESTINATION_EN_SEn on gfx10. */
@@ -106,12 +109,12 @@ si_emit_compute(struct radv_device *device, struct radeon_cmdbuf *cs)
    }
 
    if (device->physical_device->rad_info.chip_class >= GFX10) {
-      radeon_set_sh_reg(cs, R_00B890_COMPUTE_USER_ACCUM_0, 0);
-      radeon_set_sh_reg(cs, R_00B894_COMPUTE_USER_ACCUM_1, 0);
-      radeon_set_sh_reg(cs, R_00B898_COMPUTE_USER_ACCUM_2, 0);
-      radeon_set_sh_reg(cs, R_00B89C_COMPUTE_USER_ACCUM_3, 0);
-      radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, 0);
-      radeon_set_sh_reg(cs, R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0);
+      radeon_set_sh_reg_seq(cs, R_00B890_COMPUTE_USER_ACCUM_0, 5);
+      radeon_emit(cs, 0); /* R_00B890_COMPUTE_USER_ACCUM_0 */
+      radeon_emit(cs, 0); /* R_00B894_COMPUTE_USER_ACCUM_1 */
+      radeon_emit(cs, 0); /* R_00B898_COMPUTE_USER_ACCUM_2 */
+      radeon_emit(cs, 0); /* R_00B89C_COMPUTE_USER_ACCUM_3 */
+      radeon_emit(cs, 0); /* R_00B8A0_COMPUTE_PGM_RSRC3 */
    }
 
    /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID
@@ -136,8 +139,7 @@ si_emit_compute(struct radv_device *device, struct radeon_cmdbuf *cs)
 
       assert(device->physical_device->rad_info.chip_class == GFX8);
 
-      tba_va = radv_buffer_get_va(device->trap_handler_shader->bo) +
-               device->trap_handler_shader->bo_offset;
+      tba_va = radv_shader_variant_get_va(device->trap_handler_shader);
       tma_va = radv_buffer_get_va(device->tma_bo);
 
       radeon_set_sh_reg_seq(cs, R_00B838_COMPUTE_TBA_LO, 4);
@@ -291,6 +293,23 @@ si_emit_graphics(struct radv_device *device, struct radeon_cmdbuf *cs)
       radeon_set_context_reg(cs, R_028408_VGT_INDX_OFFSET, 0);
    }
 
+   if (device->physical_device->rad_info.chip_class >= GFX10) {
+      radeon_set_sh_reg(cs, R_00B524_SPI_SHADER_PGM_HI_LS,
+                        S_00B524_MEM_BASE(device->physical_device->rad_info.address32_hi >> 8));
+      radeon_set_sh_reg(cs, R_00B324_SPI_SHADER_PGM_HI_ES,
+                        S_00B324_MEM_BASE(device->physical_device->rad_info.address32_hi >> 8));
+   } else if (device->physical_device->rad_info.chip_class == GFX9) {
+      radeon_set_sh_reg(cs, R_00B414_SPI_SHADER_PGM_HI_LS,
+                        S_00B414_MEM_BASE(device->physical_device->rad_info.address32_hi >> 8));
+      radeon_set_sh_reg(cs, R_00B214_SPI_SHADER_PGM_HI_ES,
+                        S_00B214_MEM_BASE(device->physical_device->rad_info.address32_hi >> 8));
+   } else {
+      radeon_set_sh_reg(cs, R_00B524_SPI_SHADER_PGM_HI_LS,
+                        S_00B524_MEM_BASE(device->physical_device->rad_info.address32_hi >> 8));
+      radeon_set_sh_reg(cs, R_00B324_SPI_SHADER_PGM_HI_ES,
+                        S_00B324_MEM_BASE(device->physical_device->rad_info.address32_hi >> 8));
+   }
+
    unsigned cu_mask_ps = 0xffffffff;
 
    /* It's wasteful to enable all CUs for PS if shader arrays have a
@@ -349,6 +368,15 @@ si_emit_graphics(struct radv_device *device, struct radeon_cmdbuf *cs)
       radeon_set_context_reg(cs, R_028C50_PA_SC_NGG_MODE_CNTL, S_028C50_MAX_DEALLOCS_IN_WAVE(512));
       radeon_set_context_reg(cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
 
+      /* Vulkan doesn't support user edge flags and it also doesn't
+       * need to prevent drawing lines on internal edges of
+       * decomposed primitives (such as quads) with polygon mode = lines.
+       */
+      unsigned vertex_reuse_depth = physical_device->rad_info.chip_class >= GFX10_3 ? 30 : 0;
+      radeon_set_context_reg(cs, R_028838_PA_CL_NGG_CNTL,
+                             S_028838_INDEX_BUF_EDGE_FLAG_ENA(0) |
+                             S_028838_VERTEX_REUSE_DEPTH(vertex_reuse_depth));
+
       /* Enable CMASK/FMASK/HTILE/DCC caching in L2 for small chips. */
       unsigned meta_write_policy, meta_read_policy;
 
@@ -379,22 +407,26 @@ si_emit_graphics(struct radv_device *device, struct radeon_cmdbuf *cs)
             S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA));
       radeon_set_context_reg(cs, R_028428_CB_COVERAGE_OUT_CONTROL, 0);
 
-      radeon_set_sh_reg(cs, R_00B0C8_SPI_SHADER_USER_ACCUM_PS_0, 0);
-      radeon_set_sh_reg(cs, R_00B0CC_SPI_SHADER_USER_ACCUM_PS_1, 0);
-      radeon_set_sh_reg(cs, R_00B0D0_SPI_SHADER_USER_ACCUM_PS_2, 0);
-      radeon_set_sh_reg(cs, R_00B0D4_SPI_SHADER_USER_ACCUM_PS_3, 0);
-      radeon_set_sh_reg(cs, R_00B1C8_SPI_SHADER_USER_ACCUM_VS_0, 0);
-      radeon_set_sh_reg(cs, R_00B1CC_SPI_SHADER_USER_ACCUM_VS_1, 0);
-      radeon_set_sh_reg(cs, R_00B1D0_SPI_SHADER_USER_ACCUM_VS_2, 0);
-      radeon_set_sh_reg(cs, R_00B1D4_SPI_SHADER_USER_ACCUM_VS_3, 0);
-      radeon_set_sh_reg(cs, R_00B2C8_SPI_SHADER_USER_ACCUM_ESGS_0, 0);
-      radeon_set_sh_reg(cs, R_00B2CC_SPI_SHADER_USER_ACCUM_ESGS_1, 0);
-      radeon_set_sh_reg(cs, R_00B2D0_SPI_SHADER_USER_ACCUM_ESGS_2, 0);
-      radeon_set_sh_reg(cs, R_00B2D4_SPI_SHADER_USER_ACCUM_ESGS_3, 0);
-      radeon_set_sh_reg(cs, R_00B4C8_SPI_SHADER_USER_ACCUM_LSHS_0, 0);
-      radeon_set_sh_reg(cs, R_00B4CC_SPI_SHADER_USER_ACCUM_LSHS_1, 0);
-      radeon_set_sh_reg(cs, R_00B4D0_SPI_SHADER_USER_ACCUM_LSHS_2, 0);
-      radeon_set_sh_reg(cs, R_00B4D4_SPI_SHADER_USER_ACCUM_LSHS_3, 0);
+      radeon_set_sh_reg_seq(cs, R_00B0C8_SPI_SHADER_USER_ACCUM_PS_0, 4);
+      radeon_emit(cs, 0); /* R_00B0C8_SPI_SHADER_USER_ACCUM_PS_0 */
+      radeon_emit(cs, 0); /* R_00B0CC_SPI_SHADER_USER_ACCUM_PS_1 */
+      radeon_emit(cs, 0); /* R_00B0D0_SPI_SHADER_USER_ACCUM_PS_2 */
+      radeon_emit(cs, 0); /* R_00B0D4_SPI_SHADER_USER_ACCUM_PS_3 */
+      radeon_set_sh_reg_seq(cs, R_00B1C8_SPI_SHADER_USER_ACCUM_VS_0, 4);
+      radeon_emit(cs, 0); /* R_00B1C8_SPI_SHADER_USER_ACCUM_VS_0 */
+      radeon_emit(cs, 0); /* R_00B1CC_SPI_SHADER_USER_ACCUM_VS_1 */
+      radeon_emit(cs, 0); /* R_00B1D0_SPI_SHADER_USER_ACCUM_VS_2 */
+      radeon_emit(cs, 0); /* R_00B1D4_SPI_SHADER_USER_ACCUM_VS_3 */
+      radeon_set_sh_reg_seq(cs, R_00B2C8_SPI_SHADER_USER_ACCUM_ESGS_0, 4);
+      radeon_emit(cs, 0); /* R_00B2C8_SPI_SHADER_USER_ACCUM_ESGS_0 */
+      radeon_emit(cs, 0); /* R_00B2CC_SPI_SHADER_USER_ACCUM_ESGS_1 */
+      radeon_emit(cs, 0); /* R_00B2D0_SPI_SHADER_USER_ACCUM_ESGS_2 */
+      radeon_emit(cs, 0); /* R_00B2D4_SPI_SHADER_USER_ACCUM_ESGS_3 */
+      radeon_set_sh_reg_seq(cs, R_00B4C8_SPI_SHADER_USER_ACCUM_LSHS_0, 4);
+      radeon_emit(cs, 0); /* R_00B4C8_SPI_SHADER_USER_ACCUM_LSHS_0 */
+      radeon_emit(cs, 0); /* R_00B4CC_SPI_SHADER_USER_ACCUM_LSHS_1 */
+      radeon_emit(cs, 0); /* R_00B4D0_SPI_SHADER_USER_ACCUM_LSHS_2 */
+      radeon_emit(cs, 0); /* R_00B4D4_SPI_SHADER_USER_ACCUM_LSHS_3 */
 
       radeon_set_sh_reg(cs, R_00B0C0_SPI_SHADER_REQ_CTRL_PS,
                         S_00B0C0_SOFT_GROUPING_EN(1) | S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1));
@@ -499,8 +531,7 @@ si_emit_graphics(struct radv_device *device, struct radeon_cmdbuf *cs)
 
       assert(device->physical_device->rad_info.chip_class == GFX8);
 
-      tba_va = radv_buffer_get_va(device->trap_handler_shader->bo) +
-               device->trap_handler_shader->bo_offset;
+      tba_va = radv_shader_variant_get_va(device->trap_handler_shader);
       tma_va = radv_buffer_get_va(device->tma_bo);
 
       uint32_t regs[] = {R_00B000_SPI_SHADER_TBA_LO_PS, R_00B100_SPI_SHADER_TBA_LO_VS,
@@ -516,6 +547,11 @@ si_emit_graphics(struct radv_device *device, struct radeon_cmdbuf *cs)
       }
    }
 
+   /* The DX10 diamond test is unnecessary with Vulkan and it decreases line rasterization
+    * performance.
+    */
+   radeon_set_context_reg(cs, R_028BDC_PA_SC_LINE_CNTL, 0);
+
    si_emit_compute(device, cs);
 }
 
@@ -1321,6 +1357,9 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
    if (unlikely(cmd_buffer->device->trace_bo))
       radv_cmd_buffer_trace_emit(cmd_buffer);
 
+   if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_INV_L2)
+      cmd_buffer->state.rb_noncoherent_dirty = false;
+
    /* Clear the caches that have been flushed to avoid syncing too much
     * when there is some pending active queries.
     */
diff --git a/mesa 3D driver/src/android_stub/cutils_stub.cpp b/mesa 3D driver/src/android_stub/cutils_stub.cpp
index b660ec9d6d..449ae26732 100644
--- a/mesa 3D driver/src/android_stub/cutils_stub.cpp	
+++ b/mesa 3D driver/src/android_stub/cutils_stub.cpp	
@@ -1,10 +1,32 @@
 #include <cutils/properties.h>
+#include <cutils/trace.h>
 
 extern "C" {
 
-int property_get(const char* key, char* value, const char* default_value)
+int
+property_get(const char *key, char *value, const char *default_value)
 {
    return 0;
 }
 
+void
+atrace_begin_body(const char * /*name*/)
+{
+}
+
+void
+atrace_end_body()
+{
+}
+
+void
+atrace_init()
+{
+}
+
+uint64_t
+atrace_get_enabled_tags()
+{
+   return ATRACE_TAG_NOT_READY;
+}
 }
diff --git a/mesa 3D driver/src/broadcom/ci/deqp-v3d-rpi4-gles.toml b/mesa 3D driver/src/broadcom/ci/deqp-v3d-rpi4-gles.toml
index 32a569344d..659a4ca9c7 100644
--- a/mesa 3D driver/src/broadcom/ci/deqp-v3d-rpi4-gles.toml	
+++ b/mesa 3D driver/src/broadcom/ci/deqp-v3d-rpi4-gles.toml	
@@ -8,6 +8,8 @@ deqp_args = [
     "--deqp-surface-width=256",
     "--deqp-visibility=hidden",
 ]
+version_check = "GL ES 3.1.*git"
+renderer_check = "V3D"
 
 [[deqp]]
 deqp = "/deqp/modules/gles3/deqp-gles3"
diff --git a/mesa 3D driver/src/broadcom/ci/deqp-v3dv-rpi4-flakes.txt b/mesa 3D driver/src/broadcom/ci/deqp-v3dv-rpi4-flakes.txt
index 91c1d598c6..0d22f002db 100644
--- a/mesa 3D driver/src/broadcom/ci/deqp-v3dv-rpi4-flakes.txt	
+++ b/mesa 3D driver/src/broadcom/ci/deqp-v3dv-rpi4-flakes.txt	
@@ -1,28 +1,5 @@
 dEQP-VK.api.external.fence.opaque_fd.reset_permanent
 dEQP-VK.api.external.fence.opaque_fd.reset_temporary
-dEQP-VK.robustness.buffer_access.through_pointers.compute.reads.1B_out_of_memory_with_scalar_u32
-dEQP-VK.robustness.buffer_access.through_pointers.compute.reads.32B_out_of_memory_with_vec4_s32
-dEQP-VK.robustness.buffer_access.through_pointers.compute.reads.32B_out_of_memory_with_vec4_u32
-dEQP-VK.robustness.buffer_access.through_pointers.compute.reads.3B_out_of_memory_with_scalar_u32
-dEQP-VK.robustness.buffer_access.through_pointers.compute.reads.3B_out_of_memory_with_vec4_f32
-dEQP-VK.robustness.buffer_access.through_pointers.compute.reads.3B_out_of_memory_with_vec4_u32
-dEQP-VK.robustness.buffer_access.through_pointers.compute.reads.4B_out_of_memory_with_scalar_f32
-dEQP-VK.robustness.buffer_access.through_pointers.compute.reads.4B_out_of_memory_with_scalar_s32
-dEQP-VK.robustness.buffer_access.through_pointers.compute.reads.4B_out_of_memory_with_scalar_u32
-dEQP-VK.robustness.buffer_access.through_pointers.compute.reads.4B_out_of_memory_with_vec4_s32
-dEQP-VK.robustness.buffer_access.through_pointers.graphics.reads.fragment.16B_out_of_memory_with_scalar_u32
-dEQP-VK.robustness.buffer_access.through_pointers.graphics.reads.fragment.1B_out_of_memory_with_scalar_f32
-dEQP-VK.robustness.buffer_access.through_pointers.graphics.reads.fragment.4B_out_of_memory_with_scalar_f32
-dEQP-VK.robustness.buffer_access.through_pointers.graphics.reads.fragment.4B_out_of_memory_with_scalar_u32
-dEQP-VK.robustness.buffer_access.through_pointers.graphics.reads.fragment.4B_out_of_memory_with_vec4_s32
-dEQP-VK.robustness.buffer_access.through_pointers.graphics.reads.vertex.16B_out_of_memory_with_vec4_f32
-dEQP-VK.robustness.buffer_access.through_pointers.graphics.reads.vertex.1B_out_of_memory_with_vec4_s32
-dEQP-VK.robustness.buffer_access.through_pointers.graphics.reads.vertex.32B_out_of_memory_with_scalar_s32
-dEQP-VK.robustness.buffer_access.through_pointers.graphics.reads.vertex.3B_out_of_memory_with_scalar_f32
-dEQP-VK.robustness.buffer_access.through_pointers.graphics.reads.vertex.3B_out_of_memory_with_vec4_u32
-dEQP-VK.robustness.buffer_access.through_pointers.graphics.reads.vertex.4B_out_of_memory_with_scalar_f32
-dEQP-VK.robustness.buffer_access.through_pointers.graphics.reads.vertex.4B_out_of_memory_with_scalar_s32
-dEQP-VK.robustness.buffer_access.through_pointers.graphics.reads.vertex.4B_out_of_memory_with_scalar_u32
-dEQP-VK.robustness.buffer_access.through_pointers.graphics.reads.vertex.4B_out_of_memory_with_vec4_s32
+dEQP-VK.api.external.fence.opaque_fd.signal_export_import_wait_permanent
 dEQP-VK.ssbo.layout.instance_array_basic_type.std430.uvec4
 dEQP-VK.wsi.display.get_display_plane_capabilities
diff --git a/mesa 3D driver/src/broadcom/ci/deqp-vc4-rpi3-fails.txt b/mesa 3D driver/src/broadcom/ci/deqp-vc4-rpi3-fails.txt
index d0722563e6..4bc5a8fe3b 100644
--- a/mesa 3D driver/src/broadcom/ci/deqp-vc4-rpi3-fails.txt	
+++ b/mesa 3D driver/src/broadcom/ci/deqp-vc4-rpi3-fails.txt	
@@ -1,15 +1,11 @@
-KHR-GLES2.core.internalformat.copy_tex_image.alpha8_oes,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.luminance4_alpha4_oes,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.luminance8_alpha8_oes,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.luminance8_oes,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.rgb565,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.rgb5_a1,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.rgba4,Fail
 KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_int_depth_component16,Fail
 KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_int_depth_component24,Fail
 KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_short_depth_component16,Fail
+
+# https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3133
 KHR-GLES2.texture_3d.copy_sub_image.negative,Fail
 KHR-GLES2.texture_3d.copy_sub_image.rgba,Fail
+
 KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_clamp_clamp,Fail
 KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_clamp_mirror,Fail
 KHR-GLES2.texture_3d.filtering.combinations.linear_linear_clamp_clamp_repeat,Fail
@@ -334,7 +330,10 @@ KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_mirror_repeat
 KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_repeat_clamp,Fail
 KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_repeat_mirror,Fail
 KHR-GLES2.texture_3d.filtering.combinations.nearest_nearest_repeat_repeat_repeat,Fail
+
+# https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3134
 KHR-GLES2.texture_3d.filtering.combinations.negative,Fail
+
 KHR-GLES2.texture_3d.filtering.formats.rgba8_linear,Fail
 KHR-GLES2.texture_3d.filtering.formats.rgba8_linear_mipmap_linear,Fail
 KHR-GLES2.texture_3d.filtering.formats.rgba8_linear_mipmap_nearest,Fail
@@ -403,8 +402,6 @@ dEQP-GLES2.functional.draw.draw_arrays.line_loop.multiple_attributes,Fail
 dEQP-GLES2.functional.draw.draw_arrays.line_loop.single_attribute,Fail
 dEQP-GLES2.functional.fbo.render.texsubimage.after_render_tex2d_rgba,Fail
 dEQP-GLES2.functional.fbo.render.texsubimage.between_render_tex2d_rgba,Fail
-dEQP-GLES2.functional.negative_api.shader.uniform_matrixfv_invalid_transpose,Fail
-dEQP-GLES2.functional.negative_api.texture.generatemipmap_zero_level_array_compressed,Fail
 dEQP-GLES2.functional.negative_api.vertex_array.vertex_attrib,Fail
 dEQP-GLES2.functional.negative_api.vertex_array.vertex_attribv,Fail
 dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_linear_mirror_rgba8888,Fail
diff --git a/mesa 3D driver/src/broadcom/ci/deqp-vc4-rpi3-gles.toml b/mesa 3D driver/src/broadcom/ci/deqp-vc4-rpi3-gles.toml
index 4ca3ab0323..218cb1835b 100644
--- a/mesa 3D driver/src/broadcom/ci/deqp-vc4-rpi3-gles.toml	
+++ b/mesa 3D driver/src/broadcom/ci/deqp-vc4-rpi3-gles.toml	
@@ -9,6 +9,8 @@ deqp_args = [
     "--deqp-surface-width=256",
     "--deqp-visibility=hidden",
 ]
+version_check = "GL ES 2.0.*git"
+renderer_check = "VC4"
 
 [[deqp]]
 deqp = "/deqp/external/openglcts/modules/glcts"
diff --git a/mesa 3D driver/src/broadcom/ci/gitlab-ci.yml b/mesa 3D driver/src/broadcom/ci/gitlab-ci.yml
index 165f995993..4f70ef1e1d 100644
--- a/mesa 3D driver/src/broadcom/ci/gitlab-ci.yml	
+++ b/mesa 3D driver/src/broadcom/ci/gitlab-ci.yml	
@@ -6,7 +6,6 @@
   variables:
     BM_BOOTFS: /boot/raspberrypi_armhf
     BM_ROOTFS: /rootfs-armhf
-    DEQP_EXPECTED_RENDERER: VC4
     GPU_VERSION: vc4-rpi3
     HWCI_KERNEL_MODULES: vc4
     FLAKES_CHANNEL: "#videocore-ci"
@@ -26,7 +25,6 @@ vc4-rpi3-gles:armhf:
   variables:
     HWCI_TEST_SCRIPT: "/install/deqp-runner.sh"
     DEQP_SUITE: vc4-rpi3-gles
-    DEQP_VER: gles2
 
 vc4-rpi3-egl:armhf:
   extends:
@@ -74,7 +72,6 @@ vc4-rpi3-piglit-quick_shader:armhf:
     BM_BOOTFS: /boot/raspberrypi_armhf
     BM_POE_TIMEOUT: 300
     BM_ROOTFS: /rootfs-armhf
-    DEQP_EXPECTED_RENDERER: V3D
     FLAKES_CHANNEL: "#videocore-ci"
     GPU_VERSION: v3d-rpi4
     HWCI_KERNEL_MODULES: v3d,vc4
@@ -92,7 +89,6 @@ v3d-rpi4-gles:armhf:
   parallel: 8
   variables:
     DEQP_SUITE: v3d-rpi4-gles
-    DEQP_VER: gles31
 
 v3d-rpi4-egl:armhf:
   extends:
@@ -123,7 +119,7 @@ v3dv-rpi4-vk:arm64:
     BM_BOOTFS: /boot/raspberrypi_arm64
     BM_POE_TIMEOUT: 300
     BM_ROOTFS: /rootfs-arm64
-    DEQP_EXPECTED_RENDERER: "V3D 4.2"
+    DEQP_EXPECTED_RENDERER: "V3D.4.2"
     DEQP_FRACTION: 5
     DEQP_VER: vk
     FLAKES_CHANNEL: "#videocore-ci"
diff --git a/mesa 3D driver/src/broadcom/ci/piglit-v3d-rpi4-fails.txt b/mesa 3D driver/src/broadcom/ci/piglit-v3d-rpi4-fails.txt
index 4557a55562..b52c0a5431 100644
--- a/mesa 3D driver/src/broadcom/ci/piglit-v3d-rpi4-fails.txt	
+++ b/mesa 3D driver/src/broadcom/ci/piglit-v3d-rpi4-fails.txt	
@@ -205,8 +205,6 @@ spec@ext_framebuffer_multisample@blit-mismatched-formats,Fail
 spec@ext_framebuffer_multisample@interpolation 2 centroid-edges,Fail
 spec@ext_framebuffer_multisample@interpolation 4 centroid-edges,Fail
 spec@ext_framebuffer_object@fbo-blending-format-quirks,Fail
-spec@ext_framebuffer_object@fbo-blending-formats,Fail
-spec@ext_framebuffer_object@fbo-blending-formats@GL_RGB10,Fail
 spec@ext_framebuffer_object@getteximage-formats init-by-clear-and-render,Fail
 spec@ext_framebuffer_object@getteximage-formats init-by-rendering,Fail
 spec@ext_gpu_shader4@execution@texelfetch@fs-texelfetch-isampler1darray,Fail
diff --git a/mesa 3D driver/src/broadcom/ci/piglit-vc4-rpi3-fails.txt b/mesa 3D driver/src/broadcom/ci/piglit-vc4-rpi3-fails.txt
index edd69894fc..9ba16c458e 100644
--- a/mesa 3D driver/src/broadcom/ci/piglit-vc4-rpi3-fails.txt	
+++ b/mesa 3D driver/src/broadcom/ci/piglit-vc4-rpi3-fails.txt	
@@ -22,6 +22,7 @@ glx@glx_ext_import_context@query context info,Fail
 shaders@glsl-arb-fragment-coord-conventions,Fail
 shaders@glsl-bug-110796,Fail
 shaders@glsl-max-vertex-attrib,Fail
+shaders@glsl-predication-on-large-array,Fail
 spec@!opengl 1.0@gl-1.0-bitmap-heart-dance,Fail
 spec@!opengl 1.0@gl-1.0-dlist-bitmap,Crash
 spec@!opengl 1.0@gl-1.0-drawbuffer-modes,Fail
@@ -795,7 +796,6 @@ spec@!opengl 1.2@lodclamp-between-max,Fail
 spec@!opengl 1.2@mipmap-setup,Fail
 spec@!opengl 1.2@tex3d,Fail
 spec@!opengl 1.2@tex3d-maxsize,Fail
-spec@!opengl 1.2@teximage-errors,Fail
 spec@!opengl 1.2@texwrap 3d proj,Fail
 spec@!opengl 1.2@texwrap 3d proj@GL_RGBA8- NPOT- projected,Fail
 spec@!opengl 1.2@texwrap 3d proj@GL_RGBA8- projected,Fail
@@ -819,7 +819,6 @@ spec@!opengl 2.1@minmax,Fail
 spec@!opengl 2.1@pbo,Fail
 spec@!opengl 2.1@pbo@test_polygon_stip,Fail
 spec@!opengl 2.1@polygon-stipple-fs,Fail
-spec@!opengl es 2.0@draw_buffers_gles2,Fail
 spec@arb_arrays_of_arrays@execution@glsl-arrays-copy-size-mismatch,Fail
 spec@arb_depth_texture@depth-level-clamp,Fail
 spec@arb_depth_texture@texwrap formats,Fail
@@ -834,7 +833,6 @@ spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT32- NPOT,Fail
 spec@arb_depth_texture@texwrap formats@GL_DEPTH_COMPONENT32- swizzled,Fail
 spec@arb_draw_elements_base_vertex@arb_draw_elements_base_vertex-negative-index,Crash
 spec@arb_draw_elements_base_vertex@arb_draw_elements_base_vertex-negative-index-user_varrays,Crash
-spec@arb_es2_compatibility@arb_es2_compatibility-drawbuffers,Fail
 spec@arb_es2_compatibility@texwrap formats,Fail
 spec@arb_es2_compatibility@texwrap formats@GL_RGB565,Fail
 spec@arb_es2_compatibility@texwrap formats@GL_RGB565- NPOT,Fail
@@ -930,8 +928,6 @@ spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_depth_
 spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_rgba,Fail
 spec@egl_khr_surfaceless_context@viewport,Fail
 spec@egl_mesa_configless_context@basic,Fail
-spec@ext_direct_state_access@indexed-state-queries 12,Fail
-spec@ext_direct_state_access@indexed-state-queries 12@GetIntegerIndexedvEXT,Fail
 spec@ext_direct_state_access@multi-texture,Crash
 spec@ext_direct_state_access@multi-texture@MultiTexImage3DEXT,Fail
 spec@ext_direct_state_access@multi-texture@MultiTexSubImage1DEXT,Fail
@@ -945,21 +941,9 @@ spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_1D + glTex*,Fail
 spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_3D + glTex* + display list GL_COMPILE,Fail
 spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_3D + glTex* + display list GL_COMPILE_AND_EXECUTE,Fail
 spec@ext_direct_state_access@textures@GL_PROXY_TEXTURE_3D + glTex*,Fail
-spec@ext_direct_state_access@textures@TextureImage2DEXT + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@textures@TextureImage2DEXT + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@textures@TextureImage2DEXT,Fail
 spec@ext_direct_state_access@textures@TextureImage3DEXT + display list GL_COMPILE,Fail
 spec@ext_direct_state_access@textures@TextureImage3DEXT + display list GL_COMPILE_AND_EXECUTE,Fail
 spec@ext_direct_state_access@textures@TextureImage3DEXT,Fail
-spec@ext_direct_state_access@textures@TextureParameterfEXT + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@textures@TextureParameterfEXT + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@textures@TextureParameterfEXT,Fail
-spec@ext_direct_state_access@textures@TextureParameteriEXT + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@textures@TextureParameteriEXT + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@textures@TextureParameteriEXT,Fail
-spec@ext_direct_state_access@textures@TextureParameterivEXT + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@textures@TextureParameterivEXT + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@textures@TextureParameterivEXT,Fail
 spec@ext_direct_state_access@textures@TextureSubImage2DEXT + display list GL_COMPILE,Fail
 spec@ext_direct_state_access@textures@TextureSubImage2DEXT + display list GL_COMPILE_AND_EXECUTE,Fail
 spec@ext_direct_state_access@textures@TextureSubImage2DEXT,Fail
diff --git a/mesa 3D driver/src/broadcom/ci/piglit-vc4-rpi3-skips.txt b/mesa 3D driver/src/broadcom/ci/piglit-vc4-rpi3-skips.txt
index ae25a28bb9..d1a0554e69 100644
--- a/mesa 3D driver/src/broadcom/ci/piglit-vc4-rpi3-skips.txt	
+++ b/mesa 3D driver/src/broadcom/ci/piglit-vc4-rpi3-skips.txt	
@@ -4,6 +4,7 @@ glx@glx-multithread-texture
 spec@arb_internalformat_query2@all internalformat_<x>_type pname checks
 spec@!opengl 1.1@streaming-texture-leak
 spec@!opengl 1.0@gl-1.0-blend-func
+shaders@glsl-predication-on-large-array
 
 # Extensions not supported
 spec@arb_gpu_shader_fp64.*
diff --git a/mesa 3D driver/src/broadcom/clif/clif_dump.c b/mesa 3D driver/src/broadcom/clif/clif_dump.c
index bf84c0b962..ede6f42eed 100644
--- a/mesa 3D driver/src/broadcom/clif/clif_dump.c	
+++ b/mesa 3D driver/src/broadcom/clif/clif_dump.c	
@@ -52,7 +52,7 @@ clif_dump_add_address_to_worklist(struct clif_dump *clif,
 
 struct clif_dump *
 clif_dump_init(const struct v3d_device_info *devinfo,
-               FILE *out, bool pretty)
+               FILE *out, bool pretty, bool nobin)
 {
         struct clif_dump *clif = rzalloc(NULL, struct clif_dump);
 
@@ -60,6 +60,7 @@ clif_dump_init(const struct v3d_device_info *devinfo,
         clif->out = out;
         clif->spec = v3d_spec_load(devinfo);
         clif->pretty = pretty;
+        clif->nobin = nobin;
 
         list_inithead(&clif->worklist);
 
@@ -159,7 +160,8 @@ clif_dump_cl(struct clif_dump *clif, uint32_t start, uint32_t end,
 static uint32_t
 clif_dump_gl_shader_state_record(struct clif_dump *clif,
                                  struct reloc_worklist_entry *reloc,
-                                 void *vaddr)
+                                 void *vaddr,
+                                 bool including_gs)
 {
         struct v3d_group *state = v3d_spec_find_struct(clif->spec,
                                                        "GL Shader State Record");
@@ -169,6 +171,16 @@ clif_dump_gl_shader_state_record(struct clif_dump *clif,
         assert(attr);
         uint32_t offset = 0;
 
+        if (including_gs) {
+                struct v3d_group *gs_state = v3d_spec_find_struct(clif->spec,
+                                                                  "Geometry Shader State Record");
+                assert(gs_state);
+                out(clif, "@format shadrec_gl_geom\n");
+                v3d_print_group(clif, gs_state, 0, vaddr + offset);
+                offset += v3d_group_get_length(gs_state);
+                /* Extra pad when geometry/tessellation shader is present */
+                offset += 20;
+        }
         out(clif, "@format shadrec_gl_main\n");
         v3d_print_group(clif, state, 0, vaddr + offset);
         offset += v3d_group_get_length(state);
@@ -200,6 +212,7 @@ clif_process_worklist(struct clif_dump *clif)
                         break;
 
                 case reloc_gl_shader_state:
+                case reloc_gl_including_gs_shader_state:
                         break;
                 case reloc_generic_tile_list:
                         clif_dump_cl(clif, reloc->addr,
@@ -238,6 +251,9 @@ static void
 clif_dump_binary(struct clif_dump *clif, struct clif_bo *bo,
                  uint32_t start, uint32_t end)
 {
+        if (clif->pretty && clif->nobin)
+                return;
+
         if (start == end)
                 return;
 
@@ -332,10 +348,12 @@ clif_dump_buffers(struct clif_dump *clif)
                         break;
 
                 case reloc_gl_shader_state:
+                case reloc_gl_including_gs_shader_state:
                         offset += clif_dump_gl_shader_state_record(clif,
                                                                    reloc,
                                                                    bo->vaddr +
-                                                                   offset);
+                                                                   offset,
+                                                                   reloc->type == reloc_gl_including_gs_shader_state);
                         break;
                 case reloc_generic_tile_list:
                         offset = clif_dump_cl(clif, reloc->addr,
diff --git a/mesa 3D driver/src/broadcom/clif/clif_dump.h b/mesa 3D driver/src/broadcom/clif/clif_dump.h
index 8de3a2cbea..63f3ae77d5 100644
--- a/mesa 3D driver/src/broadcom/clif/clif_dump.h	
+++ b/mesa 3D driver/src/broadcom/clif/clif_dump.h	
@@ -32,7 +32,7 @@ struct clif_dump;
 struct drm_v3d_submit_cl;
 
 struct clif_dump *clif_dump_init(const struct v3d_device_info *devinfo,
-                                 FILE *output, bool pretty);
+                                 FILE *output, bool pretty, bool nobin);
 void clif_dump(struct clif_dump *clif, const struct drm_v3d_submit_cl *submit);
 void clif_dump_destroy(struct clif_dump *clif);
 
diff --git a/mesa 3D driver/src/broadcom/clif/clif_private.h b/mesa 3D driver/src/broadcom/clif/clif_private.h
index 597d0b5067..6ace62b031 100644
--- a/mesa 3D driver/src/broadcom/clif/clif_private.h	
+++ b/mesa 3D driver/src/broadcom/clif/clif_private.h	
@@ -54,11 +54,17 @@ struct clif_dump {
          * output.
          */
         bool pretty;
+
+        /**
+         * Flag to no dump the binary resources.
+         */
+        bool nobin;
 };
 
 enum reloc_worklist_type {
         reloc_cl,
         reloc_gl_shader_state,
+        reloc_gl_including_gs_shader_state,
         reloc_generic_tile_list,
 };
 
diff --git a/mesa 3D driver/src/broadcom/clif/v3dx_dump.c b/mesa 3D driver/src/broadcom/clif/v3dx_dump.c
index 9cf59f8892..454478531f 100644
--- a/mesa 3D driver/src/broadcom/clif/v3dx_dump.c	
+++ b/mesa 3D driver/src/broadcom/clif/v3dx_dump.c	
@@ -94,6 +94,25 @@ v3dX(clif_dump_packet)(struct clif_dump *clif, uint32_t offset,
                 return true;
         }
 
+#if V3D_VERSION >= 41
+        case V3DX(GL_SHADER_STATE_INCLUDING_GS_opcode): {
+                struct V3DX(GL_SHADER_STATE_INCLUDING_GS) values;
+                V3DX(GL_SHADER_STATE_INCLUDING_GS_unpack)(cl, &values);
+
+                if (reloc_mode) {
+                        struct reloc_worklist_entry *reloc =
+                                clif_dump_add_address_to_worklist(clif,
+                                                                  reloc_gl_including_gs_shader_state,
+                                                                  values.address);
+                        if (reloc) {
+                                reloc->shader_state.num_attrs =
+                                        values.number_of_attribute_arrays;
+                        }
+                }
+                return true;
+        }
+#endif /* V3D_VERSION >= 41 */
+
 #if V3D_VERSION < 40
         case V3DX(STORE_MULTI_SAMPLE_RESOLVED_TILE_COLOR_BUFFER_EXTENDED_opcode): {
                 struct V3DX(STORE_MULTI_SAMPLE_RESOLVED_TILE_COLOR_BUFFER_EXTENDED) values;
diff --git a/mesa 3D driver/src/broadcom/common/v3d_debug.c b/mesa 3D driver/src/broadcom/common/v3d_debug.c
index 5352e3d2f9..508a2b7c74 100644
--- a/mesa 3D driver/src/broadcom/common/v3d_debug.c	
+++ b/mesa 3D driver/src/broadcom/common/v3d_debug.c	
@@ -42,6 +42,8 @@ uint32_t V3D_DEBUG = 0;
 static const struct debug_named_value debug_control[] = {
         { "cl",          V3D_DEBUG_CL,
           "Dump command list during creation" },
+        { "cl_nobin",    V3D_DEBUG_CL_NO_BIN,
+          "Dump command listduring creation, excluding binary resources" },
         { "clif",        V3D_DEBUG_CLIF,
           "Dump command list (CLIF format) during creation", },
         { "qpu",         V3D_DEBUG_QPU,
@@ -84,6 +86,8 @@ static const struct debug_named_value debug_control[] = {
          */
         { "tmu16",  V3D_DEBUG_TMU_16BIT,
           "Force 16-bit precision on all TMU operations" },
+        { "noloopunroll",  V3D_DEBUG_NO_LOOP_UNROLL,
+          "Disable loop unrolling" },
         { NULL }
 };
 
diff --git a/mesa 3D driver/src/broadcom/common/v3d_debug.h b/mesa 3D driver/src/broadcom/common/v3d_debug.h
index 21ae4ce782..72d632568d 100644
--- a/mesa 3D driver/src/broadcom/common/v3d_debug.h	
+++ b/mesa 3D driver/src/broadcom/common/v3d_debug.h	
@@ -61,6 +61,8 @@ extern uint32_t V3D_DEBUG;
 #define V3D_DEBUG_DUMP_SPIRV        (1 << 17)
 #define V3D_DEBUG_TMU_32BIT         (1 << 18)
 #define V3D_DEBUG_TMU_16BIT         (1 << 19)
+#define V3D_DEBUG_NO_LOOP_UNROLL    (1 << 20)
+#define V3D_DEBUG_CL_NO_BIN         (1 << 21)
 
 #define V3D_DEBUG_SHADERS           (V3D_DEBUG_TGSI | V3D_DEBUG_NIR | \
                                      V3D_DEBUG_VIR | V3D_DEBUG_QPU | \
@@ -83,11 +85,6 @@ extern uint32_t V3D_DEBUG;
 #define dbg_printf(...)	fprintf(stderr, __VA_ARGS__)
 #endif /* HAVE_ANDROID_PLATFORM */
 
-#define DBG(flag, ...) do {                                     \
-        if (unlikely(V3D_DEBUG & (flag)))                       \
-                dbg_printf(__VA_ARGS__);                        \
-} while(0)
-
 extern uint32_t v3d_debug_flag_for_shader_stage(gl_shader_stage stage);
 
 extern void v3d_process_debug_variable(void);
diff --git a/mesa 3D driver/src/broadcom/compiler/nir_to_vir.c b/mesa 3D driver/src/broadcom/compiler/nir_to_vir.c
index b0cc4977c0..d0a89f1a7d 100644
--- a/mesa 3D driver/src/broadcom/compiler/nir_to_vir.c	
+++ b/mesa 3D driver/src/broadcom/compiler/nir_to_vir.c	
@@ -1670,6 +1670,15 @@ vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt)
 static void
 emit_frag_end(struct v3d_compile *c)
 {
+        /* If the shader has no non-TLB side effects and doesn't write Z
+         * we can promote it to enabling early_fragment_tests even
+         * if the user didn't.
+         */
+        if (c->output_position_index == -1 &&
+            !(c->s->info.num_images || c->s->info.num_ssbos)) {
+                c->s->info.fs.early_fragment_tests = true;
+        }
+
         if (c->output_sample_mask_index != -1) {
                 vir_SETMSF_dest(c, vir_nop_reg(),
                                 vir_AND(c,
@@ -1694,7 +1703,8 @@ emit_frag_end(struct v3d_compile *c)
         }
 
         struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
-        if (c->output_position_index != -1) {
+        if (c->output_position_index != -1 &&
+            !c->s->info.fs.early_fragment_tests) {
                 struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
                                                   c->outputs[c->output_position_index]);
                 uint8_t tlb_specifier = TLB_TYPE_DEPTH;
@@ -3903,9 +3913,25 @@ vir_remove_thrsw(struct v3d_compile *c)
         c->last_thrsw = NULL;
 }
 
-void
-vir_emit_last_thrsw(struct v3d_compile *c)
+/**
+ * This makes sure we have a top-level last thread switch which signals the
+ * start of the last thread section, which may include adding a new thrsw
+ * instruction if needed. We don't allow spilling in the last thread section, so
+ * if we need to do any spills that inject additional thread switches later on,
+ * we ensure this thread switch will still be the last thread switch in the
+ * program, which makes last thread switch signalling a lot easier when we have
+ * spilling. If in the end we don't need to spill to compile the program and we
+ * injected a new thread switch instruction here only for that, we will
+ * eventually restore the previous last thread switch and remove the one we
+ * added here.
+ */
+static void
+vir_emit_last_thrsw(struct v3d_compile *c,
+                    struct qinst **restore_last_thrsw,
+                    bool *restore_scoreboard_lock)
 {
+        *restore_last_thrsw = c->last_thrsw;
+
         /* On V3D before 4.1, we need a TMU op to be outstanding when thread
          * switching, so disable threads if we didn't do any TMU ops (each of
          * which would have emitted a THRSW).
@@ -3914,7 +3940,7 @@ vir_emit_last_thrsw(struct v3d_compile *c)
                 c->threads = 1;
                 if (c->last_thrsw)
                         vir_remove_thrsw(c);
-                return;
+                *restore_last_thrsw = NULL;
         }
 
         /* If we're threaded and the last THRSW was in conditional code, then
@@ -3937,8 +3963,34 @@ vir_emit_last_thrsw(struct v3d_compile *c)
                 vir_emit_thrsw(c);
         }
 
+        /* If we have not inserted a last thread switch yet, do it now to ensure
+         * any potential spilling we do happens before this. If we don't spill
+         * in the end, we will restore the previous one.
+         */
+        if (*restore_last_thrsw == c->last_thrsw) {
+                if (*restore_last_thrsw)
+                        (*restore_last_thrsw)->is_last_thrsw = false;
+                *restore_scoreboard_lock = c->lock_scoreboard_on_first_thrsw;
+                vir_emit_thrsw(c);
+        } else {
+                *restore_last_thrsw = c->last_thrsw;
+        }
+
+        assert(c->last_thrsw);
+        c->last_thrsw->is_last_thrsw = true;
+}
+
+static void
+vir_restore_last_thrsw(struct v3d_compile *c,
+                       struct qinst *thrsw,
+                       bool scoreboard_lock)
+{
+        assert(c->last_thrsw);
+        vir_remove_instruction(c, c->last_thrsw);
+        c->last_thrsw = thrsw;
         if (c->last_thrsw)
                 c->last_thrsw->is_last_thrsw = true;
+        c->lock_scoreboard_on_first_thrsw = scoreboard_lock;
 }
 
 /* There's a flag in the shader for "center W is needed for reasons other than
@@ -3976,8 +4028,14 @@ v3d_nir_to_vir(struct v3d_compile *c)
 
         nir_to_vir(c);
 
+        bool restore_scoreboard_lock = false;
+        struct qinst *restore_last_thrsw;
+
         /* Emit the last THRSW before STVPM and TLB writes. */
-        vir_emit_last_thrsw(c);
+        vir_emit_last_thrsw(c,
+                            &restore_last_thrsw,
+                            &restore_scoreboard_lock);
+
 
         switch (c->s->info.stage) {
         case MESA_SHADER_FRAGMENT:
@@ -4076,6 +4134,12 @@ v3d_nir_to_vir(struct v3d_compile *c)
                         vir_remove_thrsw(c);
         }
 
+        /* If we didn't spill, then remove the last thread switch we injected
+         * artificially (if any) and restore the previous one.
+         */
+        if (!c->spills && c->last_thrsw != restore_last_thrsw)
+                vir_restore_last_thrsw(c, restore_last_thrsw, restore_scoreboard_lock);
+
         if (c->spills &&
             (V3D_DEBUG & (V3D_DEBUG_VIR |
                           v3d_debug_flag_for_shader_stage(c->s->info.stage)))) {
diff --git a/mesa 3D driver/src/broadcom/compiler/qpu_schedule.c b/mesa 3D driver/src/broadcom/compiler/qpu_schedule.c
index c559814b9e..63436735f5 100644
--- a/mesa 3D driver/src/broadcom/compiler/qpu_schedule.c	
+++ b/mesa 3D driver/src/broadcom/compiler/qpu_schedule.c	
@@ -1648,6 +1648,14 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
         if (v3d_qpu_writes_flags(&qinst->qpu))
                 return false;
 
+        /* TSY sync ops materialize at the point of the next thread switch,
+         * therefore, if we have a TSY sync right after a thread switch, we
+         * cannot place it in its delay slots, or we would be moving the sync
+         * to the thrsw before it instead.
+         */
+        if (qinst->qpu.alu.add.op == V3D_QPU_A_BARRIERID)
+                return false;
+
         return true;
 }
 
diff --git a/mesa 3D driver/src/broadcom/compiler/v3d_compiler.h b/mesa 3D driver/src/broadcom/compiler/v3d_compiler.h
index 4b66e8621a..427b704071 100644
--- a/mesa 3D driver/src/broadcom/compiler/v3d_compiler.h	
+++ b/mesa 3D driver/src/broadcom/compiler/v3d_compiler.h	
@@ -1057,7 +1057,6 @@ void vir_set_unpack(struct qinst *inst, int src,
 void vir_set_pack(struct qinst *inst, enum v3d_qpu_output_pack pack);
 
 struct qreg vir_get_temp(struct v3d_compile *c);
-void vir_emit_last_thrsw(struct v3d_compile *c);
 void vir_calculate_live_intervals(struct v3d_compile *c);
 int vir_get_nsrc(struct qinst *inst);
 bool vir_has_side_effects(struct v3d_compile *c, struct qinst *inst);
@@ -1104,7 +1103,6 @@ void v3d_nir_lower_robust_buffer_access(nir_shader *shader, struct v3d_compile *
 void v3d_nir_lower_scratch(nir_shader *s);
 void v3d_nir_lower_txf_ms(nir_shader *s, struct v3d_compile *c);
 void v3d_nir_lower_image_load_store(nir_shader *s);
-void vir_lower_uniforms(struct v3d_compile *c);
 
 void v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components);
 void v3d33_vir_vpm_write_setup(struct v3d_compile *c);
@@ -1416,30 +1414,6 @@ vir_TLB_COLOR_READ(struct v3d_compile *c)
         return vir_emit_def(c, ldtlb);
 }
 
-/*
-static inline struct qreg
-vir_LOAD_IMM(struct v3d_compile *c, uint32_t val)
-{
-        return vir_emit_def(c, vir_inst(QOP_LOAD_IMM, c->undef,
-                                        vir_reg(QFILE_LOAD_IMM, val), c->undef));
-}
-
-static inline struct qreg
-vir_LOAD_IMM_U2(struct v3d_compile *c, uint32_t val)
-{
-        return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_U2, c->undef,
-                                        vir_reg(QFILE_LOAD_IMM, val),
-                                        c->undef));
-}
-static inline struct qreg
-vir_LOAD_IMM_I2(struct v3d_compile *c, uint32_t val)
-{
-        return vir_emit_def(c, vir_inst(QOP_LOAD_IMM_I2, c->undef,
-                                        vir_reg(QFILE_LOAD_IMM, val),
-                                        c->undef));
-}
-*/
-
 static inline struct qinst *
 vir_BRANCH(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
 {
diff --git a/mesa 3D driver/src/broadcom/compiler/v3d_nir_lower_io.c b/mesa 3D driver/src/broadcom/compiler/v3d_nir_lower_io.c
index 655f74fd42..895b1a3916 100644
--- a/mesa 3D driver/src/broadcom/compiler/v3d_nir_lower_io.c	
+++ b/mesa 3D driver/src/broadcom/compiler/v3d_nir_lower_io.c	
@@ -24,6 +24,8 @@
 #include "compiler/v3d_compiler.h"
 #include "compiler/nir/nir_builder.h"
 
+#include "util/u_helpers.h"
+
 /**
  * Walks the NIR generated by TGSI-to-NIR or GLSL-to-NIR to lower its io
  * intrinsics into something amenable to the V3D architecture.
@@ -325,6 +327,59 @@ v3d_nir_lower_vertex_input(struct v3d_compile *c, nir_builder *b,
                 nir_intrinsic_set_component(instr, (comp + 2) % 4);
 }
 
+/* Sometimes the origin of gl_PointCoord is in the upper left rather than the
+ * lower left so we need to flip it.
+ *
+ * This is needed for Vulkan, Gallium uses lower_wpos_pntc.
+ */
+static void
+v3d_nir_lower_fragment_input(struct v3d_compile *c, nir_builder *b,
+                             nir_intrinsic_instr *intr)
+{
+        assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
+
+        /* Gallium uses lower_wpos_pntc */
+        if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
+                return;
+
+        b->cursor = nir_after_instr(&intr->instr);
+
+        int comp = nir_intrinsic_component(intr);
+
+        nir_variable *input_var =
+                nir_find_variable_with_driver_location(c->s,
+                                                       nir_var_shader_in,
+                                                       nir_intrinsic_base(intr));
+
+        if (input_var && util_varying_is_point_coord(input_var->data.location,
+                                                     c->fs_key->point_sprite_mask)) {
+                assert(intr->num_components == 1);
+
+                nir_ssa_def *result = &intr->dest.ssa;
+
+                switch (comp) {
+                case 0:
+                case 1:
+                        if (!c->fs_key->is_points)
+                                result = nir_imm_float(b, 0.0);
+                        break;
+                case 2:
+                        result = nir_imm_float(b, 0.0);
+                        break;
+                case 3:
+                        result = nir_imm_float(b, 1.0);
+                        break;
+                }
+                if (c->fs_key->point_coord_upper_left && comp == 1)
+                        result = nir_fsub(b, nir_imm_float(b, 1.0), result);
+                if (result != &intr->dest.ssa) {
+                        nir_ssa_def_rewrite_uses_after(&intr->dest.ssa,
+                                                       result,
+                                                       result->parent_instr);
+                }
+        }
+}
+
 static void
 v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
                        struct nir_instr *instr,
@@ -338,6 +393,8 @@ v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
         case nir_intrinsic_load_input:
                 if (c->s->info.stage == MESA_SHADER_VERTEX)
                         v3d_nir_lower_vertex_input(c, b, intr);
+                else if (c->s->info.stage == MESA_SHADER_FRAGMENT)
+                        v3d_nir_lower_fragment_input(c, b, intr);
                 break;
 
         case nir_intrinsic_load_uniform:
diff --git a/mesa 3D driver/src/broadcom/compiler/vir.c b/mesa 3D driver/src/broadcom/compiler/vir.c
index 7f01f5d233..bf75a4da17 100644
--- a/mesa 3D driver/src/broadcom/compiler/vir.c	
+++ b/mesa 3D driver/src/broadcom/compiler/vir.c	
@@ -563,7 +563,8 @@ vir_compile_init(const struct v3d_compiler *compiler,
         c->fallback_scheduler = fallback_scheduler;
         c->disable_tmu_pipelining = disable_tmu_pipelining;
         c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting;
-        c->disable_loop_unrolling = disable_loop_unrolling;
+        c->disable_loop_unrolling = V3D_DEBUG & V3D_DEBUG_NO_LOOP_UNROLL
+                ? true : disable_loop_unrolling;
 
         s = nir_shader_clone(c, s);
         c->s = s;
@@ -984,14 +985,6 @@ v3d_nir_lower_fs_early(struct v3d_compile *c)
                 /* The lowering pass can introduce new sysval reads */
                 nir_shader_gather_info(c->s, nir_shader_get_entrypoint(c->s));
         }
-
-        /* If the shader has no non-TLB side effects, we can promote it to
-         * enabling early_fragment_tests even if the user didn't.
-         */
-        if (!(c->s->info.num_images ||
-              c->s->info.num_ssbos)) {
-                c->s->info.fs.early_fragment_tests = true;
-        }
 }
 
 static void
@@ -1877,6 +1870,24 @@ try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif)
 {
         uint32_t count = 20;
         struct qinst *prev_inst = NULL;
+        assert(c->cur_block);
+
+#ifdef DEBUG
+        /* We can only reuse a uniform if it was emitted in the same block,
+         * so callers must make sure the current instruction is being emitted
+         * in the current block.
+         */
+        bool found = false;
+        vir_for_each_inst(inst, c->cur_block) {
+                if (&inst->link == c->cursor.link) {
+                        found = true;
+                        break;
+                }
+        }
+
+        assert(found || &c->cur_block->instructions == c->cursor.link);
+#endif
+
         list_for_each_entry_from_rev(struct qinst, inst, c->cursor.link->prev,
                                      &c->cur_block->instructions, link) {
                 if ((inst->qpu.sig.ldunif || inst->qpu.sig.ldunifrf) &&
diff --git a/mesa 3D driver/src/broadcom/compiler/vir_live_variables.c b/mesa 3D driver/src/broadcom/compiler/vir_live_variables.c
index 6a849dc7fc..2fd6430a0f 100644
--- a/mesa 3D driver/src/broadcom/compiler/vir_live_variables.c	
+++ b/mesa 3D driver/src/broadcom/compiler/vir_live_variables.c	
@@ -175,7 +175,7 @@ vir_setup_def_use(struct v3d_compile *c)
                         }
 
                         if (inst->qpu.flags.auf != V3D_QPU_UF_NONE ||
-                            inst->qpu.flags.auf != V3D_QPU_UF_NONE) {
+                            inst->qpu.flags.muf != V3D_QPU_UF_NONE) {
                                 flags_inst = NULL;
                         }
 
diff --git a/mesa 3D driver/src/broadcom/compiler/vir_register_allocate.c b/mesa 3D driver/src/broadcom/compiler/vir_register_allocate.c
index ddb9957b8e..e26b790c94 100644
--- a/mesa 3D driver/src/broadcom/compiler/vir_register_allocate.c	
+++ b/mesa 3D driver/src/broadcom/compiler/vir_register_allocate.c	
@@ -26,8 +26,6 @@
 #include "common/v3d_device_info.h"
 #include "v3d_compiler.h"
 
-#define QPU_R(i) { .magic = false, .index = i }
-
 #define ACC_INDEX     0
 #define ACC_COUNT     6
 #define PHYS_INDEX    (ACC_INDEX + ACC_COUNT)
@@ -261,7 +259,7 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
         }
 
         struct qinst *last_thrsw = c->last_thrsw;
-        assert(!last_thrsw || last_thrsw->is_last_thrsw);
+        assert(last_thrsw && last_thrsw->is_last_thrsw);
 
         int start_num_temps = c->num_temps;
 
@@ -347,29 +345,13 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
                                                                    spill_offset);
                                 }
                         }
-
-                        /* If we didn't have a last-thrsw inserted by nir_to_vir and
-                         * we've been inserting thrsws, then insert a new last_thrsw
-                         * right before we start the vpm/tlb sequence for the last
-                         * thread segment.
-                         */
-                        if (!is_uniform && !last_thrsw && c->last_thrsw &&
-                            (v3d_qpu_writes_vpm(&inst->qpu) ||
-                             v3d_qpu_uses_tlb(&inst->qpu))) {
-                                c->cursor = vir_before_inst(inst);
-                                vir_emit_thrsw(c);
-
-                                last_thrsw = c->last_thrsw;
-                                last_thrsw->is_last_thrsw = true;
-                        }
                 }
         }
 
         /* Make sure c->last_thrsw is the actual last thrsw, not just one we
          * inserted in our most recent unspill.
          */
-        if (last_thrsw)
-                c->last_thrsw = last_thrsw;
+        c->last_thrsw = last_thrsw;
 
         /* Don't allow spilling of our spilling instructions.  There's no way
          * they can help get things colored.
diff --git a/mesa 3D driver/src/broadcom/compiler/vir_to_qpu.c b/mesa 3D driver/src/broadcom/compiler/vir_to_qpu.c
index aa33545420..634b8961ba 100644
--- a/mesa 3D driver/src/broadcom/compiler/vir_to_qpu.c	
+++ b/mesa 3D driver/src/broadcom/compiler/vir_to_qpu.c	
@@ -45,12 +45,6 @@ qpu_magic(enum v3d_qpu_waddr waddr)
         return reg;
 }
 
-static inline struct qpu_reg
-qpu_acc(int acc)
-{
-        return qpu_magic(V3D_QPU_WADDR_R0 + acc);
-}
-
 struct v3d_qpu_instr
 v3d_qpu_nop(void)
 {
@@ -219,8 +213,13 @@ v3d_generate_code_block(struct v3d_compile *c,
                                 src[i] = qpu_magic(qinst->src[i].index);
                                 break;
                         case QFILE_NULL:
+                                /* QFILE_NULL is an undef, so we can load
+                                 * anything. Using reg 0
+                                 */
+                                src[i] = qpu_reg(0);
+                                break;
                         case QFILE_LOAD_IMM:
-                                src[i] = qpu_acc(0);
+                                assert(!"not reached");
                                 break;
                         case QFILE_TEMP:
                                 src[i] = temp_registers[index];
@@ -238,7 +237,7 @@ v3d_generate_code_block(struct v3d_compile *c,
                                 temp = new_qpu_nop_before(qinst);
                                 temp->qpu.sig.ldvpm = true;
 
-                                src[i] = qpu_acc(3);
+                                src[i] = qpu_magic(V3D_QPU_WADDR_R3);
                                 break;
                         }
                 }
diff --git a/mesa 3D driver/src/broadcom/qpu/qpu_pack.c b/mesa 3D driver/src/broadcom/qpu/qpu_pack.c
index d70e9b77e1..eee1e9f95a 100644
--- a/mesa 3D driver/src/broadcom/qpu/qpu_pack.c	
+++ b/mesa 3D driver/src/broadcom/qpu/qpu_pack.c	
@@ -1001,7 +1001,7 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
         if (!desc)
                 return false;
 
-        uint32_t opcode = opcode = desc->opcode_first;
+        uint32_t opcode = desc->opcode_first;
 
         /* If an operation doesn't use an arg, its mux values may be used to
          * identify the operation type.
diff --git a/mesa 3D driver/src/broadcom/vulkan/meson.build b/mesa 3D driver/src/broadcom/vulkan/meson.build
index 9d2593cf6d..9e0326b613 100644
--- a/mesa 3D driver/src/broadcom/vulkan/meson.build	
+++ b/mesa 3D driver/src/broadcom/vulkan/meson.build	
@@ -50,7 +50,6 @@ libv3dv_files = files(
   'v3dv_query.c',
   'v3dv_queue.c',
   'v3dv_uniforms.c',
-  'v3dv_util.c',
   'v3dv_wsi.c',
 )
 
@@ -86,29 +85,18 @@ v3dv_deps = [
   idep_nir,
   idep_nir_headers,
   idep_vulkan_util,
+  idep_vulkan_wsi,
 ]
 
 if with_platform_x11
   v3dv_deps += dep_xcb_dri3
-  v3dv_flags += [
-    '-DVK_USE_PLATFORM_XCB_KHR',
-    '-DVK_USE_PLATFORM_XLIB_KHR',
-  ]
-  libv3dv_files += files('v3dv_wsi_x11.c')
 endif
 
 if with_platform_wayland
   v3dv_deps += [dep_wayland_client, dep_wl_protocols]
-  v3dv_flags += '-DVK_USE_PLATFORM_WAYLAND_KHR'
-  libv3dv_files += files('v3dv_wsi_wayland.c')
   libv3dv_files += [wayland_drm_client_protocol_h, wayland_drm_protocol_c]
 endif
 
-if system_has_kms_drm and not with_platform_android
- v3dv_flags += '-DVK_USE_PLATFORM_DISPLAY_KHR'
- libv3dv_files += files('v3dv_wsi_display.c')
-endif
-
 per_version_libs = []
 foreach ver : v3d_versions
   per_version_libs += static_library(
@@ -116,7 +104,7 @@ foreach ver : v3d_versions
     [files_per_version, v3d_xml_pack, v3dv_entrypoints[0]],
     include_directories : [
       inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_broadcom,
-      inc_compiler, inc_util, inc_vulkan_wsi,
+      inc_compiler, inc_util,
     ],
     c_args : [v3dv_flags, '-DV3D_VERSION=' + ver],
     gnu_symbol_visibility : 'hidden',
@@ -128,12 +116,11 @@ libvulkan_broadcom = shared_library(
   'vulkan_broadcom',
   [libv3dv_files, v3dv_entrypoints, sha1_h],
   include_directories : [
-    inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom, inc_compiler, inc_util, inc_vulkan_wsi,
+    inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom, inc_compiler, inc_util,
   ],
   link_with : [
     libbroadcom_cle,
     libbroadcom_v3d,
-    libvulkan_wsi,
     per_version_libs,
   ],
   dependencies : v3dv_deps,
@@ -162,7 +149,7 @@ broadcom_icd = custom_target(
   output : 'broadcom_icd.@0@.json'.format(host_machine.cpu()),
   command : [
     prog_python, '@INPUT0@',
-    '--api-version', '1.0', '--xml', '@INPUT1@',
+    '--api-version', '1.1', '--xml', '@INPUT1@',
     '--lib-path', join_paths(get_option('prefix'), get_option('libdir'),
     		  	     'libvulkan_broadcom.so'),
     '--out', '@OUTPUT@',
diff --git a/mesa 3D driver/src/broadcom/vulkan/v3dv_cmd_buffer.c b/mesa 3D driver/src/broadcom/vulkan/v3dv_cmd_buffer.c
index 7d19091c0d..ff914e0489 100644
--- a/mesa 3D driver/src/broadcom/vulkan/v3dv_cmd_buffer.c	
+++ b/mesa 3D driver/src/broadcom/vulkan/v3dv_cmd_buffer.c	
@@ -98,7 +98,7 @@ v3dv_CreateCommandPool(VkDevice _device,
    pool = vk_object_zalloc(&device->vk, pAllocator, sizeof(*pool),
                            VK_OBJECT_TYPE_COMMAND_POOL);
    if (pool == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    if (pAllocator)
       pool->alloc = *pAllocator;
@@ -122,7 +122,7 @@ cmd_buffer_init(struct v3dv_cmd_buffer *cmd_buffer,
     * buffer reset that would reset the loader's dispatch table for the
     * command buffer, and any other relevant info from vk_object_base
     */
-   const uint32_t base_size = sizeof(struct vk_object_base);
+   const uint32_t base_size = sizeof(struct vk_command_buffer);
    uint8_t *cmd_buffer_driver_start = ((uint8_t *) cmd_buffer) + base_size;
    memset(cmd_buffer_driver_start, 0, sizeof(*cmd_buffer) - base_size);
 
@@ -150,12 +150,20 @@ cmd_buffer_create(struct v3dv_device *device,
                   VkCommandBuffer *pCommandBuffer)
 {
    struct v3dv_cmd_buffer *cmd_buffer;
-   cmd_buffer = vk_object_zalloc(&device->vk,
-                                 &pool->alloc,
-                                 sizeof(*cmd_buffer),
-                                 VK_OBJECT_TYPE_COMMAND_BUFFER);
+   cmd_buffer = vk_zalloc2(&device->vk.alloc,
+                           &pool->alloc,
+                           sizeof(*cmd_buffer),
+                           8,
+                           VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (cmd_buffer == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   VkResult result;
+   result = vk_command_buffer_init(&cmd_buffer->vk, &device->vk);
+   if (result != VK_SUCCESS) {
+      vk_free2(&device->vk.alloc, &pool->alloc, cmd_buffer);
+      return result;
+   }
 
    cmd_buffer_init(cmd_buffer, device, pool, level);
 
@@ -340,7 +348,9 @@ cmd_buffer_destroy(struct v3dv_cmd_buffer *cmd_buffer)
 {
    list_del(&cmd_buffer->pool_link);
    cmd_buffer_free_resources(cmd_buffer);
-   vk_object_free(&cmd_buffer->device->vk, &cmd_buffer->pool->alloc, cmd_buffer);
+   vk_command_buffer_finish(&cmd_buffer->vk);
+   vk_free2(&cmd_buffer->device->vk.alloc, &cmd_buffer->pool->alloc,
+            cmd_buffer);
 }
 
 static bool
@@ -787,7 +797,7 @@ v3dv_job_init(struct v3dv_job *job,
 
       v3dv_cl_init(job, &job->indirect);
 
-      if (V3D_DEBUG & V3D_DEBUG_ALWAYS_FLUSH)
+      if (unlikely(V3D_DEBUG & V3D_DEBUG_ALWAYS_FLUSH))
          job->always_flush = true;
    }
 
@@ -864,6 +874,7 @@ static VkResult
 cmd_buffer_reset(struct v3dv_cmd_buffer *cmd_buffer,
                  VkCommandBufferResetFlags flags)
 {
+   vk_command_buffer_reset(&cmd_buffer->vk);
    if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_INITIALIZED) {
       struct v3dv_device *device = cmd_buffer->device;
       struct v3dv_cmd_pool *pool = cmd_buffer->pool;
@@ -1015,26 +1026,28 @@ cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer *cmd_buffer)
          .sType = VK_STRUCTURE_TYPE_IMAGE_RESOLVE_2_KHR,
          .srcSubresource = {
             VK_IMAGE_ASPECT_COLOR_BIT,
-            src_iview->base_level,
-            src_iview->first_layer,
-            src_iview->last_layer - src_iview->first_layer + 1,
+            src_iview->vk.base_mip_level,
+            src_iview->vk.base_array_layer,
+            src_iview->vk.layer_count,
          },
          .srcOffset = { 0, 0, 0 },
          .dstSubresource =  {
             VK_IMAGE_ASPECT_COLOR_BIT,
-            dst_iview->base_level,
-            dst_iview->first_layer,
-            dst_iview->last_layer - dst_iview->first_layer + 1,
+            dst_iview->vk.base_mip_level,
+            dst_iview->vk.base_array_layer,
+            dst_iview->vk.layer_count,
          },
          .dstOffset = { 0, 0, 0 },
-         .extent = src_iview->image->extent,
+         .extent = src_iview->vk.image->extent,
       };
 
+      struct v3dv_image *src_image = (struct v3dv_image *) src_iview->vk.image;
+      struct v3dv_image *dst_image = (struct v3dv_image *) dst_iview->vk.image;
       VkResolveImageInfo2KHR resolve_info = {
          .sType = VK_STRUCTURE_TYPE_RESOLVE_IMAGE_INFO_2_KHR,
-         .srcImage = v3dv_image_to_handle(src_iview->image),
+         .srcImage = v3dv_image_to_handle(src_image),
          .srcImageLayout = VK_IMAGE_LAYOUT_GENERAL,
-         .dstImage = v3dv_image_to_handle(dst_iview->image),
+         .dstImage = v3dv_image_to_handle(dst_image),
          .dstImageLayout = VK_IMAGE_LAYOUT_GENERAL,
          .regionCount = 1,
          .pRegions = &region,
diff --git a/mesa 3D driver/src/broadcom/vulkan/v3dv_descriptor_set.c b/mesa 3D driver/src/broadcom/vulkan/v3dv_descriptor_set.c
index c1f9a7a815..14a93cea45 100644
--- a/mesa 3D driver/src/broadcom/vulkan/v3dv_descriptor_set.c	
+++ b/mesa 3D driver/src/broadcom/vulkan/v3dv_descriptor_set.c	
@@ -240,7 +240,7 @@ v3dv_descriptor_map_get_texture_format(struct v3dv_descriptor_state *descriptor_
    case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
    case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
       assert(descriptor->image_view);
-      *out_vk_format = descriptor->image_view->vk_format;
+      *out_vk_format = descriptor->image_view->vk.format;
       return descriptor->image_view->format;
    default:
       unreachable("descriptor type doesn't has a texture format");
@@ -266,9 +266,12 @@ v3dv_descriptor_map_get_texture_bo(struct v3dv_descriptor_state *descriptor_stat
    case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
    case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
    case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
-   case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+   case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
       assert(descriptor->image_view);
-      return descriptor->image_view->image->mem->bo;
+      struct v3dv_image *image =
+         (struct v3dv_image *) descriptor->image_view->vk.image;
+      return image->mem->bo;
+   }
    default:
       unreachable("descriptor type doesn't has a texture bo");
    }
@@ -323,7 +326,7 @@ v3dv_CreatePipelineLayout(VkDevice _device,
    layout = vk_object_zalloc(&device->vk, pAllocator, sizeof(*layout),
                              VK_OBJECT_TYPE_PIPELINE_LAYOUT);
    if (layout == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    layout->num_sets = pCreateInfo->setLayoutCount;
 
@@ -430,7 +433,7 @@ v3dv_CreateDescriptorPool(VkDevice _device,
                            VK_OBJECT_TYPE_DESCRIPTOR_POOL);
 
    if (!pool)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT)) {
       pool->host_memory_base = (uint8_t*)pool + sizeof(struct v3dv_descriptor_pool);
@@ -460,7 +463,7 @@ v3dv_CreateDescriptorPool(VkDevice _device,
 
  out_of_device_memory:
    vk_object_free(&device->vk, pAllocator, pool);
-   return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+   return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 }
 
 static void
@@ -580,7 +583,7 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device,
                                  VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT);
 
    if (!set_layout)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    /* We just allocate all the immutable samplers at the end of the struct */
    struct v3dv_sampler *samplers = (void*) &set_layout->binding[num_bindings];
@@ -592,7 +595,7 @@ v3dv_CreateDescriptorSetLayout(VkDevice _device,
                                                pCreateInfo->bindingCount, &bindings);
    if (result != VK_SUCCESS) {
       vk_object_free(&device->vk, pAllocator, set_layout);
-      return vk_error(device->instance, result);
+      return vk_error(device, result);
    }
 
    memset(set_layout->binding, 0,
@@ -694,7 +697,7 @@ out_of_pool_memory(const struct v3dv_device *device,
     * by allocating a new pool, so they don't point to real issues.
     */
    if (!pool->is_driver_internal)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_POOL_MEMORY)
+      return vk_error(device, VK_ERROR_OUT_OF_POOL_MEMORY);
    else
       return VK_ERROR_OUT_OF_POOL_MEMORY;
 }
@@ -723,7 +726,7 @@ descriptor_set_create(struct v3dv_device *device,
                              VK_OBJECT_TYPE_DESCRIPTOR_SET);
 
       if (!set)
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
    }
 
    set->pool = pool;
@@ -918,7 +921,7 @@ write_image_descriptor(struct v3dv_device *device,
 
    if (iview) {
       const uint32_t tex_state_index =
-         iview->type != VK_IMAGE_VIEW_TYPE_CUBE_ARRAY ||
+         iview->vk.view_type != VK_IMAGE_VIEW_TYPE_CUBE_ARRAY ||
          desc_type != VK_DESCRIPTOR_TYPE_STORAGE_IMAGE ? 0 : 1;
       memcpy(desc_map,
              iview->texture_shader_state[tex_state_index],
@@ -1139,7 +1142,7 @@ v3dv_CreateDescriptorUpdateTemplate(
    template = vk_object_alloc(&device->vk, pAllocator, size,
                               VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE);
    if (template == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    template->bind_point = pCreateInfo->pipelineBindPoint;
 
diff --git a/mesa 3D driver/src/broadcom/vulkan/v3dv_device.c b/mesa 3D driver/src/broadcom/vulkan/v3dv_device.c
index bcd5d23920..5c36b50ff6 100644
--- a/mesa 3D driver/src/broadcom/vulkan/v3dv_device.c	
+++ b/mesa 3D driver/src/broadcom/vulkan/v3dv_device.c	
@@ -66,7 +66,7 @@
 #include "drm-uapi/i915_drm.h"
 #endif
 
-#define V3DV_API_VERSION VK_MAKE_VERSION(1, 0, VK_HEADER_VERSION)
+#define V3DV_API_VERSION VK_MAKE_VERSION(1, 1, VK_HEADER_VERSION)
 
 VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_EnumerateInstanceVersion(uint32_t *pApiVersion)
@@ -94,6 +94,7 @@ static const struct vk_instance_extension_table instance_extensions = {
 #ifdef V3DV_HAS_SURFACE
    .KHR_get_surface_capabilities2       = true,
    .KHR_surface                         = true,
+   .KHR_surface_protected_capabilities  = true,
 #endif
 #ifdef VK_USE_PLATFORM_WAYLAND_KHR
    .KHR_wayland_surface                 = true,
@@ -136,17 +137,21 @@ get_device_extensions(const struct v3dv_physical_device *device,
       .KHR_uniform_buffer_standard_layout  = true,
 #ifdef V3DV_HAS_SURFACE
       .KHR_swapchain                       = true,
+      .KHR_swapchain_mutable_format        = true,
       .KHR_incremental_present             = true,
 #endif
       .KHR_variable_pointers               = true,
       .EXT_color_write_enable              = true,
       .EXT_custom_border_color             = true,
       .EXT_external_memory_dma_buf         = true,
+      .EXT_host_query_reset                = true,
       .EXT_index_type_uint8                = true,
       .EXT_physical_device_drm             = true,
       .EXT_pipeline_creation_cache_control = true,
+      .EXT_pipeline_creation_feedback      = true,
       .EXT_private_data                    = true,
       .EXT_provoking_vertex                = true,
+      .EXT_vertex_attribute_divisor        = true,
    };
 }
 
@@ -184,6 +189,8 @@ v3dv_CreateInstance(const VkInstanceCreateInfo *pCreateInfo,
    struct vk_instance_dispatch_table dispatch_table;
    vk_instance_dispatch_table_from_entrypoints(
       &dispatch_table, &v3dv_instance_entrypoints, true);
+   vk_instance_dispatch_table_from_entrypoints(
+      &dispatch_table, &wsi_instance_entrypoints, false);
 
    result = vk_instance_init(&instance->vk,
                              &instance_extensions,
@@ -192,7 +199,7 @@ v3dv_CreateInstance(const VkInstanceCreateInfo *pCreateInfo,
 
    if (result != VK_SUCCESS) {
       vk_free(pAllocator, instance);
-      return vk_error(instance, result);
+      return vk_error(NULL, result);
    }
 
    v3d_process_debug_variable();
@@ -612,14 +619,14 @@ init_uuids(struct v3dv_physical_device *device)
    const struct build_id_note *note =
       build_id_find_nhdr_for_addr(init_uuids);
    if (!note) {
-      return vk_errorf((struct v3dv_instance*) device->vk.instance,
+      return vk_errorf(device->vk.instance,
                        VK_ERROR_INITIALIZATION_FAILED,
                        "Failed to find build-id");
    }
 
    unsigned build_id_len = build_id_length(note);
    if (build_id_len < 20) {
-      return vk_errorf((struct v3dv_instance*) device->vk.instance,
+      return vk_errorf(device->vk.instance,
                        VK_ERROR_INITIALIZATION_FAILED,
                        "build-id too short.  It needs to be a SHA");
    }
@@ -689,6 +696,8 @@ physical_device_init(struct v3dv_physical_device *device,
    struct vk_physical_device_dispatch_table dispatch_table;
    vk_physical_device_dispatch_table_from_entrypoints
       (&dispatch_table, &v3dv_physical_device_entrypoints, true);
+   vk_physical_device_dispatch_table_from_entrypoints(
+      &dispatch_table, &wsi_physical_device_entrypoints, false);
 
    result = vk_physical_device_init(&device->vk, &instance->vk, NULL,
                                     &dispatch_table);
@@ -725,8 +734,7 @@ physical_device_init(struct v3dv_physical_device *device,
    device->has_primary = primary_path;
    if (device->has_primary) {
       if (stat(primary_path, &primary_stat) != 0) {
-         result = vk_errorf(instance,
-                            VK_ERROR_INITIALIZATION_FAILED,
+         result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
                             "failed to stat DRM primary node %s",
                             primary_path);
          goto fail;
@@ -736,8 +744,7 @@ physical_device_init(struct v3dv_physical_device *device,
    }
 
    if (fstat(render_fd, &render_stat) != 0) {
-      result = vk_errorf(instance,
-                         VK_ERROR_INITIALIZATION_FAILED,
+      result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
                          "failed to stat DRM render node %s",
                          path);
       goto fail;
@@ -1124,6 +1131,22 @@ v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
          break;
       }
 
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_FEATURES_EXT: {
+         VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *features =
+            (void *) ext;
+         features->vertexAttributeInstanceRateDivisor = true;
+         features->vertexAttributeInstanceRateZeroDivisor = false;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES: {
+         VkPhysicalDeviceHostQueryResetFeatures *features =
+            (void *) ext;
+
+         features->hostQueryReset = true;
+         break;
+      }
+
       /* Vulkan 1.1 */
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES: {
          VkPhysicalDeviceVulkan11Features *features =
@@ -1229,7 +1252,12 @@ v3dv_physical_device_device_id(struct v3dv_physical_device *dev)
 
    return devid;
 #else
-   return dev->devinfo.ver;
+   switch (dev->devinfo.ver) {
+   case 42:
+      return 0xBE485FD3; /* Broadcom deviceID for 2711 */
+   default:
+      unreachable("Unsupported V3D version");
+   }
 #endif
 }
 
@@ -1251,7 +1279,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
 
    const uint32_t v3d_coord_shift = 6;
 
-   const uint32_t v3d_point_line_granularity = 2.0f / (1 << v3d_coord_shift);
+   const float v3d_point_line_granularity = 2.0f / (1 << v3d_coord_shift);
    const uint32_t max_fb_size = 4096;
 
    const VkSampleCountFlags supported_sample_counts =
@@ -1432,6 +1460,12 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
          props->transformFeedbackPreservesTriangleFanProvokingVertex = false;
          break;
       }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_PROPERTIES_EXT: {
+         VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *props =
+            (VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *)ext;
+         props->maxVertexAttribDivisor = 0xffff;
+         break;
+      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES: {
          VkPhysicalDeviceIDProperties *id_props =
             (VkPhysicalDeviceIDProperties *)ext;
@@ -1650,16 +1684,19 @@ v3dv_EnumerateDeviceLayerProperties(VkPhysicalDevice physicalDevice,
       return VK_SUCCESS;
    }
 
-   return vk_error((struct v3dv_instance*) physical_device->vk.instance,
-                   VK_ERROR_LAYER_NOT_PRESENT);
+   return vk_error(physical_device, VK_ERROR_LAYER_NOT_PRESENT);
 }
 
 static VkResult
-queue_init(struct v3dv_device *device, struct v3dv_queue *queue)
+queue_init(struct v3dv_device *device, struct v3dv_queue *queue,
+           const VkDeviceQueueCreateInfo *create_info,
+           uint32_t index_in_family)
 {
-   vk_object_base_init(&device->vk, &queue->base, VK_OBJECT_TYPE_QUEUE);
+   VkResult result = vk_queue_init(&queue->vk, &device->vk, create_info,
+                                   index_in_family);
+   if (result != VK_SUCCESS)
+      return result;
    queue->device = device;
-   queue->flags = 0;
    queue->noop_job = NULL;
    list_inithead(&queue->submit_wait_list);
    pthread_mutex_init(&queue->mutex, NULL);
@@ -1669,7 +1706,7 @@ queue_init(struct v3dv_device *device, struct v3dv_queue *queue)
 static void
 queue_finish(struct v3dv_queue *queue)
 {
-   vk_object_base_finish(&queue->base);
+   vk_queue_finish(&queue->vk);
    assert(list_is_empty(&queue->submit_wait_list));
    if (queue->noop_job)
       v3dv_job_destroy(queue->noop_job);
@@ -1707,19 +1744,6 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
 
    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO);
 
-   /* Check enabled features */
-   if (pCreateInfo->pEnabledFeatures) {
-      VkPhysicalDeviceFeatures supported_features;
-      v3dv_GetPhysicalDeviceFeatures(physicalDevice, &supported_features);
-      VkBool32 *supported_feature = (VkBool32 *)&supported_features;
-      VkBool32 *enabled_feature = (VkBool32 *)pCreateInfo->pEnabledFeatures;
-      unsigned num_features = sizeof(VkPhysicalDeviceFeatures) / sizeof(VkBool32);
-      for (uint32_t i = 0; i < num_features; i++) {
-         if (enabled_feature[i] && !supported_feature[i])
-            return vk_error(instance, VK_ERROR_FEATURE_NOT_PRESENT);
-      }
-   }
-
    /* Check requested queues (we only expose one queue ) */
    assert(pCreateInfo->queueCreateInfoCount == 1);
    for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
@@ -1738,11 +1762,13 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
    struct vk_device_dispatch_table dispatch_table;
    vk_device_dispatch_table_from_entrypoints(&dispatch_table,
                                              &v3dv_device_entrypoints, true);
+   vk_device_dispatch_table_from_entrypoints(&dispatch_table,
+                                             &wsi_device_entrypoints, false);
    result = vk_device_init(&device->vk, &physical_device->vk,
                            &dispatch_table, pCreateInfo, pAllocator);
    if (result != VK_SUCCESS) {
       vk_free(&device->vk.alloc, device);
-      return vk_error(instance, result);
+      return vk_error(NULL, result);
    }
 
    device->instance = instance;
@@ -1755,20 +1781,31 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
 
    pthread_mutex_init(&device->mutex, NULL);
 
-   result = queue_init(device, &device->queue);
+   result = queue_init(device, &device->queue,
+                       pCreateInfo->pQueueCreateInfos, 0);
    if (result != VK_SUCCESS)
       goto fail;
 
    device->devinfo = physical_device->devinfo;
 
-   if (pCreateInfo->pEnabledFeatures) {
+   /* Vulkan 1.1 and VK_KHR_get_physical_device_properties2 added
+    * VkPhysicalDeviceFeatures2 which can be used in the pNext chain of
+    * vkDeviceCreateInfo, in which case it should be used instead of
+    * pEnabledFeatures.
+    */
+   const VkPhysicalDeviceFeatures2 *features2 =
+      vk_find_struct_const(pCreateInfo->pNext, PHYSICAL_DEVICE_FEATURES_2);
+   if (features2) {
+      memcpy(&device->features, &features2->features,
+             sizeof(device->features));
+   } else  if (pCreateInfo->pEnabledFeatures) {
       memcpy(&device->features, pCreateInfo->pEnabledFeatures,
              sizeof(device->features));
-
-      if (device->features.robustBufferAccess)
-         perf_debug("Device created with Robust Buffer Access enabled.\n");
    }
 
+   if (device->features.robustBufferAccess)
+      perf_debug("Device created with Robust Buffer Access enabled.\n");
+
    int ret = drmSyncobjCreate(physical_device->render_fd,
                               DRM_SYNCOBJ_CREATE_SIGNALED,
                               &device->last_job_sync);
@@ -1825,20 +1862,6 @@ v3dv_DestroyDevice(VkDevice _device,
    vk_free2(&device->vk.alloc, pAllocator, device);
 }
 
-VKAPI_ATTR void VKAPI_CALL
-v3dv_GetDeviceQueue(VkDevice _device,
-                    uint32_t queueFamilyIndex,
-                    uint32_t queueIndex,
-                    VkQueue *pQueue)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-
-   assert(queueIndex == 0);
-   assert(queueFamilyIndex == 0);
-
-   *pQueue = v3dv_queue_to_handle(&device->queue);
-}
-
 VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_DeviceWaitIdle(VkDevice _device)
 {
@@ -2135,7 +2158,7 @@ v3dv_AllocateMemory(VkDevice _device,
 
    if (result != VK_SUCCESS) {
       vk_object_free(&device->vk, pAllocator, mem);
-      return vk_error(device->instance, result);
+      return vk_error(device, result);
    }
 
    *pMem = v3dv_device_memory_to_handle(mem);
@@ -2186,7 +2209,7 @@ v3dv_MapMemory(VkDevice _device,
     */
    VkResult result = device_map(device, mem);
    if (result != VK_SUCCESS)
-      return vk_error(device->instance, result);
+      return vk_error(device, result);
 
    *ppData = ((uint8_t *) mem->bo->map) + offset;
    return VK_SUCCESS;
@@ -2239,8 +2262,8 @@ v3dv_GetImageMemoryRequirements2(VkDevice device,
       case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
          VkMemoryDedicatedRequirements *req =
             (VkMemoryDedicatedRequirements *) ext;
-         req->requiresDedicatedAllocation = image->external;
-         req->prefersDedicatedAllocation = image->external;
+         req->requiresDedicatedAllocation = image->vk.external_handle_types != 0;
+         req->prefersDedicatedAllocation = image->vk.external_handle_types != 0;
          break;
       }
       default:
@@ -2375,7 +2398,7 @@ v3dv_CreateBuffer(VkDevice  _device,
    buffer = vk_object_zalloc(&device->vk, pAllocator, sizeof(*buffer),
                              VK_OBJECT_TYPE_BUFFER);
    if (buffer == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    buffer->size = pCreateInfo->size;
    buffer->usage = pCreateInfo->usage;
@@ -2421,7 +2444,7 @@ v3dv_CreateFramebuffer(VkDevice _device,
    framebuffer = vk_object_zalloc(&device->vk, pAllocator, size,
                                   VK_OBJECT_TYPE_FRAMEBUFFER);
    if (framebuffer == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    framebuffer->width = pCreateInfo->width;
    framebuffer->height = pCreateInfo->height;
@@ -2433,7 +2456,7 @@ v3dv_CreateFramebuffer(VkDevice _device,
    for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
       framebuffer->attachments[i] =
          v3dv_image_view_from_handle(pCreateInfo->pAttachments[i]);
-      if (framebuffer->attachments[i]->aspects & VK_IMAGE_ASPECT_COLOR_BIT)
+      if (framebuffer->attachments[i]->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
          framebuffer->color_attachment_count++;
    }
 
@@ -2471,7 +2494,7 @@ v3dv_GetMemoryFdPropertiesKHR(VkDevice _device,
          (1 << pdevice->memory.memoryTypeCount) - 1;
       return VK_SUCCESS;
    default:
-      return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+      return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
    }
 }
 
@@ -2492,7 +2515,7 @@ v3dv_GetMemoryFdKHR(VkDevice _device,
                             mem->bo->handle,
                             DRM_CLOEXEC, &fd);
    if (ret)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    *pFd = fd;
 
@@ -2510,7 +2533,7 @@ v3dv_CreateEvent(VkDevice _device,
       vk_object_zalloc(&device->vk, pAllocator, sizeof(*event),
                        VK_OBJECT_TYPE_EVENT);
    if (!event)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    /* Events are created in the unsignaled state */
    event->state = false;
@@ -2570,7 +2593,7 @@ v3dv_CreateSampler(VkDevice _device,
    sampler = vk_object_zalloc(&device->vk, pAllocator, sizeof(*sampler),
                               VK_OBJECT_TYPE_SAMPLER);
    if (!sampler)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    sampler->compare_enable = pCreateInfo->compareEnable;
    sampler->unnormalized_coordinates = pCreateInfo->unnormalizedCoordinates;
diff --git a/mesa 3D driver/src/broadcom/vulkan/v3dv_image.c b/mesa 3D driver/src/broadcom/vulkan/v3dv_image.c
index a7662fc74c..5f5ef742a5 100644
--- a/mesa 3D driver/src/broadcom/vulkan/v3dv_image.c	
+++ b/mesa 3D driver/src/broadcom/vulkan/v3dv_image.c	
@@ -76,9 +76,9 @@ v3d_setup_slices(struct v3dv_image *image)
 {
    assert(image->cpp > 0);
 
-   uint32_t width = image->extent.width;
-   uint32_t height = image->extent.height;
-   uint32_t depth = image->extent.depth;
+   uint32_t width = image->vk.extent.width;
+   uint32_t height = image->vk.extent.height;
+   uint32_t depth = image->vk.extent.depth;
 
    /* Note that power-of-two padding is based on level 1.  These are not
     * equivalent to just util_next_power_of_two(dimension), because at a
@@ -94,21 +94,21 @@ v3d_setup_slices(struct v3dv_image *image)
    uint32_t uif_block_w = utile_w * 2;
    uint32_t uif_block_h = utile_h * 2;
 
-   uint32_t block_width = vk_format_get_blockwidth(image->vk_format);
-   uint32_t block_height = vk_format_get_blockheight(image->vk_format);
+   uint32_t block_width = vk_format_get_blockwidth(image->vk.format);
+   uint32_t block_height = vk_format_get_blockheight(image->vk.format);
 
-   assert(image->samples == VK_SAMPLE_COUNT_1_BIT ||
-          image->samples == VK_SAMPLE_COUNT_4_BIT);
-   bool msaa = image->samples != VK_SAMPLE_COUNT_1_BIT;
+   assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT ||
+          image->vk.samples == VK_SAMPLE_COUNT_4_BIT);
+   bool msaa = image->vk.samples != VK_SAMPLE_COUNT_1_BIT;
 
    bool uif_top = msaa;
 
-   assert(image->array_size > 0);
+   assert(image->vk.array_layers > 0);
    assert(depth > 0);
-   assert(image->levels >= 1);
+   assert(image->vk.mip_levels >= 1);
 
    uint32_t offset = 0;
-   for (int32_t i = image->levels - 1; i >= 0; i--) {
+   for (int32_t i = image->vk.mip_levels - 1; i >= 0; i--) {
       struct v3d_resource_slice *slice = &image->slices[i];
 
       uint32_t level_width, level_height, level_depth;
@@ -135,7 +135,7 @@ v3d_setup_slices(struct v3dv_image *image)
 
       if (!image->tiled) {
          slice->tiling = V3D_TILING_RASTER;
-         if (image->type == VK_IMAGE_TYPE_1D)
+         if (image->vk.image_type == VK_IMAGE_TYPE_1D)
             level_width = align(level_width, 64 / image->cpp);
       } else {
          if ((i != 0 || !uif_top) &&
@@ -210,13 +210,12 @@ v3d_setup_slices(struct v3dv_image *image)
     *
     * We additionally align to 4k, which improves UIF XOR performance.
     */
-   image->alignment =
-      image->tiling == VK_IMAGE_TILING_LINEAR ? image->cpp : 4096;
+   image->alignment = image->tiled ? 4096 : image->cpp;
    uint32_t align_offset =
       align(image->slices[0].offset, image->alignment) - image->slices[0].offset;
    if (align_offset) {
       image->size += align_offset;
-      for (int i = 0; i < image->levels; i++)
+      for (int i = 0; i < image->vk.mip_levels; i++)
          image->slices[i].offset += align_offset;
    }
 
@@ -224,10 +223,10 @@ v3d_setup_slices(struct v3dv_image *image)
     * one full mipmap tree to the next (64b aligned).  For 3D textures,
     * we need to program the stride between slices of miplevel 0.
     */
-   if (image->type != VK_IMAGE_TYPE_3D) {
+   if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
       image->cube_map_stride =
          align(image->slices[0].offset + image->slices[0].size, 64);
-      image->size += image->cube_map_stride * (image->array_size - 1);
+      image->size += image->cube_map_stride * (image->vk.array_layers - 1);
    } else {
       image->cube_map_stride = image->slices[0].size;
    }
@@ -238,7 +237,7 @@ v3dv_layer_offset(const struct v3dv_image *image, uint32_t level, uint32_t layer
 {
    const struct v3d_resource_slice *slice = &image->slices[level];
 
-   if (image->type == VK_IMAGE_TYPE_3D)
+   if (image->vk.image_type == VK_IMAGE_TYPE_3D)
       return image->mem_offset + slice->offset + layer * slice->size;
    else
       return image->mem_offset + slice->offset + layer * image->cube_map_stride;
@@ -252,14 +251,9 @@ create_image(struct v3dv_device *device,
 {
    struct v3dv_image *image = NULL;
 
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO);
-
-   v3dv_assert(pCreateInfo->mipLevels > 0);
-   v3dv_assert(pCreateInfo->arrayLayers > 0);
-   v3dv_assert(pCreateInfo->samples > 0);
-   v3dv_assert(pCreateInfo->extent.width > 0);
-   v3dv_assert(pCreateInfo->extent.height > 0);
-   v3dv_assert(pCreateInfo->extent.depth > 0);
+   image = vk_image_create(&device->vk, pCreateInfo, pAllocator, sizeof(*image));
+   if (image == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    /* When using the simulator the WSI common code will see that our
     * driver wsi device doesn't match the display device and because of that
@@ -270,8 +264,9 @@ create_image(struct v3dv_device *device,
     * As a result, on that path, swapchain images do not have any special
     * requirements and are not created with the pNext structs below.
     */
+   VkImageTiling tiling = pCreateInfo->tiling;
    uint64_t modifier = DRM_FORMAT_MOD_INVALID;
-   if (pCreateInfo->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
+   if (tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
       const VkImageDrmFormatModifierListCreateInfoEXT *mod_info =
          vk_find_struct_const(pCreateInfo->pNext,
                               IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT);
@@ -297,55 +292,32 @@ create_image(struct v3dv_device *device,
       }
       assert(modifier == DRM_FORMAT_MOD_LINEAR ||
              modifier == DRM_FORMAT_MOD_BROADCOM_UIF);
-   } else {
-      const struct wsi_image_create_info *wsi_info =
-         vk_find_struct_const(pCreateInfo->pNext, WSI_IMAGE_CREATE_INFO_MESA);
-      if (wsi_info && wsi_info->scanout)
-         modifier = DRM_FORMAT_MOD_LINEAR;
+   } else if (pCreateInfo->imageType == VK_IMAGE_TYPE_1D ||
+              image->vk.wsi_legacy_scanout) {
+      tiling = VK_IMAGE_TILING_LINEAR;
    }
 
-   const VkExternalMemoryImageCreateInfo *external_info =
-      vk_find_struct_const(pCreateInfo->pNext, EXTERNAL_MEMORY_IMAGE_CREATE_INFO);
-
-   /* 1D and 1D_ARRAY textures are always raster-order */
-   VkImageTiling tiling;
-   if (pCreateInfo->imageType == VK_IMAGE_TYPE_1D)
-      tiling = VK_IMAGE_TILING_LINEAR;
-   else if (modifier == DRM_FORMAT_MOD_INVALID)
-      tiling = pCreateInfo->tiling;
-   else if (modifier == DRM_FORMAT_MOD_BROADCOM_UIF)
-      tiling = VK_IMAGE_TILING_OPTIMAL;
-   else
-      tiling = VK_IMAGE_TILING_LINEAR;
-
-   const struct v3dv_format *format = v3dv_X(device, get_format)(pCreateInfo->format);
+   const struct v3dv_format *format =
+      v3dv_X(device, get_format)(pCreateInfo->format);
    v3dv_assert(format != NULL && format->supported);
 
-   image = vk_object_zalloc(&device->vk, pAllocator, sizeof(*image),
-                            VK_OBJECT_TYPE_IMAGE);
-   if (!image)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
-
    assert(pCreateInfo->samples == VK_SAMPLE_COUNT_1_BIT ||
           pCreateInfo->samples == VK_SAMPLE_COUNT_4_BIT);
 
-   image->type = pCreateInfo->imageType;
-   image->extent = pCreateInfo->extent;
-   image->vk_format = pCreateInfo->format;
    image->format = format;
-   image->aspects = vk_format_aspects(image->vk_format);
-   image->levels = pCreateInfo->mipLevels;
-   image->array_size = pCreateInfo->arrayLayers;
-   image->samples = pCreateInfo->samples;
-   image->usage = pCreateInfo->usage;
-   image->flags = pCreateInfo->flags;
+   image->cpp = vk_format_get_blocksize(image->vk.format);
+   image->tiled = tiling == VK_IMAGE_TILING_OPTIMAL ||
+                  (tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT &&
+                   modifier != DRM_FORMAT_MOD_LINEAR);
 
-   image->drm_format_mod = modifier;
-   image->tiling = tiling;
-   image->tiled = tiling == VK_IMAGE_TILING_OPTIMAL;
-   image->external = external_info != NULL;
+   image->vk.tiling = tiling;
+   image->vk.drm_format_mod = modifier;
 
-   image->cpp = vk_format_get_blocksize(image->vk_format);
+   /* Our meta paths can create image views with compatible formats for any
+    * image, so always set this flag to keep the common Vulkan image code
+    * happy.
+    */
+   image->vk.create_flags |= VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT;
 
    v3d_setup_slices(image);
 
@@ -376,26 +348,26 @@ create_image_from_swapchain(struct v3dv_device *device,
     * #swapchain-wsi-image-create-info .
     */
    assert(local_create_info.tiling == VK_IMAGE_TILING_OPTIMAL);
-   local_create_info.tiling = swapchain_image->tiling;
+   local_create_info.tiling = swapchain_image->vk.tiling;
 
    VkImageDrmFormatModifierListCreateInfoEXT local_modifier_info = {
       .sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT,
       .drmFormatModifierCount = 1,
-      .pDrmFormatModifiers = &swapchain_image->drm_format_mod,
+      .pDrmFormatModifiers = &swapchain_image->vk.drm_format_mod,
    };
 
-   if (swapchain_image->drm_format_mod != DRM_FORMAT_MOD_INVALID)
+   if (swapchain_image->vk.drm_format_mod != DRM_FORMAT_MOD_INVALID)
       __vk_append_struct(&local_create_info, &local_modifier_info);
 
-   assert(swapchain_image->type == local_create_info.imageType);
-   assert(swapchain_image->vk_format == local_create_info.format);
-   assert(swapchain_image->extent.width == local_create_info.extent.width);
-   assert(swapchain_image->extent.height == local_create_info.extent.height);
-   assert(swapchain_image->extent.depth == local_create_info.extent.depth);
-   assert(swapchain_image->array_size == local_create_info.arrayLayers);
-   assert(swapchain_image->samples == local_create_info.samples);
-   assert(swapchain_image->tiling == local_create_info.tiling);
-   assert((swapchain_image->usage & local_create_info.usage) ==
+   assert(swapchain_image->vk.image_type == local_create_info.imageType);
+   assert(swapchain_image->vk.format == local_create_info.format);
+   assert(swapchain_image->vk.extent.width == local_create_info.extent.width);
+   assert(swapchain_image->vk.extent.height == local_create_info.extent.height);
+   assert(swapchain_image->vk.extent.depth == local_create_info.extent.depth);
+   assert(swapchain_image->vk.array_layers == local_create_info.arrayLayers);
+   assert(swapchain_image->vk.samples == local_create_info.samples);
+   assert(swapchain_image->vk.tiling == local_create_info.tiling);
+   assert((swapchain_image->vk.usage & local_create_info.usage) ==
           local_create_info.usage);
 
    return create_image(device, &local_create_info, pAllocator, pImage);
@@ -434,7 +406,7 @@ v3dv_GetImageSubresourceLayout(VkDevice device,
    layout->depthPitch = image->cube_map_stride;
    layout->arrayPitch = image->cube_map_stride;
 
-   if (image->type != VK_IMAGE_TYPE_3D) {
+   if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
       layout->size = slice->size;
    } else {
       /* For 3D images, the size of the slice represents the size of a 2D slice
@@ -444,7 +416,7 @@ v3dv_GetImageSubresourceLayout(VkDevice device,
        * arranged in memory from last to first).
        */
       if (subresource->mipLevel == 0) {
-         layout->size = slice->size * image->extent.depth;
+         layout->size = slice->size * image->vk.extent.depth;
       } else {
             const struct v3d_resource_slice *prev_slice =
                &image->slices[subresource->mipLevel - 1];
@@ -453,22 +425,6 @@ v3dv_GetImageSubresourceLayout(VkDevice device,
    }
 }
 
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetImageDrmFormatModifierPropertiesEXT(
-   VkDevice device,
-   VkImage _image,
-   VkImageDrmFormatModifierPropertiesEXT *pProperties)
-{
-   V3DV_FROM_HANDLE(v3dv_image, image, _image);
-
-   assert(pProperties->sType ==
-          VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_PROPERTIES_EXT);
-
-   pProperties->drmFormatModifier = image->drm_format_mod;
-
-   return VK_SUCCESS;
-}
-
 VKAPI_ATTR void VKAPI_CALL
 v3dv_DestroyImage(VkDevice _device,
                   VkImage _image,
@@ -480,7 +436,7 @@ v3dv_DestroyImage(VkDevice _device,
    if (image == NULL)
       return;
 
-   vk_object_free(&device->vk, pAllocator, image);
+   vk_image_destroy(&device->vk, pAllocator, &image->vk);
 }
 
 VkImageViewType
@@ -495,31 +451,6 @@ v3dv_image_type_to_view_type(VkImageType type)
    }
 }
 
-static enum pipe_swizzle
-vk_component_mapping_to_pipe_swizzle(VkComponentSwizzle comp,
-                                     VkComponentSwizzle swz)
-{
-   if (swz == VK_COMPONENT_SWIZZLE_IDENTITY)
-      swz = comp;
-
-   switch (swz) {
-   case VK_COMPONENT_SWIZZLE_ZERO:
-      return PIPE_SWIZZLE_0;
-   case VK_COMPONENT_SWIZZLE_ONE:
-      return PIPE_SWIZZLE_1;
-   case VK_COMPONENT_SWIZZLE_R:
-      return PIPE_SWIZZLE_X;
-   case VK_COMPONENT_SWIZZLE_G:
-      return PIPE_SWIZZLE_Y;
-   case VK_COMPONENT_SWIZZLE_B:
-      return PIPE_SWIZZLE_Z;
-   case VK_COMPONENT_SWIZZLE_A:
-      return PIPE_SWIZZLE_W;
-   default:
-      unreachable("Unknown VkComponentSwizzle");
-   };
-}
-
 VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreateImageView(VkDevice _device,
                      const VkImageViewCreateInfo *pCreateInfo,
@@ -530,56 +461,15 @@ v3dv_CreateImageView(VkDevice _device,
    V3DV_FROM_HANDLE(v3dv_image, image, pCreateInfo->image);
    struct v3dv_image_view *iview;
 
-   iview = vk_object_zalloc(&device->vk, pAllocator, sizeof(*iview),
-                            VK_OBJECT_TYPE_IMAGE_VIEW);
+   iview = vk_image_view_create(&device->vk, pCreateInfo, pAllocator,
+                                sizeof(*iview));
    if (iview == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    const VkImageSubresourceRange *range = &pCreateInfo->subresourceRange;
 
-   assert(range->layerCount > 0);
-   assert(range->baseMipLevel < image->levels);
-
-#ifdef DEBUG
-   switch (image->type) {
-   case VK_IMAGE_TYPE_1D:
-   case VK_IMAGE_TYPE_2D:
-      assert(range->baseArrayLayer + v3dv_layer_count(image, range) - 1 <=
-             image->array_size);
-      break;
-   case VK_IMAGE_TYPE_3D:
-      assert(range->baseArrayLayer + v3dv_layer_count(image, range) - 1
-             <= u_minify(image->extent.depth, range->baseMipLevel));
-      /* VK_KHR_maintenance1 */
-      assert(pCreateInfo->viewType != VK_IMAGE_VIEW_TYPE_2D ||
-             ((image->flags & VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT) &&
-              range->levelCount == 1 && range->layerCount == 1));
-      assert(pCreateInfo->viewType != VK_IMAGE_VIEW_TYPE_2D_ARRAY ||
-             ((image->flags & VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT) &&
-              range->levelCount == 1));
-      break;
-   default:
-      unreachable("bad VkImageType");
-   }
-#endif
-
-   iview->image = image;
-   iview->aspects = range->aspectMask;
-   iview->type = pCreateInfo->viewType;
-
-   iview->base_level = range->baseMipLevel;
-   iview->max_level = iview->base_level + v3dv_level_count(image, range) - 1;
-   iview->extent = (VkExtent3D) {
-      .width  = u_minify(image->extent.width , iview->base_level),
-      .height = u_minify(image->extent.height, iview->base_level),
-      .depth  = u_minify(image->extent.depth , iview->base_level),
-   };
-
-   iview->first_layer = range->baseArrayLayer;
-   iview->last_layer = range->baseArrayLayer +
-                       v3dv_layer_count(image, range) - 1;
-   iview->offset =
-      v3dv_layer_offset(image, iview->base_level, iview->first_layer);
+   iview->offset = v3dv_layer_offset(image, iview->vk.base_mip_level,
+                                     iview->vk.base_array_layer);
 
    /* If we have D24S8 format but the view only selects the stencil aspect
     * we want to re-interpret the format as RGBA8_UINT, then map our stencil
@@ -602,26 +492,17 @@ v3dv_CreateImageView(VkDevice _device,
        * util_format_compose_swizzles. Would be good to check if it would be
        * better to reimplement the latter using vk component
        */
-      image_view_swizzle[0] =
-         vk_component_mapping_to_pipe_swizzle(VK_COMPONENT_SWIZZLE_R,
-                                              pCreateInfo->components.r);
-      image_view_swizzle[1] =
-         vk_component_mapping_to_pipe_swizzle(VK_COMPONENT_SWIZZLE_G,
-                                              pCreateInfo->components.g);
-      image_view_swizzle[2] =
-         vk_component_mapping_to_pipe_swizzle(VK_COMPONENT_SWIZZLE_B,
-                                              pCreateInfo->components.b);
-      image_view_swizzle[3] =
-         vk_component_mapping_to_pipe_swizzle(VK_COMPONENT_SWIZZLE_A,
-                                              pCreateInfo->components.a);
+      vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle,
+                                           image_view_swizzle);
    }
 
-   iview->vk_format = format;
+   iview->vk.format = format;
    iview->format = v3dv_X(device, get_format)(format);
    assert(iview->format && iview->format->supported);
 
-   if (vk_format_is_depth_or_stencil(iview->vk_format)) {
-      iview->internal_type = v3dv_X(device, get_internal_depth_type)(iview->vk_format);
+   if (vk_format_is_depth_or_stencil(iview->vk.format)) {
+      iview->internal_type =
+         v3dv_X(device, get_internal_depth_type)(iview->vk.format);
    } else {
       v3dv_X(device, get_internal_type_bpp_for_output_format)
          (iview->format->rt_type, &iview->internal_type, &iview->internal_bpp);
@@ -650,7 +531,7 @@ v3dv_DestroyImageView(VkDevice _device,
    if (image_view == NULL)
       return;
 
-   vk_object_free(&device->vk, pAllocator, image_view);
+   vk_image_view_destroy(&device->vk, pAllocator, &image_view->vk);
 }
 
 VKAPI_ATTR VkResult VKAPI_CALL
@@ -668,7 +549,7 @@ v3dv_CreateBufferView(VkDevice _device,
       vk_object_zalloc(&device->vk, pAllocator, sizeof(*view),
                        VK_OBJECT_TYPE_BUFFER_VIEW);
    if (!view)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    uint32_t range;
    if (pCreateInfo->range == VK_WHOLE_SIZE)
diff --git a/mesa 3D driver/src/broadcom/vulkan/v3dv_meta_clear.c b/mesa 3D driver/src/broadcom/vulkan/v3dv_meta_clear.c
index f6d24d519f..5555c690bb 100644
--- a/mesa 3D driver/src/broadcom/vulkan/v3dv_meta_clear.c	
+++ b/mesa 3D driver/src/broadcom/vulkan/v3dv_meta_clear.c	
@@ -79,7 +79,7 @@ clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
    union v3dv_clear_value hw_clear_value = { 0 };
    if (range->aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
       get_hw_clear_color(cmd_buffer->device, &clear_value->color, fb_format,
-                         image->vk_format, internal_type, internal_bpp,
+                         image->vk.format, internal_type, internal_bpp,
                          &hw_clear_value.color[0]);
    } else {
       assert((range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) ||
@@ -88,9 +88,7 @@ clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
       hw_clear_value.s = clear_value->depthStencil.stencil;
    }
 
-   uint32_t level_count = range->levelCount == VK_REMAINING_MIP_LEVELS ?
-                          image->levels - range->baseMipLevel :
-                          range->levelCount;
+   uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
    uint32_t min_level = range->baseMipLevel;
    uint32_t max_level = range->baseMipLevel + level_count;
 
@@ -100,23 +98,21 @@ clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
     */
    uint32_t min_layer;
    uint32_t max_layer;
-   if (image->type != VK_IMAGE_TYPE_3D) {
-      uint32_t layer_count = range->layerCount == VK_REMAINING_ARRAY_LAYERS ?
-                             image->array_size - range->baseArrayLayer :
-                             range->layerCount;
+   if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
       min_layer = range->baseArrayLayer;
-      max_layer = range->baseArrayLayer + layer_count;
+      max_layer = range->baseArrayLayer +
+                  vk_image_subresource_layer_count(&image->vk, range);
    } else {
       min_layer = 0;
       max_layer = 0;
    }
 
    for (uint32_t level = min_level; level < max_level; level++) {
-      if (image->type == VK_IMAGE_TYPE_3D)
-         max_layer = u_minify(image->extent.depth, level);
+      if (image->vk.image_type == VK_IMAGE_TYPE_3D)
+         max_layer = u_minify(image->vk.extent.depth, level);
 
-      uint32_t width = u_minify(image->extent.width, level);
-      uint32_t height = u_minify(image->extent.height, level);
+      uint32_t width = u_minify(image->vk.extent.width, level);
+      uint32_t height = u_minify(image->vk.extent.height, level);
 
       struct v3dv_job *job =
          v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
@@ -126,7 +122,7 @@ clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
 
       v3dv_job_start_frame(job, width, height, max_layer, false,
                            1, internal_bpp,
-                           image->samples > VK_SAMPLE_COUNT_1_BIT);
+                           image->vk.samples > VK_SAMPLE_COUNT_1_BIT);
 
       struct v3dv_meta_framebuffer framebuffer;
       v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
@@ -138,7 +134,7 @@ clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
       /* If this triggers it is an application bug: the spec requires
        * that any aspects to clear are present in the image.
        */
-      assert(range->aspectMask & image->aspects);
+      assert(range->aspectMask & image->vk.aspects);
 
       v3dv_X(job->device, meta_emit_clear_image_rcl)
          (job, image, &framebuffer, &hw_clear_value,
diff --git a/mesa 3D driver/src/broadcom/vulkan/v3dv_meta_copy.c b/mesa 3D driver/src/broadcom/vulkan/v3dv_meta_copy.c
index a2dfbb18ff..b7c29d0822 100644
--- a/mesa 3D driver/src/broadcom/vulkan/v3dv_meta_copy.c	
+++ b/mesa 3D driver/src/broadcom/vulkan/v3dv_meta_copy.c	
@@ -349,7 +349,7 @@ v3dv_meta_can_use_tlb(struct v3dv_image *image,
 
    if (image->format->rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO) {
       if (compat_format)
-         *compat_format = image->vk_format;
+         *compat_format = image->vk.format;
       return true;
    }
 
@@ -357,7 +357,7 @@ v3dv_meta_can_use_tlb(struct v3dv_image *image,
     * a compatible format instead.
     */
    if (compat_format) {
-      *compat_format = get_compatible_tlb_format(image->vk_format);
+      *compat_format = get_compatible_tlb_format(image->vk.format);
       if (*compat_format != VK_FORMAT_UNDEFINED)
          return true;
    }
@@ -391,7 +391,7 @@ copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer,
        &internal_type, &internal_bpp);
 
    uint32_t num_layers;
-   if (image->type != VK_IMAGE_TYPE_3D)
+   if (image->vk.image_type != VK_IMAGE_TYPE_3D)
       num_layers = region->imageSubresource.layerCount;
    else
       num_layers = region->imageExtent.depth;
@@ -403,8 +403,8 @@ copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer,
       return true;
 
    /* Handle copy from compressed format using a compatible format */
-   const uint32_t block_w = vk_format_get_blockwidth(image->vk_format);
-   const uint32_t block_h = vk_format_get_blockheight(image->vk_format);
+   const uint32_t block_w = vk_format_get_blockwidth(image->vk.format);
+   const uint32_t block_h = vk_format_get_blockheight(image->vk.format);
    const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
    const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
 
@@ -493,10 +493,10 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
          dst_format = VK_FORMAT_R8G8B8A8_UINT;
          break;
       case VK_IMAGE_ASPECT_DEPTH_BIT:
-         assert(image->vk_format == VK_FORMAT_D32_SFLOAT ||
-                image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
-                image->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32);
-         if (image->vk_format == VK_FORMAT_D32_SFLOAT) {
+         assert(image->vk.format == VK_FORMAT_D32_SFLOAT ||
+                image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
+                image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32);
+         if (image->vk.format == VK_FORMAT_D32_SFLOAT) {
             src_format = VK_FORMAT_R32_UINT;
             dst_format = VK_FORMAT_R32_UINT;
          } else {
@@ -518,7 +518,7 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
          break;
       case VK_IMAGE_ASPECT_STENCIL_BIT:
          assert(copy_aspect == VK_IMAGE_ASPECT_STENCIL_BIT);
-         assert(image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT);
+         assert(image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT);
          /* Copying from S8D24. We want to write 8-bit stencil values only,
           * so adjust the buffer bpp for that. Since the hardware stores stencil
           * in the LSB, we can just do a RGBA8UI to R8UI blit.
@@ -572,14 +572,14 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
       buf_height = region->bufferImageHeight;
 
    /* If the image is compressed, the bpp refers to blocks, not pixels */
-   uint32_t block_width = vk_format_get_blockwidth(image->vk_format);
-   uint32_t block_height = vk_format_get_blockheight(image->vk_format);
+   uint32_t block_width = vk_format_get_blockwidth(image->vk.format);
+   uint32_t block_height = vk_format_get_blockheight(image->vk.format);
    buf_width = buf_width / block_width;
    buf_height = buf_height / block_height;
 
    /* Compute layers to copy */
    uint32_t num_layers;
-   if (image->type != VK_IMAGE_TYPE_3D)
+   if (image->vk.image_type != VK_IMAGE_TYPE_3D)
       num_layers = region->imageSubresource.layerCount;
    else
       num_layers = region->imageExtent.depth;
@@ -596,17 +596,17 @@ copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
    VkResult result;
    struct v3dv_device *device = cmd_buffer->device;
    VkDevice _device = v3dv_device_to_handle(device);
-   if (vk_format_is_compressed(image->vk_format)) {
+   if (vk_format_is_compressed(image->vk.format)) {
       VkImage uiview;
       VkImageCreateInfo uiview_info = {
          .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
          .imageType = VK_IMAGE_TYPE_3D,
          .format = dst_format,
-         .extent = { buf_width, buf_height, image->extent.depth },
-         .mipLevels = image->levels,
-         .arrayLayers = image->array_size,
-         .samples = image->samples,
-         .tiling = image->tiling,
+         .extent = { buf_width, buf_height, image->vk.extent.depth },
+         .mipLevels = image->vk.mip_levels,
+         .arrayLayers = image->vk.array_layers,
+         .samples = image->vk.samples,
+         .tiling = image->vk.tiling,
          .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
          .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
          .queueFamilyIndexCount = 0,
@@ -739,7 +739,7 @@ v3dv_CmdCopyImageToBuffer2KHR(VkCommandBuffer commandBuffer,
    V3DV_FROM_HANDLE(v3dv_image, image, info->srcImage);
    V3DV_FROM_HANDLE(v3dv_buffer, buffer, info->dstBuffer);
 
-   assert(image->samples == VK_SAMPLE_COUNT_1_BIT);
+   assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);
 
    for (uint32_t i = 0; i < info->regionCount; i++) {
       if (copy_image_to_buffer_tlb(cmd_buffer, buffer, image, &info->pRegions[i]))
@@ -761,14 +761,14 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
                const VkImageCopy2KHR *region)
 {
    /* Destination can't be raster format */
-   if (dst->tiling == VK_IMAGE_TILING_LINEAR)
+   if (dst->vk.tiling == VK_IMAGE_TILING_LINEAR)
       return false;
 
    /* We can only do full copies, so if the format is D24S8 both aspects need
     * to be copied. We only need to check the dst format because the spec
     * states that depth/stencil formats must match exactly.
     */
-   if (dst->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
+   if (dst->vk.format == VK_FORMAT_D24_UNORM_S8_UINT) {
        const VkImageAspectFlags ds_aspects = VK_IMAGE_ASPECT_DEPTH_BIT |
                                              VK_IMAGE_ASPECT_STENCIL_BIT;
        if (region->dstSubresource.aspectMask != ds_aspects)
@@ -784,8 +784,8 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
     * checking against the region dimensions, which are in units of the source
     * image format.
     */
-   if (vk_format_is_compressed(dst->vk_format) !=
-       vk_format_is_compressed(src->vk_format)) {
+   if (vk_format_is_compressed(dst->vk.format) !=
+       vk_format_is_compressed(src->vk.format)) {
       return false;
    }
 
@@ -798,8 +798,8 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
       return false;
 
    const uint32_t dst_mip_level = region->dstSubresource.mipLevel;
-   uint32_t dst_width = u_minify(dst->extent.width, dst_mip_level);
-   uint32_t dst_height = u_minify(dst->extent.height, dst_mip_level);
+   uint32_t dst_width = u_minify(dst->vk.extent.width, dst_mip_level);
+   uint32_t dst_height = u_minify(dst->vk.extent.height, dst_mip_level);
    if (region->extent.width != dst_width || region->extent.height != dst_height)
       return false;
 
@@ -809,15 +809,15 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
     *    members represent the texel dimensions of the source image and not
     *    the destination."
     */
-   const uint32_t block_w = vk_format_get_blockwidth(src->vk_format);
-   const uint32_t block_h = vk_format_get_blockheight(src->vk_format);
+   const uint32_t block_w = vk_format_get_blockwidth(src->vk.format);
+   const uint32_t block_h = vk_format_get_blockheight(src->vk.format);
    uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
    uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
 
    /* Account for sample count */
-   assert(dst->samples == src->samples);
-   if (dst->samples > VK_SAMPLE_COUNT_1_BIT) {
-      assert(dst->samples == VK_SAMPLE_COUNT_4_BIT);
+   assert(dst->vk.samples == src->vk.samples);
+   if (dst->vk.samples > VK_SAMPLE_COUNT_1_BIT) {
+      assert(dst->vk.samples == VK_SAMPLE_COUNT_4_BIT);
       width *= 2;
       height *= 2;
    }
@@ -840,20 +840,41 @@ copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
                                      dst->cpp, NULL);
 
    /* Emit a TFU job for each layer to blit */
-   const uint32_t layer_count = dst->type != VK_IMAGE_TYPE_3D ?
+   const uint32_t layer_count = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
       region->dstSubresource.layerCount :
       region->extent.depth;
    const uint32_t src_mip_level = region->srcSubresource.mipLevel;
 
-   const uint32_t base_src_layer = src->type != VK_IMAGE_TYPE_3D ?
+   const uint32_t base_src_layer = src->vk.image_type != VK_IMAGE_TYPE_3D ?
       region->srcSubresource.baseArrayLayer : region->srcOffset.z;
-   const uint32_t base_dst_layer = dst->type != VK_IMAGE_TYPE_3D ?
+   const uint32_t base_dst_layer = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
       region->dstSubresource.baseArrayLayer : region->dstOffset.z;
    for (uint32_t i = 0; i < layer_count; i++) {
-      v3dv_X(cmd_buffer->device, meta_emit_tfu_job)
-         (cmd_buffer, dst, dst_mip_level, base_dst_layer + i,
-          src, src_mip_level, base_src_layer + i,
-          width, height, format);
+      const uint32_t dst_offset =
+         dst->mem->bo->offset +
+         v3dv_layer_offset(dst, dst_mip_level, base_dst_layer + i);
+      const uint32_t src_offset =
+         src->mem->bo->offset +
+         v3dv_layer_offset(src, src_mip_level, base_src_layer + i);
+
+      const struct v3d_resource_slice *dst_slice = &dst->slices[dst_mip_level];
+      const struct v3d_resource_slice *src_slice = &src->slices[src_mip_level];
+
+      v3dv_X(cmd_buffer->device, meta_emit_tfu_job)(
+         cmd_buffer,
+         dst->mem->bo->handle,
+         dst_offset,
+         dst_slice->tiling,
+         dst_slice->tiling == V3D_TILING_RASTER ?
+                              dst_slice->stride : dst_slice->padded_height,
+         dst->cpp,
+         src->mem->bo->handle,
+         src_offset,
+         src_slice->tiling,
+         src_slice->tiling == V3D_TILING_RASTER ?
+                              src_slice->stride : src_slice->padded_height,
+         src->cpp,
+         width, height, format);
    }
 
    return true;
@@ -894,12 +915,12 @@ copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
     *  srcSubresource (for non-3D) must match the number of slices of the
     *  extent (for 3D) or layers of the dstSubresource (for non-3D)."
     */
-   assert((src->type != VK_IMAGE_TYPE_3D ?
+   assert((src->vk.image_type != VK_IMAGE_TYPE_3D ?
            region->srcSubresource.layerCount : region->extent.depth) ==
-          (dst->type != VK_IMAGE_TYPE_3D ?
+          (dst->vk.image_type != VK_IMAGE_TYPE_3D ?
            region->dstSubresource.layerCount : region->extent.depth));
    uint32_t num_layers;
-   if (dst->type != VK_IMAGE_TYPE_3D)
+   if (dst->vk.image_type != VK_IMAGE_TYPE_3D)
       num_layers = region->dstSubresource.layerCount;
    else
       num_layers = region->extent.depth;
@@ -911,13 +932,13 @@ copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
       return true;
 
    /* Handle copy to compressed image using compatible format */
-   const uint32_t block_w = vk_format_get_blockwidth(dst->vk_format);
-   const uint32_t block_h = vk_format_get_blockheight(dst->vk_format);
+   const uint32_t block_w = vk_format_get_blockwidth(dst->vk.format);
+   const uint32_t block_h = vk_format_get_blockheight(dst->vk.format);
    const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
    const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
 
    v3dv_job_start_frame(job, width, height, num_layers, false, 1, internal_bpp,
-                        src->samples > VK_SAMPLE_COUNT_1_BIT);
+                        src->vk.samples > VK_SAMPLE_COUNT_1_BIT);
 
    struct v3dv_meta_framebuffer framebuffer;
    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
@@ -956,18 +977,18 @@ create_image_alias(struct v3dv_cmd_buffer *cmd_buffer,
 
    VkImageCreateInfo info = {
       .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
-      .imageType = src->type,
+      .imageType = src->vk.image_type,
       .format = format,
       .extent = {
-         .width = src->extent.width * width_scale,
-         .height = src->extent.height * height_scale,
-         .depth = src->extent.depth,
+         .width = src->vk.extent.width * width_scale,
+         .height = src->vk.extent.height * height_scale,
+         .depth = src->vk.extent.depth,
       },
-      .mipLevels = src->levels,
-      .arrayLayers = src->array_size,
-      .samples = src->samples,
-      .tiling = src->tiling,
-      .usage = src->usage,
+      .mipLevels = src->vk.mip_levels,
+      .arrayLayers = src->vk.array_layers,
+      .samples = src->vk.samples,
+      .tiling = src->vk.tiling,
+      .usage = src->vk.usage,
    };
 
     VkImage _image;
@@ -994,10 +1015,10 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
                 struct v3dv_image *src,
                 const VkImageCopy2KHR *region)
 {
-   const uint32_t src_block_w = vk_format_get_blockwidth(src->vk_format);
-   const uint32_t src_block_h = vk_format_get_blockheight(src->vk_format);
-   const uint32_t dst_block_w = vk_format_get_blockwidth(dst->vk_format);
-   const uint32_t dst_block_h = vk_format_get_blockheight(dst->vk_format);
+   const uint32_t src_block_w = vk_format_get_blockwidth(src->vk.format);
+   const uint32_t src_block_h = vk_format_get_blockheight(src->vk.format);
+   const uint32_t dst_block_w = vk_format_get_blockwidth(dst->vk.format);
+   const uint32_t dst_block_h = vk_format_get_blockheight(dst->vk.format);
    const float block_scale_w = (float)src_block_w / (float)dst_block_w;
    const float block_scale_h = (float)src_block_h / (float)dst_block_h;
 
@@ -1011,7 +1032,7 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
    float src_scale_h = 1.0f;
    float dst_scale_w = block_scale_w;
    float dst_scale_h = block_scale_h;
-   if (vk_format_is_compressed(src->vk_format)) {
+   if (vk_format_is_compressed(src->vk.format)) {
       /* If we are copying from a compressed format we should be aware that we
        * are going to texture from the source image, and the texture setup
        * knows the actual size of the image, so we need to choose a format
@@ -1062,7 +1083,7 @@ copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
                                dst_scale_w, dst_scale_h, format);
    } else {
       format = src->format->rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO ?
-         src->vk_format : get_compatible_tlb_format(src->vk_format);
+         src->vk.format : get_compatible_tlb_format(src->vk.format);
       if (format == VK_FORMAT_UNDEFINED)
          return false;
 
@@ -1139,7 +1160,7 @@ v3dv_CmdCopyImage2KHR(VkCommandBuffer commandBuffer,
    V3DV_FROM_HANDLE(v3dv_image, src, info->srcImage);
    V3DV_FROM_HANDLE(v3dv_image, dst, info->dstImage);
 
-   assert(src->samples == dst->samples);
+   assert(src->vk.samples == dst->vk.samples);
 
    for (uint32_t i = 0; i < info->regionCount; i++) {
       if (copy_image_tfu(cmd_buffer, dst, src, &info->pRegions[i]))
@@ -1260,10 +1281,10 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
                          struct v3dv_buffer *buffer,
                          const VkBufferImageCopy2KHR *region)
 {
-   assert(image->samples == VK_SAMPLE_COUNT_1_BIT);
+   assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);
 
    /* Destination can't be raster format */
-   if (image->tiling == VK_IMAGE_TILING_LINEAR)
+   if (image->vk.tiling == VK_IMAGE_TILING_LINEAR)
       return false;
 
    /* We can't copy D24S8 because buffer to image copies only copy one aspect
@@ -1273,8 +1294,8 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
     * is not a straight copy, we would havew to swizzle the channels, which the
     * TFU can't do.
     */
-   if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
-       image->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) {
+   if (image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
+       image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32) {
          return false;
    }
 
@@ -1295,12 +1316,12 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
    else
       height = region->bufferImageHeight;
 
-   if (width != image->extent.width || height != image->extent.height)
+   if (width != image->vk.extent.width || height != image->vk.extent.height)
       return false;
 
    /* Handle region semantics for compressed images */
-   const uint32_t block_w = vk_format_get_blockwidth(image->vk_format);
-   const uint32_t block_h = vk_format_get_blockheight(image->vk_format);
+   const uint32_t block_w = vk_format_get_blockwidth(image->vk.format);
+   const uint32_t block_h = vk_format_get_blockheight(image->vk.format);
    width = DIV_ROUND_UP(width, block_w);
    height = DIV_ROUND_UP(height, block_h);
 
@@ -1317,7 +1338,7 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
    const struct v3d_resource_slice *slice = &image->slices[mip_level];
 
    uint32_t num_layers;
-   if (image->type != VK_IMAGE_TYPE_3D)
+   if (image->vk.image_type != VK_IMAGE_TYPE_3D)
       num_layers = region->imageSubresource.layerCount;
    else
       num_layers = region->imageExtent.depth;
@@ -1333,51 +1354,33 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
    const uint32_t buffer_stride = width * image->cpp;
    for (int i = 0; i < num_layers; i++) {
       uint32_t layer;
-      if (image->type != VK_IMAGE_TYPE_3D)
+      if (image->vk.image_type != VK_IMAGE_TYPE_3D)
          layer = region->imageSubresource.baseArrayLayer + i;
       else
          layer = region->imageOffset.z + i;
 
-      struct drm_v3d_submit_tfu tfu = {
-         .ios = (height << 16) | width,
-         .bo_handles = {
-            dst_bo->handle,
-            src_bo->handle != dst_bo->handle ? src_bo->handle : 0
-         },
-      };
-
       const uint32_t buffer_offset =
          buffer->mem_offset + region->bufferOffset +
          height * buffer_stride * i;
-
       const uint32_t src_offset = src_bo->offset + buffer_offset;
-      tfu.iia |= src_offset;
-      tfu.icfg |= V3D_TFU_ICFG_FORMAT_RASTER << V3D_TFU_ICFG_FORMAT_SHIFT;
-      tfu.iis |= width;
 
       const uint32_t dst_offset =
          dst_bo->offset + v3dv_layer_offset(image, mip_level, layer);
-      tfu.ioa |= dst_offset;
 
-      tfu.ioa |= (V3D_TFU_IOA_FORMAT_LINEARTILE +
-                  (slice->tiling - V3D_TILING_LINEARTILE)) <<
-                   V3D_TFU_IOA_FORMAT_SHIFT;
-      tfu.icfg |= format->tex_type << V3D_TFU_ICFG_TTYPE_SHIFT;
-
-      /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
-       * OPAD field for the destination (how many extra UIF blocks beyond
-       * those necessary to cover the height).
-       */
-      if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
-          slice->tiling == V3D_TILING_UIF_XOR) {
-         uint32_t uif_block_h = 2 * v3d_utile_height(image->cpp);
-         uint32_t implicit_padded_height = align(height, uif_block_h);
-         uint32_t icfg =
-            (slice->padded_height - implicit_padded_height) / uif_block_h;
-         tfu.icfg |= icfg << V3D_TFU_ICFG_OPAD_SHIFT;
-      }
-
-      v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu);
+      v3dv_X(cmd_buffer->device, meta_emit_tfu_job)(
+             cmd_buffer,
+             dst_bo->handle,
+             dst_offset,
+             slice->tiling,
+             slice->tiling == V3D_TILING_RASTER ?
+                              slice->stride : slice->padded_height,
+             image->cpp,
+             src_bo->handle,
+             src_offset,
+             V3D_TILING_RASTER,
+             width,
+             1,
+             width, height, format);
    }
 
    return true;
@@ -1403,7 +1406,7 @@ copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
        &internal_type, &internal_bpp);
 
    uint32_t num_layers;
-   if (image->type != VK_IMAGE_TYPE_3D)
+   if (image->vk.image_type != VK_IMAGE_TYPE_3D)
       num_layers = region->imageSubresource.layerCount;
    else
       num_layers = region->imageExtent.depth;
@@ -1415,8 +1418,8 @@ copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
       return true;
 
    /* Handle copy to compressed format using a compatible format */
-   const uint32_t block_w = vk_format_get_blockwidth(image->vk_format);
-   const uint32_t block_h = vk_format_get_blockheight(image->vk_format);
+   const uint32_t block_w = vk_format_get_blockwidth(image->vk.format);
+   const uint32_t block_h = vk_format_get_blockheight(image->vk.format);
    const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
    const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
 
@@ -1961,7 +1964,7 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
       return handled;
 
    /* FIXME: we only handle uncompressed images for now. */
-   if (vk_format_is_compressed(image->vk_format))
+   if (vk_format_is_compressed(image->vk.format))
       return handled;
 
    const VkColorComponentFlags full_cmask = VK_COLOR_COMPONENT_R_BIT |
@@ -1999,7 +2002,7 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
     */
    const VkImageSubresourceLayers *resource = &regions[0].imageSubresource;
    uint32_t num_layers;
-   if (image->type != VK_IMAGE_TYPE_3D) {
+   if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
       num_layers = resource->layerCount;
    } else {
       assert(region_count == 1);
@@ -2011,7 +2014,7 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
    struct v3dv_meta_texel_buffer_copy_pipeline *pipeline = NULL;
    bool ok = get_copy_texel_buffer_pipeline(cmd_buffer->device,
                                             dst_format, cmask, cswizzle,
-                                            image->type, num_layers > 1,
+                                            image->vk.image_type, num_layers > 1,
                                             &pipeline);
    if (!ok)
       return handled;
@@ -2087,12 +2090,12 @@ texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
     * For 3D images, this creates a layered framebuffer with a number of
     * layers matching the depth extent of the 3D image.
     */
-   uint32_t fb_width = u_minify(image->extent.width, resource->mipLevel);
-   uint32_t fb_height = u_minify(image->extent.height, resource->mipLevel);
+   uint32_t fb_width = u_minify(image->vk.extent.width, resource->mipLevel);
+   uint32_t fb_height = u_minify(image->vk.extent.height, resource->mipLevel);
    VkImageViewCreateInfo image_view_info = {
       .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
       .image = v3dv_image_to_handle(image),
-      .viewType = v3dv_image_type_to_view_type(image->type),
+      .viewType = v3dv_image_type_to_view_type(image->vk.image_type),
       .format = dst_format,
       .subresourceRange = {
          .aspectMask = aspect,
@@ -2287,7 +2290,7 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
       .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
       .imageType = VK_IMAGE_TYPE_2D,
       .format = src_format,
-      .extent = { image->extent.width, image->extent.height, 1 },
+      .extent = { image->vk.extent.width, image->vk.extent.height, 1 },
       .mipLevels = 1,
       .arrayLayers = 1,
       .samples = VK_SAMPLE_COUNT_1_BIT,
@@ -2327,7 +2330,7 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
     * image subresource so we can take this from the first region.
     */
    uint32_t num_layers;
-   if (image->type != VK_IMAGE_TYPE_3D)
+   if (image->vk.image_type != VK_IMAGE_TYPE_3D)
       num_layers = regions[0].imageSubresource.layerCount;
    else
       num_layers = regions[0].imageExtent.depth;
@@ -2338,8 +2341,8 @@ copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
     */
    assert(num_layers == 1 || region_count == 1);
 
-   const uint32_t block_width = vk_format_get_blockwidth(image->vk_format);
-   const uint32_t block_height = vk_format_get_blockheight(image->vk_format);
+   const uint32_t block_width = vk_format_get_blockwidth(image->vk.format);
+   const uint32_t block_height = vk_format_get_blockheight(image->vk.format);
 
    /* Copy regions by uploading each region to a temporary tiled image using
     * the memory we have just allocated as storage.
@@ -2543,9 +2546,9 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
          dst_format = src_format;
          break;
       case VK_IMAGE_ASPECT_DEPTH_BIT:
-         assert(image->vk_format == VK_FORMAT_D32_SFLOAT ||
-                image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
-                image->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32);
+         assert(image->vk.format == VK_FORMAT_D32_SFLOAT ||
+                image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
+                image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32);
          src_format = VK_FORMAT_R8G8B8A8_UINT;
          dst_format = src_format;
          aspect = VK_IMAGE_ASPECT_COLOR_BIT;
@@ -2554,8 +2557,8 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
           * in the buffer is stored in the 24-LSB, but V3D wants it in the
           * 24-MSB.
           */
-         if (image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
-             image->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) {
+         if (image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
+             image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32) {
             cmask = VK_COLOR_COMPONENT_G_BIT |
                     VK_COLOR_COMPONENT_B_BIT |
                     VK_COLOR_COMPONENT_A_BIT;
@@ -2573,7 +2576,7 @@ copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
           * blit to an RGBA8UI destination masking out writes to components
           * GBA (which map to the D24 component of a S8D24 image).
           */
-         assert(image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT);
+         assert(image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT);
          buf_bpp = 1;
          src_format = VK_FORMAT_R8_UINT;
          dst_format = VK_FORMAT_R8G8B8A8_UINT;
@@ -2626,13 +2629,13 @@ copy_buffer_to_image_cpu(struct v3dv_cmd_buffer *cmd_buffer,
                          const VkBufferImageCopy2KHR *region)
 {
    /* FIXME */
-   if (vk_format_is_depth_or_stencil(image->vk_format))
+   if (vk_format_is_depth_or_stencil(image->vk.format))
       return false;
 
-   if (vk_format_is_compressed(image->vk_format))
+   if (vk_format_is_compressed(image->vk.format))
       return false;
 
-   if (image->tiling == VK_IMAGE_TILING_LINEAR)
+   if (image->vk.tiling == VK_IMAGE_TILING_LINEAR)
       return false;
 
    uint32_t buffer_width, buffer_height;
@@ -2650,7 +2653,7 @@ copy_buffer_to_image_cpu(struct v3dv_cmd_buffer *cmd_buffer,
    uint32_t buffer_layer_stride = buffer_stride * buffer_height;
 
    uint32_t num_layers;
-   if (image->type != VK_IMAGE_TYPE_3D)
+   if (image->vk.image_type != VK_IMAGE_TYPE_3D)
       num_layers = region->imageSubresource.layerCount;
    else
       num_layers = region->imageExtent.depth;
@@ -2689,7 +2692,7 @@ v3dv_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer,
    V3DV_FROM_HANDLE(v3dv_buffer, buffer, info->srcBuffer);
    V3DV_FROM_HANDLE(v3dv_image, image, info->dstImage);
 
-   assert(image->samples == VK_SAMPLE_COUNT_1_BIT);
+   assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);
 
    uint32_t r = 0;
    while (r < info->regionCount) {
@@ -2719,7 +2722,7 @@ v3dv_CmdCopyBufferToImage2KHR(VkCommandBuffer commandBuffer,
             break;
 
          /* For 3D images we also need to check the depth extent */
-         if (image->type == VK_IMAGE_TYPE_3D &&
+         if (image->vk.image_type == VK_IMAGE_TYPE_3D &&
              info->pRegions[s].imageExtent.depth !=
              info->pRegions[r].imageExtent.depth) {
                break;
@@ -2775,15 +2778,15 @@ blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
          struct v3dv_image *src,
          const VkImageBlit2KHR *region)
 {
-   assert(dst->samples == VK_SAMPLE_COUNT_1_BIT);
-   assert(src->samples == VK_SAMPLE_COUNT_1_BIT);
+   assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT);
+   assert(src->vk.samples == VK_SAMPLE_COUNT_1_BIT);
 
    /* Format must match */
-   if (src->vk_format != dst->vk_format)
+   if (src->vk.format != dst->vk.format)
       return false;
 
    /* Destination can't be raster format */
-   if (dst->tiling == VK_IMAGE_TILING_LINEAR)
+   if (dst->vk.tiling == VK_IMAGE_TILING_LINEAR)
       return false;
 
    /* Source region must start at (0,0) */
@@ -2795,8 +2798,8 @@ blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
       return false;
 
    const uint32_t dst_mip_level = region->dstSubresource.mipLevel;
-   const uint32_t dst_width = u_minify(dst->extent.width, dst_mip_level);
-   const uint32_t dst_height = u_minify(dst->extent.height, dst_mip_level);
+   const uint32_t dst_width = u_minify(dst->vk.extent.width, dst_mip_level);
+   const uint32_t dst_height = u_minify(dst->vk.extent.height, dst_mip_level);
    if (region->dstOffsets[1].x < dst_width - 1||
        region->dstOffsets[1].y < dst_height - 1) {
       return false;
@@ -2811,7 +2814,7 @@ blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
    /* If the format is D24S8 both aspects need to be copied, since the TFU
     * can't be programmed to copy only one aspect of the image.
     */
-   if (dst->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
+   if (dst->vk.format == VK_FORMAT_D24_UNORM_S8_UINT) {
        const VkImageAspectFlags ds_aspects = VK_IMAGE_ASPECT_DEPTH_BIT |
                                              VK_IMAGE_ASPECT_STENCIL_BIT;
        if (region->dstSubresource.aspectMask != ds_aspects)
@@ -2834,7 +2837,7 @@ blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
    uint32_t min_dst_layer;
    uint32_t max_dst_layer;
    bool dst_mirror_z = false;
-   if (dst->type == VK_IMAGE_TYPE_3D) {
+   if (dst->vk.image_type == VK_IMAGE_TYPE_3D) {
       compute_blit_3d_layers(region->dstOffsets,
                              &min_dst_layer, &max_dst_layer,
                              &dst_mirror_z);
@@ -2846,7 +2849,7 @@ blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
    uint32_t min_src_layer;
    uint32_t max_src_layer;
    bool src_mirror_z = false;
-   if (src->type == VK_IMAGE_TYPE_3D) {
+   if (src->vk.image_type == VK_IMAGE_TYPE_3D) {
       compute_blit_3d_layers(region->srcOffsets,
                              &min_src_layer, &max_src_layer,
                              &src_mirror_z);
@@ -2871,10 +2874,30 @@ blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
          dst_mirror_z ? max_dst_layer - i - 1: min_dst_layer + i;
       const uint32_t src_layer =
          src_mirror_z ? max_src_layer - i - 1: min_src_layer + i;
-      v3dv_X(cmd_buffer->device, meta_emit_tfu_job)
-         (cmd_buffer, dst, dst_mip_level, dst_layer,
-          src, src_mip_level, src_layer,
-          dst_width, dst_height, format);
+
+      const uint32_t dst_offset =
+         dst->mem->bo->offset + v3dv_layer_offset(dst, dst_mip_level, dst_layer);
+      const uint32_t src_offset =
+         src->mem->bo->offset + v3dv_layer_offset(src, src_mip_level, src_layer);
+
+      const struct v3d_resource_slice *dst_slice = &dst->slices[dst_mip_level];
+      const struct v3d_resource_slice *src_slice = &src->slices[src_mip_level];
+
+      v3dv_X(cmd_buffer->device, meta_emit_tfu_job)(
+         cmd_buffer,
+         dst->mem->bo->handle,
+         dst_offset,
+         dst_slice->tiling,
+         dst_slice->tiling == V3D_TILING_RASTER ?
+                              dst_slice->stride : dst_slice->padded_height,
+         dst->cpp,
+         src->mem->bo->handle,
+         src_offset,
+         src_slice->tiling,
+         src_slice->tiling == V3D_TILING_RASTER ?
+                              src_slice->stride : src_slice->padded_height,
+         src->cpp,
+         dst_width, dst_height, format);
    }
 
    return true;
@@ -3782,11 +3805,11 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
    /* We don't support rendering to linear depth/stencil, this should have
     * been rewritten to a compatible color blit by the caller.
     */
-   assert(dst->tiling != VK_IMAGE_TILING_LINEAR ||
+   assert(dst->vk.tiling != VK_IMAGE_TILING_LINEAR ||
           !vk_format_is_depth_or_stencil(dst_format));
 
    /* Can't sample from linear images */
-   if (src->tiling == VK_IMAGE_TILING_LINEAR && src->type != VK_IMAGE_TYPE_1D)
+   if (src->vk.tiling == VK_IMAGE_TILING_LINEAR && src->vk.image_type != VK_IMAGE_TYPE_1D)
       return false;
 
    VkImageBlit2KHR region = *_region;
@@ -3844,23 +3867,23 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
     * need to apply those same semantics here when we compute the size of the
     * destination image level.
     */
-   const uint32_t dst_block_w = vk_format_get_blockwidth(dst->vk_format);
-   const uint32_t dst_block_h = vk_format_get_blockheight(dst->vk_format);
-   const uint32_t src_block_w = vk_format_get_blockwidth(src->vk_format);
-   const uint32_t src_block_h = vk_format_get_blockheight(src->vk_format);
+   const uint32_t dst_block_w = vk_format_get_blockwidth(dst->vk.format);
+   const uint32_t dst_block_h = vk_format_get_blockheight(dst->vk.format);
+   const uint32_t src_block_w = vk_format_get_blockwidth(src->vk.format);
+   const uint32_t src_block_h = vk_format_get_blockheight(src->vk.format);
    const uint32_t dst_level_w =
-      u_minify(DIV_ROUND_UP(dst->extent.width * src_block_w, dst_block_w),
+      u_minify(DIV_ROUND_UP(dst->vk.extent.width * src_block_w, dst_block_w),
                region.dstSubresource.mipLevel);
    const uint32_t dst_level_h =
-      u_minify(DIV_ROUND_UP(dst->extent.height * src_block_h, dst_block_h),
+      u_minify(DIV_ROUND_UP(dst->vk.extent.height * src_block_h, dst_block_h),
                region.dstSubresource.mipLevel);
 
    const uint32_t src_level_w =
-      u_minify(src->extent.width, region.srcSubresource.mipLevel);
+      u_minify(src->vk.extent.width, region.srcSubresource.mipLevel);
    const uint32_t src_level_h =
-      u_minify(src->extent.height, region.srcSubresource.mipLevel);
+      u_minify(src->vk.extent.height, region.srcSubresource.mipLevel);
    const uint32_t src_level_d =
-      u_minify(src->extent.depth, region.srcSubresource.mipLevel);
+      u_minify(src->vk.extent.depth, region.srcSubresource.mipLevel);
 
    uint32_t dst_x, dst_y, dst_w, dst_h;
    bool dst_mirror_x, dst_mirror_y;
@@ -3879,7 +3902,7 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
    uint32_t min_dst_layer;
    uint32_t max_dst_layer;
    bool dst_mirror_z = false;
-   if (dst->type != VK_IMAGE_TYPE_3D) {
+   if (dst->vk.image_type != VK_IMAGE_TYPE_3D) {
       min_dst_layer = region.dstSubresource.baseArrayLayer;
       max_dst_layer = min_dst_layer + region.dstSubresource.layerCount;
    } else {
@@ -3891,7 +3914,7 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
    uint32_t min_src_layer;
    uint32_t max_src_layer;
    bool src_mirror_z = false;
-   if (src->type != VK_IMAGE_TYPE_3D) {
+   if (src->vk.image_type != VK_IMAGE_TYPE_3D) {
       min_src_layer = region.srcSubresource.baseArrayLayer;
       max_src_layer = min_src_layer + region.srcSubresource.layerCount;
    } else {
@@ -3913,7 +3936,7 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
       (float)(src_y + src_h),
    };
 
-   if (src->samples == VK_SAMPLE_COUNT_1_BIT) {
+   if (src->vk.samples == VK_SAMPLE_COUNT_1_BIT) {
       coords[0] /= (float)src_level_w;
       coords[1] /= (float)src_level_h;
       coords[2] /= (float)src_level_w;
@@ -3945,8 +3968,8 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
    /* Get the blit pipeline */
    struct v3dv_meta_blit_pipeline *pipeline = NULL;
    bool ok = get_blit_pipeline(cmd_buffer->device,
-                               dst_format, src_format, cmask, src->type,
-                               dst->samples, src->samples,
+                               dst_format, src_format, cmask, src->vk.image_type,
+                               dst->vk.samples, src->vk.samples,
                                &pipeline);
    if (!ok)
       return handled;
@@ -4016,7 +4039,7 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
       VkImageViewCreateInfo dst_image_view_info = {
          .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
          .image = v3dv_image_to_handle(dst),
-         .viewType = v3dv_image_type_to_view_type(dst->type),
+         .viewType = v3dv_image_type_to_view_type(dst->vk.image_type),
          .format = dst_format,
          .subresourceRange = {
             .aspectMask = aspects,
@@ -4074,7 +4097,7 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
       VkImageViewCreateInfo src_image_view_info = {
          .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
          .image = v3dv_image_to_handle(src),
-         .viewType = v3dv_image_type_to_view_type(src->type),
+         .viewType = v3dv_image_type_to_view_type(src->vk.image_type),
          .format = src_format,
          .components = *cswizzle,
          .subresourceRange = {
@@ -4082,7 +4105,7 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
             .baseMipLevel = region.srcSubresource.mipLevel,
             .levelCount = 1,
             .baseArrayLayer =
-               src->type == VK_IMAGE_TYPE_3D ? 0 : min_src_layer + i,
+               src->vk.image_type == VK_IMAGE_TYPE_3D ? 0 : min_src_layer + i,
             .layerCount = 1
          },
       };
@@ -4156,7 +4179,7 @@ blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
        * based on the ratio of the depth of the source and the destination
        * images, picking the coordinate in the middle of each step.
        */
-      if (src->type == VK_IMAGE_TYPE_3D) {
+      if (src->vk.image_type == VK_IMAGE_TYPE_3D) {
          tex_coords[4] =
             !mirror_z ?
             (min_src_layer + (i + 0.5f) * src_z_step) / (float)src_level_d :
@@ -4193,18 +4216,18 @@ v3dv_CmdBlitImage2KHR(VkCommandBuffer commandBuffer,
    assert(cmd_buffer->state.job == NULL);
 
    /* From the Vulkan 1.0 spec, vkCmdBlitImage valid usage */
-   assert(dst->samples == VK_SAMPLE_COUNT_1_BIT &&
-          src->samples == VK_SAMPLE_COUNT_1_BIT);
+   assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT &&
+          src->vk.samples == VK_SAMPLE_COUNT_1_BIT);
 
    /* We don't export VK_FORMAT_FEATURE_BLIT_DST_BIT on compressed formats */
-   assert(!vk_format_is_compressed(dst->vk_format));
+   assert(!vk_format_is_compressed(dst->vk.format));
 
    for (uint32_t i = 0; i < pBlitImageInfo->regionCount; i++) {
       if (blit_tfu(cmd_buffer, dst, src, &pBlitImageInfo->pRegions[i]))
          continue;
       if (blit_shader(cmd_buffer,
-                      dst, dst->vk_format,
-                      src, src->vk_format,
+                      dst, dst->vk.format,
+                      src, src->vk.format,
                       0, NULL,
                       &pBlitImageInfo->pRegions[i],
                       pBlitImageInfo->filter, true)) {
@@ -4228,10 +4251,10 @@ resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
    if (!v3dv_X(cmd_buffer->device, format_supports_tlb_resolve)(src->format))
       return false;
 
-   const VkFormat fb_format = src->vk_format;
+   const VkFormat fb_format = src->vk.format;
 
    uint32_t num_layers;
-   if (dst->type != VK_IMAGE_TYPE_3D)
+   if (dst->vk.image_type != VK_IMAGE_TYPE_3D)
       num_layers = region->dstSubresource.layerCount;
    else
       num_layers = region->extent.depth;
@@ -4242,8 +4265,8 @@ resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
    if (!job)
       return true;
 
-   const uint32_t block_w = vk_format_get_blockwidth(dst->vk_format);
-   const uint32_t block_h = vk_format_get_blockheight(dst->vk_format);
+   const uint32_t block_w = vk_format_get_blockwidth(dst->vk.format);
+   const uint32_t block_h = vk_format_get_blockheight(dst->vk.format);
    const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
    const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
 
@@ -4293,8 +4316,8 @@ resolve_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
       },
    };
    return blit_shader(cmd_buffer,
-                      dst, dst->vk_format,
-                      src, src->vk_format,
+                      dst, dst->vk.format,
+                      src, src->vk.format,
                       0, NULL,
                       &blit_region, VK_FILTER_NEAREST, true);
 }
@@ -4312,8 +4335,8 @@ v3dv_CmdResolveImage2KHR(VkCommandBuffer commandBuffer,
    assert(cmd_buffer->state.pass == NULL);
    assert(cmd_buffer->state.job == NULL);
 
-   assert(src->samples == VK_SAMPLE_COUNT_4_BIT);
-   assert(dst->samples == VK_SAMPLE_COUNT_1_BIT);
+   assert(src->vk.samples == VK_SAMPLE_COUNT_4_BIT);
+   assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT);
 
    for (uint32_t i = 0; i < info->regionCount; i++) {
       if (resolve_image_tlb(cmd_buffer, dst, src, &info->pRegions[i]))
diff --git a/mesa 3D driver/src/broadcom/vulkan/v3dv_pass.c b/mesa 3D driver/src/broadcom/vulkan/v3dv_pass.c
index 464703e42a..1b03c0d793 100644
--- a/mesa 3D driver/src/broadcom/vulkan/v3dv_pass.c	
+++ b/mesa 3D driver/src/broadcom/vulkan/v3dv_pass.c	
@@ -143,7 +143,7 @@ v3dv_CreateRenderPass(VkDevice _device,
    pass = vk_object_zalloc(&device->vk, pAllocator, size,
                            VK_OBJECT_TYPE_RENDER_PASS);
    if (pass == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    pass->multiview_enabled = multiview_enabled;
    pass->attachment_count = pCreateInfo->attachmentCount;
@@ -168,7 +168,7 @@ v3dv_CreateRenderPass(VkDevice _device,
                    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
       if (pass->subpass_attachments == NULL) {
          vk_object_free(&device->vk, pAllocator, pass);
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
       }
    } else {
       pass->subpass_attachments = NULL;
diff --git a/mesa 3D driver/src/broadcom/vulkan/v3dv_pipeline.c b/mesa 3D driver/src/broadcom/vulkan/v3dv_pipeline.c
index bd2854636f..daa6c75501 100644
--- a/mesa 3D driver/src/broadcom/vulkan/v3dv_pipeline.c	
+++ b/mesa 3D driver/src/broadcom/vulkan/v3dv_pipeline.c	
@@ -35,6 +35,7 @@
 
 #include "util/u_atomic.h"
 #include "util/u_prim.h"
+#include "util/os_time.h"
 
 #include "vulkan/util/vk_format.h"
 
@@ -189,11 +190,11 @@ static const struct spirv_to_nir_options default_spirv_options =  {
    .phys_ssbo_addr_format = nir_address_format_64bit_global,
    .push_const_addr_format = nir_address_format_logical,
    .shared_addr_format = nir_address_format_32bit_offset,
-   .frag_coord_is_sysval = false,
 };
 
 const nir_shader_compiler_options v3dv_nir_options = {
-   .lower_add_sat = true,
+   .lower_uadd_sat = true,
+   .lower_iadd_sat = true,
    .lower_all_io_to_temps = true,
    .lower_extract_byte = true,
    .lower_extract_word = true,
@@ -438,7 +439,7 @@ shader_module_compile_to_nir(struct v3dv_device *device,
       uint32_t *spirv = (uint32_t *) stage->module->data;
       assert(stage->module->size % 4 == 0);
 
-      if (V3D_DEBUG & V3D_DEBUG_DUMP_SPIRV)
+      if (unlikely(V3D_DEBUG & V3D_DEBUG_DUMP_SPIRV))
          v3dv_print_spirv(stage->module->data, stage->module->size, stderr);
 
       uint32_t num_spec_entries = 0;
@@ -465,9 +466,15 @@ shader_module_compile_to_nir(struct v3dv_device *device,
    }
    assert(nir->info.stage == broadcom_shader_stage_to_gl(stage->stage));
 
-   if (V3D_DEBUG & (V3D_DEBUG_NIR |
-                    v3d_debug_flag_for_shader_stage(
-                       broadcom_shader_stage_to_gl(stage->stage)))) {
+   const struct nir_lower_sysvals_to_varyings_options sysvals_to_varyings = {
+      .frag_coord = true,
+      .point_coord = true,
+   };
+   NIR_PASS_V(nir, nir_lower_sysvals_to_varyings, &sysvals_to_varyings);
+
+   if (unlikely(V3D_DEBUG & (V3D_DEBUG_NIR |
+                             v3d_debug_flag_for_shader_stage(
+                                broadcom_shader_stage_to_gl(stage->stage))))) {
       fprintf(stderr, "Initial form: %s prog %d NIR:\n",
               broadcom_shader_stage_name(stage->stage),
               stage->program_id);
@@ -1405,8 +1412,13 @@ pipeline_stage_create_binning(const struct v3dv_pipeline_stage *src,
    p_stage->stage = bin_stage;
    p_stage->entrypoint = src->entrypoint;
    p_stage->module = src->module;
-   p_stage->nir = src->nir ? nir_shader_clone(NULL, src->nir) : NULL;
+   /* For binning shaders we will clone the NIR code from the corresponding
+    * render shader later, when we call pipeline_compile_xxx_shader. This way
+    * we only have to run the relevant NIR lowerings once for render shaders
+    */
+   p_stage->nir = NULL;
    p_stage->spec_info = src->spec_info;
+   p_stage->feedback = (VkPipelineCreationFeedbackEXT) { 0 };
    memcpy(p_stage->shader_sha1, src->shader_sha1, 20);
 
    return p_stage;
@@ -1609,14 +1621,16 @@ pipeline_compile_shader_variant(struct v3dv_pipeline_stage *p_stage,
                                 const VkAllocationCallbacks *pAllocator,
                                 VkResult *out_vk_result)
 {
+   int64_t stage_start = os_time_get_nano();
+
    struct v3dv_pipeline *pipeline = p_stage->pipeline;
    struct v3dv_physical_device *physical_device =
       &pipeline->device->instance->physicalDevice;
    const struct v3d_compiler *compiler = physical_device->compiler;
 
-   if (V3D_DEBUG & (V3D_DEBUG_NIR |
-                    v3d_debug_flag_for_shader_stage
-                    (broadcom_shader_stage_to_gl(p_stage->stage)))) {
+   if (unlikely(V3D_DEBUG & (V3D_DEBUG_NIR |
+                             v3d_debug_flag_for_shader_stage
+                             (broadcom_shader_stage_to_gl(p_stage->stage))))) {
       fprintf(stderr, "Just before v3d_compile: %s prog %d NIR:\n",
               broadcom_shader_stage_name(p_stage->stage),
               p_stage->program_id);
@@ -1657,6 +1671,8 @@ pipeline_compile_shader_variant(struct v3dv_pipeline_stage *p_stage,
     * we finish it, so let's not worry about freeing the nir here.
     */
 
+   p_stage->feedback.duration += os_time_get_nano() - stage_start;
+
    return variant;
 }
 
@@ -1756,6 +1772,8 @@ pipeline_lower_nir(struct v3dv_pipeline *pipeline,
                    struct v3dv_pipeline_stage *p_stage,
                    struct v3dv_pipeline_layout *layout)
 {
+   int64_t stage_start = os_time_get_nano();
+
    assert(pipeline->shared_data &&
           pipeline->shared_data->maps[p_stage->stage]);
 
@@ -1780,6 +1798,8 @@ pipeline_lower_nir(struct v3dv_pipeline *pipeline,
 
    /* Apply the actual pipeline layout to UBOs, SSBOs, and textures */
    NIR_PASS_V(p_stage->nir, lower_pipeline_layout_info, pipeline, layout);
+
+   p_stage->feedback.duration += os_time_get_nano() - stage_start;
 }
 
 /**
@@ -1808,6 +1828,8 @@ pipeline_stage_get_nir(struct v3dv_pipeline_stage *p_stage,
                        struct v3dv_pipeline *pipeline,
                        struct v3dv_pipeline_cache *cache)
 {
+   int64_t stage_start = os_time_get_nano();
+
    nir_shader *nir = NULL;
 
    nir = v3dv_pipeline_cache_search_for_nir(pipeline, cache,
@@ -1816,6 +1838,14 @@ pipeline_stage_get_nir(struct v3dv_pipeline_stage *p_stage,
 
    if (nir) {
       assert(nir->info.stage == broadcom_shader_stage_to_gl(p_stage->stage));
+
+      /* A NIR cach hit doesn't avoid the large majority of pipeline stage
+       * creation so the cache hit is not recorded in the pipeline feedback
+       * flags
+       */
+
+      p_stage->feedback.duration += os_time_get_nano() - stage_start;
+
       return nir;
    }
 
@@ -1835,6 +1865,9 @@ pipeline_stage_get_nir(struct v3dv_pipeline_stage *p_stage,
          v3dv_pipeline_cache_upload_nir(pipeline, default_cache, nir,
                                         p_stage->shader_sha1);
       }
+
+      p_stage->feedback.duration += os_time_get_nano() - stage_start;
+
       return nir;
    }
 
@@ -2119,6 +2152,59 @@ v3dv_pipeline_shared_data_new_empty(const unsigned char sha1_key[20],
    return NULL;
 }
 
+static void
+write_creation_feedback(struct v3dv_pipeline *pipeline,
+                        const void *next,
+                        const VkPipelineCreationFeedbackEXT *pipeline_feedback,
+                        uint32_t stage_count,
+                        const VkPipelineShaderStageCreateInfo *stages)
+{
+   const VkPipelineCreationFeedbackCreateInfoEXT *create_feedback =
+      vk_find_struct_const(next, PIPELINE_CREATION_FEEDBACK_CREATE_INFO_EXT);
+
+   if (create_feedback) {
+      typed_memcpy(create_feedback->pPipelineCreationFeedback,
+             pipeline_feedback,
+             1);
+
+      assert(stage_count == create_feedback->pipelineStageCreationFeedbackCount);
+
+      for (uint32_t i = 0; i < stage_count; i++) {
+         gl_shader_stage s = vk_to_mesa_shader_stage(stages[i].stage);
+         switch (s) {
+         case MESA_SHADER_VERTEX:
+            create_feedback->pPipelineStageCreationFeedbacks[i] =
+               pipeline->vs->feedback;
+
+            create_feedback->pPipelineStageCreationFeedbacks[i].duration +=
+               pipeline->vs_bin->feedback.duration;
+            break;
+
+         case MESA_SHADER_GEOMETRY:
+            create_feedback->pPipelineStageCreationFeedbacks[i] =
+               pipeline->gs->feedback;
+
+            create_feedback->pPipelineStageCreationFeedbacks[i].duration +=
+               pipeline->gs_bin->feedback.duration;
+            break;
+
+         case MESA_SHADER_FRAGMENT:
+            create_feedback->pPipelineStageCreationFeedbacks[i] =
+               pipeline->fs->feedback;
+            break;
+
+         case MESA_SHADER_COMPUTE:
+            create_feedback->pPipelineStageCreationFeedbacks[i] =
+               pipeline->cs->feedback;
+            break;
+
+         default:
+            unreachable("not supported shader stage");
+         }
+      }
+   }
+}
+
 static uint32_t
 multiview_gs_input_primitive_from_pipeline(struct v3dv_pipeline *pipeline)
 {
@@ -2294,6 +2380,11 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
                           const VkGraphicsPipelineCreateInfo *pCreateInfo,
                           const VkAllocationCallbacks *pAllocator)
 {
+   VkPipelineCreationFeedbackEXT pipeline_feedback = {
+      .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT,
+   };
+   int64_t pipeline_start = os_time_get_nano();
+
    struct v3dv_device *device = pipeline->device;
    struct v3dv_physical_device *physical_device =
       &device->instance->physicalDevice;
@@ -2408,8 +2499,12 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
    unsigned char pipeline_sha1[20];
    pipeline_hash_graphics(pipeline, &pipeline_key, pipeline_sha1);
 
+   bool cache_hit = false;
+
    pipeline->shared_data =
-      v3dv_pipeline_cache_search_for_pipeline(cache, pipeline_sha1);
+      v3dv_pipeline_cache_search_for_pipeline(cache,
+                                              pipeline_sha1,
+                                              &cache_hit);
 
    if (pipeline->shared_data != NULL) {
       /* A correct pipeline must have at least a VS and FS */
@@ -2420,6 +2515,11 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
              pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY]);
       assert(!pipeline->gs ||
              pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN]);
+
+      if (cache_hit && cache != &pipeline->device->default_pipeline_cache)
+         pipeline_feedback.flags |=
+            VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT;
+
       goto success;
    }
 
@@ -2432,6 +2532,14 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
    pipeline->shared_data =
       v3dv_pipeline_shared_data_new_empty(pipeline_sha1, pipeline, true);
 
+   pipeline->vs->feedback.flags |=
+      VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT;
+   if (pipeline->gs)
+      pipeline->gs->feedback.flags |=
+         VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT;
+   pipeline->fs->feedback.flags |=
+      VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT;
+
    if (!pipeline->vs->nir)
       pipeline->vs->nir = pipeline_stage_get_nir(pipeline->vs, pipeline, cache);
    if (pipeline->gs && !pipeline->gs->nir)
@@ -2490,6 +2598,14 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
    v3dv_pipeline_cache_upload_pipeline(pipeline, cache);
 
  success:
+
+   pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
+   write_creation_feedback(pipeline,
+                           pCreateInfo->pNext,
+                           &pipeline_feedback,
+                           pCreateInfo->stageCount,
+                           pCreateInfo->pStages);
+
    /* Since we have the variants in the pipeline shared data we can now free
     * the pipeline stages.
     */
@@ -2895,8 +3011,14 @@ pipeline_init(struct v3dv_pipeline *pipeline,
       return result;
    }
 
-   v3dv_X(device, pipeline_pack_compile_state)(pipeline,
-                                               pCreateInfo->pVertexInputState);
+   const VkPipelineVertexInputStateCreateInfo *vi_info =
+      pCreateInfo->pVertexInputState;
+
+   const VkPipelineVertexInputDivisorStateCreateInfoEXT *vd_info =
+      vk_find_struct_const(vi_info->pNext,
+                           PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT);
+
+   v3dv_X(device, pipeline_pack_compile_state)(pipeline, vi_info, vd_info);
 
    if (pipeline_has_integer_vertex_attrib(pipeline)) {
       pipeline->default_attribute_values =
@@ -2931,7 +3053,7 @@ graphics_pipeline_create(VkDevice _device,
                                VK_OBJECT_TYPE_PIPELINE);
 
    if (pipeline == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    result = pipeline_init(pipeline, device, cache,
                           pCreateInfo,
@@ -3019,6 +3141,11 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline,
                          const VkComputePipelineCreateInfo *info,
                          const VkAllocationCallbacks *alloc)
 {
+   VkPipelineCreationFeedbackEXT pipeline_feedback = {
+      .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT,
+   };
+   int64_t pipeline_start = os_time_get_nano();
+
    struct v3dv_device *device = pipeline->device;
    struct v3dv_physical_device *physical_device =
       &device->instance->physicalDevice;
@@ -3038,6 +3165,7 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline,
    p_stage->entrypoint = sinfo->pName;
    p_stage->module = vk_shader_module_from_handle(sinfo->module);
    p_stage->spec_info = sinfo->pSpecializationInfo;
+   p_stage->feedback = (VkPipelineCreationFeedbackEXT) { 0 };
 
    pipeline_hash_shader(p_stage->module,
                         p_stage->entrypoint,
@@ -3056,11 +3184,16 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline,
    unsigned char pipeline_sha1[20];
    pipeline_hash_compute(pipeline, &pipeline_key, pipeline_sha1);
 
+   bool cache_hit = false;
    pipeline->shared_data =
-      v3dv_pipeline_cache_search_for_pipeline(cache, pipeline_sha1);
+      v3dv_pipeline_cache_search_for_pipeline(cache, pipeline_sha1, &cache_hit);
 
    if (pipeline->shared_data != NULL) {
       assert(pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);
+      if (cache_hit && cache != &pipeline->device->default_pipeline_cache)
+         pipeline_feedback.flags |=
+            VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT;
+
       goto success;
    }
 
@@ -3071,6 +3204,8 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline,
                                                                pipeline,
                                                                false);
 
+   p_stage->feedback.flags |= VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT;
+
    /* If not found on cache, compile it */
    p_stage->nir = pipeline_stage_get_nir(p_stage, pipeline, cache);
    assert(p_stage->nir);
@@ -3096,12 +3231,21 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline,
       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
 
    v3dv_pipeline_cache_upload_pipeline(pipeline, cache);
+
+success:
+
+   pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
+   write_creation_feedback(pipeline,
+                           info->pNext,
+                           &pipeline_feedback,
+                           1,
+                           &info->stage);
+
    /* As we got the variants in pipeline->shared_data, after compiling we
     * don't need the pipeline_stages
     */
    pipeline_free_stages(device, pipeline, alloc);
 
- success:
    pipeline_check_spill_size(pipeline);
 
    return VK_SUCCESS;
@@ -3144,7 +3288,7 @@ compute_pipeline_create(VkDevice _device,
    pipeline = vk_object_zalloc(&device->vk, pAllocator, sizeof(*pipeline),
                                VK_OBJECT_TYPE_PIPELINE);
    if (pipeline == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    result = compute_pipeline_init(pipeline, device, cache,
                                   pCreateInfo, pAllocator);
diff --git a/mesa 3D driver/src/broadcom/vulkan/v3dv_pipeline_cache.c b/mesa 3D driver/src/broadcom/vulkan/v3dv_pipeline_cache.c
index 9d6a541c71..c19eecc42c 100644
--- a/mesa 3D driver/src/broadcom/vulkan/v3dv_pipeline_cache.c	
+++ b/mesa 3D driver/src/broadcom/vulkan/v3dv_pipeline_cache.c	
@@ -247,7 +247,8 @@ v3dv_pipeline_shared_data_write_to_blob(const struct v3dv_pipeline_shared_data *
  */
 struct v3dv_pipeline_shared_data *
 v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache,
-                                        unsigned char sha1_key[20])
+                                        unsigned char sha1_key[20],
+                                        bool *cache_hit)
 {
    if (!cache || !cache->cache)
       return NULL;
@@ -270,6 +271,7 @@ v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache,
       assert(cache_entry);
 
       cache->stats.hit++;
+      *cache_hit = true;
       if (debug_cache) {
          fprintf(stderr, "\tcache hit: %p\n", cache_entry);
          if (dump_stats)
@@ -693,7 +695,7 @@ v3dv_CreatePipelineCache(VkDevice _device,
                             VK_OBJECT_TYPE_PIPELINE_CACHE);
 
    if (cache == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    v3dv_pipeline_cache_init(cache, device, pCreateInfo->flags,
                             device->instance->pipeline_cache_enabled);
diff --git a/mesa 3D driver/src/broadcom/vulkan/v3dv_private.h b/mesa 3D driver/src/broadcom/vulkan/v3dv_private.h
index b3908afc28..68c90a88a4 100644
--- a/mesa 3D driver/src/broadcom/vulkan/v3dv_private.h	
+++ b/mesa 3D driver/src/broadcom/vulkan/v3dv_private.h	
@@ -38,10 +38,15 @@
 
 #include "vk_device.h"
 #include "vk_instance.h"
+#include "vk_image.h"
+#include "vk_log.h"
 #include "vk_physical_device.h"
 #include "vk_shader_module.h"
 #include "vk_util.h"
 
+#include "vk_command_buffer.h"
+#include "vk_queue.h"
+
 #include <xf86drm.h>
 
 #ifdef HAVE_VALGRIND
@@ -215,10 +220,9 @@ struct v3dv_queue_submit_wait_info {
 };
 
 struct v3dv_queue {
-   struct vk_object_base base;
+   struct vk_queue vk;
 
    struct v3dv_device *device;
-   VkDeviceQueueCreateFlags flags;
 
    /* A list of active v3dv_queue_submit_wait_info */
    struct list_head submit_wait_list;
@@ -518,35 +522,19 @@ struct v3d_resource_slice {
 };
 
 struct v3dv_image {
-   struct vk_object_base base;
+   struct vk_image vk;
 
-   VkImageType type;
-   VkImageAspectFlags aspects;
-
-   VkExtent3D extent;
-   uint32_t levels;
-   uint32_t array_size;
-   uint32_t samples;
-   VkImageUsageFlags usage;
-   VkImageCreateFlags flags;
-   VkImageTiling tiling;
-
-   VkFormat vk_format;
    const struct v3dv_format *format;
-
    uint32_t cpp;
-
-   uint64_t drm_format_mod;
    bool tiled;
-   bool external;
 
    struct v3d_resource_slice slices[V3D_MAX_MIP_LEVELS];
    uint64_t size; /* Total size in bytes */
    uint32_t cube_map_stride;
-   uint32_t alignment;
 
    struct v3dv_device_memory *mem;
    VkDeviceSize mem_offset;
+   uint32_t alignment;
 };
 
 VkImageViewType v3dv_image_type_to_view_type(VkImageType type);
@@ -565,23 +553,12 @@ VkImageViewType v3dv_image_type_to_view_type(VkImageType type);
 #define V3DV_STENCIL_CFG_LENGTH 6
 
 struct v3dv_image_view {
-   struct vk_object_base base;
+   struct vk_image_view vk;
 
-   struct v3dv_image *image;
-   VkImageAspectFlags aspects;
-   VkExtent3D extent;
-   VkImageViewType type;
-
-   VkFormat vk_format;
    const struct v3dv_format *format;
    bool swap_rb;
    uint32_t internal_bpp;
    uint32_t internal_type;
-
-   uint32_t base_level;
-   uint32_t max_level;
-   uint32_t first_layer;
-   uint32_t last_layer;
    uint32_t offset;
 
    /* Precomputed (composed from createinfo->components and formar swizzle)
@@ -1293,6 +1270,11 @@ VkResult v3dv_get_query_pool_results_cpu(struct v3dv_device *device,
                                          VkDeviceSize stride,
                                          VkQueryResultFlags flags);
 
+void v3dv_reset_query_pools(struct v3dv_device *device,
+                            struct v3dv_query_pool *query_pool,
+                            uint32_t first,
+                            uint32_t last);
+
 typedef void (*v3dv_cmd_buffer_private_obj_destroy_cb)(VkDevice device,
                                                        uint64_t pobj,
                                                        VkAllocationCallbacks *alloc);
@@ -1303,7 +1285,7 @@ struct v3dv_cmd_buffer_private_obj {
 };
 
 struct v3dv_cmd_buffer {
-   struct vk_object_base base;
+   struct vk_command_buffer vk;
 
    struct v3dv_device *device;
 
@@ -1487,6 +1469,8 @@ struct v3dv_pipeline_stage {
 
    /** A name for this program, so you can track it in shader-db output. */
    uint32_t program_id;
+
+   VkPipelineCreationFeedbackEXT feedback;
 };
 
 /* We are using the descriptor pool entry for two things:
@@ -1896,13 +1880,6 @@ const nir_shader_compiler_options *v3dv_pipeline_get_nir_options(void);
 uint32_t v3dv_physical_device_vendor_id(struct v3dv_physical_device *dev);
 uint32_t v3dv_physical_device_device_id(struct v3dv_physical_device *dev);
 
-VkResult __vk_errorf(struct v3dv_instance *instance, VkResult error,
-                     const char *file, int line,
-                     const char *format, ...);
-
-#define vk_error(instance, error) __vk_errorf(instance, error, __FILE__, __LINE__, NULL);
-#define vk_errorf(instance, error, format, ...) __vk_errorf(instance, error, __FILE__, __LINE__, format, ## __VA_ARGS__);
-
 #ifdef DEBUG
 #define v3dv_debug_ignored_stype(sType) \
    fprintf(stderr, "%s: ignored VkStructureType %u:%s\n\n", __func__, (sType), vk_StructureType_to_str(sType))
@@ -2037,7 +2014,8 @@ nir_shader* v3dv_pipeline_cache_search_for_nir(struct v3dv_pipeline *pipeline,
 
 struct v3dv_pipeline_shared_data *
 v3dv_pipeline_cache_search_for_pipeline(struct v3dv_pipeline_cache *cache,
-                                        unsigned char sha1_key[20]);
+                                        unsigned char sha1_key[20],
+                                        bool *cache_hit);
 
 void
 v3dv_pipeline_cache_upload_pipeline(struct v3dv_pipeline *pipeline,
@@ -2051,74 +2029,58 @@ void v3dv_shader_module_internal_init(struct v3dv_device *device,
                                       struct vk_shader_module *module,
                                       nir_shader *nir);
 
-#define V3DV_DEFINE_HANDLE_CASTS(__v3dv_type, __VkType)   \
-                                                        \
-   static inline struct __v3dv_type *                    \
-   __v3dv_type ## _from_handle(__VkType _handle)         \
-   {                                                    \
-      return (struct __v3dv_type *) _handle;             \
-   }                                                    \
-                                                        \
-   static inline __VkType                               \
-   __v3dv_type ## _to_handle(struct __v3dv_type *_obj)    \
-   {                                                    \
-      return (__VkType) _obj;                           \
-   }
-
-#define V3DV_DEFINE_NONDISP_HANDLE_CASTS(__v3dv_type, __VkType)              \
-                                                                           \
-   static inline struct __v3dv_type *                                       \
-   __v3dv_type ## _from_handle(__VkType _handle)                            \
-   {                                                                       \
-      return (struct __v3dv_type *)(uintptr_t) _handle;                     \
-   }                                                                       \
-                                                                           \
-   static inline __VkType                                                  \
-   __v3dv_type ## _to_handle(struct __v3dv_type *_obj)                       \
-   {                                                                       \
-      return (__VkType)(uintptr_t) _obj;                                   \
-   }
-
 #define V3DV_FROM_HANDLE(__v3dv_type, __name, __handle)			\
-   struct __v3dv_type *__name = __v3dv_type ## _from_handle(__handle)
+   VK_FROM_HANDLE(__v3dv_type, __name, __handle)
 
-V3DV_DEFINE_HANDLE_CASTS(v3dv_cmd_buffer, VkCommandBuffer)
-V3DV_DEFINE_HANDLE_CASTS(v3dv_device, VkDevice)
-V3DV_DEFINE_HANDLE_CASTS(v3dv_instance, VkInstance)
-V3DV_DEFINE_HANDLE_CASTS(v3dv_physical_device, VkPhysicalDevice)
-V3DV_DEFINE_HANDLE_CASTS(v3dv_queue, VkQueue)
+VK_DEFINE_HANDLE_CASTS(v3dv_cmd_buffer, vk.base, VkCommandBuffer,
+                       VK_OBJECT_TYPE_COMMAND_BUFFER)
+VK_DEFINE_HANDLE_CASTS(v3dv_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
+VK_DEFINE_HANDLE_CASTS(v3dv_instance, vk.base, VkInstance,
+                       VK_OBJECT_TYPE_INSTANCE)
+VK_DEFINE_HANDLE_CASTS(v3dv_physical_device, vk.base, VkPhysicalDevice,
+                       VK_OBJECT_TYPE_PHYSICAL_DEVICE)
+VK_DEFINE_HANDLE_CASTS(v3dv_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
 
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_cmd_pool, VkCommandPool)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_buffer, VkBuffer)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_buffer_view, VkBufferView)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_device_memory, VkDeviceMemory)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_pool, VkDescriptorPool)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_set, VkDescriptorSet)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_set_layout, VkDescriptorSetLayout)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_update_template, VkDescriptorUpdateTemplate)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_event, VkEvent)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_fence, VkFence)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_framebuffer, VkFramebuffer)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_image, VkImage)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_image_view, VkImageView)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline, VkPipeline)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline_cache, VkPipelineCache)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline_layout, VkPipelineLayout)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_query_pool, VkQueryPool)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_render_pass, VkRenderPass)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_sampler, VkSampler)
-V3DV_DEFINE_NONDISP_HANDLE_CASTS(v3dv_semaphore, VkSemaphore)
-
-/* This is defined as a macro so that it works for both
- * VkImageSubresourceRange and VkImageSubresourceLayers
- */
-#define v3dv_layer_count(_image, _range) \
-   ((_range)->layerCount == VK_REMAINING_ARRAY_LAYERS ? \
-    (_image)->array_size - (_range)->baseArrayLayer : (_range)->layerCount)
-
-#define v3dv_level_count(_image, _range) \
-   ((_range)->levelCount == VK_REMAINING_MIP_LEVELS ? \
-    (_image)->levels - (_range)->baseMipLevel : (_range)->levelCount)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_cmd_pool, base, VkCommandPool,
+                               VK_OBJECT_TYPE_COMMAND_POOL)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_buffer, base, VkBuffer,
+                               VK_OBJECT_TYPE_BUFFER)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_buffer_view, base, VkBufferView,
+                               VK_OBJECT_TYPE_BUFFER_VIEW)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_device_memory, base, VkDeviceMemory,
+                               VK_OBJECT_TYPE_DEVICE_MEMORY)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_pool, base, VkDescriptorPool,
+                               VK_OBJECT_TYPE_DESCRIPTOR_POOL)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_set, base, VkDescriptorSet,
+                               VK_OBJECT_TYPE_DESCRIPTOR_SET)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_set_layout, base,
+                               VkDescriptorSetLayout,
+                               VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_descriptor_update_template, base,
+                               VkDescriptorUpdateTemplate,
+                               VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_fence, base, VkFence, VK_OBJECT_TYPE_FENCE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_framebuffer, base, VkFramebuffer,
+                               VK_OBJECT_TYPE_FRAMEBUFFER)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_image, vk.base, VkImage,
+                               VK_OBJECT_TYPE_IMAGE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_image_view, vk.base, VkImageView,
+                               VK_OBJECT_TYPE_IMAGE_VIEW)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline, base, VkPipeline,
+                               VK_OBJECT_TYPE_PIPELINE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline_cache, base, VkPipelineCache,
+                               VK_OBJECT_TYPE_PIPELINE_CACHE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_pipeline_layout, base, VkPipelineLayout,
+                               VK_OBJECT_TYPE_PIPELINE_LAYOUT)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_query_pool, base, VkQueryPool,
+                               VK_OBJECT_TYPE_QUERY_POOL)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_render_pass, base, VkRenderPass,
+                               VK_OBJECT_TYPE_RENDER_PASS)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_sampler, base, VkSampler,
+                               VK_OBJECT_TYPE_SAMPLER)
+VK_DEFINE_NONDISP_HANDLE_CASTS(v3dv_semaphore, base, VkSemaphore,
+                               VK_OBJECT_TYPE_SEMAPHORE)
 
 static inline int
 v3dv_ioctl(int fd, unsigned long request, void *arg)
diff --git a/mesa 3D driver/src/broadcom/vulkan/v3dv_query.c b/mesa 3D driver/src/broadcom/vulkan/v3dv_query.c
index 0deb430fc1..70d6dac182 100644
--- a/mesa 3D driver/src/broadcom/vulkan/v3dv_query.c	
+++ b/mesa 3D driver/src/broadcom/vulkan/v3dv_query.c	
@@ -39,7 +39,7 @@ v3dv_CreateQueryPool(VkDevice _device,
       vk_object_zalloc(&device->vk, pAllocator, sizeof(*pool),
                        VK_OBJECT_TYPE_QUERY_POOL);
    if (pool == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    pool->query_type = pCreateInfo->queryType;
    pool->query_count = pCreateInfo->queryCount;
@@ -50,7 +50,7 @@ v3dv_CreateQueryPool(VkDevice _device,
    pool->queries = vk_alloc2(&device->vk.alloc, pAllocator, pool_bytes, 8,
                              VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (pool->queries == NULL) {
-      result = vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
       goto fail;
    }
 
@@ -63,11 +63,11 @@ v3dv_CreateQueryPool(VkDevice _device,
       const uint32_t bo_size = query_groups * 1024;
       pool->bo = v3dv_bo_alloc(device, bo_size, "query", true);
       if (!pool->bo) {
-         result = vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+         result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
          goto fail;
       }
       if (!v3dv_bo_map(device, pool->bo, bo_size)) {
-         result = vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+         result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
          goto fail;
       }
    }
@@ -159,10 +159,10 @@ get_occlusion_query_result(struct v3dv_device *device,
        *     error may occur."
        */
       if (!q->maybe_available)
-         return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+         return vk_error(device, VK_ERROR_DEVICE_LOST);
 
       if (!v3dv_bo_wait(device, q->bo, 0xffffffffffffffffull))
-         return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+         return vk_error(device, VK_ERROR_DEVICE_LOST);
 
       *available = true;
    } else {
@@ -195,7 +195,7 @@ get_timestamp_query_result(struct v3dv_device *device,
        *     error may occur."
        */
       if (!q->maybe_available)
-         return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+         return vk_error(device, VK_ERROR_DEVICE_LOST);
 
       *available = true;
    } else {
@@ -351,3 +351,41 @@ v3dv_CmdEndQuery(VkCommandBuffer commandBuffer,
 
    v3dv_cmd_buffer_end_query(cmd_buffer, pool, query);
 }
+
+void
+v3dv_reset_query_pools(struct v3dv_device *device,
+                       struct v3dv_query_pool *pool,
+                       uint32_t first,
+                       uint32_t count)
+{
+   for (uint32_t i = first; i < first + count; i++) {
+      assert(i < pool->query_count);
+      struct v3dv_query *q = &pool->queries[i];
+      q->maybe_available = false;
+      switch (pool->query_type) {
+      case VK_QUERY_TYPE_OCCLUSION: {
+         const uint8_t *q_addr = ((uint8_t *) q->bo->map) + q->offset;
+         uint32_t *counter = (uint32_t *) q_addr;
+         *counter = 0;
+         break;
+      }
+      case VK_QUERY_TYPE_TIMESTAMP:
+         q->value = 0;
+         break;
+      default:
+         unreachable("Unsupported query type");
+      }
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL
+v3dv_ResetQueryPool(VkDevice _device,
+                    VkQueryPool queryPool,
+                    uint32_t firstQuery,
+                    uint32_t queryCount)
+{
+   V3DV_FROM_HANDLE(v3dv_device, device, _device);
+   V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
+
+   v3dv_reset_query_pools(device, pool, firstQuery, queryCount);
+}
diff --git a/mesa 3D driver/src/broadcom/vulkan/v3dv_queue.c b/mesa 3D driver/src/broadcom/vulkan/v3dv_queue.c
index 91cd6a3f79..baac241e4c 100644
--- a/mesa 3D driver/src/broadcom/vulkan/v3dv_queue.c	
+++ b/mesa 3D driver/src/broadcom/vulkan/v3dv_queue.c	
@@ -34,12 +34,16 @@ v3dv_clif_dump(struct v3dv_device *device,
                struct v3dv_job *job,
                struct drm_v3d_submit_cl *submit)
 {
-   if (!(V3D_DEBUG & (V3D_DEBUG_CL | V3D_DEBUG_CLIF)))
+   if (!(unlikely(V3D_DEBUG & (V3D_DEBUG_CL |
+                               V3D_DEBUG_CL_NO_BIN |
+                               V3D_DEBUG_CLIF))))
       return;
 
    struct clif_dump *clif = clif_dump_init(&device->devinfo,
                                            stderr,
-                                           V3D_DEBUG & V3D_DEBUG_CL);
+                                           V3D_DEBUG & (V3D_DEBUG_CL |
+                                                        V3D_DEBUG_CL_NO_BIN),
+                                           V3D_DEBUG & V3D_DEBUG_CL_NO_BIN);
 
    set_foreach(job->bos, entry) {
       struct v3dv_bo *bo = (void *)entry->key;
@@ -172,24 +176,7 @@ handle_reset_query_cpu_job(struct v3dv_job *job)
    if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION)
          v3dv_bo_wait(job->device, info->pool->bo, PIPE_TIMEOUT_INFINITE);
 
-   for (uint32_t i = info->first; i < info->first + info->count; i++) {
-      assert(i < info->pool->query_count);
-      struct v3dv_query *q = &info->pool->queries[i];
-      q->maybe_available = false;
-      switch (info->pool->query_type) {
-      case VK_QUERY_TYPE_OCCLUSION: {
-         const uint8_t *q_addr = ((uint8_t *) q->bo->map) + q->offset;
-         uint32_t *counter = (uint32_t *) q_addr;
-         *counter = 0;
-         break;
-      }
-      case VK_QUERY_TYPE_TIMESTAMP:
-         q->value = 0;
-         break;
-      default:
-         unreachable("Unsupported query type");
-      }
-   }
+   v3dv_reset_query_pools(job->device, info->pool, info->first, info->count);
 
    return VK_SUCCESS;
 }
@@ -219,7 +206,7 @@ handle_copy_query_results_cpu_job(struct v3dv_job *job)
    /* Map the entire dst buffer for the CPU copy if needed */
    assert(!bo->map || bo->map_size == bo->size);
    if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
-      return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    /* FIXME: if flags includes VK_QUERY_RESULT_WAIT_BIT this could trigger a
     * sync wait on the CPU for the corresponding GPU jobs to finish. We might
@@ -356,7 +343,7 @@ spawn_event_wait_thread(struct v3dv_job *job, pthread_t *wait_thread)
    assert(wait_thread != NULL);
 
    if (pthread_create(wait_thread, NULL, event_wait_thread_func, job))
-      return vk_error(job->device->instance, VK_ERROR_DEVICE_LOST);
+      return vk_error(job->device, VK_ERROR_DEVICE_LOST);
 
    return VK_NOT_READY;
 }
@@ -409,13 +396,13 @@ handle_copy_buffer_to_image_cpu_job(struct v3dv_job *job)
    struct v3dv_bo *dst_bo = info->image->mem->bo;
    assert(!dst_bo->map || dst_bo->map_size == dst_bo->size);
    if (!dst_bo->map && !v3dv_bo_map(job->device, dst_bo, dst_bo->size))
-      return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
    void *dst_ptr = dst_bo->map;
 
    struct v3dv_bo *src_bo = info->buffer->mem->bo;
    assert(!src_bo->map || src_bo->map_size == src_bo->size);
    if (!src_bo->map && !v3dv_bo_map(job->device, src_bo, src_bo->size))
-      return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
    void *src_ptr = src_bo->map;
 
    const struct v3d_resource_slice *slice =
@@ -488,7 +475,7 @@ handle_csd_indirect_cpu_job(struct v3dv_queue *queue,
    assert(info->buffer && info->buffer->mem && info->buffer->mem->bo);
    struct v3dv_bo *bo = info->buffer->mem->bo;
    if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
-      return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
    assert(bo->map);
 
    const uint32_t offset = info->buffer->mem_offset + info->offset;
@@ -520,7 +507,7 @@ process_semaphores_to_signal(struct v3dv_device *device,
    drmSyncobjExportSyncFile(render_fd, device->last_job_sync, &fd);
    mtx_unlock(&device->mutex);
    if (fd == -1)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    VkResult result = VK_SUCCESS;
    for (uint32_t i = 0; i < count; i++) {
@@ -559,7 +546,7 @@ process_fence_to_signal(struct v3dv_device *device, VkFence _fence)
    drmSyncobjExportSyncFile(render_fd, device->last_job_sync, &fd);
    mtx_unlock(&device->mutex);
    if (fd == -1)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    int ret;
    if (!fence->temp_sync)
@@ -657,7 +644,7 @@ handle_cl_job(struct v3dv_queue *queue,
    free(bo_handles);
 
    if (ret)
-      return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+      return vk_error(device, VK_ERROR_DEVICE_LOST);
 
    return VK_SUCCESS;
 }
@@ -680,7 +667,7 @@ handle_tfu_job(struct v3dv_queue *queue,
 
    if (ret != 0) {
       fprintf(stderr, "Failed to submit TFU job: %d\n", ret);
-      return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+      return vk_error(device, VK_ERROR_DEVICE_LOST);
    }
 
    return VK_SUCCESS;
@@ -725,7 +712,7 @@ handle_csd_job(struct v3dv_queue *queue,
    free(bo_handles);
 
    if (ret)
-      return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+      return vk_error(device, VK_ERROR_DEVICE_LOST);
 
    return VK_SUCCESS;
 }
@@ -773,7 +760,7 @@ queue_create_noop_job(struct v3dv_queue *queue)
    queue->noop_job = vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_job), 8,
                                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (!queue->noop_job)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
    v3dv_job_init(queue->noop_job, V3DV_JOB_TYPE_GPU_CL, device, NULL, -1);
 
    v3dv_X(device, job_emit_noop)(queue->noop_job);
@@ -995,7 +982,7 @@ spawn_master_wait_thread(struct v3dv_queue *queue,
    mtx_lock(&queue->mutex);
    if (pthread_create(&wait_info->master_wait_thread, NULL,
                       master_wait_thread_func, wait_info)) {
-      result = vk_error(queue->device->instance, VK_ERROR_DEVICE_LOST);
+      result = vk_error(queue, VK_ERROR_DEVICE_LOST);
       goto done;
    }
 
@@ -1063,12 +1050,12 @@ v3dv_CreateSemaphore(VkDevice _device,
       vk_object_zalloc(&device->vk, pAllocator, sizeof(struct v3dv_semaphore),
                        VK_OBJECT_TYPE_SEMAPHORE);
    if (sem == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    int ret = drmSyncobjCreate(device->pdevice->render_fd, 0, &sem->sync);
    if (ret) {
       vk_object_free(&device->vk, pAllocator, sem);
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
    }
 
    *pSemaphore = v3dv_semaphore_to_handle(sem);
@@ -1154,23 +1141,23 @@ v3dv_ImportSemaphoreFdKHR(
        */
       unsigned flags = fd == -1 ? DRM_SYNCOBJ_CREATE_SIGNALED : 0;
       if (drmSyncobjCreate(render_fd, flags, &new_sync))
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
       if (fd != -1) {
          if (drmSyncobjImportSyncFile(render_fd, new_sync, fd)) {
             drmSyncobjDestroy(render_fd, new_sync);
-            return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+            return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
          }
       }
       break;
    }
    case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: {
       if (drmSyncobjFDToHandle(render_fd, fd, &new_sync))
-         return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+         return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
       break;
    }
    default:
-      return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+      return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
    }
 
    destroy_syncobj(render_fd, &sem->temp_sync);
@@ -1212,12 +1199,12 @@ v3dv_GetSemaphoreFdKHR(VkDevice _device,
    case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: {
       drmSyncobjExportSyncFile(render_fd, sem->sync, pFd);
       if (*pFd == -1)
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
       break;
    case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT:
       drmSyncobjHandleToFD(render_fd, sem->sync, pFd);
       if (*pFd == -1)
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
       break;
    }
    default:
@@ -1258,7 +1245,7 @@ v3dv_CreateFence(VkDevice _device,
       vk_object_zalloc(&device->vk, pAllocator, sizeof(struct v3dv_fence),
                        VK_OBJECT_TYPE_FENCE);
    if (fence == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    unsigned flags = 0;
    if (pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT)
@@ -1266,7 +1253,7 @@ v3dv_CreateFence(VkDevice _device,
    int ret = drmSyncobjCreate(device->pdevice->render_fd, flags, &fence->sync);
    if (ret) {
       vk_object_free(&device->vk, pAllocator, fence);
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
    }
 
    *pFence = v3dv_fence_to_handle(fence);
@@ -1353,23 +1340,23 @@ v3dv_ImportFenceFdKHR(VkDevice _device,
        */
       unsigned flags = fd == -1 ? DRM_SYNCOBJ_CREATE_SIGNALED : 0;
       if (drmSyncobjCreate(render_fd, flags, &new_sync))
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
       if (fd != -1) {
          if (drmSyncobjImportSyncFile(render_fd, new_sync, fd)) {
             drmSyncobjDestroy(render_fd, new_sync);
-            return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+            return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
          }
       }
       break;
    }
    case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT: {
       if (drmSyncobjFDToHandle(render_fd, fd, &new_sync))
-         return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+         return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
       break;
    }
    default:
-      return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+      return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
    }
 
    destroy_syncobj(render_fd, &fence->temp_sync);
@@ -1423,7 +1410,7 @@ v3dv_GetFenceStatus(VkDevice _device, VkFence _fence)
    if (ret == -ETIME)
       return VK_NOT_READY;
    else if (ret)
-      return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+      return vk_error(device, VK_ERROR_DEVICE_LOST);
    return VK_SUCCESS;
 }
 
@@ -1443,12 +1430,12 @@ v3dv_GetFenceFdKHR(VkDevice _device,
    case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: {
       drmSyncobjExportSyncFile(render_fd, fence->sync, pFd);
       if (*pFd == -1)
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
       break;
    case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT:
       drmSyncobjHandleToFD(render_fd, fence->sync, pFd);
       if (*pFd == -1)
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
       break;
    }
    default:
@@ -1467,7 +1454,7 @@ v3dv_ResetFences(VkDevice _device, uint32_t fenceCount, const VkFence *pFences)
                                  sizeof(*syncobjs) * fenceCount, 8,
                                  VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
    if (!syncobjs)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    int render_fd = device->pdevice->render_fd;
    uint32_t reset_count = 0;
@@ -1497,7 +1484,7 @@ v3dv_ResetFences(VkDevice _device, uint32_t fenceCount, const VkFence *pFences)
    vk_free(&device->vk.alloc, syncobjs);
 
    if (ret)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
    return VK_SUCCESS;
 }
 
@@ -1516,7 +1503,7 @@ v3dv_WaitForFences(VkDevice _device,
                                  sizeof(*syncobjs) * fenceCount, 8,
                                  VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
    if (!syncobjs)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    for (uint32_t i = 0; i < fenceCount; i++) {
       struct v3dv_fence *fence = v3dv_fence_from_handle(pFences[i]);
@@ -1538,7 +1525,7 @@ v3dv_WaitForFences(VkDevice _device,
    if (ret == -ETIME)
       return VK_TIMEOUT;
    else if (ret)
-      return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+      return vk_error(device, VK_ERROR_DEVICE_LOST);
    return VK_SUCCESS;
 }
 
@@ -1549,5 +1536,5 @@ v3dv_QueueBindSparse(VkQueue _queue,
                      VkFence fence)
 {
    V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
-   return vk_error(queue->device->instance, VK_ERROR_FEATURE_NOT_PRESENT);
+   return vk_error(queue, VK_ERROR_FEATURE_NOT_PRESENT);
 }
diff --git a/mesa 3D driver/src/broadcom/vulkan/v3dv_uniforms.c b/mesa 3D driver/src/broadcom/vulkan/v3dv_uniforms.c
index bf25e64d4b..47bc3a0b17 100644
--- a/mesa 3D driver/src/broadcom/vulkan/v3dv_uniforms.c	
+++ b/mesa 3D driver/src/broadcom/vulkan/v3dv_uniforms.c	
@@ -312,26 +312,26 @@ get_texture_size_from_image_view(struct v3dv_image_view *image_view,
       /* We don't u_minify the values, as we are using the image_view
        * extents
        */
-      return image_view->extent.width;
+      return image_view->vk.extent.width;
    case QUNIFORM_IMAGE_HEIGHT:
    case QUNIFORM_TEXTURE_HEIGHT:
-      return image_view->extent.height;
+      return image_view->vk.extent.height;
    case QUNIFORM_IMAGE_DEPTH:
    case QUNIFORM_TEXTURE_DEPTH:
-      return image_view->extent.depth;
+      return image_view->vk.extent.depth;
    case QUNIFORM_IMAGE_ARRAY_SIZE:
    case QUNIFORM_TEXTURE_ARRAY_SIZE:
-      if (image_view->type != VK_IMAGE_VIEW_TYPE_CUBE_ARRAY) {
-         return image_view->last_layer - image_view->first_layer + 1;
+      if (image_view->vk.view_type != VK_IMAGE_VIEW_TYPE_CUBE_ARRAY) {
+         return image_view->vk.layer_count;
       } else {
-         assert((image_view->last_layer - image_view->first_layer + 1) % 6 == 0);
-         return (image_view->last_layer - image_view->first_layer + 1) / 6;
+         assert(image_view->vk.layer_count % 6 == 0);
+         return image_view->vk.layer_count / 6;
       }
    case QUNIFORM_TEXTURE_LEVELS:
-      return image_view->max_level - image_view->base_level + 1;
+      return image_view->vk.level_count;
    case QUNIFORM_TEXTURE_SAMPLES:
-      assert(image_view->image);
-      return image_view->image->samples;
+      assert(image_view->vk.image);
+      return image_view->vk.image->samples;
    default:
       unreachable("Bad texture size field");
    }
diff --git a/mesa 3D driver/src/broadcom/vulkan/v3dv_wsi.c b/mesa 3D driver/src/broadcom/vulkan/v3dv_wsi.c
index 23c542cbc0..154adf3a7d 100644
--- a/mesa 3D driver/src/broadcom/vulkan/v3dv_wsi.c	
+++ b/mesa 3D driver/src/broadcom/vulkan/v3dv_wsi.c	
@@ -25,6 +25,7 @@
 
 #include "v3dv_private.h"
 #include "drm-uapi/drm_fourcc.h"
+#include "wsi_common_entrypoints.h"
 #include "vk_format_info.h"
 #include "vk_util.h"
 #include "wsi_common.h"
@@ -89,46 +90,19 @@ v3dv_wsi_init(struct v3dv_physical_device *physical_device)
    physical_device->wsi_device.can_present_on_device =
       v3dv_wsi_can_present_on_device;
 
+   physical_device->vk.wsi_device = &physical_device->wsi_device;
+
    return VK_SUCCESS;
 }
 
 void
 v3dv_wsi_finish(struct v3dv_physical_device *physical_device)
 {
+   physical_device->vk.wsi_device = NULL;
    wsi_device_finish(&physical_device->wsi_device,
                      &physical_device->vk.instance->alloc);
 }
 
-VKAPI_ATTR void VKAPI_CALL
-v3dv_DestroySurfaceKHR(
-    VkInstance                                   _instance,
-    VkSurfaceKHR                                 _surface,
-    const VkAllocationCallbacks*                 pAllocator)
-{
-   V3DV_FROM_HANDLE(v3dv_instance, instance, _instance);
-   ICD_FROM_HANDLE(VkIcdSurfaceBase, surface, _surface);
-
-   if (!surface)
-      return;
-
-   vk_free2(&instance->vk.alloc, pAllocator, surface);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetPhysicalDeviceSurfaceSupportKHR(
-    VkPhysicalDevice                            physicalDevice,
-    uint32_t                                    queueFamilyIndex,
-    VkSurfaceKHR                                surface,
-    VkBool32*                                   pSupported)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_support(&device->wsi_device,
-                                         queueFamilyIndex,
-                                         surface,
-                                         pSupported);
-}
-
 static void
 constraint_surface_capabilities(VkSurfaceCapabilitiesKHR *caps)
 {
@@ -149,12 +123,10 @@ v3dv_GetPhysicalDeviceSurfaceCapabilitiesKHR(
     VkSurfaceKHR                                surface,
     VkSurfaceCapabilitiesKHR*                   pSurfaceCapabilities)
 {
-   V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
    VkResult result;
-   result = wsi_common_get_surface_capabilities(&device->wsi_device,
-                                                surface,
-                                                pSurfaceCapabilities);
+   result = wsi_GetPhysicalDeviceSurfaceCapabilitiesKHR(physicalDevice,
+                                                        surface,
+                                                        pSurfaceCapabilities);
    constraint_surface_capabilities(pSurfaceCapabilities);
    return result;
 }
@@ -165,56 +137,14 @@ v3dv_GetPhysicalDeviceSurfaceCapabilities2KHR(
     const VkPhysicalDeviceSurfaceInfo2KHR*      pSurfaceInfo,
     VkSurfaceCapabilities2KHR*                  pSurfaceCapabilities)
 {
-   V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
    VkResult result;
-   result = wsi_common_get_surface_capabilities2(&device->wsi_device,
-                                                 pSurfaceInfo,
-                                                 pSurfaceCapabilities);
+   result = wsi_GetPhysicalDeviceSurfaceCapabilities2KHR(physicalDevice,
+                                                         pSurfaceInfo,
+                                                         pSurfaceCapabilities);
    constraint_surface_capabilities(&pSurfaceCapabilities->surfaceCapabilities);
    return result;
 }
 
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetPhysicalDeviceSurfaceFormatsKHR(
-    VkPhysicalDevice                            physicalDevice,
-    VkSurfaceKHR                                surface,
-    uint32_t*                                   pSurfaceFormatCount,
-    VkSurfaceFormatKHR*                         pSurfaceFormats)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_formats(&device->wsi_device, surface,
-                                         pSurfaceFormatCount, pSurfaceFormats);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetPhysicalDeviceSurfaceFormats2KHR(
-    VkPhysicalDevice                            physicalDevice,
-    const VkPhysicalDeviceSurfaceInfo2KHR*      pSurfaceInfo,
-    uint32_t*                                   pSurfaceFormatCount,
-    VkSurfaceFormat2KHR*                        pSurfaceFormats)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_formats2(&device->wsi_device, pSurfaceInfo,
-                                          pSurfaceFormatCount, pSurfaceFormats);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetPhysicalDeviceSurfacePresentModesKHR(
-    VkPhysicalDevice                            physicalDevice,
-    VkSurfaceKHR                                surface,
-    uint32_t*                                   pPresentModeCount,
-    VkPresentModeKHR*                           pPresentModes)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_present_modes(&device->wsi_device, surface,
-                                               pPresentModeCount,
-                                               pPresentModes);
-}
-
 VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_CreateSwapchainKHR(
     VkDevice                                     _device,
@@ -225,7 +155,6 @@ v3dv_CreateSwapchainKHR(
    V3DV_FROM_HANDLE(v3dv_device, device, _device);
    struct v3dv_instance *instance = device->instance;
    struct v3dv_physical_device *pdevice = &instance->physicalDevice;
-   struct wsi_device *wsi_device = &pdevice->wsi_device;
 
    ICD_FROM_HANDLE(VkIcdSurfaceBase, surface, pCreateInfo->surface);
    VkResult result =
@@ -233,43 +162,7 @@ v3dv_CreateSwapchainKHR(
    if (result != VK_SUCCESS)
       return result;
 
-   const VkAllocationCallbacks *alloc;
-   if (pAllocator)
-     alloc = pAllocator;
-   else
-     alloc = &device->vk.alloc;
-
-   return wsi_common_create_swapchain(wsi_device, _device,
-                                      pCreateInfo, alloc, pSwapchain);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-v3dv_DestroySwapchainKHR(
-    VkDevice                                     _device,
-    VkSwapchainKHR                               swapchain,
-    const VkAllocationCallbacks*                 pAllocator)
-{
-   V3DV_FROM_HANDLE(v3dv_device, device, _device);
-   const VkAllocationCallbacks *alloc;
-
-   if (pAllocator)
-     alloc = pAllocator;
-   else
-     alloc = &device->vk.alloc;
-
-   wsi_common_destroy_swapchain(_device, swapchain, alloc);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetSwapchainImagesKHR(
-    VkDevice                                     device,
-    VkSwapchainKHR                               swapchain,
-    uint32_t*                                    pSwapchainImageCount,
-    VkImage*                                     pSwapchainImages)
-{
-   return wsi_common_get_images(swapchain,
-                                pSwapchainImageCount,
-                                pSwapchainImages);
+   return wsi_CreateSwapchainKHR(_device, pCreateInfo, pAllocator, pSwapchain);
 }
 
 struct v3dv_image *
@@ -290,27 +183,6 @@ v3dv_wsi_get_image_from_swapchain(VkSwapchainKHR swapchain, uint32_t index)
    return image;
 }
 
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_AcquireNextImageKHR(
-    VkDevice                                     device,
-    VkSwapchainKHR                               swapchain,
-    uint64_t                                     timeout,
-    VkSemaphore                                  semaphore,
-    VkFence                                      fence,
-    uint32_t*                                    pImageIndex)
-{
-   VkAcquireNextImageInfoKHR acquire_info = {
-      .sType = VK_STRUCTURE_TYPE_ACQUIRE_NEXT_IMAGE_INFO_KHR,
-      .swapchain = swapchain,
-      .timeout = timeout,
-      .semaphore = semaphore,
-      .fence = fence,
-      .deviceMask = 0,
-   };
-
-   return v3dv_AcquireNextImage2KHR(device, &acquire_info, pImageIndex);
-}
-
 VKAPI_ATTR VkResult VKAPI_CALL
 v3dv_AcquireNextImage2KHR(
     VkDevice                                     _device,
@@ -336,56 +208,3 @@ v3dv_AcquireNextImage2KHR(
 
    return result;
 }
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_QueuePresentKHR(
-    VkQueue                                  _queue,
-    const VkPresentInfoKHR*                  pPresentInfo)
-{
-   V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
-   struct v3dv_physical_device *pdevice =
-      &queue->device->instance->physicalDevice;
-
-   return wsi_common_queue_present(&pdevice->wsi_device,
-                                   v3dv_device_to_handle(queue->device),
-                                   _queue, 0,
-                                   pPresentInfo);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetDeviceGroupPresentCapabilitiesKHR(
-    VkDevice                                    device,
-    VkDeviceGroupPresentCapabilitiesKHR*        pCapabilities)
-{
-   memset(pCapabilities->presentMask, 0,
-          sizeof(pCapabilities->presentMask));
-   pCapabilities->presentMask[0] = 0x1;
-   pCapabilities->modes = VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHR;
-
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetDeviceGroupSurfacePresentModesKHR(
-    VkDevice                                    device,
-    VkSurfaceKHR                                surface,
-    VkDeviceGroupPresentModeFlagsKHR*           pModes)
-{
-   *pModes = VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHR;
-
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-v3dv_GetPhysicalDevicePresentRectanglesKHR(
-    VkPhysicalDevice                            physicalDevice,
-    VkSurfaceKHR                                surface,
-    uint32_t*                                   pRectCount,
-    VkRect2D*                                   pRects)
-{
-   V3DV_FROM_HANDLE(v3dv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_present_rectangles(&device->wsi_device,
-                                            surface,
-                                            pRectCount, pRects);
-}
diff --git a/mesa 3D driver/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/mesa 3D driver/src/broadcom/vulkan/v3dvx_cmd_buffer.c
index fee9aee8c5..c2f2c77864 100644
--- a/mesa 3D driver/src/broadcom/vulkan/v3dvx_cmd_buffer.c	
+++ b/mesa 3D driver/src/broadcom/vulkan/v3dvx_cmd_buffer.c	
@@ -105,11 +105,12 @@ cmd_buffer_render_pass_emit_load(struct v3dv_cmd_buffer *cmd_buffer,
                                  uint32_t layer,
                                  uint32_t buffer)
 {
-   const struct v3dv_image *image = iview->image;
-   const struct v3d_resource_slice *slice = &image->slices[iview->base_level];
-   uint32_t layer_offset = v3dv_layer_offset(image,
-                                             iview->base_level,
-                                             iview->first_layer + layer);
+   const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image;
+   const struct v3d_resource_slice *slice =
+      &image->slices[iview->vk.base_mip_level];
+   uint32_t layer_offset =
+      v3dv_layer_offset(image, iview->vk.base_mip_level,
+                        iview->vk.base_array_layer + layer);
 
    cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
       load.buffer_to_load = buffer;
@@ -127,7 +128,7 @@ cmd_buffer_render_pass_emit_load(struct v3dv_cmd_buffer *cmd_buffer,
          load.height_in_ub_or_stride = slice->stride;
       }
 
-      if (image->samples > VK_SAMPLE_COUNT_1_BIT)
+      if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT)
          load.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
       else
          load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
@@ -140,7 +141,7 @@ check_needs_load(const struct v3dv_cmd_buffer_state *state,
                  uint32_t first_subpass_idx,
                  VkAttachmentLoadOp load_op)
 {
-   /* We call this with image->aspects & aspect, so 0 means the aspect we are
+   /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are
     * testing does not exist in the image.
     */
    if (!aspect)
@@ -290,11 +291,12 @@ cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer *cmd_buffer,
 {
    const struct v3dv_image_view *iview =
       cmd_buffer->state.framebuffer->attachments[attachment_idx];
-   const struct v3dv_image *image = iview->image;
-   const struct v3d_resource_slice *slice = &image->slices[iview->base_level];
+   const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image;
+   const struct v3d_resource_slice *slice =
+      &image->slices[iview->vk.base_mip_level];
    uint32_t layer_offset = v3dv_layer_offset(image,
-                                             iview->base_level,
-                                             iview->first_layer + layer);
+                                             iview->vk.base_mip_level,
+                                             iview->vk.base_array_layer + layer);
 
    cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
       store.buffer_to_store = buffer;
@@ -313,7 +315,7 @@ cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer *cmd_buffer,
          store.height_in_ub_or_stride = slice->stride;
       }
 
-      if (image->samples > VK_SAMPLE_COUNT_1_BIT)
+      if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT)
          store.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
       else if (is_multisample_resolve)
          store.decimate_mode = V3D_DECIMATE_MODE_4X;
@@ -329,7 +331,7 @@ check_needs_clear(const struct v3dv_cmd_buffer_state *state,
                   VkAttachmentLoadOp load_op,
                   bool do_clear_with_draw)
 {
-   /* We call this with image->aspects & aspect, so 0 means the aspect we are
+   /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are
     * testing does not exist in the image.
     */
    if (!aspect)
@@ -370,7 +372,7 @@ check_needs_store(const struct v3dv_cmd_buffer_state *state,
                   uint32_t last_subpass_idx,
                   VkAttachmentStoreOp store_op)
 {
-   /* We call this with image->aspects & aspect, so 0 means the aspect we are
+   /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are
     * testing does not exist in the image.
     */
    if (!aspect)
@@ -843,8 +845,9 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
       struct v3dv_image_view *iview =
          state->framebuffer->attachments[attachment_idx];
 
-      const struct v3dv_image *image = iview->image;
-      const struct v3d_resource_slice *slice = &image->slices[iview->base_level];
+      const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image;
+      const struct v3d_resource_slice *slice =
+         &image->slices[iview->vk.base_mip_level];
 
       const uint32_t *clear_color =
          &state->attachments[attachment_idx].clear_value.color[0];
@@ -2265,13 +2268,13 @@ v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buf
    const struct v3dv_framebuffer *framebuffer = state->framebuffer;
    assert(attachment_idx < framebuffer->attachment_count);
    struct v3dv_image_view *iview = framebuffer->attachments[attachment_idx];
-   assert(iview->aspects & VK_IMAGE_ASPECT_COLOR_BIT);
+   assert(iview->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT);
 
    *rt_bpp = iview->internal_bpp;
    *rt_type = iview->internal_type;
-   if (vk_format_is_int(iview->vk_format))
+   if (vk_format_is_int(iview->vk.format))
       *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT;
-   else if (vk_format_is_srgb(iview->vk_format))
+   else if (vk_format_is_srgb(iview->vk.format))
       *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM;
    else
       *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
diff --git a/mesa 3D driver/src/broadcom/vulkan/v3dvx_device.c b/mesa 3D driver/src/broadcom/vulkan/v3dvx_device.c
index f75b30e547..a48738aec4 100644
--- a/mesa 3D driver/src/broadcom/vulkan/v3dvx_device.c	
+++ b/mesa 3D driver/src/broadcom/vulkan/v3dvx_device.c	
@@ -255,10 +255,10 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
          const struct v3dv_image_view *att = framebuffer->attachments[att_idx];
          assert(att);
 
-         if (att->aspects & VK_IMAGE_ASPECT_COLOR_BIT)
+         if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
             *max_bpp = MAX2(*max_bpp, att->internal_bpp);
 
-         if (att->image->samples > VK_SAMPLE_COUNT_1_BIT)
+         if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
             *msaa = true;
       }
 
@@ -267,7 +267,7 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
             framebuffer->attachments[subpass->ds_attachment.attachment];
          assert(att);
 
-         if (att->image->samples > VK_SAMPLE_COUNT_1_BIT)
+         if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
             *msaa = true;
       }
 
@@ -279,10 +279,10 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
       const struct v3dv_image_view *att = framebuffer->attachments[i];
       assert(att);
 
-      if (att->aspects & VK_IMAGE_ASPECT_COLOR_BIT)
+      if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
          *max_bpp = MAX2(*max_bpp, att->internal_bpp);
 
-      if (att->image->samples > VK_SAMPLE_COUNT_1_BIT)
+      if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
          *msaa = true;
    }
 
diff --git a/mesa 3D driver/src/broadcom/vulkan/v3dvx_image.c b/mesa 3D driver/src/broadcom/vulkan/v3dvx_image.c
index fce29171fe..a9aa0fb979 100644
--- a/mesa 3D driver/src/broadcom/vulkan/v3dvx_image.c	
+++ b/mesa 3D driver/src/broadcom/vulkan/v3dvx_image.c	
@@ -61,15 +61,15 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
                                  bool for_cube_map_array_storage)
 {
    assert(!for_cube_map_array_storage ||
-          image_view->type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY);
+          image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY);
    const uint32_t index = for_cube_map_array_storage ? 1 : 0;
 
-   assert(image_view->image);
-   const struct v3dv_image *image = image_view->image;
+   assert(image_view->vk.image);
+   const struct v3dv_image *image = (struct v3dv_image *) image_view->vk.image;
 
-   assert(image->samples == VK_SAMPLE_COUNT_1_BIT ||
-          image->samples == VK_SAMPLE_COUNT_4_BIT);
-   const uint32_t msaa_scale = image->samples == VK_SAMPLE_COUNT_1_BIT ? 1 : 2;
+   assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT ||
+          image->vk.samples == VK_SAMPLE_COUNT_4_BIT);
+   const uint32_t msaa_scale = image->vk.samples == VK_SAMPLE_COUNT_1_BIT ? 1 : 2;
 
    v3dvx_pack(image_view->texture_shader_state[index], TEXTURE_SHADER_STATE, tex) {
 
@@ -91,8 +91,9 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
          tex.extended = true;
       }
 
-      tex.base_level = image_view->base_level;
-      tex.max_level = image_view->max_level;
+      tex.base_level = image_view->vk.base_mip_level;
+      tex.max_level = image_view->vk.base_mip_level +
+                      image_view->vk.level_count - 1;
 
       tex.swizzle_r = translate_swizzle(image_view->swizzle[0]);
       tex.swizzle_g = translate_swizzle(image_view->swizzle[1]);
@@ -101,29 +102,29 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
 
       tex.texture_type = image_view->format->tex_type;
 
-      if (image->type == VK_IMAGE_TYPE_3D) {
-         tex.image_depth = image->extent.depth;
+      if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
+         tex.image_depth = image->vk.extent.depth;
       } else {
-         tex.image_depth = (image_view->last_layer - image_view->first_layer) + 1;
+         tex.image_depth = image_view->vk.layer_count;
       }
 
       /* Empirical testing with CTS shows that when we are sampling from cube
        * arrays we want to set image depth to layers / 6, but not when doing
        * image load/store.
        */
-      if (image_view->type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY &&
+      if (image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY &&
           !for_cube_map_array_storage) {
          assert(tex.image_depth % 6 == 0);
          tex.image_depth /= 6;
       }
 
-      tex.image_height = image->extent.height * msaa_scale;
-      tex.image_width = image->extent.width * msaa_scale;
+      tex.image_height = image->vk.extent.height * msaa_scale;
+      tex.image_width = image->vk.extent.width * msaa_scale;
 
       /* On 4.x, the height of a 1D texture is redefined to be the
        * upper 14 bits of the width (which is only usable with txf).
        */
-      if (image->type == VK_IMAGE_TYPE_1D) {
+      if (image->vk.image_type == VK_IMAGE_TYPE_1D) {
          tex.image_height = tex.image_width >> 14;
       }
       tex.image_width &= (1 << 14) - 1;
@@ -131,7 +132,7 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
 
       tex.array_stride_64_byte_aligned = image->cube_map_stride / 64;
 
-      tex.srgb = vk_format_is_srgb(image_view->vk_format);
+      tex.srgb = vk_format_is_srgb(image_view->vk.format);
 
       /* At this point we don't have the job. That's the reason the first
        * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
@@ -140,7 +141,7 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
        */
       const uint32_t base_offset =
          image->mem->bo->offset +
-         v3dv_layer_offset(image, 0, image_view->first_layer);
+         v3dv_layer_offset(image, 0, image_view->vk.base_array_layer);
       tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
    }
 }
@@ -150,7 +151,7 @@ v3dX(pack_texture_shader_state)(struct v3dv_device *device,
                                 struct v3dv_image_view *iview)
 {
    pack_texture_shader_state_helper(device, iview, false);
-   if (iview->type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
+   if (iview->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
       pack_texture_shader_state_helper(device, iview, true);
 }
 
diff --git a/mesa 3D driver/src/broadcom/vulkan/v3dvx_meta_common.c b/mesa 3D driver/src/broadcom/vulkan/v3dvx_meta_common.c
index c115f9efa7..1dfbbfc57e 100644
--- a/mesa 3D driver/src/broadcom/vulkan/v3dvx_meta_common.c	
+++ b/mesa 3D driver/src/broadcom/vulkan/v3dvx_meta_common.c	
@@ -387,7 +387,7 @@ emit_image_load(struct v3dv_device *device,
          load.height_in_ub_or_stride = slice->stride;
       }
 
-      if (image->samples > VK_SAMPLE_COUNT_1_BIT)
+      if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT)
          load.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
       else
          load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
@@ -448,7 +448,7 @@ emit_image_store(struct v3dv_device *device,
          store.height_in_ub_or_stride = slice->stride;
       }
 
-      if (image->samples > VK_SAMPLE_COUNT_1_BIT)
+      if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT)
          store.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
       else if (is_multisample_resolve)
          store.decimate_mode = V3D_DECIMATE_MODE_4X;
@@ -474,11 +474,11 @@ emit_copy_layer_to_buffer_per_tile_list(struct v3dv_job *job,
    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
 
    /* Load image to TLB */
-   assert((image->type != VK_IMAGE_TYPE_3D &&
+   assert((image->vk.image_type != VK_IMAGE_TYPE_3D &&
            layer_offset < region->imageSubresource.layerCount) ||
-          layer_offset < image->extent.depth);
+          layer_offset < image->vk.extent.depth);
 
-   const uint32_t image_layer = image->type != VK_IMAGE_TYPE_3D ?
+   const uint32_t image_layer = image->vk.image_type != VK_IMAGE_TYPE_3D ?
       region->imageSubresource.baseArrayLayer + layer_offset :
       region->imageOffset.z + layer_offset;
 
@@ -505,8 +505,8 @@ emit_copy_layer_to_buffer_per_tile_list(struct v3dv_job *job,
       height = region->bufferImageHeight;
 
    /* Handle copy from compressed format */
-   width = DIV_ROUND_UP(width, vk_format_get_blockwidth(image->vk_format));
-   height = DIV_ROUND_UP(height, vk_format_get_blockheight(image->vk_format));
+   width = DIV_ROUND_UP(width, vk_format_get_blockwidth(image->vk.format));
+   height = DIV_ROUND_UP(height, vk_format_get_blockheight(image->vk.format));
 
    /* If we are storing stencil from a combined depth/stencil format the
     * Vulkan spec states that the output buffer must have packed stencil
@@ -522,7 +522,7 @@ emit_copy_layer_to_buffer_per_tile_list(struct v3dv_job *job,
    uint32_t format = choose_tlb_format(framebuffer,
                                        region->imageSubresource.aspectMask,
                                        true, true, false);
-   bool msaa = image->samples > VK_SAMPLE_COUNT_1_BIT;
+   bool msaa = image->vk.samples > VK_SAMPLE_COUNT_1_BIT;
 
    emit_linear_store(cl, RENDER_TARGET_0, buffer->mem->bo,
                      buffer_offset, buffer_stride, msaa, format);
@@ -582,11 +582,11 @@ emit_resolve_image_layer_per_tile_list(struct v3dv_job *job,
 
    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
 
-   assert((src->type != VK_IMAGE_TYPE_3D &&
+   assert((src->vk.image_type != VK_IMAGE_TYPE_3D &&
            layer_offset < region->srcSubresource.layerCount) ||
-          layer_offset < src->extent.depth);
+          layer_offset < src->vk.extent.depth);
 
-   const uint32_t src_layer = src->type != VK_IMAGE_TYPE_3D ?
+   const uint32_t src_layer = src->vk.image_type != VK_IMAGE_TYPE_3D ?
       region->srcSubresource.baseArrayLayer + layer_offset :
       region->srcOffset.z + layer_offset;
 
@@ -600,11 +600,11 @@ emit_resolve_image_layer_per_tile_list(struct v3dv_job *job,
 
    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
 
-   assert((dst->type != VK_IMAGE_TYPE_3D &&
+   assert((dst->vk.image_type != VK_IMAGE_TYPE_3D &&
            layer_offset < region->dstSubresource.layerCount) ||
-          layer_offset < dst->extent.depth);
+          layer_offset < dst->vk.extent.depth);
 
-   const uint32_t dst_layer = dst->type != VK_IMAGE_TYPE_3D ?
+   const uint32_t dst_layer = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
       region->dstSubresource.baseArrayLayer + layer_offset :
       region->dstOffset.z + layer_offset;
 
@@ -743,11 +743,11 @@ emit_copy_image_layer_per_tile_list(struct v3dv_job *job,
 
    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
 
-   assert((src->type != VK_IMAGE_TYPE_3D &&
+   assert((src->vk.image_type != VK_IMAGE_TYPE_3D &&
            layer_offset < region->srcSubresource.layerCount) ||
-          layer_offset < src->extent.depth);
+          layer_offset < src->vk.extent.depth);
 
-   const uint32_t src_layer = src->type != VK_IMAGE_TYPE_3D ?
+   const uint32_t src_layer = src->vk.image_type != VK_IMAGE_TYPE_3D ?
       region->srcSubresource.baseArrayLayer + layer_offset :
       region->srcOffset.z + layer_offset;
 
@@ -761,11 +761,11 @@ emit_copy_image_layer_per_tile_list(struct v3dv_job *job,
 
    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
 
-   assert((dst->type != VK_IMAGE_TYPE_3D &&
+   assert((dst->vk.image_type != VK_IMAGE_TYPE_3D &&
            layer_offset < region->dstSubresource.layerCount) ||
-          layer_offset < dst->extent.depth);
+          layer_offset < dst->vk.extent.depth);
 
-   const uint32_t dst_layer = dst->type != VK_IMAGE_TYPE_3D ?
+   const uint32_t dst_layer = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
       region->dstSubresource.baseArrayLayer + layer_offset :
       region->dstOffset.z + layer_offset;
 
@@ -815,62 +815,52 @@ v3dX(meta_emit_copy_image_rcl)(struct v3dv_job *job,
 
 void
 v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
-                        struct v3dv_image *dst,
-                        uint32_t dst_mip_level,
-                        uint32_t dst_layer,
-                        struct v3dv_image *src,
-                        uint32_t src_mip_level,
-                        uint32_t src_layer,
+                        uint32_t dst_bo_handle,
+                        uint32_t dst_offset,
+                        enum v3d_tiling_mode dst_tiling,
+                        uint32_t dst_padded_height_or_stride,
+                        uint32_t dst_cpp,
+                        uint32_t src_bo_handle,
+                        uint32_t src_offset,
+                        enum v3d_tiling_mode src_tiling,
+                        uint32_t src_padded_height_or_stride,
+                        uint32_t src_cpp,
                         uint32_t width,
                         uint32_t height,
                         const struct v3dv_format *format)
 {
-   const struct v3d_resource_slice *src_slice = &src->slices[src_mip_level];
-   const struct v3d_resource_slice *dst_slice = &dst->slices[dst_mip_level];
-
-   assert(dst->mem && dst->mem->bo);
-   const struct v3dv_bo *dst_bo = dst->mem->bo;
-
-   assert(src->mem && src->mem->bo);
-   const struct v3dv_bo *src_bo = src->mem->bo;
-
    struct drm_v3d_submit_tfu tfu = {
       .ios = (height << 16) | width,
       .bo_handles = {
-         dst_bo->handle,
-         src_bo->handle != dst_bo->handle ? src_bo->handle : 0
+         dst_bo_handle,
+         src_bo_handle != dst_bo_handle ? src_bo_handle : 0
       },
    };
 
-   const uint32_t src_offset =
-      src_bo->offset + v3dv_layer_offset(src, src_mip_level, src_layer);
    tfu.iia |= src_offset;
 
-   uint32_t icfg;
-   if (src_slice->tiling == V3D_TILING_RASTER) {
-      icfg = V3D_TFU_ICFG_FORMAT_RASTER;
+   if (src_tiling == V3D_TILING_RASTER) {
+      tfu.icfg = V3D_TFU_ICFG_FORMAT_RASTER << V3D_TFU_ICFG_FORMAT_SHIFT;
    } else {
-      icfg = V3D_TFU_ICFG_FORMAT_LINEARTILE +
-             (src_slice->tiling - V3D_TILING_LINEARTILE);
+      tfu.icfg = (V3D_TFU_ICFG_FORMAT_LINEARTILE +
+                  (src_tiling - V3D_TILING_LINEARTILE)) <<
+                   V3D_TFU_ICFG_FORMAT_SHIFT;
    }
-   tfu.icfg |= icfg << V3D_TFU_ICFG_FORMAT_SHIFT;
-
-   const uint32_t dst_offset =
-      dst_bo->offset + v3dv_layer_offset(dst, dst_mip_level, dst_layer);
-   tfu.ioa |= dst_offset;
-
-   tfu.ioa |= (V3D_TFU_IOA_FORMAT_LINEARTILE +
-               (dst_slice->tiling - V3D_TILING_LINEARTILE)) <<
-                V3D_TFU_IOA_FORMAT_SHIFT;
    tfu.icfg |= format->tex_type << V3D_TFU_ICFG_TTYPE_SHIFT;
 
-   switch (src_slice->tiling) {
+   tfu.ioa = dst_offset;
+
+   tfu.ioa |= (V3D_TFU_IOA_FORMAT_LINEARTILE +
+               (dst_tiling - V3D_TILING_LINEARTILE)) <<
+                V3D_TFU_IOA_FORMAT_SHIFT;
+
+   switch (src_tiling) {
    case V3D_TILING_UIF_NO_XOR:
    case V3D_TILING_UIF_XOR:
-      tfu.iis |= src_slice->padded_height / (2 * v3d_utile_height(src->cpp));
+      tfu.iis |= src_padded_height_or_stride / (2 * v3d_utile_height(src_cpp));
       break;
    case V3D_TILING_RASTER:
-      tfu.iis |= src_slice->stride / src->cpp;
+      tfu.iis |= src_padded_height_or_stride / src_cpp;
       break;
    default:
       break;
@@ -880,12 +870,11 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
     * OPAD field for the destination (how many extra UIF blocks beyond
     * those necessary to cover the height).
     */
-   if (dst_slice->tiling == V3D_TILING_UIF_NO_XOR ||
-       dst_slice->tiling == V3D_TILING_UIF_XOR) {
-      uint32_t uif_block_h = 2 * v3d_utile_height(dst->cpp);
+   if (dst_tiling == V3D_TILING_UIF_NO_XOR || dst_tiling == V3D_TILING_UIF_XOR) {
+      uint32_t uif_block_h = 2 * v3d_utile_height(dst_cpp);
       uint32_t implicit_padded_height = align(height, uif_block_h);
-      uint32_t icfg =
-         (dst_slice->padded_height - implicit_padded_height) / uif_block_h;
+      uint32_t icfg = (dst_padded_height_or_stride - implicit_padded_height) /
+                      uif_block_h;
       tfu.icfg |= icfg << V3D_TFU_ICFG_OPAD_SHIFT;
    }
 
@@ -1053,8 +1042,8 @@ emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job,
    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
 
    const VkImageSubresourceLayers *imgrsc = &region->imageSubresource;
-   assert((image->type != VK_IMAGE_TYPE_3D && layer < imgrsc->layerCount) ||
-          layer < image->extent.depth);
+   assert((image->vk.image_type != VK_IMAGE_TYPE_3D && layer < imgrsc->layerCount) ||
+          layer < image->vk.extent.depth);
 
    /* Load TLB from buffer */
    uint32_t width, height;
@@ -1069,8 +1058,8 @@ emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job,
       height = region->bufferImageHeight;
 
    /* Handle copy to compressed format using a compatible format */
-   width = DIV_ROUND_UP(width, vk_format_get_blockwidth(image->vk_format));
-   height = DIV_ROUND_UP(height, vk_format_get_blockheight(image->vk_format));
+   width = DIV_ROUND_UP(width, vk_format_get_blockwidth(image->vk.format));
+   height = DIV_ROUND_UP(height, vk_format_get_blockheight(image->vk.format));
 
    uint32_t cpp = imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT ?
                   1 : image->cpp;
@@ -1081,6 +1070,9 @@ emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job,
    uint32_t format = choose_tlb_format(framebuffer, imgrsc->aspectMask,
                                        false, false, true);
 
+   uint32_t image_layer = layer + (image->vk.image_type != VK_IMAGE_TYPE_3D ?
+      imgrsc->baseArrayLayer : region->imageOffset.z);
+
    emit_linear_load(cl, RENDER_TARGET_0, buffer->mem->bo,
                     buffer_offset, buffer_stride, format);
 
@@ -1100,13 +1092,13 @@ emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job,
       if (imgrsc->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
          emit_image_load(job->device, cl, framebuffer, image,
                          VK_IMAGE_ASPECT_STENCIL_BIT,
-                         imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
+                         image_layer, imgrsc->mipLevel,
                          false, false);
       } else {
          assert(imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT);
          emit_image_load(job->device, cl, framebuffer, image,
                          VK_IMAGE_ASPECT_DEPTH_BIT,
-                         imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
+                         image_layer, imgrsc->mipLevel,
                          false, false);
       }
    }
@@ -1117,20 +1109,20 @@ emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job,
 
    /* Store TLB to image */
    emit_image_store(job->device, cl, framebuffer, image, imgrsc->aspectMask,
-                    imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
+                    image_layer, imgrsc->mipLevel,
                     false, true, false);
 
    if (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
       if (imgrsc->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
          emit_image_store(job->device, cl, framebuffer, image,
                           VK_IMAGE_ASPECT_STENCIL_BIT,
-                          imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
+                          image_layer, imgrsc->mipLevel,
                           false, false, false);
       } else {
          assert(imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT);
          emit_image_store(job->device, cl, framebuffer, image,
                           VK_IMAGE_ASPECT_DEPTH_BIT,
-                          imgrsc->baseArrayLayer + layer, imgrsc->mipLevel,
+                          image_layer, imgrsc->mipLevel,
                           false, false, false);
       }
    }
diff --git a/mesa 3D driver/src/broadcom/vulkan/v3dvx_pipeline.c b/mesa 3D driver/src/broadcom/vulkan/v3dvx_pipeline.c
index 0400231177..8623a45370 100644
--- a/mesa 3D driver/src/broadcom/vulkan/v3dvx_pipeline.c	
+++ b/mesa 3D driver/src/broadcom/vulkan/v3dvx_pipeline.c	
@@ -599,7 +599,8 @@ pack_shader_state_attribute_record(struct v3dv_pipeline *pipeline,
 
 void
 v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
-                                  const VkPipelineVertexInputStateCreateInfo *vi_info)
+                                  const VkPipelineVertexInputStateCreateInfo *vi_info,
+                                  const VkPipelineVertexInputDivisorStateCreateInfoEXT *vd_info)
 {
    pack_shader_state_record(pipeline);
    pack_vcm_cache_size(pipeline);
@@ -613,6 +614,15 @@ v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
       pipeline->vb[desc->binding].instance_divisor = desc->inputRate;
    }
 
+   if (vd_info) {
+      for (uint32_t i = 0; i < vd_info->vertexBindingDivisorCount; i++) {
+         const VkVertexInputBindingDivisorDescriptionEXT *desc =
+            &vd_info->pVertexBindingDivisors[i];
+
+         pipeline->vb[desc->binding].instance_divisor = desc->divisor;
+      }
+   }
+
    pipeline->va_count = 0;
    struct v3d_vs_prog_data *prog_data_vs =
       pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]->prog_data.vs;
diff --git a/mesa 3D driver/src/broadcom/vulkan/v3dvx_private.h b/mesa 3D driver/src/broadcom/vulkan/v3dvx_private.h
index 18e7a1af86..86add82c06 100644
--- a/mesa 3D driver/src/broadcom/vulkan/v3dvx_private.h	
+++ b/mesa 3D driver/src/broadcom/vulkan/v3dvx_private.h	
@@ -227,12 +227,16 @@ v3dX(meta_emit_copy_image_rcl)(struct v3dv_job *job,
 
 void
 v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
-                        struct v3dv_image *dst,
-                        uint32_t dst_mip_level,
-                        uint32_t dst_layer,
-                        struct v3dv_image *src,
-                        uint32_t src_mip_level,
-                        uint32_t src_layer,
+                        uint32_t dst_bo_handle,
+                        uint32_t dst_offset,
+                        enum v3d_tiling_mode dst_tiling,
+                        uint32_t dst_padded_height_or_stride,
+                        uint32_t dst_cpp,
+                        uint32_t src_bo_handle,
+                        uint32_t src_offset,
+                        enum v3d_tiling_mode src_tiling,
+                        uint32_t src_padded_height_or_stride,
+                        uint32_t src_cpp,
                         uint32_t width,
                         uint32_t height,
                         const struct v3dv_format *format);
@@ -298,8 +302,8 @@ v3dX(pipeline_pack_state)(struct v3dv_pipeline *pipeline,
                           const VkPipelineMultisampleStateCreateInfo *ms_info);
 void
 v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
-                                  const VkPipelineVertexInputStateCreateInfo *vi_info);
-
+                                  const VkPipelineVertexInputStateCreateInfo *vi_info,
+                                  const VkPipelineVertexInputDivisorStateCreateInfoEXT *vd_info);
 /* Used at v3dv_queue */
 void
 v3dX(job_emit_noop)(struct v3dv_job *job);
diff --git a/mesa 3D driver/src/compiler/builtin_type_macros.h b/mesa 3D driver/src/compiler/builtin_type_macros.h
index 97b18d6f32..630600e09f 100644
--- a/mesa 3D driver/src/compiler/builtin_type_macros.h	
+++ b/mesa 3D driver/src/compiler/builtin_type_macros.h	
@@ -133,6 +133,50 @@ DECL_TYPE(sampler2DRectShadow,    GL_SAMPLER_2D_RECT_SHADOW,        GLSL_TYPE_SA
 
 DECL_TYPE(samplerExternalOES,     GL_SAMPLER_EXTERNAL_OES,          GLSL_TYPE_SAMPLER, GLSL_SAMPLER_DIM_EXTERNAL, 0, 0, GLSL_TYPE_FLOAT)
 
+DECL_TYPE(texture1D,         GL_SAMPLER_1D,                   GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_1D,   0, 0, GLSL_TYPE_FLOAT)
+DECL_TYPE(texture2D,         GL_SAMPLER_2D,                   GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_2D,   0, 0, GLSL_TYPE_FLOAT)
+DECL_TYPE(texture3D,         GL_SAMPLER_3D,                   GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_3D,   0, 0, GLSL_TYPE_FLOAT)
+DECL_TYPE(textureCube,       GL_SAMPLER_CUBE,                 GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_CUBE, 0, 0, GLSL_TYPE_FLOAT)
+DECL_TYPE(texture1DArray,    GL_SAMPLER_1D_ARRAY,             GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_1D,   0, 1, GLSL_TYPE_FLOAT)
+DECL_TYPE(texture2DArray,    GL_SAMPLER_2D_ARRAY,             GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_2D,   0, 1, GLSL_TYPE_FLOAT)
+DECL_TYPE(textureCubeArray,  GL_SAMPLER_CUBE_MAP_ARRAY,       GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_CUBE, 0, 1, GLSL_TYPE_FLOAT)
+DECL_TYPE(texture2DRect,     GL_SAMPLER_2D_RECT,              GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_RECT, 0, 0, GLSL_TYPE_FLOAT)
+DECL_TYPE(textureBuffer,     GL_SAMPLER_BUFFER,               GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_BUF,  0, 0, GLSL_TYPE_FLOAT)
+DECL_TYPE(texture2DMS,       GL_SAMPLER_2D_MULTISAMPLE,       GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_MS,   0, 0, GLSL_TYPE_FLOAT)
+DECL_TYPE(texture2DMSArray,  GL_SAMPLER_2D_MULTISAMPLE_ARRAY, GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_MS,   0, 1, GLSL_TYPE_FLOAT)
+
+DECL_TYPE(itexture1D,        GL_INT_SAMPLER_1D,                   GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_1D,   0, 0, GLSL_TYPE_INT)
+DECL_TYPE(itexture2D,        GL_INT_SAMPLER_2D,                   GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_2D,   0, 0, GLSL_TYPE_INT)
+DECL_TYPE(itexture3D,        GL_INT_SAMPLER_3D,                   GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_3D,   0, 0, GLSL_TYPE_INT)
+DECL_TYPE(itextureCube,      GL_INT_SAMPLER_CUBE,                 GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_CUBE, 0, 0, GLSL_TYPE_INT)
+DECL_TYPE(itexture1DArray,   GL_INT_SAMPLER_1D_ARRAY,             GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_1D,   0, 1, GLSL_TYPE_INT)
+DECL_TYPE(itexture2DArray,   GL_INT_SAMPLER_2D_ARRAY,             GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_2D,   0, 1, GLSL_TYPE_INT)
+DECL_TYPE(itextureCubeArray, GL_INT_SAMPLER_CUBE_MAP_ARRAY,       GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_CUBE, 0, 1, GLSL_TYPE_INT)
+DECL_TYPE(itexture2DRect,    GL_INT_SAMPLER_2D_RECT,              GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_RECT, 0, 0, GLSL_TYPE_INT)
+DECL_TYPE(itextureBuffer,    GL_INT_SAMPLER_BUFFER,               GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_BUF,  0, 0, GLSL_TYPE_INT)
+DECL_TYPE(itexture2DMS,      GL_INT_SAMPLER_2D_MULTISAMPLE,       GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_MS,   0, 0, GLSL_TYPE_INT)
+DECL_TYPE(itexture2DMSArray, GL_INT_SAMPLER_2D_MULTISAMPLE_ARRAY, GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_MS,   0, 1, GLSL_TYPE_INT)
+
+DECL_TYPE(utexture1D,        GL_UNSIGNED_INT_SAMPLER_1D,                   GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_1D,   0, 0, GLSL_TYPE_UINT)
+DECL_TYPE(utexture2D,        GL_UNSIGNED_INT_SAMPLER_2D,                   GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_2D,   0, 0, GLSL_TYPE_UINT)
+DECL_TYPE(utexture3D,        GL_UNSIGNED_INT_SAMPLER_3D,                   GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_3D,   0, 0, GLSL_TYPE_UINT)
+DECL_TYPE(utextureCube,      GL_UNSIGNED_INT_SAMPLER_CUBE,                 GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_CUBE, 0, 0, GLSL_TYPE_UINT)
+DECL_TYPE(utexture1DArray,   GL_UNSIGNED_INT_SAMPLER_1D_ARRAY,             GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_1D,   0, 1, GLSL_TYPE_UINT)
+DECL_TYPE(utexture2DArray,   GL_UNSIGNED_INT_SAMPLER_2D_ARRAY,             GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_2D,   0, 1, GLSL_TYPE_UINT)
+DECL_TYPE(utextureCubeArray, GL_UNSIGNED_INT_SAMPLER_CUBE_MAP_ARRAY,       GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_CUBE, 0, 1, GLSL_TYPE_UINT)
+DECL_TYPE(utexture2DRect,    GL_UNSIGNED_INT_SAMPLER_2D_RECT,              GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_RECT, 0, 0, GLSL_TYPE_UINT)
+DECL_TYPE(utextureBuffer,    GL_UNSIGNED_INT_SAMPLER_BUFFER,               GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_BUF,  0, 0, GLSL_TYPE_UINT)
+DECL_TYPE(utexture2DMS,      GL_UNSIGNED_INT_SAMPLER_2D_MULTISAMPLE,       GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_MS,   0, 0, GLSL_TYPE_UINT)
+DECL_TYPE(utexture2DMSArray, GL_UNSIGNED_INT_SAMPLER_2D_MULTISAMPLE_ARRAY, GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_MS,   0, 1, GLSL_TYPE_UINT)
+
+/* OpenCL image types */
+DECL_TYPE(vtexture1D,        GL_SAMPLER_1D,       GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_1D,  0, 0, GLSL_TYPE_VOID)
+DECL_TYPE(vtexture2D,        GL_SAMPLER_2D,       GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_2D,  0, 0, GLSL_TYPE_VOID)
+DECL_TYPE(vtexture3D,        GL_SAMPLER_3D,       GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_3D,  0, 0, GLSL_TYPE_VOID)
+DECL_TYPE(vtexture1DArray,   GL_SAMPLER_1D_ARRAY, GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_1D,  0, 1, GLSL_TYPE_VOID)
+DECL_TYPE(vtexture2DArray,   GL_SAMPLER_2D_ARRAY, GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_2D,  0, 1, GLSL_TYPE_VOID)
+DECL_TYPE(vtextureBuffer,    GL_SAMPLER_BUFFER,   GLSL_TYPE_TEXTURE, GLSL_SAMPLER_DIM_BUF, 0, 0, GLSL_TYPE_VOID)
+
 DECL_TYPE(image1D,         GL_IMAGE_1D,                                GLSL_TYPE_IMAGE, GLSL_SAMPLER_DIM_1D,     0, 0, GLSL_TYPE_FLOAT)
 DECL_TYPE(image2D,         GL_IMAGE_2D,                                GLSL_TYPE_IMAGE, GLSL_SAMPLER_DIM_2D,     0, 0, GLSL_TYPE_FLOAT)
 DECL_TYPE(image3D,         GL_IMAGE_3D,                                GLSL_TYPE_IMAGE, GLSL_SAMPLER_DIM_3D,     0, 0, GLSL_TYPE_FLOAT)
diff --git a/mesa 3D driver/src/compiler/clc/clc.c b/mesa 3D driver/src/compiler/clc/clc.c
new file mode 100644
index 0000000000..3d4e9ca857
--- /dev/null
+++ b/mesa 3D driver/src/compiler/clc/clc.c	
@@ -0,0 +1,314 @@
+/*
+ * Copyright © Microsoft Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir/nir.h"
+#include "nir/nir_serialize.h"
+#include "glsl_types.h"
+#include "nir_types.h"
+#include "clc.h"
+#include "clc_helpers.h"
+#include "spirv/nir_spirv.h"
+#include "util/u_debug.h"
+
+#include <stdlib.h>
+
+enum clc_debug_flags {
+   CLC_DEBUG_DUMP_SPIRV = 1 << 0,
+   CLC_DEBUG_VERBOSE = 1 << 1,
+};
+
+static const struct debug_named_value clc_debug_options[] = {
+   { "dump_spirv",  CLC_DEBUG_DUMP_SPIRV, "Dump spirv blobs" },
+   { "verbose",  CLC_DEBUG_VERBOSE, NULL },
+   DEBUG_NAMED_VALUE_END
+};
+
+DEBUG_GET_ONCE_FLAGS_OPTION(debug_clc, "CLC_DEBUG", clc_debug_options, 0)
+
+static void
+clc_print_kernels_info(const struct clc_parsed_spirv *obj)
+{
+   fprintf(stdout, "Kernels:\n");
+   for (unsigned i = 0; i < obj->num_kernels; i++) {
+      const struct clc_kernel_arg *args = obj->kernels[i].args;
+      bool first = true;
+
+      fprintf(stdout, "\tvoid %s(", obj->kernels[i].name);
+      for (unsigned j = 0; j < obj->kernels[i].num_args; j++) {
+         if (!first)
+            fprintf(stdout, ", ");
+         else
+            first = false;
+
+         switch (args[j].address_qualifier) {
+         case CLC_KERNEL_ARG_ADDRESS_GLOBAL:
+            fprintf(stdout, "__global ");
+            break;
+         case CLC_KERNEL_ARG_ADDRESS_LOCAL:
+            fprintf(stdout, "__local ");
+            break;
+         case CLC_KERNEL_ARG_ADDRESS_CONSTANT:
+            fprintf(stdout, "__constant ");
+            break;
+         default:
+            break;
+         }
+
+         if (args[j].type_qualifier & CLC_KERNEL_ARG_TYPE_VOLATILE)
+            fprintf(stdout, "volatile ");
+         if (args[j].type_qualifier & CLC_KERNEL_ARG_TYPE_CONST)
+            fprintf(stdout, "const ");
+         if (args[j].type_qualifier & CLC_KERNEL_ARG_TYPE_RESTRICT)
+            fprintf(stdout, "restrict ");
+
+         fprintf(stdout, "%s %s", args[j].type_name, args[j].name);
+      }
+      fprintf(stdout, ");\n");
+   }
+}
+
+static void
+clc_libclc_optimize(nir_shader *s)
+{
+   bool progress;
+   do {
+      progress = false;
+      NIR_PASS(progress, s, nir_split_var_copies);
+      NIR_PASS(progress, s, nir_opt_copy_prop_vars);
+      NIR_PASS(progress, s, nir_lower_var_copies);
+      NIR_PASS(progress, s, nir_lower_vars_to_ssa);
+      NIR_PASS(progress, s, nir_copy_prop);
+      NIR_PASS(progress, s, nir_opt_remove_phis);
+      NIR_PASS(progress, s, nir_opt_dce);
+      NIR_PASS(progress, s, nir_opt_if, true);
+      NIR_PASS(progress, s, nir_opt_dead_cf);
+      NIR_PASS(progress, s, nir_opt_cse);
+      NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
+      NIR_PASS(progress, s, nir_opt_algebraic);
+      NIR_PASS(progress, s, nir_opt_constant_folding);
+      NIR_PASS(progress, s, nir_opt_undef);
+      NIR_PASS(progress, s, nir_lower_undef_to_zero);
+      NIR_PASS(progress, s, nir_opt_deref);
+   } while (progress);
+}
+
+struct clc_libclc {
+   const nir_shader *libclc_nir;
+};
+
+struct clc_libclc *
+clc_libclc_new(const struct clc_logger *logger, const struct clc_libclc_options *options)
+{
+   struct clc_libclc *ctx = rzalloc(NULL, struct clc_libclc);
+   if (!ctx) {
+      clc_error(logger, "D3D12: failed to allocate a clc_libclc");
+      return NULL;
+   }
+
+   const struct spirv_to_nir_options libclc_spirv_options = {
+      .environment = NIR_SPIRV_OPENCL,
+      .create_library = true,
+      .constant_addr_format = nir_address_format_32bit_index_offset_pack64,
+      .global_addr_format = nir_address_format_32bit_index_offset_pack64,
+      .shared_addr_format = nir_address_format_32bit_offset_as_64bit,
+      .temp_addr_format = nir_address_format_32bit_offset_as_64bit,
+      .float_controls_execution_mode = FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32,
+      .caps = {
+         .address = true,
+         .float64 = true,
+         .int8 = true,
+         .int16 = true,
+         .int64 = true,
+         .kernel = true,
+      },
+   };
+
+   glsl_type_singleton_init_or_ref();
+   nir_shader *s = nir_load_libclc_shader(64, NULL, &libclc_spirv_options, options->nir_options);
+   if (!s) {
+      clc_error(logger, "D3D12: spirv_to_nir failed on libclc blob");
+      ralloc_free(ctx);
+      return NULL;
+   }
+
+   if (options && options->optimize)
+      clc_libclc_optimize(s);
+
+   ralloc_steal(ctx, s);
+   ctx->libclc_nir = s;
+
+   return ctx;
+}
+
+void clc_free_libclc(struct clc_libclc *ctx)
+{
+   ralloc_free(ctx);
+   glsl_type_singleton_decref();
+}
+
+const nir_shader *clc_libclc_get_clc_shader(struct clc_libclc *ctx)
+{
+   return ctx->libclc_nir;
+}
+
+void clc_libclc_serialize(struct clc_libclc *context,
+                           void **serialized,
+                           size_t *serialized_size)
+{
+   struct blob tmp;
+   blob_init(&tmp);
+   nir_serialize(&tmp, context->libclc_nir, true);
+
+   blob_finish_get_buffer(&tmp, serialized, serialized_size);
+}
+
+void clc_libclc_free_serialized(void *serialized)
+{
+   free(serialized);
+}
+
+struct clc_libclc *
+clc_libclc_deserialize(const void *serialized, size_t serialized_size)
+{
+   struct clc_libclc *ctx = rzalloc(NULL, struct clc_libclc);
+   if (!ctx) {
+      return NULL;
+   }
+
+   glsl_type_singleton_init_or_ref();
+
+   struct blob_reader tmp;
+   blob_reader_init(&tmp, serialized, serialized_size);
+
+   nir_shader *s = nir_deserialize(NULL, NULL, &tmp);
+   if (!s) {
+      ralloc_free(ctx);
+      return NULL;
+   }
+
+   ralloc_steal(ctx, s);
+   ctx->libclc_nir = s;
+
+   return ctx;
+}
+
+bool
+clc_compile_c_to_spir(const struct clc_compile_args *args,
+                      const struct clc_logger *logger,
+                      struct clc_binary *out_spir)
+{
+   return clc_c_to_spir(args, logger, out_spir) >= 0;
+}
+
+void
+clc_free_spir(struct clc_binary *spir)
+{
+   clc_free_spir_binary(spir);
+}
+
+bool
+clc_compile_spir_to_spirv(const struct clc_binary *in_spir,
+                          const struct clc_logger *logger,
+                          struct clc_binary *out_spirv)
+{
+   if (clc_spir_to_spirv(in_spir, logger, out_spirv) < 0)
+      return false;
+
+   if (debug_get_option_debug_clc() & CLC_DEBUG_DUMP_SPIRV)
+      clc_dump_spirv(out_spirv, stdout);
+
+   return true;
+}
+
+void
+clc_free_spirv(struct clc_binary *spirv)
+{
+   clc_free_spirv_binary(spirv);
+}
+
+bool
+clc_compile_c_to_spirv(const struct clc_compile_args *args,
+                       const struct clc_logger *logger,
+                       struct clc_binary *out_spirv)
+{
+   if (clc_c_to_spirv(args, logger, out_spirv) < 0)
+      return false;
+
+   if (debug_get_option_debug_clc() & CLC_DEBUG_DUMP_SPIRV)
+      clc_dump_spirv(out_spirv, stdout);
+
+   return true;
+}
+
+bool
+clc_link_spirv(const struct clc_linker_args *args,
+               const struct clc_logger *logger,
+               struct clc_binary *out_spirv)
+{
+   if (clc_link_spirv_binaries(args, logger, out_spirv) < 0)
+      return false;
+
+   if (debug_get_option_debug_clc() & CLC_DEBUG_DUMP_SPIRV)
+      clc_dump_spirv(out_spirv, stdout);
+
+   return true;
+}
+
+bool
+clc_parse_spirv(const struct clc_binary *in_spirv,
+                const struct clc_logger *logger,
+                struct clc_parsed_spirv *out_data)
+{
+   if (!clc_spirv_get_kernels_info(in_spirv,
+      &out_data->kernels,
+      &out_data->num_kernels,
+      &out_data->spec_constants,
+      &out_data->num_spec_constants,
+      logger))
+      return false;
+
+   if (debug_get_option_debug_clc() & CLC_DEBUG_VERBOSE)
+      clc_print_kernels_info(out_data);
+
+   return true;
+}
+
+void clc_free_parsed_spirv(struct clc_parsed_spirv *data)
+{
+   clc_free_kernels_info(data->kernels, data->num_kernels);
+}
+
+bool
+clc_specialize_spirv(const struct clc_binary *in_spirv,
+                     const struct clc_parsed_spirv *parsed_data,
+                     const struct clc_spirv_specialization_consts *consts,
+                     struct clc_binary *out_spirv)
+{
+   if (!clc_spirv_specialize(in_spirv, parsed_data, consts, out_spirv))
+      return false;
+
+   if (debug_get_option_debug_clc() & CLC_DEBUG_DUMP_SPIRV)
+      clc_dump_spirv(out_spirv, stdout);
+
+   return true;
+}
diff --git a/mesa 3D driver/src/compiler/clc/clc.h b/mesa 3D driver/src/compiler/clc/clc.h
new file mode 100644
index 0000000000..68366340ce
--- /dev/null
+++ b/mesa 3D driver/src/compiler/clc/clc.h	
@@ -0,0 +1,247 @@
+/*
+ * Copyright © Microsoft Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef MESA_CLC_H
+#define MESA_CLC_H
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct nir_shader nir_shader;
+struct nir_shader_compiler_options;
+
+struct clc_named_value {
+   const char *name;
+   const char *value;
+};
+
+enum clc_spirv_version {
+   CLC_SPIRV_VERSION_MAX = 0,
+   CLC_SPIRV_VERSION_1_0,
+   CLC_SPIRV_VERSION_1_1,
+   CLC_SPIRV_VERSION_1_2,
+   CLC_SPIRV_VERSION_1_3,
+   CLC_SPIRV_VERSION_1_4,
+};
+
+struct clc_compile_args {
+   const struct clc_named_value *headers;
+   unsigned num_headers;
+   struct clc_named_value source;
+   const char * const *args;
+   unsigned num_args;
+
+   /* SPIRV version to target. */
+   enum clc_spirv_version spirv_version;
+
+   /* Allowed extensions SPIRV extensions the OpenCL->SPIRV translation can
+    * enable. A pointer to a NULL terminated array of strings, allow any
+    * extension if NULL.
+    */
+   const char * const *allowed_spirv_extensions;
+};
+
+struct clc_binary {
+   void *data;
+   size_t size;
+};
+
+struct clc_linker_args {
+   const struct clc_binary * const *in_objs;
+   unsigned num_in_objs;
+   unsigned create_library;
+};
+
+typedef void (*clc_msg_callback)(void *priv, const char *msg);
+
+struct clc_logger {
+   void *priv;
+   clc_msg_callback error;
+   clc_msg_callback warning;
+};
+
+enum clc_kernel_arg_type_qualifier {
+   CLC_KERNEL_ARG_TYPE_CONST = 1 << 0,
+   CLC_KERNEL_ARG_TYPE_RESTRICT = 1 << 1,
+   CLC_KERNEL_ARG_TYPE_VOLATILE = 1 << 2,
+};
+
+enum clc_kernel_arg_access_qualifier {
+   CLC_KERNEL_ARG_ACCESS_READ = 1 << 0,
+   CLC_KERNEL_ARG_ACCESS_WRITE = 1 << 1,
+};
+
+enum clc_kernel_arg_address_qualifier {
+   CLC_KERNEL_ARG_ADDRESS_PRIVATE,
+   CLC_KERNEL_ARG_ADDRESS_CONSTANT,
+   CLC_KERNEL_ARG_ADDRESS_LOCAL,
+   CLC_KERNEL_ARG_ADDRESS_GLOBAL,
+};
+
+struct clc_kernel_arg {
+   const char *name;
+   const char *type_name;
+   unsigned type_qualifier;
+   unsigned access_qualifier;
+   enum clc_kernel_arg_address_qualifier address_qualifier;
+};
+
+enum clc_vec_hint_type {
+   CLC_VEC_HINT_TYPE_CHAR = 0,
+   CLC_VEC_HINT_TYPE_SHORT = 1,
+   CLC_VEC_HINT_TYPE_INT = 2,
+   CLC_VEC_HINT_TYPE_LONG = 3,
+   CLC_VEC_HINT_TYPE_HALF = 4,
+   CLC_VEC_HINT_TYPE_FLOAT = 5,
+   CLC_VEC_HINT_TYPE_DOUBLE = 6
+};
+
+struct clc_kernel_info {
+   const char *name;
+   size_t num_args;
+   const struct clc_kernel_arg *args;
+
+   unsigned vec_hint_size;
+   enum clc_vec_hint_type vec_hint_type;
+};
+
+enum clc_spec_constant_type {
+   CLC_SPEC_CONSTANT_UNKNOWN,
+   CLC_SPEC_CONSTANT_BOOL,
+   CLC_SPEC_CONSTANT_FLOAT,
+   CLC_SPEC_CONSTANT_DOUBLE,
+   CLC_SPEC_CONSTANT_INT8,
+   CLC_SPEC_CONSTANT_UINT8,
+   CLC_SPEC_CONSTANT_INT16,
+   CLC_SPEC_CONSTANT_UINT16,
+   CLC_SPEC_CONSTANT_INT32,
+   CLC_SPEC_CONSTANT_UINT32,
+   CLC_SPEC_CONSTANT_INT64,
+   CLC_SPEC_CONSTANT_UINT64,
+};
+
+struct clc_parsed_spec_constant {
+   uint32_t id;
+   enum clc_spec_constant_type type;
+};
+
+struct clc_parsed_spirv {
+   const struct clc_kernel_info *kernels;
+   unsigned num_kernels;
+
+   const struct clc_parsed_spec_constant *spec_constants;
+   unsigned num_spec_constants;
+};
+
+struct clc_libclc;
+
+struct clc_libclc_options {
+   unsigned optimize;
+   const struct nir_shader_compiler_options *nir_options;
+};
+
+struct clc_libclc *clc_libclc_new(const struct clc_logger *logger, const struct clc_libclc_options *options);
+
+void clc_free_libclc(struct clc_libclc *lib);
+
+const nir_shader *clc_libclc_get_clc_shader(struct clc_libclc *lib);
+
+void clc_libclc_serialize(struct clc_libclc *lib, void **serialized, size_t *size);
+void clc_libclc_free_serialized(void *serialized);
+struct clc_libclc *clc_libclc_deserialize(const void *serialized, size_t size);
+
+bool
+clc_compile_c_to_spir(const struct clc_compile_args *args,
+                      const struct clc_logger *logger,
+                      struct clc_binary *out_spir);
+
+void
+clc_free_spir(struct clc_binary *spir);
+
+bool
+clc_compile_spir_to_spirv(const struct clc_binary *in_spir,
+                          const struct clc_logger *logger,
+                          struct clc_binary *out_spirv);
+
+void
+clc_free_spirv(struct clc_binary *spirv);
+
+bool
+clc_compile_c_to_spirv(const struct clc_compile_args *args,
+                       const struct clc_logger *logger,
+                       struct clc_binary *out_spirv);
+
+bool
+clc_link_spirv(const struct clc_linker_args *args,
+               const struct clc_logger *logger,
+               struct clc_binary *out_spirv);
+
+bool
+clc_parse_spirv(const struct clc_binary *in_spirv,
+                const struct clc_logger *logger,
+                struct clc_parsed_spirv *out_data);
+
+void
+clc_free_parsed_spirv(struct clc_parsed_spirv *data);
+
+typedef union {
+   bool b;
+   float f32;
+   double f64;
+   int8_t i8;
+   uint8_t u8;
+   int16_t i16;
+   uint16_t u16;
+   int32_t i32;
+   uint32_t u32;
+   int64_t i64;
+   uint64_t u64;
+} clc_spirv_const_value;
+
+struct clc_spirv_specialization {
+   uint32_t id;
+   clc_spirv_const_value value;
+   bool defined_on_module;
+};
+
+struct clc_spirv_specialization_consts {
+   const struct clc_spirv_specialization *specializations;
+   unsigned num_specializations;
+};
+
+bool
+clc_specialize_spirv(const struct clc_binary *in_spirv,
+                     const struct clc_parsed_spirv *parsed_data,
+                     const struct clc_spirv_specialization_consts *consts,
+                     struct clc_binary *out_spirv);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* MESA_CLC_H */
diff --git a/mesa 3D driver/src/compiler/clc/clc_helpers.cpp b/mesa 3D driver/src/compiler/clc/clc_helpers.cpp
new file mode 100644
index 0000000000..c5b0f29eaa
--- /dev/null
+++ b/mesa 3D driver/src/compiler/clc/clc_helpers.cpp	
@@ -0,0 +1,1135 @@
+//
+// Copyright 2012-2016 Francisco Jerez
+// Copyright 2012-2016 Advanced Micro Devices, Inc.
+// Copyright 2014-2016 Jan Vesely
+// Copyright 2014-2015 Serge Martin
+// Copyright 2015 Zoltan Gilian
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <sstream>
+
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/IR/DiagnosticPrinter.h>
+#include <llvm/IR/DiagnosticInfo.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Type.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Bitcode/BitcodeWriter.h>
+#include <llvm/Bitcode/BitcodeReader.h>
+#include <llvm-c/Core.h>
+#include <llvm-c/Target.h>
+#include <LLVMSPIRVLib/LLVMSPIRVLib.h>
+
+#include <clang/CodeGen/CodeGenAction.h>
+#include <clang/Lex/PreprocessorOptions.h>
+#include <clang/Frontend/CompilerInstance.h>
+#include <clang/Frontend/TextDiagnosticBuffer.h>
+#include <clang/Frontend/TextDiagnosticPrinter.h>
+#include <clang/Basic/TargetInfo.h>
+
+#include <spirv-tools/libspirv.hpp>
+#include <spirv-tools/linker.hpp>
+#include <spirv-tools/optimizer.hpp>
+
+#include "util/macros.h"
+#include "glsl_types.h"
+
+#include "spirv.h"
+
+#ifdef USE_STATIC_OPENCL_C_H
+#include "opencl-c.h.h"
+#include "opencl-c-base.h.h"
+#endif
+
+#include "clc_helpers.h"
+
+/* Use the highest version of SPIRV supported by SPIRV-Tools. */
+constexpr spv_target_env spirv_target = SPV_ENV_UNIVERSAL_1_5;
+
+constexpr SPIRV::VersionNumber invalid_spirv_trans_version = static_cast<SPIRV::VersionNumber>(0);
+
+using ::llvm::Function;
+using ::llvm::LLVMContext;
+using ::llvm::Module;
+using ::llvm::raw_string_ostream;
+
+static void
+llvm_log_handler(const ::llvm::DiagnosticInfo &di, void *data) {
+   raw_string_ostream os { *reinterpret_cast<std::string *>(data) };
+   ::llvm::DiagnosticPrinterRawOStream printer { os };
+   di.print(printer);
+}
+
+class SPIRVKernelArg {
+public:
+   SPIRVKernelArg(uint32_t id, uint32_t typeId) : id(id), typeId(typeId),
+                                                  addrQualifier(CLC_KERNEL_ARG_ADDRESS_PRIVATE),
+                                                  accessQualifier(0),
+                                                  typeQualifier(0) { }
+   ~SPIRVKernelArg() { }
+
+   uint32_t id;
+   uint32_t typeId;
+   std::string name;
+   std::string typeName;
+   enum clc_kernel_arg_address_qualifier addrQualifier;
+   unsigned accessQualifier;
+   unsigned typeQualifier;
+};
+
+class SPIRVKernelInfo {
+public:
+   SPIRVKernelInfo(uint32_t fid, const char *nm) : funcId(fid), name(nm), vecHint(0) { }
+   ~SPIRVKernelInfo() { }
+
+   uint32_t funcId;
+   std::string name;
+   std::vector<SPIRVKernelArg> args;
+   unsigned vecHint;
+};
+
+class SPIRVKernelParser {
+public:
+   SPIRVKernelParser() : curKernel(NULL)
+   {
+      ctx = spvContextCreate(spirv_target);
+   }
+
+   ~SPIRVKernelParser()
+   {
+     spvContextDestroy(ctx);
+   }
+
+   void parseEntryPoint(const spv_parsed_instruction_t *ins)
+   {
+      assert(ins->num_operands >= 3);
+
+      const spv_parsed_operand_t *op = &ins->operands[1];
+
+      assert(op->type == SPV_OPERAND_TYPE_ID);
+
+      uint32_t funcId = ins->words[op->offset];
+
+      for (auto &iter : kernels) {
+         if (funcId == iter.funcId)
+            return;
+      }
+
+      op = &ins->operands[2];
+      assert(op->type == SPV_OPERAND_TYPE_LITERAL_STRING);
+      const char *name = reinterpret_cast<const char *>(ins->words + op->offset);
+
+      kernels.push_back(SPIRVKernelInfo(funcId, name));
+   }
+
+   void parseFunction(const spv_parsed_instruction_t *ins)
+   {
+      assert(ins->num_operands == 4);
+
+      const spv_parsed_operand_t *op = &ins->operands[1];
+
+      assert(op->type == SPV_OPERAND_TYPE_RESULT_ID);
+
+      uint32_t funcId = ins->words[op->offset];
+
+      for (auto &kernel : kernels) {
+         if (funcId == kernel.funcId && !kernel.args.size()) {
+            curKernel = &kernel;
+	    return;
+         }
+      }
+   }
+
+   void parseFunctionParam(const spv_parsed_instruction_t *ins)
+   {
+      const spv_parsed_operand_t *op;
+      uint32_t id, typeId;
+
+      if (!curKernel)
+         return;
+
+      assert(ins->num_operands == 2);
+      op = &ins->operands[0];
+      assert(op->type == SPV_OPERAND_TYPE_TYPE_ID);
+      typeId = ins->words[op->offset];
+      op = &ins->operands[1];
+      assert(op->type == SPV_OPERAND_TYPE_RESULT_ID);
+      id = ins->words[op->offset];
+      curKernel->args.push_back(SPIRVKernelArg(id, typeId));
+   }
+
+   void parseName(const spv_parsed_instruction_t *ins)
+   {
+      const spv_parsed_operand_t *op;
+      const char *name;
+      uint32_t id;
+
+      assert(ins->num_operands == 2);
+
+      op = &ins->operands[0];
+      assert(op->type == SPV_OPERAND_TYPE_ID);
+      id = ins->words[op->offset];
+      op = &ins->operands[1];
+      assert(op->type == SPV_OPERAND_TYPE_LITERAL_STRING);
+      name = reinterpret_cast<const char *>(ins->words + op->offset);
+
+      for (auto &kernel : kernels) {
+         for (auto &arg : kernel.args) {
+            if (arg.id == id && arg.name.empty()) {
+              arg.name = name;
+              break;
+	    }
+         }
+      }
+   }
+
+   void parseTypePointer(const spv_parsed_instruction_t *ins)
+   {
+      enum clc_kernel_arg_address_qualifier addrQualifier;
+      uint32_t typeId, storageClass;
+      const spv_parsed_operand_t *op;
+
+      assert(ins->num_operands == 3);
+
+      op = &ins->operands[0];
+      assert(op->type == SPV_OPERAND_TYPE_RESULT_ID);
+      typeId = ins->words[op->offset];
+
+      op = &ins->operands[1];
+      assert(op->type == SPV_OPERAND_TYPE_STORAGE_CLASS);
+      storageClass = ins->words[op->offset];
+      switch (storageClass) {
+      case SpvStorageClassCrossWorkgroup:
+         addrQualifier = CLC_KERNEL_ARG_ADDRESS_GLOBAL;
+         break;
+      case SpvStorageClassWorkgroup:
+         addrQualifier = CLC_KERNEL_ARG_ADDRESS_LOCAL;
+         break;
+      case SpvStorageClassUniformConstant:
+         addrQualifier = CLC_KERNEL_ARG_ADDRESS_CONSTANT;
+         break;
+      default:
+         addrQualifier = CLC_KERNEL_ARG_ADDRESS_PRIVATE;
+         break;
+      }
+
+      for (auto &kernel : kernels) {
+	 for (auto &arg : kernel.args) {
+            if (arg.typeId == typeId)
+               arg.addrQualifier = addrQualifier;
+         }
+      }
+   }
+
+   void parseOpString(const spv_parsed_instruction_t *ins)
+   {
+      const spv_parsed_operand_t *op;
+      std::string str;
+
+      assert(ins->num_operands == 2);
+
+      op = &ins->operands[1];
+      assert(op->type == SPV_OPERAND_TYPE_LITERAL_STRING);
+      str = reinterpret_cast<const char *>(ins->words + op->offset);
+
+      if (str.find("kernel_arg_type.") != 0)
+         return;
+
+      size_t start = sizeof("kernel_arg_type.") - 1;
+
+      for (auto &kernel : kernels) {
+         size_t pos;
+
+	 pos = str.find(kernel.name, start);
+         if (pos == std::string::npos ||
+             pos != start || str[start + kernel.name.size()] != '.')
+            continue;
+
+	 pos = start + kernel.name.size();
+         if (str[pos++] != '.')
+            continue;
+
+         for (auto &arg : kernel.args) {
+            if (arg.name.empty())
+               break;
+
+            size_t typeEnd = str.find(',', pos);
+	    if (typeEnd == std::string::npos)
+               break;
+
+            arg.typeName = str.substr(pos, typeEnd - pos);
+            pos = typeEnd + 1;
+         }
+      }
+   }
+
+   void applyDecoration(uint32_t id, const spv_parsed_instruction_t *ins)
+   {
+      auto iter = decorationGroups.find(id);
+      if (iter != decorationGroups.end()) {
+         for (uint32_t entry : iter->second)
+            applyDecoration(entry, ins);
+         return;
+      }
+
+      const spv_parsed_operand_t *op;
+      uint32_t decoration;
+
+      assert(ins->num_operands >= 2);
+
+      op = &ins->operands[1];
+      assert(op->type == SPV_OPERAND_TYPE_DECORATION);
+      decoration = ins->words[op->offset];
+
+      if (decoration == SpvDecorationSpecId) {
+         uint32_t spec_id = ins->words[ins->operands[2].offset];
+         for (auto &c : specConstants) {
+            if (c.second.id == spec_id) {
+               assert(c.first == id);
+               return;
+            }
+         }
+         specConstants.emplace_back(id, clc_parsed_spec_constant{ spec_id });
+         return;
+      }
+
+      for (auto &kernel : kernels) {
+         for (auto &arg : kernel.args) {
+            if (arg.id == id) {
+               switch (decoration) {
+               case SpvDecorationVolatile:
+                  arg.typeQualifier |= CLC_KERNEL_ARG_TYPE_VOLATILE;
+                  break;
+               case SpvDecorationConstant:
+                  arg.typeQualifier |= CLC_KERNEL_ARG_TYPE_CONST;
+                  break;
+               case SpvDecorationRestrict:
+                  arg.typeQualifier |= CLC_KERNEL_ARG_TYPE_RESTRICT;
+                  break;
+               case SpvDecorationFuncParamAttr:
+                  op = &ins->operands[2];
+                  assert(op->type == SPV_OPERAND_TYPE_FUNCTION_PARAMETER_ATTRIBUTE);
+                  switch (ins->words[op->offset]) {
+                  case SpvFunctionParameterAttributeNoAlias:
+                     arg.typeQualifier |= CLC_KERNEL_ARG_TYPE_RESTRICT;
+                     break;
+                  case SpvFunctionParameterAttributeNoWrite:
+                     arg.typeQualifier |= CLC_KERNEL_ARG_TYPE_CONST;
+                     break;
+                  }
+                  break;
+               }
+            }
+
+         }
+      }
+   }
+
+   void parseOpDecorate(const spv_parsed_instruction_t *ins)
+   {
+      const spv_parsed_operand_t *op;
+      uint32_t id;
+
+      assert(ins->num_operands >= 2);
+
+      op = &ins->operands[0];
+      assert(op->type == SPV_OPERAND_TYPE_ID);
+      id = ins->words[op->offset];
+
+      applyDecoration(id, ins);
+   }
+
+   void parseOpGroupDecorate(const spv_parsed_instruction_t *ins)
+   {
+      assert(ins->num_operands >= 2);
+
+      const spv_parsed_operand_t *op = &ins->operands[0];
+      assert(op->type == SPV_OPERAND_TYPE_ID);
+      uint32_t groupId = ins->words[op->offset];
+
+      auto lowerBound = decorationGroups.lower_bound(groupId);
+      if (lowerBound != decorationGroups.end() &&
+          lowerBound->first == groupId)
+         // Group already filled out
+         return;
+
+      auto iter = decorationGroups.emplace_hint(lowerBound, groupId, std::vector<uint32_t>{});
+      auto& vec = iter->second;
+      vec.reserve(ins->num_operands - 1);
+      for (uint32_t i = 1; i < ins->num_operands; ++i) {
+         op = &ins->operands[i];
+         assert(op->type == SPV_OPERAND_TYPE_ID);
+         vec.push_back(ins->words[op->offset]);
+      }
+   }
+
+   void parseOpTypeImage(const spv_parsed_instruction_t *ins)
+   {
+      const spv_parsed_operand_t *op;
+      uint32_t typeId;
+      unsigned accessQualifier = CLC_KERNEL_ARG_ACCESS_READ;
+
+      op = &ins->operands[0];
+      assert(op->type == SPV_OPERAND_TYPE_RESULT_ID);
+      typeId = ins->words[op->offset];
+
+      if (ins->num_operands >= 9) {
+         op = &ins->operands[8];
+         assert(op->type == SPV_OPERAND_TYPE_ACCESS_QUALIFIER);
+         switch (ins->words[op->offset]) {
+         case SpvAccessQualifierReadOnly:
+            accessQualifier = CLC_KERNEL_ARG_ACCESS_READ;
+            break;
+         case SpvAccessQualifierWriteOnly:
+            accessQualifier = CLC_KERNEL_ARG_ACCESS_WRITE;
+            break;
+         case SpvAccessQualifierReadWrite:
+            accessQualifier = CLC_KERNEL_ARG_ACCESS_WRITE |
+               CLC_KERNEL_ARG_ACCESS_READ;
+            break;
+         }
+      }
+
+      for (auto &kernel : kernels) {
+	 for (auto &arg : kernel.args) {
+            if (arg.typeId == typeId) {
+               arg.accessQualifier = accessQualifier;
+               arg.addrQualifier = CLC_KERNEL_ARG_ADDRESS_GLOBAL;
+            }
+         }
+      }
+   }
+
+   void parseExecutionMode(const spv_parsed_instruction_t *ins)
+   {
+      uint32_t executionMode = ins->words[ins->operands[1].offset];
+      if (executionMode != SpvExecutionModeVecTypeHint)
+         return;
+
+      uint32_t funcId = ins->words[ins->operands[0].offset];
+      uint32_t vecHint = ins->words[ins->operands[2].offset];
+      for (auto& kernel : kernels) {
+         if (kernel.funcId == funcId)
+            kernel.vecHint = vecHint;
+      }
+   }
+
+   void parseLiteralType(const spv_parsed_instruction_t *ins)
+   {
+      uint32_t typeId = ins->words[ins->operands[0].offset];
+      auto& literalType = literalTypes[typeId];
+      switch (ins->opcode) {
+      case SpvOpTypeBool:
+         literalType = CLC_SPEC_CONSTANT_BOOL;
+         break;
+      case SpvOpTypeFloat: {
+         uint32_t sizeInBits = ins->words[ins->operands[1].offset];
+         switch (sizeInBits) {
+         case 32:
+            literalType = CLC_SPEC_CONSTANT_FLOAT;
+            break;
+         case 64:
+            literalType = CLC_SPEC_CONSTANT_DOUBLE;
+            break;
+         case 16:
+            /* Can't be used for a spec constant */
+            break;
+         default:
+            unreachable("Unexpected float bit size");
+         }
+         break;
+      }
+      case SpvOpTypeInt: {
+         uint32_t sizeInBits = ins->words[ins->operands[1].offset];
+         bool isSigned = ins->words[ins->operands[2].offset];
+         if (isSigned) {
+            switch (sizeInBits) {
+            case 8:
+               literalType = CLC_SPEC_CONSTANT_INT8;
+               break;
+            case 16:
+               literalType = CLC_SPEC_CONSTANT_INT16;
+               break;
+            case 32:
+               literalType = CLC_SPEC_CONSTANT_INT32;
+               break;
+            case 64:
+               literalType = CLC_SPEC_CONSTANT_INT64;
+               break;
+            default:
+               unreachable("Unexpected int bit size");
+            }
+         } else {
+            switch (sizeInBits) {
+            case 8:
+               literalType = CLC_SPEC_CONSTANT_UINT8;
+               break;
+            case 16:
+               literalType = CLC_SPEC_CONSTANT_UINT16;
+               break;
+            case 32:
+               literalType = CLC_SPEC_CONSTANT_UINT32;
+               break;
+            case 64:
+               literalType = CLC_SPEC_CONSTANT_UINT64;
+               break;
+            default:
+               unreachable("Unexpected uint bit size");
+            }
+         }
+         break;
+      }
+      default:
+         unreachable("Unexpected type opcode");
+      }
+   }
+
+   void parseSpecConstant(const spv_parsed_instruction_t *ins)
+   {
+      uint32_t id = ins->result_id;
+      for (auto& c : specConstants) {
+         if (c.first == id) {
+            auto& data = c.second;
+            switch (ins->opcode) {
+            case SpvOpSpecConstant: {
+               uint32_t typeId = ins->words[ins->operands[0].offset];
+
+               // This better be an integer or float type
+               auto typeIter = literalTypes.find(typeId);
+               assert(typeIter != literalTypes.end());
+
+               data.type = typeIter->second;
+               break;
+            }
+            case SpvOpSpecConstantFalse:
+            case SpvOpSpecConstantTrue:
+               data.type = CLC_SPEC_CONSTANT_BOOL;
+               break;
+            default:
+               unreachable("Composites and Ops are not directly specializable.");
+            }
+         }
+      }
+   }
+
+   static spv_result_t
+   parseInstruction(void *data, const spv_parsed_instruction_t *ins)
+   {
+      SPIRVKernelParser *parser = reinterpret_cast<SPIRVKernelParser *>(data);
+
+      switch (ins->opcode) {
+      case SpvOpName:
+         parser->parseName(ins);
+         break;
+      case SpvOpEntryPoint:
+         parser->parseEntryPoint(ins);
+         break;
+      case SpvOpFunction:
+         parser->parseFunction(ins);
+         break;
+      case SpvOpFunctionParameter:
+         parser->parseFunctionParam(ins);
+         break;
+      case SpvOpFunctionEnd:
+      case SpvOpLabel:
+         parser->curKernel = NULL;
+         break;
+      case SpvOpTypePointer:
+         parser->parseTypePointer(ins);
+         break;
+      case SpvOpTypeImage:
+         parser->parseOpTypeImage(ins);
+         break;
+      case SpvOpString:
+         parser->parseOpString(ins);
+         break;
+      case SpvOpDecorate:
+         parser->parseOpDecorate(ins);
+         break;
+      case SpvOpGroupDecorate:
+         parser->parseOpGroupDecorate(ins);
+         break;
+      case SpvOpExecutionMode:
+         parser->parseExecutionMode(ins);
+         break;
+      case SpvOpTypeBool:
+      case SpvOpTypeInt:
+      case SpvOpTypeFloat:
+         parser->parseLiteralType(ins);
+         break;
+      case SpvOpSpecConstant:
+      case SpvOpSpecConstantFalse:
+      case SpvOpSpecConstantTrue:
+         parser->parseSpecConstant(ins);
+         break;
+      default:
+         break;
+      }
+
+      return SPV_SUCCESS;
+   }
+
+   bool parsingComplete()
+   {
+      for (auto &kernel : kernels) {
+         if (kernel.name.empty())
+            return false;
+
+         for (auto &arg : kernel.args) {
+            if (arg.name.empty() || arg.typeName.empty())
+               return false;
+         }
+      }
+
+      return true;
+   }
+
+   bool parseBinary(const struct clc_binary &spvbin, const struct clc_logger *logger)
+   {
+      /* 3 passes should be enough to retrieve all kernel information:
+       * 1st pass: all entry point name and number of args
+       * 2nd pass: argument names and type names
+       * 3rd pass: pointer type names
+       */
+      for (unsigned pass = 0; pass < 3; pass++) {
+         spv_diagnostic diagnostic = NULL;
+         auto result = spvBinaryParse(ctx, reinterpret_cast<void *>(this),
+                                      static_cast<uint32_t*>(spvbin.data), spvbin.size / 4,
+                                      NULL, parseInstruction, &diagnostic);
+
+         if (result != SPV_SUCCESS) {
+            if (diagnostic && logger)
+               logger->error(logger->priv, diagnostic->error);
+            return false;
+         }
+
+         if (parsingComplete())
+            return true;
+      }
+
+      assert(0);
+      return false;
+   }
+
+   std::vector<SPIRVKernelInfo> kernels;
+   std::vector<std::pair<uint32_t, clc_parsed_spec_constant>> specConstants;
+   std::map<uint32_t, enum clc_spec_constant_type> literalTypes;
+   std::map<uint32_t, std::vector<uint32_t>> decorationGroups;
+   SPIRVKernelInfo *curKernel;
+   spv_context ctx;
+};
+
+bool
+clc_spirv_get_kernels_info(const struct clc_binary *spvbin,
+                           const struct clc_kernel_info **out_kernels,
+                           unsigned *num_kernels,
+                           const struct clc_parsed_spec_constant **out_spec_constants,
+                           unsigned *num_spec_constants,
+                           const struct clc_logger *logger)
+{
+   struct clc_kernel_info *kernels;
+   struct clc_parsed_spec_constant *spec_constants = NULL;
+
+   SPIRVKernelParser parser;
+
+   if (!parser.parseBinary(*spvbin, logger))
+      return false;
+
+   *num_kernels = parser.kernels.size();
+   *num_spec_constants = parser.specConstants.size();
+   if (!*num_kernels)
+      return false;
+
+   kernels = reinterpret_cast<struct clc_kernel_info *>(calloc(*num_kernels,
+                                                               sizeof(*kernels)));
+   assert(kernels);
+   for (unsigned i = 0; i < parser.kernels.size(); i++) {
+      kernels[i].name = strdup(parser.kernels[i].name.c_str());
+      kernels[i].num_args = parser.kernels[i].args.size();
+      kernels[i].vec_hint_size = parser.kernels[i].vecHint >> 16;
+      kernels[i].vec_hint_type = (enum clc_vec_hint_type)(parser.kernels[i].vecHint & 0xFFFF);
+      if (!kernels[i].num_args)
+         continue;
+
+      struct clc_kernel_arg *args;
+
+      args = reinterpret_cast<struct clc_kernel_arg *>(calloc(kernels[i].num_args,
+                                                       sizeof(*kernels->args)));
+      kernels[i].args = args;
+      assert(args);
+      for (unsigned j = 0; j < kernels[i].num_args; j++) {
+         if (!parser.kernels[i].args[j].name.empty())
+            args[j].name = strdup(parser.kernels[i].args[j].name.c_str());
+         args[j].type_name = strdup(parser.kernels[i].args[j].typeName.c_str());
+         args[j].address_qualifier = parser.kernels[i].args[j].addrQualifier;
+         args[j].type_qualifier = parser.kernels[i].args[j].typeQualifier;
+         args[j].access_qualifier = parser.kernels[i].args[j].accessQualifier;
+      }
+   }
+
+   if (*num_spec_constants) {
+      spec_constants = reinterpret_cast<struct clc_parsed_spec_constant *>(calloc(*num_spec_constants,
+                                                                                  sizeof(*spec_constants)));
+      assert(spec_constants);
+
+      for (unsigned i = 0; i < parser.specConstants.size(); ++i) {
+         spec_constants[i] = parser.specConstants[i].second;
+      }
+   }
+
+   *out_kernels = kernels;
+   *out_spec_constants = spec_constants;
+
+   return true;
+}
+
+void
+clc_free_kernels_info(const struct clc_kernel_info *kernels,
+                      unsigned num_kernels)
+{
+   if (!kernels)
+      return;
+
+   for (unsigned i = 0; i < num_kernels; i++) {
+      if (kernels[i].args) {
+         for (unsigned j = 0; j < kernels[i].num_args; j++) {
+            free((void *)kernels[i].args[j].name);
+            free((void *)kernels[i].args[j].type_name);
+         }
+      }
+      free((void *)kernels[i].name);
+   }
+
+   free((void *)kernels);
+}
+
+static std::pair<std::unique_ptr<::llvm::Module>, std::unique_ptr<LLVMContext>>
+clc_compile_to_llvm_module(const struct clc_compile_args *args,
+                           const struct clc_logger *logger)
+{
+   LLVMInitializeAllTargets();
+   LLVMInitializeAllTargetInfos();
+   LLVMInitializeAllTargetMCs();
+   LLVMInitializeAllAsmPrinters();
+
+   std::string log;
+   std::unique_ptr<LLVMContext> llvm_ctx { new LLVMContext };
+   llvm_ctx->setDiagnosticHandlerCallBack(llvm_log_handler, &log);
+
+   std::unique_ptr<clang::CompilerInstance> c { new clang::CompilerInstance };
+   clang::DiagnosticsEngine diag { new clang::DiagnosticIDs,
+         new clang::DiagnosticOptions,
+         new clang::TextDiagnosticPrinter(*new raw_string_ostream(log),
+                                          &c->getDiagnosticOpts(), true)};
+
+   std::vector<const char *> clang_opts = {
+      args->source.name,
+      "-triple", "spir64-unknown-unknown",
+      // By default, clang prefers to use modules to pull in the default headers,
+      // which doesn't work with our technique of embedding the headers in our binary
+      "-finclude-default-header",
+      // Add a default CL compiler version. Clang will pick the last one specified
+      // on the command line, so the app can override this one.
+      "-cl-std=cl1.2",
+      // The LLVM-SPIRV-Translator doesn't support memset with variable size
+      "-fno-builtin-memset",
+      // LLVM's optimizations can produce code that the translator can't translate
+      "-O0",
+      // Ensure inline functions are actually emitted
+      "-fgnu89-inline"
+   };
+   // We assume there's appropriate defines for __OPENCL_VERSION__ and __IMAGE_SUPPORT__
+   // being provided by the caller here.
+   clang_opts.insert(clang_opts.end(), args->args, args->args + args->num_args);
+
+   if (!clang::CompilerInvocation::CreateFromArgs(c->getInvocation(),
+#if LLVM_VERSION_MAJOR >= 10
+                                                  clang_opts,
+#else
+                                                  clang_opts.data(),
+                                                  clang_opts.data() + clang_opts.size(),
+#endif
+                                                  diag)) {
+      clc_error(logger, "%sCouldn't create Clang invocation.\n", log.c_str());
+      return {};
+   }
+
+   if (diag.hasErrorOccurred()) {
+      clc_error(logger, "%sErrors occurred during Clang invocation.\n",
+                log.c_str());
+      return {};
+   }
+
+   // This is a workaround for a Clang bug which causes the number
+   // of warnings and errors to be printed to stderr.
+   // http://www.llvm.org/bugs/show_bug.cgi?id=19735
+   c->getDiagnosticOpts().ShowCarets = false;
+
+   c->createDiagnostics(new clang::TextDiagnosticPrinter(
+                           *new raw_string_ostream(log),
+                           &c->getDiagnosticOpts(), true));
+
+   c->setTarget(clang::TargetInfo::CreateTargetInfo(
+                   c->getDiagnostics(), c->getInvocation().TargetOpts));
+
+   c->getFrontendOpts().ProgramAction = clang::frontend::EmitLLVMOnly;
+
+#ifdef USE_STATIC_OPENCL_C_H
+   c->getHeaderSearchOpts().UseBuiltinIncludes = false;
+   c->getHeaderSearchOpts().UseStandardSystemIncludes = false;
+
+   // Add opencl-c generic search path
+   {
+      ::llvm::SmallString<128> system_header_path;
+      ::llvm::sys::path::system_temp_directory(true, system_header_path);
+      ::llvm::sys::path::append(system_header_path, "openclon12");
+      c->getHeaderSearchOpts().AddPath(system_header_path.str(),
+                                       clang::frontend::Angled,
+                                       false, false);
+
+      ::llvm::sys::path::append(system_header_path, "opencl-c.h");
+      c->getPreprocessorOpts().addRemappedFile(system_header_path.str(),
+         ::llvm::MemoryBuffer::getMemBuffer(llvm::StringRef(opencl_c_source, ARRAY_SIZE(opencl_c_source) - 1)).release());
+
+      ::llvm::sys::path::remove_filename(system_header_path);
+      ::llvm::sys::path::append(system_header_path, "opencl-c-base.h");
+      c->getPreprocessorOpts().addRemappedFile(system_header_path.str(),
+         ::llvm::MemoryBuffer::getMemBuffer(llvm::StringRef(opencl_c_base_source, ARRAY_SIZE(opencl_c_base_source) - 1)).release());
+   }
+#else
+   c->getHeaderSearchOpts().UseBuiltinIncludes = true;
+   c->getHeaderSearchOpts().UseStandardSystemIncludes = true;
+   c->getHeaderSearchOpts().ResourceDir = CLANG_RESOURCE_DIR;
+
+   // Add opencl-c generic search path
+   c->getHeaderSearchOpts().AddPath(CLANG_RESOURCE_DIR,
+                                    clang::frontend::Angled,
+                                    false, false);
+   // Add opencl include
+   c->getPreprocessorOpts().Includes.push_back("opencl-c.h");
+#endif
+
+   if (args->num_headers) {
+      ::llvm::SmallString<128> tmp_header_path;
+      ::llvm::sys::path::system_temp_directory(true, tmp_header_path);
+      ::llvm::sys::path::append(tmp_header_path, "openclon12");
+
+      c->getHeaderSearchOpts().AddPath(tmp_header_path.str(),
+                                       clang::frontend::Quoted,
+                                       false, false);
+
+      for (size_t i = 0; i < args->num_headers; i++) {
+         auto path_copy = tmp_header_path;
+         ::llvm::sys::path::append(path_copy, ::llvm::sys::path::convert_to_slash(args->headers[i].name));
+         c->getPreprocessorOpts().addRemappedFile(path_copy.str(),
+            ::llvm::MemoryBuffer::getMemBufferCopy(args->headers[i].value).release());
+      }
+   }
+
+   c->getPreprocessorOpts().addRemappedFile(
+           args->source.name,
+           ::llvm::MemoryBuffer::getMemBufferCopy(std::string(args->source.value)).release());
+
+   // Compile the code
+   clang::EmitLLVMOnlyAction act(llvm_ctx.get());
+   if (!c->ExecuteAction(act)) {
+      clc_error(logger, "%sError executing LLVM compilation action.\n",
+                log.c_str());
+      return {};
+   }
+
+   return { act.takeModule(), std::move(llvm_ctx) };
+}
+
+static SPIRV::VersionNumber
+spirv_version_to_llvm_spirv_translator_version(enum clc_spirv_version version)
+{
+   switch (version) {
+   case CLC_SPIRV_VERSION_MAX: return SPIRV::VersionNumber::MaximumVersion;
+   case CLC_SPIRV_VERSION_1_0: return SPIRV::VersionNumber::SPIRV_1_0;
+   case CLC_SPIRV_VERSION_1_1: return SPIRV::VersionNumber::SPIRV_1_1;
+   case CLC_SPIRV_VERSION_1_2: return SPIRV::VersionNumber::SPIRV_1_2;
+   case CLC_SPIRV_VERSION_1_3: return SPIRV::VersionNumber::SPIRV_1_3;
+#ifdef HAS_SPIRV_1_4
+   case CLC_SPIRV_VERSION_1_4: return SPIRV::VersionNumber::SPIRV_1_4;
+#endif
+   default:      return invalid_spirv_trans_version;
+   }
+}
+
+static int
+llvm_mod_to_spirv(std::unique_ptr<::llvm::Module> mod,
+                  std::unique_ptr<LLVMContext> context,
+                  const struct clc_compile_args *args,
+                  const struct clc_logger *logger,
+                  struct clc_binary *out_spirv)
+{
+   std::string log;
+
+   SPIRV::VersionNumber version =
+      spirv_version_to_llvm_spirv_translator_version(args->spirv_version);
+   if (version == invalid_spirv_trans_version) {
+      clc_error(logger, "Invalid/unsupported SPIRV specified.\n");
+      return -1;
+   }
+
+   const char *const *extensions = NULL;
+   if (args)
+      extensions = args->allowed_spirv_extensions;
+   if (!extensions) {
+      /* The SPIR-V parser doesn't handle all extensions */
+      static const char *default_extensions[] = {
+         "SPV_EXT_shader_atomic_float_add",
+         "SPV_EXT_shader_atomic_float_min_max",
+         "SPV_KHR_float_controls",
+         NULL,
+      };
+      extensions = default_extensions;
+   }
+
+   SPIRV::TranslatorOpts::ExtensionsStatusMap ext_map;
+   for (int i = 0; extensions[i]; i++) {
+#define EXT(X) \
+      if (strcmp(#X, extensions[i]) == 0) \
+         ext_map.insert(std::make_pair(SPIRV::ExtensionID::X, true));
+#include "LLVMSPIRVLib/LLVMSPIRVExtensions.inc"
+#undef EXT
+   }
+   SPIRV::TranslatorOpts spirv_opts = SPIRV::TranslatorOpts(version, ext_map);
+
+#if LLVM_VERSION_MAJOR >= 13
+   /* This was the default in 12.0 and older, but currently we'll fail to parse without this */
+   spirv_opts.setPreserveOCLKernelArgTypeMetadataThroughString(true);
+#endif
+
+   std::ostringstream spv_stream;
+   if (!::llvm::writeSpirv(mod.get(), spirv_opts, spv_stream, log)) {
+      clc_error(logger, "%sTranslation from LLVM IR to SPIR-V failed.\n",
+                log.c_str());
+      return -1;
+   }
+
+   const std::string spv_out = spv_stream.str();
+   out_spirv->size = spv_out.size();
+   out_spirv->data = malloc(out_spirv->size);
+   memcpy(out_spirv->data, spv_out.data(), out_spirv->size);
+
+   return 0;
+}
+
+int
+clc_c_to_spir(const struct clc_compile_args *args,
+              const struct clc_logger *logger,
+              struct clc_binary *out_spir)
+{
+   auto pair = clc_compile_to_llvm_module(args, logger);
+   if (!pair.first)
+      return -1;
+
+   ::llvm::SmallVector<char, 0> buffer;
+   ::llvm::BitcodeWriter writer(buffer);
+   writer.writeModule(*pair.first);
+
+   out_spir->size = buffer.size_in_bytes();
+   out_spir->data = malloc(out_spir->size);
+   memcpy(out_spir->data, buffer.data(), out_spir->size);
+
+   return 0;
+}
+
+int
+clc_c_to_spirv(const struct clc_compile_args *args,
+               const struct clc_logger *logger,
+               struct clc_binary *out_spirv)
+{
+   auto pair = clc_compile_to_llvm_module(args, logger);
+   if (!pair.first)
+      return -1;
+   return llvm_mod_to_spirv(std::move(pair.first), std::move(pair.second), args, logger, out_spirv);
+}
+
+int
+clc_spir_to_spirv(const struct clc_binary *in_spir,
+                  const struct clc_logger *logger,
+                  struct clc_binary *out_spirv)
+{
+   LLVMInitializeAllTargets();
+   LLVMInitializeAllTargetInfos();
+   LLVMInitializeAllTargetMCs();
+   LLVMInitializeAllAsmPrinters();
+
+   std::unique_ptr<LLVMContext> llvm_ctx{ new LLVMContext };
+   ::llvm::StringRef spir_ref(static_cast<const char*>(in_spir->data), in_spir->size);
+   auto mod = ::llvm::parseBitcodeFile(::llvm::MemoryBufferRef(spir_ref, "<spir>"), *llvm_ctx);
+   if (!mod)
+      return -1;
+
+   return llvm_mod_to_spirv(std::move(mod.get()), std::move(llvm_ctx), NULL, logger, out_spirv);
+}
+
+class SPIRVMessageConsumer {
+public:
+   SPIRVMessageConsumer(const struct clc_logger *logger): logger(logger) {}
+
+   void operator()(spv_message_level_t level, const char *src,
+                   const spv_position_t &pos, const char *msg)
+   {
+      switch(level) {
+      case SPV_MSG_FATAL:
+      case SPV_MSG_INTERNAL_ERROR:
+      case SPV_MSG_ERROR:
+         clc_error(logger, "(file=%s,line=%ld,column=%ld,index=%ld): %s\n",
+                   src, pos.line, pos.column, pos.index, msg);
+         break;
+
+      case SPV_MSG_WARNING:
+         clc_warning(logger, "(file=%s,line=%ld,column=%ld,index=%ld): %s\n",
+                     src, pos.line, pos.column, pos.index, msg);
+         break;
+
+      default:
+         break;
+      }
+   }
+
+private:
+   const struct clc_logger *logger;
+};
+
+int
+clc_link_spirv_binaries(const struct clc_linker_args *args,
+                        const struct clc_logger *logger,
+                        struct clc_binary *out_spirv)
+{
+   std::vector<std::vector<uint32_t>> binaries;
+
+   for (unsigned i = 0; i < args->num_in_objs; i++) {
+      const uint32_t *data = static_cast<const uint32_t *>(args->in_objs[i]->data);
+      std::vector<uint32_t> bin(data, data + (args->in_objs[i]->size / 4));
+      binaries.push_back(bin);
+   }
+
+   SPIRVMessageConsumer msgconsumer(logger);
+   spvtools::Context context(spirv_target);
+   context.SetMessageConsumer(msgconsumer);
+   spvtools::LinkerOptions options;
+   options.SetAllowPartialLinkage(args->create_library);
+   options.SetCreateLibrary(args->create_library);
+   std::vector<uint32_t> linkingResult;
+   spv_result_t status = spvtools::Link(context, binaries, &linkingResult, options);
+   if (status != SPV_SUCCESS) {
+      return -1;
+   }
+
+   out_spirv->size = linkingResult.size() * 4;
+   out_spirv->data = static_cast<uint32_t *>(malloc(out_spirv->size));
+   memcpy(out_spirv->data, linkingResult.data(), out_spirv->size);
+
+   return 0;
+}
+
+int
+clc_spirv_specialize(const struct clc_binary *in_spirv,
+                     const struct clc_parsed_spirv *parsed_data,
+                     const struct clc_spirv_specialization_consts *consts,
+                     struct clc_binary *out_spirv)
+{
+   std::unordered_map<uint32_t, std::vector<uint32_t>> spec_const_map;
+   for (unsigned i = 0; i < consts->num_specializations; ++i) {
+      unsigned id = consts->specializations[i].id;
+      auto parsed_spec_const = std::find_if(parsed_data->spec_constants,
+         parsed_data->spec_constants + parsed_data->num_spec_constants,
+         [id](const clc_parsed_spec_constant &c) { return c.id == id; });
+      assert(parsed_spec_const != parsed_data->spec_constants + parsed_data->num_spec_constants);
+
+      std::vector<uint32_t> words;
+      switch (parsed_spec_const->type) {
+      case CLC_SPEC_CONSTANT_BOOL:
+         words.push_back(consts->specializations[i].value.b);
+         break;
+      case CLC_SPEC_CONSTANT_INT32:
+      case CLC_SPEC_CONSTANT_UINT32:
+      case CLC_SPEC_CONSTANT_FLOAT:
+         words.push_back(consts->specializations[i].value.u32);
+         break;
+      case CLC_SPEC_CONSTANT_INT16:
+         words.push_back((uint32_t)(int32_t)consts->specializations[i].value.i16);
+         break;
+      case CLC_SPEC_CONSTANT_INT8:
+         words.push_back((uint32_t)(int32_t)consts->specializations[i].value.i8);
+         break;
+      case CLC_SPEC_CONSTANT_UINT16:
+         words.push_back((uint32_t)consts->specializations[i].value.u16);
+         break;
+      case CLC_SPEC_CONSTANT_UINT8:
+         words.push_back((uint32_t)consts->specializations[i].value.u8);
+         break;
+      case CLC_SPEC_CONSTANT_DOUBLE:
+      case CLC_SPEC_CONSTANT_INT64:
+      case CLC_SPEC_CONSTANT_UINT64:
+         words.resize(2);
+         memcpy(words.data(), &consts->specializations[i].value.u64, 8);
+         break;
+      case CLC_SPEC_CONSTANT_UNKNOWN:
+         assert(0);
+         break;
+      }
+
+      ASSERTED auto ret = spec_const_map.emplace(id, std::move(words));
+      assert(ret.second);
+   }
+
+   spvtools::Optimizer opt(spirv_target);
+   opt.RegisterPass(spvtools::CreateSetSpecConstantDefaultValuePass(std::move(spec_const_map)));
+
+   std::vector<uint32_t> result;
+   if (!opt.Run(static_cast<const uint32_t*>(in_spirv->data), in_spirv->size / 4, &result))
+      return false;
+
+   out_spirv->size = result.size() * 4;
+   out_spirv->data = malloc(out_spirv->size);
+   memcpy(out_spirv->data, result.data(), out_spirv->size);
+   return true;
+}
+
+void
+clc_dump_spirv(const struct clc_binary *spvbin, FILE *f)
+{
+   spvtools::SpirvTools tools(spirv_target);
+   const uint32_t *data = static_cast<const uint32_t *>(spvbin->data);
+   std::vector<uint32_t> bin(data, data + (spvbin->size / 4));
+   std::string out;
+   tools.Disassemble(bin, &out,
+                     SPV_BINARY_TO_TEXT_OPTION_INDENT |
+                     SPV_BINARY_TO_TEXT_OPTION_FRIENDLY_NAMES);
+   fwrite(out.c_str(), out.size(), 1, f);
+}
+
+void
+clc_free_spir_binary(struct clc_binary *spir)
+{
+   free(spir->data);
+}
+
+void
+clc_free_spirv_binary(struct clc_binary *spvbin)
+{
+   free(spvbin->data);
+}
diff --git a/mesa 3D driver/src/compiler/clc/clc_helpers.h b/mesa 3D driver/src/compiler/clc/clc_helpers.h
new file mode 100644
index 0000000000..cbad142efe
--- /dev/null
+++ b/mesa 3D driver/src/compiler/clc/clc_helpers.h	
@@ -0,0 +1,104 @@
+/*
+ * Copyright © Microsoft Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef MESA_CLC_HELPERS_H
+#define MESA_CLC_HELPERS_H
+
+#include "nir_types.h"
+
+#include "clc.h"
+#include "util/u_string.h"
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool
+clc_spirv_get_kernels_info(const struct clc_binary *spvbin,
+                           const struct clc_kernel_info **kernels,
+                           unsigned *num_kernels,
+                           const struct clc_parsed_spec_constant **spec_constants,
+                           unsigned *num_spec_constants,
+                           const struct clc_logger *logger);
+
+void
+clc_free_kernels_info(const struct clc_kernel_info *kernels,
+                      unsigned num_kernels);
+
+int
+clc_c_to_spir(const struct clc_compile_args *args,
+              const struct clc_logger *logger,
+              struct clc_binary *out_spir);
+
+int
+clc_spir_to_spirv(const struct clc_binary *in_spir,
+                  const struct clc_logger *logger,
+                  struct clc_binary *out_spirv);
+
+int
+clc_c_to_spirv(const struct clc_compile_args *args,
+               const struct clc_logger *logger,
+               struct clc_binary *out_spirv);
+
+int
+clc_link_spirv_binaries(const struct clc_linker_args *args,
+                        const struct clc_logger *logger,
+                        struct clc_binary *out_spirv);
+
+int
+clc_spirv_specialize(const struct clc_binary *in_spirv,
+                     const struct clc_parsed_spirv *parsed_data,
+                     const struct clc_spirv_specialization_consts *consts,
+                     struct clc_binary *out_spirv);
+
+void
+clc_dump_spirv(const struct clc_binary *spvbin, FILE *f);
+
+void
+clc_free_spir_binary(struct clc_binary *spir);
+
+void
+clc_free_spirv_binary(struct clc_binary *spvbin);
+
+#define clc_log(logger, level, fmt, ...) do {        \
+      if (!logger || !logger->level) break;          \
+      char *_msg = NULL;                             \
+      asprintf(&_msg, fmt, ##__VA_ARGS__);           \
+      assert(_msg);                                  \
+      logger->level(logger->priv, _msg);             \
+      free(_msg);                                    \
+   } while (0)
+
+#define clc_error(logger, fmt, ...) clc_log(logger, error, fmt, ##__VA_ARGS__)
+#define clc_warning(logger, fmt, ...) clc_log(logger, warning, fmt, ##__VA_ARGS__)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* MESA_CLC_HELPERS_H */
diff --git a/mesa 3D driver/src/compiler/clc/meson.build b/mesa 3D driver/src/compiler/clc/meson.build
new file mode 100644
index 0000000000..d3c3286a7a
--- /dev/null
+++ b/mesa 3D driver/src/compiler/clc/meson.build	
@@ -0,0 +1,66 @@
+# Copyright © Microsoft Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+clang_resource_dir = join_paths(llvm_libdir, 'clang', dep_llvm.version(), 'include')
+
+opencl_c_h = custom_target(
+  'opencl-c.h',
+  input : [files_xxd, join_paths(clang_resource_dir, 'opencl-c.h')],
+  output : 'opencl-c.h.h',
+  command : [prog_python, '@INPUT@', '@OUTPUT@', '-n', 'opencl_c_source'],
+)
+opencl_c_base_h = custom_target(
+  'opencl-c-base.h',
+  input : [files_xxd, join_paths(clang_resource_dir, 'opencl-c-base.h')],
+  output : 'opencl-c-base.h.h',
+  command : [prog_python, '@INPUT@', '@OUTPUT@', '-n', 'opencl_c_base_source'],
+)
+
+files_libclc = files(
+  'clc.c',
+  'clc_helpers.cpp',
+)
+
+_libclc_cpp_args = ['-DCLANG_RESOURCE_DIR="@0@"'.format(clang_resource_dir)]
+if with_microsoft_clc
+  _libclc_cpp_args += ['-DUSE_STATIC_OPENCL_C_H=1']
+endif
+
+# Supported added for SPIRV 1.4 in a version that required LLVM 14.
+if dep_llvm.version().version_compare('>= 14.0')
+  _libclc_cpp_args += ['-DHAS_SPIRV_1_4=1']
+endif
+
+_libclc = static_library(
+  'libclc',
+  files_libclc,
+  opencl_c_h,
+  opencl_c_base_h,
+  include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_compiler, inc_spirv],
+  cpp_args : _libclc_cpp_args,
+  dependencies: [idep_nir_headers, dep_clang, dep_llvm, dep_llvmspirvlib,
+                 idep_mesautil, idep_nir, dep_spirv_tools]
+)
+
+idep_clc = declare_dependency(
+  link_with : _libclc,
+  include_directories : include_directories('.'),
+)
diff --git a/mesa 3D driver/src/compiler/glsl/ast.h b/mesa 3D driver/src/compiler/glsl/ast.h
index c6b578cb89..0a5b94bb1c 100644
--- a/mesa 3D driver/src/compiler/glsl/ast.h	
+++ b/mesa 3D driver/src/compiler/glsl/ast.h	
@@ -1195,6 +1195,8 @@ class ast_iteration_statement : public ast_node {
    ast_node *condition;
    ast_expression *rest_expression;
 
+   exec_list rest_instructions;
+
    ast_node *body;
 
    /**
diff --git a/mesa 3D driver/src/compiler/glsl/ast_to_hir.cpp b/mesa 3D driver/src/compiler/glsl/ast_to_hir.cpp
index 370f6934bd..c929ba59a1 100644
--- a/mesa 3D driver/src/compiler/glsl/ast_to_hir.cpp	
+++ b/mesa 3D driver/src/compiler/glsl/ast_to_hir.cpp	
@@ -1182,6 +1182,7 @@ do_comparison(void *mem_ctx, int operation, ir_rvalue *op0, ir_rvalue *op1)
    case GLSL_TYPE_ERROR:
    case GLSL_TYPE_VOID:
    case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_TEXTURE:
    case GLSL_TYPE_IMAGE:
    case GLSL_TYPE_INTERFACE:
    case GLSL_TYPE_ATOMIC_UINT:
@@ -1703,6 +1704,7 @@ ast_expression::do_hir(exec_list *instructions,
       if ((op[0]->type == glsl_type::error_type ||
            op[1]->type == glsl_type::error_type)) {
          error_emitted = true;
+         result = ir_rvalue::error_value(ctx);
          break;
       }
 
@@ -1740,6 +1742,14 @@ ast_expression::do_hir(exec_list *instructions,
       op[0] = this->subexpressions[0]->hir(instructions, state);
       op[1] = this->subexpressions[1]->hir(instructions, state);
 
+      /* Break out if operand types were not parsed successfully. */
+      if ((op[0]->type == glsl_type::error_type ||
+           op[1]->type == glsl_type::error_type)) {
+         error_emitted = true;
+         result = ir_rvalue::error_value(ctx);
+         break;
+      }
+
       orig_type = op[0]->type;
       type = modulus_result_type(op[0], op[1], state, &loc);
 
@@ -1770,6 +1780,15 @@ ast_expression::do_hir(exec_list *instructions,
       this->subexpressions[0]->set_is_lhs(true);
       op[0] = this->subexpressions[0]->hir(instructions, state);
       op[1] = this->subexpressions[1]->hir(instructions, state);
+
+      /* Break out if operand types were not parsed successfully. */
+      if ((op[0]->type == glsl_type::error_type ||
+           op[1]->type == glsl_type::error_type)) {
+         error_emitted = true;
+         result = ir_rvalue::error_value(ctx);
+         break;
+      }
+
       type = shift_result_type(op[0]->type, op[1]->type, this->oper, state,
                                &loc);
       ir_rvalue *temp_rhs = new(ctx) ir_expression(operations[this->oper],
@@ -1790,6 +1809,14 @@ ast_expression::do_hir(exec_list *instructions,
       op[0] = this->subexpressions[0]->hir(instructions, state);
       op[1] = this->subexpressions[1]->hir(instructions, state);
 
+      /* Break out if operand types were not parsed successfully. */
+      if ((op[0]->type == glsl_type::error_type ||
+           op[1]->type == glsl_type::error_type)) {
+         error_emitted = true;
+         result = ir_rvalue::error_value(ctx);
+         break;
+      }
+
       orig_type = op[0]->type;
       type = bit_logic_result_type(op[0], op[1], this->oper, state, &loc);
 
@@ -3970,9 +3997,9 @@ apply_layout_qualifier_to_variable(const struct ast_type_qualifier *qual,
          _mesa_glsl_error(loc, state, "gl_Layer redeclaration with "
                           "different viewport_relative setting than earlier");
       }
-      state->redeclares_gl_layer = 1;
+      state->redeclares_gl_layer = true;
       if (qual->flags.q.viewport_relative) {
-         state->layer_viewport_relative = 1;
+         state->layer_viewport_relative = true;
       }
    } else if (qual->flags.q.viewport_relative) {
       _mesa_glsl_error(loc, state,
@@ -4196,6 +4223,7 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
       case GLSL_TYPE_INT64:
          break;
       case GLSL_TYPE_SAMPLER:
+      case GLSL_TYPE_TEXTURE:
       case GLSL_TYPE_IMAGE:
          if (state->has_bindless())
             break;
@@ -5406,6 +5434,7 @@ ast_declarator_list::hir(exec_list *instructions,
                error = !state->is_version(410, 0) && !state->ARB_vertex_attrib_64bit_enable;
                break;
             case GLSL_TYPE_SAMPLER:
+            case GLSL_TYPE_TEXTURE:
             case GLSL_TYPE_IMAGE:
                error = !state->has_bindless();
                break;
@@ -6531,8 +6560,8 @@ ast_jump_statement::hir(exec_list *instructions,
          if (state->loop_nesting_ast != NULL &&
              mode == ast_continue && !state->switch_state.is_switch_innermost) {
             if (state->loop_nesting_ast->rest_expression) {
-               state->loop_nesting_ast->rest_expression->hir(instructions,
-                                                             state);
+               clone_ir_list(ctx, instructions,
+                             &state->loop_nesting_ast->rest_instructions);
             }
             if (state->loop_nesting_ast->mode ==
                 ast_iteration_statement::ast_do_while) {
@@ -6780,8 +6809,8 @@ ast_switch_statement::hir(exec_list *instructions,
 
       if (state->loop_nesting_ast != NULL) {
          if (state->loop_nesting_ast->rest_expression) {
-            state->loop_nesting_ast->rest_expression->hir(&irif->then_instructions,
-                                                          state);
+            clone_ir_list(ctx, &irif->then_instructions,
+                          &state->loop_nesting_ast->rest_instructions);
          }
          if (state->loop_nesting_ast->mode ==
              ast_iteration_statement::ast_do_while) {
@@ -6830,8 +6859,11 @@ ir_rvalue *
 ast_switch_body::hir(exec_list *instructions,
                      struct _mesa_glsl_parse_state *state)
 {
-   if (stmts != NULL)
+   if (stmts != NULL) {
+      state->symbols->push_scope();
       stmts->hir(instructions, state);
+      state->symbols->pop_scope();
+   }
 
    /* Switch bodies do not have r-values. */
    return NULL;
@@ -7135,11 +7167,21 @@ ast_iteration_statement::hir(exec_list *instructions,
    if (mode != ast_do_while)
       condition_to_hir(&stmt->body_instructions, state);
 
-   if (body != NULL)
+   if (rest_expression != NULL)
+      rest_expression->hir(&rest_instructions, state);
+
+   if (body != NULL) {
+      if (mode == ast_do_while)
+         state->symbols->push_scope();
+
       body->hir(& stmt->body_instructions, state);
 
+      if (mode == ast_do_while)
+         state->symbols->pop_scope();
+   }
+
    if (rest_expression != NULL)
-      rest_expression->hir(& stmt->body_instructions, state);
+      stmt->body_instructions.append_list(&rest_instructions);
 
    if (mode == ast_do_while)
       condition_to_hir(&stmt->body_instructions, state);
@@ -7189,6 +7231,7 @@ is_valid_default_precision_type(const struct glsl_type *const type)
       /* "int" and "float" are valid, but vectors and matrices are not. */
       return type->vector_elements == 1 && type->matrix_columns == 1;
    case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_TEXTURE:
    case GLSL_TYPE_IMAGE:
    case GLSL_TYPE_ATOMIC_UINT:
       return true;
diff --git a/mesa 3D driver/src/compiler/glsl/builtin_variables.cpp b/mesa 3D driver/src/compiler/glsl/builtin_variables.cpp
index 3a8ec615c1..d2d9d17702 100644
--- a/mesa 3D driver/src/compiler/glsl/builtin_variables.cpp	
+++ b/mesa 3D driver/src/compiler/glsl/builtin_variables.cpp	
@@ -1630,6 +1630,9 @@ builtin_variable_generator::generate_varyings()
 
          var->data.invariant = fields[i].location == VARYING_SLOT_POS &&
                                options->PositionAlwaysInvariant;
+
+         var->data.precise = fields[i].location == VARYING_SLOT_POS &&
+                               options->PositionAlwaysPrecise;
       }
    }
 }
diff --git a/mesa 3D driver/src/compiler/glsl/gl_nir_link_uniform_initializers.c b/mesa 3D driver/src/compiler/glsl/gl_nir_link_uniform_initializers.c
index 6cbc7984eb..488bdba0bc 100644
--- a/mesa 3D driver/src/compiler/glsl/gl_nir_link_uniform_initializers.c	
+++ b/mesa 3D driver/src/compiler/glsl/gl_nir_link_uniform_initializers.c	
@@ -152,6 +152,7 @@ copy_constant_to_storage(union gl_constant_value *storage,
             break;
          case GLSL_TYPE_ARRAY:
          case GLSL_TYPE_STRUCT:
+         case GLSL_TYPE_TEXTURE:
          case GLSL_TYPE_IMAGE:
          case GLSL_TYPE_ATOMIC_UINT:
          case GLSL_TYPE_INTERFACE:
diff --git a/mesa 3D driver/src/compiler/glsl/gl_nir_link_uniforms.c b/mesa 3D driver/src/compiler/glsl/gl_nir_link_uniforms.c
index 4670b8520a..05b6e036e1 100644
--- a/mesa 3D driver/src/compiler/glsl/gl_nir_link_uniforms.c	
+++ b/mesa 3D driver/src/compiler/glsl/gl_nir_link_uniforms.c	
@@ -394,7 +394,8 @@ add_var_use_deref(nir_deref_instr *deref, struct hash_table *live,
    if (deref->deref_type != nir_deref_type_var ||
        !nir_deref_mode_is_one_of(deref, nir_var_uniform |
                                         nir_var_mem_ubo |
-                                        nir_var_mem_ssbo)) {
+                                        nir_var_mem_ssbo |
+                                        nir_var_image)) {
       nir_deref_path_finish(&path);
       return;
    }
diff --git a/mesa 3D driver/src/compiler/glsl/gl_nir_linker.h b/mesa 3D driver/src/compiler/glsl/gl_nir_linker.h
index 5171a2c68a..5b9e9de237 100644
--- a/mesa 3D driver/src/compiler/glsl/gl_nir_linker.h	
+++ b/mesa 3D driver/src/compiler/glsl/gl_nir_linker.h	
@@ -38,7 +38,8 @@ struct gl_nir_linker_options {
 #define nir_foreach_gl_uniform_variable(var, shader) \
    nir_foreach_variable_with_modes(var, shader, nir_var_uniform | \
                                                 nir_var_mem_ubo | \
-                                                nir_var_mem_ssbo)
+                                                nir_var_mem_ssbo | \
+                                                nir_var_image)
 
 bool gl_nir_link_spirv(struct gl_context *ctx,
                        struct gl_shader_program *prog,
diff --git a/mesa 3D driver/src/compiler/glsl/gl_nir_lower_buffers.c b/mesa 3D driver/src/compiler/glsl/gl_nir_lower_buffers.c
index d427d4a98b..06460ad6a5 100644
--- a/mesa 3D driver/src/compiler/glsl/gl_nir_lower_buffers.c	
+++ b/mesa 3D driver/src/compiler/glsl/gl_nir_lower_buffers.c	
@@ -39,7 +39,7 @@ get_block_array_index(nir_builder *b, nir_deref_instr *deref,
     * blocks later on as well as an optional dynamic index which gets added
     * to the block index later.
     */
-   int binding = 0;
+   int const_array_offset = 0;
    const char *block_name = "";
    nir_ssa_def *nonconst_index = NULL;
    while (deref->deref_type == nir_deref_type_array) {
@@ -54,7 +54,7 @@ get_block_array_index(nir_builder *b, nir_deref_instr *deref,
          block_name = ralloc_asprintf(b->shader, "[%u]%s", arr_index,
                                       block_name);
 
-         binding += arr_index * array_elements;
+         const_array_offset += arr_index * array_elements;
       } else {
          nir_ssa_def *arr_index = nir_ssa_for_src(b, deref->arr.index, 1);
          arr_index = nir_umin(b, arr_index, nir_imm_int(b, arr_size - 1));
@@ -73,7 +73,7 @@ get_block_array_index(nir_builder *b, nir_deref_instr *deref,
    }
 
    assert(deref->deref_type == nir_deref_type_var);
-   binding += deref->var->data.binding;
+   int binding = const_array_offset + deref->var->data.binding;
    block_name = ralloc_asprintf(b->shader, "%s%s",
                                 glsl_get_type_name(deref->var->interface_type),
                                 block_name);
@@ -98,6 +98,7 @@ get_block_array_index(nir_builder *b, nir_deref_instr *deref,
    for (unsigned i = 0; i < num_blocks; i++) {
       if (( use_bindings && binding == blocks[i]->Binding) ||
           (!use_bindings && strcmp(block_name, blocks[i]->Name) == 0)) {
+         deref->var->data.driver_location = i - const_array_offset;
          if (nonconst_index)
             return nir_iadd_imm(b, nonconst_index, i);
          else
@@ -144,6 +145,7 @@ get_block_index_offset(nir_variable *var,
       const char *block_name = glsl_get_type_name(var->interface_type);
       if (( use_bindings && blocks[i]->Binding == var->data.binding) ||
           (!use_bindings && strcmp(block_name, blocks[i]->Name) == 0)) {
+         var->data.driver_location = i;
          *index = i;
          *offset = blocks[i]->Uniforms[var->data.location].Offset;
          return;
@@ -318,6 +320,8 @@ lower_buffer_interface_derefs_impl(nir_function_impl *impl,
    if (progress) {
       nir_metadata_preserve(impl, nir_metadata_block_index |
                                   nir_metadata_dominance);
+   } else {
+      nir_metadata_preserve(impl, nir_metadata_all);
    }
 
    return progress;
@@ -329,6 +333,11 @@ gl_nir_lower_buffers(nir_shader *shader,
 {
    bool progress = false;
 
+   nir_foreach_variable_with_modes(var, shader, nir_var_mem_ubo | nir_var_mem_ssbo) {
+      var->data.driver_location = -1;
+      progress = true;
+   }
+
    /* First, we lower the derefs to turn block variable and array derefs into
     * a nir_address_format_32bit_index_offset pointer.  From there forward,
     * we leave the derefs in place and let nir_lower_explicit_io handle them.
diff --git a/mesa 3D driver/src/compiler/glsl/gl_nir_lower_images.c b/mesa 3D driver/src/compiler/glsl/gl_nir_lower_images.c
index ee74004de9..fde996051c 100644
--- a/mesa 3D driver/src/compiler/glsl/gl_nir_lower_images.c	
+++ b/mesa 3D driver/src/compiler/glsl/gl_nir_lower_images.c	
@@ -87,7 +87,7 @@ lower_impl(nir_builder *b, nir_instr *instr, bool bindless_only)
       return false;
    }
 
-   bool bindless = var->data.mode != nir_var_uniform || var->data.bindless;
+   bool bindless = var->data.mode != nir_var_image || var->data.bindless;
    if (bindless_only && !bindless)
       return false;
 
diff --git a/mesa 3D driver/src/compiler/glsl/gl_nir_lower_samplers_as_deref.c b/mesa 3D driver/src/compiler/glsl/gl_nir_lower_samplers_as_deref.c
index b56a887b7b..8c193e2115 100644
--- a/mesa 3D driver/src/compiler/glsl/gl_nir_lower_samplers_as_deref.c	
+++ b/mesa 3D driver/src/compiler/glsl/gl_nir_lower_samplers_as_deref.c	
@@ -141,7 +141,8 @@ lower_deref(nir_builder *b, struct lower_samplers_as_deref_state *state,
    nir_variable *var = nir_deref_instr_get_variable(deref);
    gl_shader_stage stage = state->shader->info.stage;
 
-   if (var->data.bindless || var->data.mode != nir_var_uniform)
+   if (!(var->data.mode & (nir_var_uniform | nir_var_image)) ||
+       var->data.bindless)
       return NULL;
 
    nir_deref_path path;
@@ -191,7 +192,7 @@ lower_deref(nir_builder *b, struct lower_samplers_as_deref_state *state,
    if (h) {
       var = (nir_variable *)h->data;
    } else {
-      var = nir_variable_create(state->shader, nir_var_uniform, type, name);
+      var = nir_variable_create(state->shader, var->data.mode, type, name);
       var->data.binding = binding;
 
       /* Don't set var->data.location.  The old structure location could be
@@ -232,12 +233,12 @@ record_textures_used(struct shader_info *info,
    const unsigned size =
       glsl_type_is_array(var->type) ? glsl_get_aoa_size(var->type) : 1;
 
-   BITSET_SET_RANGE(info->textures_used, var->data.binding, var->data.binding + (MAX2(size, 1) - 1));
+   BITSET_SET_RANGE_INSIDE_WORD(info->textures_used, var->data.binding, var->data.binding + (MAX2(size, 1) - 1));
 
    if (op == nir_texop_txf ||
        op == nir_texop_txf_ms ||
        op == nir_texop_txf_ms_mcs_intel)
-      BITSET_SET_RANGE(info->textures_used_by_txf, var->data.binding, var->data.binding + (MAX2(size, 1) - 1));
+      BITSET_SET_RANGE_INSIDE_WORD(info->textures_used_by_txf, var->data.binding, var->data.binding + (MAX2(size, 1) - 1));
 }
 
 static bool
diff --git a/mesa 3D driver/src/compiler/glsl/glsl_parser.yy b/mesa 3D driver/src/compiler/glsl/glsl_parser.yy
index ec66e680a2..4111c45c97 100644
--- a/mesa 3D driver/src/compiler/glsl/glsl_parser.yy	
+++ b/mesa 3D driver/src/compiler/glsl/glsl_parser.yy	
@@ -2743,7 +2743,7 @@ iteration_statement:
                                             NULL, $3, NULL, $5);
       $$->set_location_range(@1, @4);
    }
-   | DO statement WHILE '(' expression ')' ';'
+   | DO statement_no_new_scope WHILE '(' expression ')' ';'
    {
       void *ctx = state->linalloc;
       $$ = new(ctx) ast_iteration_statement(ast_iteration_statement::ast_do_while,
diff --git a/mesa 3D driver/src/compiler/glsl/glsl_to_nir.cpp b/mesa 3D driver/src/compiler/glsl/glsl_to_nir.cpp
index 3baa7fc84e..7ab447dcf0 100644
--- a/mesa 3D driver/src/compiler/glsl/glsl_to_nir.cpp	
+++ b/mesa 3D driver/src/compiler/glsl/glsl_to_nir.cpp	
@@ -256,6 +256,7 @@ glsl_to_nir(struct gl_context *ctx,
    if (shader->info.stage == MESA_SHADER_FRAGMENT) {
       shader->info.fs.pixel_center_integer = sh->Program->info.fs.pixel_center_integer;
       shader->info.fs.origin_upper_left = sh->Program->info.fs.origin_upper_left;
+      shader->info.fs.advanced_blend_modes = sh->Program->info.fs.advanced_blend_modes;
    }
 
    return shader;
@@ -431,17 +432,6 @@ nir_visitor::constant_copy(ir_constant *ir, void *mem_ctx)
    return ret;
 }
 
-static const glsl_type *
-wrap_type_in_array(const glsl_type *elem_type, const glsl_type *array_type)
-{
-   if (!array_type->is_array())
-      return elem_type;
-
-   elem_type = wrap_type_in_array(elem_type, array_type->fields.array);
-
-   return glsl_type::get_array_instance(elem_type, array_type->length);
-}
-
 static unsigned
 get_nir_how_declared(unsigned how_declared)
 {
@@ -544,6 +534,8 @@ nir_visitor::visit(ir_variable *ir)
    case ir_var_uniform:
       if (ir->get_interface_type())
          var->data.mode = nir_var_mem_ubo;
+      else if (ir->type->contains_image() && !ir->data.bindless)
+         var->data.mode = nir_var_image;
       else
          var->data.mode = nir_var_uniform;
       break;
@@ -585,7 +577,7 @@ nir_visitor::visit(ir_variable *ir)
          /* If the type contains the interface, wrap the explicit type in the
           * right number of arrays.
           */
-         var->type = wrap_type_in_array(explicit_ifc_type, ir->type);
+         var->type = glsl_type_wrap_in_arrays(explicit_ifc_type, ir->type);
       } else {
          /* Otherwise, this variable is one entry in the interface */
          UNUSED bool found = false;
@@ -1634,7 +1626,7 @@ nir_visitor::visit(ir_call *ir)
          nir_ssa_def *val = evaluate_rvalue(param_rvalue);
          nir_src src = nir_src_for_ssa(val);
 
-         nir_src_copy(&call->params[i], &src, call);
+         nir_src_copy(&call->params[i], &src);
       } else if (sig_param->data.mode == ir_var_function_inout) {
          unreachable("unimplemented: inout parameters");
       }
diff --git a/mesa 3D driver/src/compiler/glsl/ir_clone.cpp b/mesa 3D driver/src/compiler/glsl/ir_clone.cpp
index e46d07d6f4..a64e88b3d6 100644
--- a/mesa 3D driver/src/compiler/glsl/ir_clone.cpp	
+++ b/mesa 3D driver/src/compiler/glsl/ir_clone.cpp	
@@ -354,6 +354,7 @@ ir_constant::clone(void *mem_ctx, struct hash_table *ht) const
    case GLSL_TYPE_UINT8:
    case GLSL_TYPE_INT8:
    case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_TEXTURE:
    case GLSL_TYPE_IMAGE:
       return new(mem_ctx) ir_constant(this->type, &this->value);
 
diff --git a/mesa 3D driver/src/compiler/glsl/link_uniform_initializers.cpp b/mesa 3D driver/src/compiler/glsl/link_uniform_initializers.cpp
index 076ff5cea3..1179a105ed 100644
--- a/mesa 3D driver/src/compiler/glsl/link_uniform_initializers.cpp	
+++ b/mesa 3D driver/src/compiler/glsl/link_uniform_initializers.cpp	
@@ -74,6 +74,7 @@ copy_constant_to_storage(union gl_constant_value *storage,
          break;
       case GLSL_TYPE_ARRAY:
       case GLSL_TYPE_STRUCT:
+      case GLSL_TYPE_TEXTURE:
       case GLSL_TYPE_IMAGE:
       case GLSL_TYPE_ATOMIC_UINT:
       case GLSL_TYPE_INTERFACE:
diff --git a/mesa 3D driver/src/compiler/glsl/link_varyings.cpp b/mesa 3D driver/src/compiler/glsl/link_varyings.cpp
index 9954a73147..abae4377e8 100644
--- a/mesa 3D driver/src/compiler/glsl/link_varyings.cpp	
+++ b/mesa 3D driver/src/compiler/glsl/link_varyings.cpp	
@@ -660,9 +660,11 @@ validate_explicit_variable_location(struct gl_context *ctx,
          glsl_struct_field *field = &type_without_array->fields.structure[i];
          unsigned field_location = field->location -
             (field->patch ? VARYING_SLOT_PATCH0 : VARYING_SLOT_VAR0);
+         unsigned field_slots = field->type->count_attribute_slots(false);
          if (!check_location_aliasing(explicit_locations, var,
                                       field_location,
-                                      0, field_location + 1,
+                                      0,
+                                      field_location + field_slots,
                                       field->type,
                                       field->interpolation,
                                       field->centroid,
@@ -2171,7 +2173,7 @@ varying_matches::store_locations() const
    /* Check is location needs to be packed with lower_packed_varyings() or if
     * we can just use ARB_enhanced_layouts packing.
     */
-   bool pack_loc[MAX_VARYINGS_INCL_PATCH] = { 0 };
+   bool pack_loc[MAX_VARYINGS_INCL_PATCH] = {};
    const glsl_type *loc_type[MAX_VARYINGS_INCL_PATCH][4] = { {NULL, NULL} };
 
    for (unsigned i = 0; i < this->num_matches; i++) {
diff --git a/mesa 3D driver/src/compiler/glsl/linker.cpp b/mesa 3D driver/src/compiler/glsl/linker.cpp
index b598b63c09..dd575a87dd 100644
--- a/mesa 3D driver/src/compiler/glsl/linker.cpp	
+++ b/mesa 3D driver/src/compiler/glsl/linker.cpp	
@@ -2126,7 +2126,7 @@ link_fs_inout_layout_qualifiers(struct gl_shader_program *prog,
          shader->SampleInterlockOrdered;
       linked_shader->Program->info.fs.sample_interlock_unordered |=
          shader->SampleInterlockUnordered;
-      linked_shader->Program->sh.fs.BlendSupport |= shader->BlendSupport;
+      linked_shader->Program->info.fs.advanced_blend_modes |= shader->BlendSupport;
    }
 
    linked_shader->Program->info.fs.pixel_center_integer = pixel_center_integer;
diff --git a/mesa 3D driver/src/compiler/glsl/lower_blend_equation_advanced.cpp b/mesa 3D driver/src/compiler/glsl/lower_blend_equation_advanced.cpp
index 5ea4222b59..9060e83cdf 100644
--- a/mesa 3D driver/src/compiler/glsl/lower_blend_equation_advanced.cpp	
+++ b/mesa 3D driver/src/compiler/glsl/lower_blend_equation_advanced.cpp	
@@ -463,7 +463,9 @@ get_main(gl_linked_shader *sh)
 bool
 lower_blend_equation_advanced(struct gl_linked_shader *sh, bool coherent)
 {
-   if (sh->Program->sh.fs.BlendSupport == 0)
+   assert(sh->Stage == MESA_SHADER_FRAGMENT);
+
+   if (sh->Program->info.fs.advanced_blend_modes == 0)
       return false;
 
    /* Lower early returns in main() so there's a single exit point
@@ -548,7 +550,7 @@ lower_blend_equation_advanced(struct gl_linked_shader *sh, bool coherent)
 
    ir_variable *result_dest =
       calc_blend_result(f, mode, fb, blend_source,
-                        sh->Program->sh.fs.BlendSupport);
+                        sh->Program->info.fs.advanced_blend_modes);
 
    /* Copy the result back to the original values.  It would be simpler
     * to demote the program's output variables, and create a new vec4
diff --git a/mesa 3D driver/src/compiler/glsl/opt_dead_builtin_varyings.cpp b/mesa 3D driver/src/compiler/glsl/opt_dead_builtin_varyings.cpp
index 3efe658232..6ea18bedb8 100644
--- a/mesa 3D driver/src/compiler/glsl/opt_dead_builtin_varyings.cpp	
+++ b/mesa 3D driver/src/compiler/glsl/opt_dead_builtin_varyings.cpp	
@@ -527,7 +527,7 @@ lower_fragdata_array(struct gl_linked_shader *shader)
    varying_info_visitor info(ir_var_shader_out, true);
    info.get(shader->ir, 0, NULL);
 
-   replace_varyings_visitor(shader, &info, 0, 0, 0);
+   replace_varyings_visitor(shader, &info, 0, 0, false);
 }
 
 
diff --git a/mesa 3D driver/src/compiler/glsl/serialize.cpp b/mesa 3D driver/src/compiler/glsl/serialize.cpp
index b5c286b0f2..3356123ab7 100644
--- a/mesa 3D driver/src/compiler/glsl/serialize.cpp	
+++ b/mesa 3D driver/src/compiler/glsl/serialize.cpp	
@@ -1127,9 +1127,6 @@ write_shader_metadata(struct blob *metadata, gl_linked_shader *shader)
                        sizeof(struct gl_bindless_image) - ptr_size);
    }
 
-   blob_write_bytes(metadata, &glprog->sh.fs.BlendSupport,
-                    sizeof(glprog->sh.fs.BlendSupport));
-
    write_shader_parameters(metadata, glprog->Parameters);
 
    assert((glprog->driver_cache_blob == NULL) ==
@@ -1194,9 +1191,6 @@ read_shader_metadata(struct blob_reader *metadata,
       }
    }
 
-   blob_copy_bytes(metadata, (uint8_t *) &glprog->sh.fs.BlendSupport,
-                   sizeof(glprog->sh.fs.BlendSupport));
-
    glprog->Parameters = _mesa_new_parameter_list();
    read_shader_parameters(metadata, glprog->Parameters);
 
diff --git a/mesa 3D driver/src/compiler/glsl/tests/meson.build b/mesa 3D driver/src/compiler/glsl/tests/meson.build
index c8a84a8180..4a565bb21c 100644
--- a/mesa 3D driver/src/compiler/glsl/tests/meson.build	
+++ b/mesa 3D driver/src/compiler/glsl/tests/meson.build	
@@ -33,6 +33,7 @@ test(
     dependencies : [dep_clock, dep_thread, idep_gtest, idep_mesautil],
   ),
   suite : ['compiler', 'glsl'],
+  protocol : gtest_test_protocol,
 )
 
 test(
@@ -49,6 +50,7 @@ test(
     dependencies : [dep_thread, idep_gtest, idep_mesautil],
   ),
   suite : ['compiler', 'glsl'],
+  protocol : gtest_test_protocol,
 )
 
 test(
@@ -63,6 +65,7 @@ test(
     dependencies : [dep_thread, idep_gtest, idep_mesautil],
   ),
   suite : ['compiler', 'glsl'],
+  protocol : gtest_test_protocol,
 )
 
 test(
@@ -77,6 +80,7 @@ test(
     dependencies : [dep_thread, idep_gtest],
   ),
   suite : ['compiler', 'glsl'],
+  protocol : gtest_test_protocol,
 )
 
 # Meson can't auto-skip these on cross builds because of the python wrapper
diff --git a/mesa 3D driver/src/compiler/glsl/tests/uniform_initializer_utils.cpp b/mesa 3D driver/src/compiler/glsl/tests/uniform_initializer_utils.cpp
index 8c00c69b29..07109d1a54 100644
--- a/mesa 3D driver/src/compiler/glsl/tests/uniform_initializer_utils.cpp	
+++ b/mesa 3D driver/src/compiler/glsl/tests/uniform_initializer_utils.cpp	
@@ -84,6 +84,7 @@ generate_data_element(void *mem_ctx, const glsl_type *type,
       case GLSL_TYPE_UINT:
       case GLSL_TYPE_INT:
       case GLSL_TYPE_SAMPLER:
+      case GLSL_TYPE_TEXTURE:
       case GLSL_TYPE_IMAGE:
 	 data.i[i] = values[idx];
 	 break;
@@ -129,6 +130,7 @@ generate_data_element(void *mem_ctx, const glsl_type *type,
       case GLSL_TYPE_UINT:
       case GLSL_TYPE_INT:
       case GLSL_TYPE_SAMPLER:
+      case GLSL_TYPE_TEXTURE:
       case GLSL_TYPE_IMAGE:
 	 ASSERT_EQ(data.i[i], val->value.i[i]);
 	 break;
@@ -262,6 +264,7 @@ verify_data(gl_constant_value *storage, unsigned storage_array_size,
 	 case GLSL_TYPE_UINT:
 	 case GLSL_TYPE_INT:
 	 case GLSL_TYPE_SAMPLER:
+	 case GLSL_TYPE_TEXTURE:
 	 case GLSL_TYPE_IMAGE:
 	    EXPECT_EQ(val->value.i[i], storage[i].i);
 	    break;
diff --git a/mesa 3D driver/src/compiler/glsl_types.cpp b/mesa 3D driver/src/compiler/glsl_types.cpp
index 334c05e73b..627a43fee5 100644
--- a/mesa 3D driver/src/compiler/glsl_types.cpp	
+++ b/mesa 3D driver/src/compiler/glsl_types.cpp	
@@ -460,6 +460,7 @@ const glsl_type *glsl_type::get_bare_type() const
                                 this->length);
 
    case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_TEXTURE:
    case GLSL_TYPE_IMAGE:
    case GLSL_TYPE_ATOMIC_UINT:
    case GLSL_TYPE_VOID:
@@ -950,6 +951,118 @@ glsl_type::get_sampler_instance(enum glsl_sampler_dim dim,
    unreachable("switch statement above should be complete");
 }
 
+const glsl_type *
+glsl_type::get_texture_instance(enum glsl_sampler_dim dim,
+                                bool array, glsl_base_type type)
+{
+   switch (type) {
+   case GLSL_TYPE_FLOAT:
+      switch (dim) {
+      case GLSL_SAMPLER_DIM_1D:
+         return (array ? texture1DArray_type : texture1D_type);
+      case GLSL_SAMPLER_DIM_2D:
+         return (array ? texture2DArray_type : texture2D_type);
+      case GLSL_SAMPLER_DIM_3D:
+         return texture3D_type;
+      case GLSL_SAMPLER_DIM_CUBE:
+         return (array ? textureCubeArray_type : textureCube_type);
+      case GLSL_SAMPLER_DIM_RECT:
+         if (array)
+            return error_type;
+         else
+            return texture2DRect_type;
+      case GLSL_SAMPLER_DIM_BUF:
+         if (array)
+            return error_type;
+         else
+            return textureBuffer_type;
+      case GLSL_SAMPLER_DIM_MS:
+         return (array ? texture2DMSArray_type : texture2DMS_type);
+      case GLSL_SAMPLER_DIM_SUBPASS:
+         return subpassInput_type;
+      case GLSL_SAMPLER_DIM_SUBPASS_MS:
+         return subpassInputMS_type;
+      case GLSL_SAMPLER_DIM_EXTERNAL:
+         return error_type;
+      }
+   case GLSL_TYPE_INT:
+      switch (dim) {
+      case GLSL_SAMPLER_DIM_1D:
+         return (array ? itexture1DArray_type : itexture1D_type);
+      case GLSL_SAMPLER_DIM_2D:
+         return (array ? itexture2DArray_type : itexture2D_type);
+      case GLSL_SAMPLER_DIM_3D:
+         if (array)
+            return error_type;
+         return itexture3D_type;
+      case GLSL_SAMPLER_DIM_CUBE:
+         return (array ? itextureCubeArray_type : itextureCube_type);
+      case GLSL_SAMPLER_DIM_RECT:
+         if (array)
+            return error_type;
+         return itexture2DRect_type;
+      case GLSL_SAMPLER_DIM_BUF:
+         if (array)
+            return error_type;
+         return itextureBuffer_type;
+      case GLSL_SAMPLER_DIM_MS:
+         return (array ? itexture2DMSArray_type : itexture2DMS_type);
+      case GLSL_SAMPLER_DIM_SUBPASS:
+         return isubpassInput_type;
+      case GLSL_SAMPLER_DIM_SUBPASS_MS:
+         return isubpassInputMS_type;
+      case GLSL_SAMPLER_DIM_EXTERNAL:
+         return error_type;
+      }
+   case GLSL_TYPE_UINT:
+      switch (dim) {
+      case GLSL_SAMPLER_DIM_1D:
+         return (array ? utexture1DArray_type : utexture1D_type);
+      case GLSL_SAMPLER_DIM_2D:
+         return (array ? utexture2DArray_type : utexture2D_type);
+      case GLSL_SAMPLER_DIM_3D:
+         if (array)
+            return error_type;
+         return utexture3D_type;
+      case GLSL_SAMPLER_DIM_CUBE:
+         return (array ? utextureCubeArray_type : utextureCube_type);
+      case GLSL_SAMPLER_DIM_RECT:
+         if (array)
+            return error_type;
+         return utexture2DRect_type;
+      case GLSL_SAMPLER_DIM_BUF:
+         if (array)
+            return error_type;
+         return utextureBuffer_type;
+      case GLSL_SAMPLER_DIM_MS:
+         return (array ? utexture2DMSArray_type : utexture2DMS_type);
+      case GLSL_SAMPLER_DIM_SUBPASS:
+         return usubpassInput_type;
+      case GLSL_SAMPLER_DIM_SUBPASS_MS:
+         return usubpassInputMS_type;
+      case GLSL_SAMPLER_DIM_EXTERNAL:
+         return error_type;
+      }
+   case GLSL_TYPE_VOID:
+      switch (dim) {
+      case GLSL_SAMPLER_DIM_1D:
+         return (array ? vtexture1DArray_type : vtexture1D_type);
+      case GLSL_SAMPLER_DIM_2D:
+         return (array ? vtexture2DArray_type : vtexture2D_type);
+      case GLSL_SAMPLER_DIM_3D:
+         return (array ? error_type : vtexture3D_type);
+      case GLSL_SAMPLER_DIM_BUF:
+         return (array ? error_type : vbuffer_type);
+      default:
+         return error_type;
+      }
+   default:
+      return error_type;
+   }
+
+   unreachable("switch statement above should be complete");
+}
+
 const glsl_type *
 glsl_type::get_image_instance(enum glsl_sampler_dim dim,
                               bool array, glsl_base_type type)
@@ -1630,6 +1743,7 @@ glsl_type::component_slots() const
       return this->length * this->fields.array->component_slots();
 
    case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_TEXTURE:
    case GLSL_TYPE_IMAGE:
       return 2;
 
@@ -1696,6 +1810,7 @@ glsl_type::component_slots_aligned(unsigned offset) const
    }
 
    case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_TEXTURE:
    case GLSL_TYPE_IMAGE:
       return 2 + ((offset % 4) == 3 ? 1 : 0);
 
@@ -1772,6 +1887,7 @@ glsl_type::uniform_locations() const
    case GLSL_TYPE_INT64:
    case GLSL_TYPE_BOOL:
    case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_TEXTURE:
    case GLSL_TYPE_IMAGE:
    case GLSL_TYPE_SUBROUTINE:
       return 1;
@@ -2810,6 +2926,7 @@ glsl_type::count_vec4_slots(bool is_gl_vertex_input, bool is_bindless) const
    }
 
    case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_TEXTURE:
    case GLSL_TYPE_IMAGE:
       if (!is_bindless)
          return 0;
@@ -2849,6 +2966,7 @@ glsl_type::count_dword_slots(bool is_bindless) const
       return DIV_ROUND_UP(this->components(), 4);
    case GLSL_TYPE_IMAGE:
    case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_TEXTURE:
       if (!is_bindless)
          return 0;
       FALLTHROUGH;
@@ -3022,8 +3140,13 @@ encode_type_to_blob(struct blob *blob, const glsl_type *type)
          blob_write_uint32(blob, type->explicit_alignment);
       return;
    case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_TEXTURE:
+   case GLSL_TYPE_IMAGE:
       encoded.sampler.dimensionality = type->sampler_dimensionality;
-      encoded.sampler.shadow = type->sampler_shadow;
+      if (type->base_type == GLSL_TYPE_SAMPLER)
+         encoded.sampler.shadow = type->sampler_shadow;
+      else
+         assert(!type->sampler_shadow);
       encoded.sampler.array = type->sampler_array;
       encoded.sampler.sampled_type = type->sampled_type;
       break;
@@ -3031,11 +3154,6 @@ encode_type_to_blob(struct blob *blob, const glsl_type *type)
       blob_write_uint32(blob, encoded.u32);
       blob_write_string(blob, type->name);
       return;
-   case GLSL_TYPE_IMAGE:
-      encoded.sampler.dimensionality = type->sampler_dimensionality;
-      encoded.sampler.array = type->sampler_array;
-      encoded.sampler.sampled_type = type->sampled_type;
-      break;
    case GLSL_TYPE_ATOMIC_UINT:
       break;
    case GLSL_TYPE_ARRAY:
@@ -3135,6 +3253,10 @@ decode_type_from_blob(struct blob_reader *blob)
                                              encoded.sampler.shadow,
                                              encoded.sampler.array,
                                              (glsl_base_type) encoded.sampler.sampled_type);
+   case GLSL_TYPE_TEXTURE:
+      return glsl_type::get_texture_instance((enum glsl_sampler_dim)encoded.sampler.dimensionality,
+                                             encoded.sampler.array,
+                                             (glsl_base_type) encoded.sampler.sampled_type);
    case GLSL_TYPE_SUBROUTINE:
       return glsl_type::get_subroutine_instance(blob_read_string(blob));
    case GLSL_TYPE_IMAGE:
diff --git a/mesa 3D driver/src/compiler/glsl_types.h b/mesa 3D driver/src/compiler/glsl_types.h
index 62b10885b4..627b065286 100644
--- a/mesa 3D driver/src/compiler/glsl_types.h	
+++ b/mesa 3D driver/src/compiler/glsl_types.h	
@@ -84,6 +84,7 @@ enum glsl_base_type {
    GLSL_TYPE_INT64,
    GLSL_TYPE_BOOL,
    GLSL_TYPE_SAMPLER,
+   GLSL_TYPE_TEXTURE,
    GLSL_TYPE_IMAGE,
    GLSL_TYPE_ATOMIC_UINT,
    GLSL_TYPE_STRUCT,
@@ -122,6 +123,7 @@ static unsigned glsl_base_type_bit_size(enum glsl_base_type type)
    case GLSL_TYPE_INT64:
    case GLSL_TYPE_UINT64:
    case GLSL_TYPE_IMAGE:
+   case GLSL_TYPE_TEXTURE:
    case GLSL_TYPE_SAMPLER:
       return 64;
 
@@ -158,6 +160,7 @@ static inline bool glsl_base_type_is_integer(enum glsl_base_type type)
           type == GLSL_TYPE_INT64 ||
           type == GLSL_TYPE_BOOL ||
           type == GLSL_TYPE_SAMPLER ||
+          type == GLSL_TYPE_TEXTURE ||
           type == GLSL_TYPE_IMAGE;
 }
 
@@ -188,6 +191,7 @@ glsl_base_type_get_bit_size(const enum glsl_base_type base_type)
    case GLSL_TYPE_UINT64:
    case GLSL_TYPE_IMAGE:
    case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_TEXTURE:
       return 64;
 
    default:
@@ -459,6 +463,10 @@ struct glsl_type {
                                                 bool array,
                                                 glsl_base_type type);
 
+   static const glsl_type *get_texture_instance(enum glsl_sampler_dim dim,
+                                                bool array,
+                                                glsl_base_type type);
+
    static const glsl_type *get_image_instance(enum glsl_sampler_dim dim,
                                               bool array, glsl_base_type type);
 
@@ -941,6 +949,14 @@ struct glsl_type {
       return base_type == GLSL_TYPE_SAMPLER;
    }
 
+   /**
+    * Query whether or not a type is a texture
+    */
+   bool is_texture() const
+   {
+      return base_type == GLSL_TYPE_TEXTURE;
+   }
+
    /**
     * Query whether or not type is a sampler, or for struct, interface and
     * array types, contains a sampler.
diff --git a/mesa 3D driver/src/compiler/isaspec/README.rst b/mesa 3D driver/src/compiler/isaspec/README.rst
new file mode 100644
index 0000000000..27379790c0
--- /dev/null
+++ b/mesa 3D driver/src/compiler/isaspec/README.rst	
@@ -0,0 +1 @@
+../../../docs/drivers/freedreno/isaspec.rst
\ No newline at end of file
diff --git a/mesa 3D driver/src/compiler/isaspec/decode.c b/mesa 3D driver/src/compiler/isaspec/decode.c
new file mode 100644
index 0000000000..07a6313d67
--- /dev/null
+++ b/mesa 3D driver/src/compiler/isaspec/decode.c	
@@ -0,0 +1,793 @@
+/*
+ * Copyright © 2020 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <assert.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "util/bitset.h"
+#include "util/compiler.h"
+#include "util/half_float.h"
+#include "util/hash_table.h"
+#include "util/ralloc.h"
+#include "util/u_debug.h"
+#include "util/u_math.h"
+
+#include "decode.h"
+#include "isa.h"
+
+/**
+ * The set of leaf node bitsets in the bitset hiearchy which defines all
+ * the possible instructions.
+ *
+ * TODO maybe we want to pass this in as parameter so this same decoder
+ * can work with multiple different instruction sets.
+ */
+extern const struct isa_bitset *__instruction[];
+
+struct decode_state;
+
+/**
+ * Decode scope.  When parsing a field that is itself a bitset, we push a
+ * new scope to the stack.  A nested bitset is allowed to resolve fields
+ * from an enclosing scope (needed, for example, to decode src register
+ * bitsets, where half/fullness is determined by fields outset if bitset
+ * in the instruction containing the bitset.
+ *
+ * But the field being resolved could be a derived field, or different
+ * depending on an override at a higher level of the stack, requiring
+ * expression evaluation which could in turn reference variables which
+ * triggers a recursive field lookup.  But those lookups should not start
+ * from the top of the stack, but instead the current stack level.  This
+ * prevents a field from accidentally resolving to different values
+ * depending on the starting point of the lookup.  (Not only causing
+ * confusion, but this is behavior we don't want to depend on if we
+ * wanted to optimize things by caching field lookup results.)
+ */
+struct decode_scope {
+	/**
+	 * Enclosing scope
+	 */
+	struct decode_scope *parent;
+
+	/**
+	 * Current bitset value being decoded
+	 */
+	bitmask_t val;
+
+	/**
+	 * Current bitset.
+	 */
+	const struct isa_bitset *bitset;
+
+	/**
+	 * Field name remapping.
+	 */
+	const struct isa_field_params *params;
+
+	/**
+	 * Pointer back to decode state, for convenience.
+	 */
+	struct decode_state *state;
+
+	/**
+	 * Cache expression evaluation results.  Expressions for overrides can
+	 * be repeatedly evaluated for each field being resolved.  And each
+	 * field reference to a derived field (potentially from another expr)
+	 * would require re-evaluation.  But for a given scope, each evaluation
+	 * of an expression gives the same result.  So we can cache to speed
+	 * things up.
+	 *
+	 * TODO we could maybe be clever and assign a unique idx to each expr
+	 * and use a direct lookup table?  Would be a bit more clever if it was
+	 * smart enough to allow unrelated expressions that are never involved
+	 * in a given scope to have overlapping cache lookup idx's.
+	 */
+	struct hash_table *cache;
+};
+
+/**
+ * Current decode state
+ */
+struct decode_state {
+	const struct isa_decode_options *options;
+	FILE *out;
+
+	/**
+	 * Current instruction being decoded:
+	 */
+	unsigned n;
+
+	/**
+	 * Number of instructions being decoded
+	 */
+	unsigned num_instr;
+
+	/**
+	 * Column number of current line
+	 */
+	unsigned line_column;
+
+	/**
+	 * Bitset of instructions that are branch targets (if options->branch_labels
+	 * is enabled)
+	 */
+	BITSET_WORD *branch_targets;
+
+	/**
+	 * We allow a limited amount of expression evaluation recursion, but
+	 * not recursive evaluation of any given expression, to prevent infinite
+	 * recursion.
+	 */
+	int expr_sp;
+	isa_expr_t expr_stack[8];
+
+	/**
+	 * Current topmost/innermost level of scope used for decoding fields,
+	 * including derived fields which may in turn rely on decoding other
+	 * fields, potentially from a lower/out level in the stack.
+	 */
+	struct decode_scope *scope;
+
+	/**
+	 * A small fixed upper limit on # of decode errors to capture per-
+	 * instruction seems reasonable.
+	 */
+	unsigned num_errors;
+	char *errors[4];
+};
+
+static void
+print(struct decode_state *state, const char *fmt, ...)
+{
+	char *buffer;
+	va_list args;
+	int ret;
+
+	va_start(args, fmt);
+	ret = vasprintf(&buffer, fmt, args);
+	va_end(args);
+
+	if (ret != -1) {
+		const size_t len = strlen(buffer);
+
+		for (size_t i = 0; i < len; i++) {
+			const char c = buffer[i];
+
+			fputc(c, state->out);
+			state->line_column++;
+
+			if (c == '\n') {
+				state->line_column = 0;
+			}
+      }
+
+		free(buffer);
+
+		return;
+	}
+}
+
+static void display(struct decode_scope *scope);
+static void decode_error(struct decode_state *state, const char *fmt, ...) _util_printf_format(2,3);
+
+static void
+decode_error(struct decode_state *state, const char *fmt, ...)
+{
+	if (!state->options->show_errors) {
+		return;
+	}
+
+	if (state->num_errors == ARRAY_SIZE(state->errors)) {
+		/* too many errors, bail */
+		return;
+	}
+
+	va_list ap;
+	va_start(ap, fmt);
+	vasprintf(&state->errors[state->num_errors++], fmt, ap);
+	va_end(ap);
+}
+
+static unsigned
+flush_errors(struct decode_state *state)
+{
+	unsigned num_errors = state->num_errors;
+	if (num_errors > 0)
+		print(state, "\t; ");
+	for (unsigned i = 0; i < num_errors; i++) {
+		print(state, "%s%s", (i > 0) ? ", " : "", state->errors[i]);
+		free(state->errors[i]);
+	}
+	state->num_errors = 0;
+	return num_errors;
+}
+
+
+static bool
+push_expr(struct decode_state *state, isa_expr_t expr)
+{
+	for (int i = state->expr_sp - 1; i > 0; i--) {
+		if (state->expr_stack[i] == expr) {
+			return false;
+		}
+	}
+	state->expr_stack[state->expr_sp++] = expr;
+	return true;
+}
+
+static void
+pop_expr(struct decode_state *state)
+{
+	assert(state->expr_sp > 0);
+	state->expr_sp--;
+}
+
+static struct decode_scope *
+push_scope(struct decode_state *state, const struct isa_bitset *bitset, bitmask_t val)
+{
+	struct decode_scope *scope = rzalloc_size(state, sizeof(*scope));
+
+	BITSET_COPY(scope->val.bitset, val.bitset);
+	scope->bitset = bitset;
+	scope->parent = state->scope;
+	scope->state  = state;
+
+	state->scope = scope;
+
+	return scope;
+}
+
+static void
+pop_scope(struct decode_scope *scope)
+{
+	assert(scope->state->scope == scope);  /* must be top of stack */
+
+	scope->state->scope = scope->parent;
+	ralloc_free(scope);
+}
+
+/**
+ * Evaluate an expression, returning it's resulting value
+ */
+static uint64_t
+evaluate_expr(struct decode_scope *scope, isa_expr_t expr)
+{
+	if (scope->cache) {
+		struct hash_entry *entry = _mesa_hash_table_search(scope->cache, expr);
+		if (entry) {
+			return *(uint64_t *)entry->data;
+		}
+	} else {
+		scope->cache = _mesa_pointer_hash_table_create(scope);
+	}
+
+	if (!push_expr(scope->state, expr))
+		return 0;
+
+	uint64_t ret = expr(scope);
+
+	pop_expr(scope->state);
+
+	uint64_t *retp = ralloc_size(scope->cache, sizeof(*retp));
+	*retp = ret;
+	_mesa_hash_table_insert(scope->cache, expr, retp);
+
+	return ret;
+}
+
+/**
+ * Find the bitset in NULL terminated bitset hiearchy root table which
+ * matches against 'val'
+ */
+static const struct isa_bitset *
+find_bitset(struct decode_state *state, const struct isa_bitset **bitsets,
+		bitmask_t val)
+{
+	const struct isa_bitset *match = NULL;
+	for (int n = 0; bitsets[n]; n++) {
+		if (state->options->gpu_id > bitsets[n]->gen.max)
+			continue;
+		if (state->options->gpu_id < bitsets[n]->gen.min)
+			continue;
+
+		// m = (val & bitsets[n]->mask) & ~bitsets[n]->dontcare;
+		bitmask_t m = { 0 };
+		bitmask_t not_dontcare;
+
+		BITSET_AND(m.bitset, val.bitset, bitsets[n]->mask.bitset);
+
+		BITSET_COPY(not_dontcare.bitset, bitsets[n]->dontcare.bitset);
+		BITSET_NOT(not_dontcare.bitset);
+
+		BITSET_AND(m.bitset, m.bitset, not_dontcare.bitset);
+
+		if (!BITSET_EQUAL(m.bitset, bitsets[n]->match.bitset)) {
+			continue;
+		}
+
+		/* We should only have exactly one match
+		 *
+		 * TODO more complete/formal way to validate that any given
+		 * bit pattern will only have a single match?
+		 */
+		if (match) {
+			decode_error(state, "bitset conflict: %s vs %s", match->name,
+					bitsets[n]->name);
+			return NULL;
+		}
+
+		match = bitsets[n];
+	}
+
+	if (match) {
+		bitmask_t m = { 0 };
+		BITSET_AND(m.bitset, match->dontcare.bitset, val.bitset);
+
+		if (BITSET_COUNT(m.bitset)) {
+			decode_error(state, "dontcare bits in %s: %"BITSET_FORMAT,
+					match->name, BITSET_VALUE(m.bitset));
+		}
+	}
+
+	return match;
+}
+
+static const struct isa_field *
+find_field(struct decode_scope *scope, const struct isa_bitset *bitset,
+		const char *name, size_t name_len)
+{
+	for (unsigned i = 0; i < bitset->num_cases; i++) {
+		const struct isa_case *c = bitset->cases[i];
+
+		if (c->expr) {
+			struct decode_state *state = scope->state;
+
+			/* When resolving a field for evaluating an expression,
+			 * temporarily assume the expression evaluates to true.
+			 * This allows <override/>'s to speculatively refer to
+			 * fields defined within the override:
+			 */
+			isa_expr_t cur_expr = NULL;
+			if (state->expr_sp > 0)
+				cur_expr = state->expr_stack[state->expr_sp - 1];
+			if ((cur_expr != c->expr) && !evaluate_expr(scope, c->expr))
+				continue;
+		}
+
+		for (unsigned i = 0; i < c->num_fields; i++) {
+			if (!strncmp(name, c->fields[i].name, name_len) &&
+			   (c->fields[i].name[name_len] == '\0')) {
+				return &c->fields[i];
+			}
+		}
+	}
+
+	if (bitset->parent) {
+		const struct isa_field *f = find_field(scope, bitset->parent, name, name_len);
+		if (f) {
+			return f;
+		}
+	}
+
+	return NULL;
+}
+
+static bitmask_t
+extract_field(struct decode_scope *scope, const struct isa_field *field)
+{
+   bitmask_t val, mask;
+
+   BITSET_COPY(val.bitset, scope->val.bitset);
+   BITSET_ZERO(mask.bitset);
+
+   BITSET_SET_RANGE(mask.bitset, field->low, field->high);
+   BITSET_AND(val.bitset, val.bitset, mask.bitset);
+   BITSET_SHR(val.bitset, field->low);
+
+   return val;
+}
+
+/**
+ * Find the display template for a given bitset, recursively searching
+ * parents in the bitset hierarchy.
+ */
+static const char *
+find_display(struct decode_scope *scope, const struct isa_bitset *bitset)
+{
+	for (unsigned i = 0; i < bitset->num_cases; i++) {
+		const struct isa_case *c = bitset->cases[i];
+		if (c->expr && !evaluate_expr(scope, c->expr))
+			continue;
+		/* since this is the chosen case, it seems like a good place
+		 * to check asserted bits:
+		 */
+		for (unsigned j = 0; j < c->num_fields; j++) {
+			if (c->fields[j].type == TYPE_ASSERT) {
+				const struct isa_field *f = &c->fields[j];
+				bitmask_t val;
+
+				val = extract_field(scope, f);
+				if (!BITSET_EQUAL(val.bitset, f->val.bitset)) {
+					decode_error(scope->state, "WARNING: unexpected "
+							"bits[%u:%u] in %s: %"BITSET_FORMAT" vs %"BITSET_FORMAT,
+							f->low, f->high, bitset->name,
+							BITSET_VALUE(val.bitset), BITSET_VALUE(f->val.bitset));
+				}
+			}
+		}
+		if (!c->display)
+			continue;
+		return c->display;
+	}
+
+	/**
+	 * If we didn't find something check up the bitset hierarchy.
+	 */
+	if (bitset->parent) {
+		return find_display(scope, bitset->parent);
+	}
+
+	return NULL;
+}
+
+/**
+ * Decode a field that is itself another bitset type
+ */
+static void
+display_bitset_field(struct decode_scope *scope, const struct isa_field *field, bitmask_t val)
+{
+	const struct isa_bitset *b = find_bitset(scope->state, field->bitsets, val);
+	if (!b) {
+		decode_error(scope->state, "no match: FIELD: '%s.%s': %"BITSET_FORMAT,
+				scope->bitset->name, field->name, BITSET_VALUE(val.bitset));
+		return;
+	}
+
+	struct decode_scope *nested_scope =
+			push_scope(scope->state, b, val);
+	nested_scope->params = field->params;
+	display(nested_scope);
+	pop_scope(nested_scope);
+}
+
+static void
+display_enum_field(struct decode_scope *scope, const struct isa_field *field, bitmask_t val)
+{
+	const struct isa_enum *e = field->enums;
+	const uint64_t ui = bitmask_to_uint64_t(val);
+
+	for (unsigned i = 0; i < e->num_values; i++) {
+		if (e->values[i].val == ui) {
+			print(scope->state, "%s", e->values[i].display);
+			return;
+		}
+	}
+
+	print(scope->state, "%u", (unsigned)ui);
+}
+
+static const struct isa_field *
+resolve_field(struct decode_scope *scope, const char *field_name, size_t field_name_len, bitmask_t *valp)
+{
+	if (!scope) {
+		/* We've reached the bottom of the stack! */
+		return NULL;
+	}
+
+	const struct isa_field *field =
+			find_field(scope, scope->bitset, field_name, field_name_len);
+
+	if (!field && scope->params) {
+		for (unsigned i = 0; i < scope->params->num_params; i++) {
+			if (!strncmp(field_name, scope->params->params[i].as, field_name_len) &&
+			   (scope->params->params[i].as[field_name_len] == '\0')) {
+				const char *param_name = scope->params->params[i].name;
+				return resolve_field(scope->parent, param_name, strlen(param_name), valp);
+			}
+		}
+	}
+
+	if (!field) {
+		return NULL;
+	}
+
+	/* extract out raw field value: */
+	if (field->expr) {
+		uint64_t val = evaluate_expr(scope, field->expr);
+
+		*valp = uint64_t_to_bitmask(val);
+	} else {
+		*valp = extract_field(scope, field);
+	}
+
+	return field;
+}
+
+/* This is also used from generated expr functions */
+uint64_t
+isa_decode_field(struct decode_scope *scope, const char *field_name)
+{
+	bitmask_t val;
+	const struct isa_field *field = resolve_field(scope, field_name, strlen(field_name), &val);
+	if (!field) {
+		decode_error(scope->state, "no field '%s'", field_name);
+		return 0;
+	}
+
+	return bitmask_to_uint64_t(val);
+}
+
+static void
+display_field(struct decode_scope *scope, const char *field_name)
+{
+	const struct isa_decode_options *options = scope->state->options;
+	struct decode_state *state = scope->state;
+	size_t field_name_len = strlen(field_name);
+	int num_align = 0;
+
+	/* alignment handling */
+	const char *align = strstr(field_name, ":align=");
+
+	if (align) {
+		const char *value = strstr(align, "=") + 1;
+
+		field_name_len = align - field_name;
+		num_align = atoi(value);
+	}
+
+	/* Special case 'NAME' maps to instruction/bitset name: */
+	if (!strncmp("NAME", field_name, field_name_len)) {
+		if (options->field_cb) {
+			options->field_cb(options->cbdata, field_name, &(struct isa_decode_value){
+				.str = scope->bitset->name,
+			});
+		}
+
+		while (scope->state->line_column < num_align)
+			print(state, " ");
+
+		print(scope->state, "%s", scope->bitset->name);
+
+		return;
+	}
+
+	bitmask_t v;
+	const struct isa_field *field = resolve_field(scope, field_name, field_name_len, &v);
+	if (!field) {
+		decode_error(scope->state, "no field '%.*s'", (int)field_name_len, field_name);
+		return;
+	}
+
+	uint64_t val = bitmask_to_uint64_t(v);
+
+	if (options->field_cb) {
+		options->field_cb(options->cbdata, field_name, &(struct isa_decode_value){
+			.num = val,
+		});
+	}
+
+	unsigned width = 1 + field->high - field->low;
+
+	while (scope->state->line_column < num_align)
+		print(state, " ");
+
+	switch (field->type) {
+	/* Basic types: */
+	case TYPE_BRANCH:
+		if (scope->state->options->branch_labels) {
+			int offset = util_sign_extend(val, width) + scope->state->n;
+			if (offset < scope->state->num_instr) {
+				print(scope->state, "l%d", offset);
+				BITSET_SET(scope->state->branch_targets, offset);
+				break;
+			}
+		}
+		FALLTHROUGH;
+	case TYPE_INT:
+		print(scope->state, "%"PRId64, util_sign_extend(val, width));
+		break;
+	case TYPE_UINT:
+		print(scope->state, "%"PRIu64, val);
+		break;
+	case TYPE_HEX:
+		// TODO format # of digits based on field width?
+		print(scope->state, "%"PRIx64, val);
+		break;
+	case TYPE_OFFSET:
+		if (val != 0) {
+			print(scope->state, "%+"PRId64, util_sign_extend(val, width));
+		}
+		break;
+	case TYPE_UOFFSET:
+		if (val != 0) {
+			print(scope->state, "+%"PRIu64, val);
+		}
+		break;
+	case TYPE_FLOAT:
+		if (width == 16) {
+			print(scope->state, "%f", _mesa_half_to_float(val));
+		} else {
+			assert(width == 32);
+			print(scope->state, "%f", uif(val));
+		}
+		break;
+	case TYPE_BOOL:
+		if (field->display) {
+			if (val) {
+				print(scope->state, "%s", field->display);
+			}
+		} else {
+			print(scope->state, "%u", (unsigned)val);
+		}
+		break;
+	case TYPE_ENUM:
+		display_enum_field(scope, field, v);
+		break;
+
+	case TYPE_ASSERT:
+		/* assert fields are not for display */
+		assert(0);
+		break;
+
+	/* For fields that are decoded with another bitset hierarchy: */
+	case TYPE_BITSET:
+		display_bitset_field(scope, field, v);
+		break;
+	default:
+		decode_error(scope->state, "Bad field type: %d (%s)",
+				field->type, field->name);
+	}
+}
+
+static void
+display(struct decode_scope *scope)
+{
+	const struct isa_bitset *bitset = scope->bitset;
+	const char *display = find_display(scope, bitset);
+
+	if (!display) {
+		decode_error(scope->state, "%s: no display template", bitset->name);
+		return;
+	}
+
+	const char *p = display;
+
+	while (*p != '\0') {
+		if (*p == '{') {
+			const char *e = ++p;
+			while (*e != '}') {
+				e++;
+			}
+
+			char *field_name = strndup(p, e-p);
+			display_field(scope, field_name);
+			free(field_name);
+
+			p = e;
+		} else {
+			fputc(*p, scope->state->out);
+			scope->state->line_column++;
+		}
+		p++;
+	}
+}
+
+static void
+decode(struct decode_state *state, void *bin, int sz)
+{
+	BITSET_WORD *instrs = bin;
+	unsigned errors = 0;   /* number of consecutive unmatched instructions */
+
+	assert(sz % BITMASK_WORDS == 0);
+
+	for (state->n = 0; state->n < state->num_instr; state->n++) {
+		bitmask_t instr = { 0 };
+
+		next_instruction(&instr, &instrs[state->n * BITMASK_WORDS]);
+      state->line_column = 0;
+
+		if (state->options->max_errors && (errors > state->options->max_errors)) {
+			break;
+		}
+
+		if (state->options->branch_labels &&
+				BITSET_TEST(state->branch_targets, state->n)) {
+			if (state->options->instr_cb) {
+				state->options->instr_cb(state->options->cbdata,
+						state->n, instr.bitset);
+			}
+			print(state, "l%d:\n", state->n);
+		}
+
+		if (state->options->instr_cb) {
+			state->options->instr_cb(state->options->cbdata, state->n, instr.bitset);
+		}
+
+		const struct isa_bitset *b = find_bitset(state, __instruction, instr);
+		if (!b) {
+			print(state, "no match: %"BITSET_FORMAT"\n", BITSET_VALUE(instr.bitset));
+			errors++;
+			continue;
+		}
+
+		struct decode_scope *scope = push_scope(state, b, instr);
+
+		display(scope);
+		if (flush_errors(state)) {
+			errors++;
+		} else {
+			errors = 0;
+		}
+		print(state, "\n");
+
+		pop_scope(scope);
+
+		if (state->options->stop) {
+			break;
+		}
+	}
+}
+
+void
+isa_decode(void *bin, int sz, FILE *out, const struct isa_decode_options *options)
+{
+	const struct isa_decode_options default_options = {
+		.gpu_id = options ? options->gpu_id : 0,
+		.branch_labels = options ? options->branch_labels : false
+	};
+	struct decode_state *state;
+
+	if (!options)
+		options = &default_options;
+
+	util_cpu_detect();  /* needed for _mesa_half_to_float() */
+
+	state = rzalloc_size(NULL, sizeof(*state));
+	state->options = options;
+	state->num_instr = sz / (BITMASK_WORDS * sizeof(BITSET_WORD));
+
+	if (state->options->branch_labels) {
+		state->branch_targets = rzalloc_size(state,
+				sizeof(BITSET_WORD) * BITSET_WORDS(state->num_instr));
+
+		/* Do a pre-pass to find all the branch targets: */
+		state->out = fopen("/dev/null", "w");
+		state->options = &default_options;   /* skip hooks for prepass */
+		decode(state, bin, sz);
+		fclose(state->out);
+		if (options) {
+			state->options = options;
+		}
+	}
+
+	state->out = out;
+
+	decode(state, bin, sz);
+
+	ralloc_free(state);
+}
diff --git a/mesa 3D driver/src/compiler/isaspec/decode.h b/mesa 3D driver/src/compiler/isaspec/decode.h
new file mode 100644
index 0000000000..ce8d1bfd8f
--- /dev/null
+++ b/mesa 3D driver/src/compiler/isaspec/decode.h	
@@ -0,0 +1,149 @@
+/*
+ * Copyright © 2020 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _DECODE_H_
+#define _DECODE_H_
+
+#include <isaspec-isa.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+/*
+ * Defines the tables which are generated from xml for disassembly
+ */
+
+struct decode_scope;
+struct isa_bitset;
+
+/**
+ * Table of enum values
+ */
+struct isa_enum {
+	unsigned num_values;
+	struct {
+		unsigned val;
+		const char *display;
+	} values[];
+};
+
+/**
+ * An expression used to for conditional overrides, derived fields, etc
+ */
+typedef uint64_t (*isa_expr_t)(struct decode_scope *scope);
+
+/**
+ * Used by generated expr functions
+ */
+uint64_t isa_decode_field(struct decode_scope *scope, const char *field_name);
+
+/**
+ * For bitset fields, there are some cases where we want to "remap" field
+ * names, essentially allowing one to parameterize a nested bitset when
+ * it resolves fields in an enclosing bitset.
+ */
+struct isa_field_params {
+	unsigned num_params;
+	struct {
+		const char *name;
+		const char *as;
+	} params[];
+};
+
+/**
+ * Description of a single field within a bitset case.
+ */
+struct isa_field {
+	const char *name;
+	isa_expr_t expr;       /* for virtual "derived" fields */
+	unsigned low;
+	unsigned high;
+	enum {
+		/* Basic types: */
+		TYPE_BRANCH,   /* branch target, like INT but optional labeling*/
+		TYPE_INT,
+		TYPE_UINT,
+		TYPE_HEX,
+		TYPE_OFFSET,   /* Like INT but formated with +/- or omitted if ==0 */
+		TYPE_UOFFSET,  /* Like UINT but formated with + or omitted if ==0 */
+		TYPE_FLOAT,
+		TYPE_BOOL,
+		TYPE_ENUM,
+
+		/* To assert a certain value in a given range of bits.. not
+		 * used for pattern matching, but allows an override to specify
+		 * that a certain bitpattern in some "unused" bits is expected
+		 */
+		TYPE_ASSERT,
+
+		/* For fields that are decoded with another bitset hierarchy: */
+		TYPE_BITSET,
+	} type;
+	union {
+		const struct isa_bitset **bitsets;  /* if type==BITSET */
+		bitmask_t val;                      /* if type==ASSERT */
+		const struct isa_enum *enums;       /* if type==ENUM */
+		const char *display;                /* if type==BOOL */
+	};
+
+	/**
+	 * type==BITSET fields can also optionally provide remapping for
+	 * field names
+	 */
+	const struct isa_field_params *params;
+};
+
+/**
+ * A bitset consists of N "cases", with the last one (with case->expr==NULL)
+ * being the default.
+ *
+ * When resolving a field, display template string, etc, all the cases with
+ * an expression that evaluates to non-zero are consider, falling back to
+ * the last (default) case.
+ */
+struct isa_case {
+	isa_expr_t expr;
+	const char *display;
+	unsigned num_fields;
+	struct isa_field fields[];
+};
+
+/**
+ * An individual bitset, the leaves of a bitset inheritance hiearchy will
+ * have the match and mask to match a single instruction (or arbitrary
+ * bit-pattern) against.
+ */
+struct isa_bitset {
+	const struct isa_bitset *parent;
+	const char *name;
+	struct {
+		unsigned min;
+		unsigned max;
+	} gen;
+	bitmask_t match;
+	bitmask_t dontcare;
+	bitmask_t mask;
+	unsigned num_cases;
+	const struct isa_case *cases[];
+};
+
+#endif /* _DECODE_H_ */
diff --git a/mesa 3D driver/src/compiler/isaspec/decode.py b/mesa 3D driver/src/compiler/isaspec/decode.py
new file mode 100644
index 0000000000..309b8035a5
--- /dev/null
+++ b/mesa 3D driver/src/compiler/isaspec/decode.py	
@@ -0,0 +1,310 @@
+#!/usr/bin/env python3
+#
+# Copyright © 2020 Google, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+from mako.template import Template
+from isa import ISA
+import os
+import sys
+
+template = """\
+/* Copyright (C) 2020 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "decode.h"
+
+/*
+ * enum tables, these don't have any link back to other tables so just
+ * dump them up front before the bitset tables
+ */
+
+%for name, enum in isa.enums.items():
+static const struct isa_enum ${enum.get_c_name()} = {
+    .num_values = ${len(enum.values)},
+    .values = {
+%   for val, display in enum.values.items():
+        { .val = ${val}, .display = "${display}" },
+%   endfor
+    },
+};
+%endfor
+
+/*
+ * generated expression functions, can be linked from bitset tables, so
+ * also dump them up front
+ */
+
+%for name, expr in isa.expressions.items():
+static uint64_t
+${expr.get_c_name()}(struct decode_scope *scope)
+{
+%   for fieldname in expr.fieldnames:
+    int64_t ${fieldname} = isa_decode_field(scope, "${fieldname}");
+%   endfor
+    return ${expr.expr};
+}
+%endfor
+
+/*
+ * Forward-declarations (so we don't have to figure out which order to
+ * emit various tables when they have pointers to each other)
+ */
+
+%for name, bitset in isa.bitsets.items():
+static const struct isa_bitset bitset_${bitset.get_c_name()};
+%endfor
+
+%for root_name, root in isa.roots.items():
+const struct isa_bitset *${root.get_c_name()}[];
+%endfor
+
+/*
+ * bitset tables:
+ */
+
+%for name, bitset in isa.bitsets.items():
+%   for case in bitset.cases:
+%      for field_name, field in case.fields.items():
+%         if field.get_c_typename() == 'TYPE_BITSET':
+%            if len(field.params) > 0:
+static const struct isa_field_params ${case.get_c_name()}_${field.get_c_name()} = {
+       .num_params = ${len(field.params)},
+       .params = {
+%               for param in field.params:
+           { .name= "${param[0]}",  .as = "${param[1]}" },
+%               endfor
+
+       },
+};
+%            endif
+%         endif
+%      endfor
+static const struct isa_case ${case.get_c_name()} = {
+%   if case.expr is not None:
+       .expr     = &${isa.expressions[case.expr].get_c_name()},
+%   endif
+%   if case.display is not None:
+       .display  = "${case.display}",
+%   endif
+       .num_fields = ${len(case.fields)},
+       .fields   = {
+%   for field_name, field in case.fields.items():
+          { .name = "${field_name}", .low = ${field.low}, .high = ${field.high},
+%      if field.expr is not None:
+            .expr = &${isa.expressions[field.expr].get_c_name()},
+%      endif
+%      if field.display is not None:
+            .display = "${field.display}",
+%      endif
+            .type = ${field.get_c_typename()},
+%      if field.get_c_typename() == 'TYPE_BITSET':
+            .bitsets = ${isa.roots[field.type].get_c_name()},
+%         if len(field.params) > 0:
+            .params = &${case.get_c_name()}_${field.get_c_name()},
+%         endif
+%      endif
+%      if field.get_c_typename() == 'TYPE_ENUM':
+            .enums = &${isa.enums[field.type].get_c_name()},
+%      endif
+%      if field.get_c_typename() == 'TYPE_ASSERT':
+            .val.bitset = { ${', '.join(isa.split_bits(field.val))} },
+%      endif
+          },
+%   endfor
+       },
+};
+%   endfor
+static const struct isa_bitset bitset_${bitset.get_c_name()} = {
+<% pattern = bitset.get_pattern() %>
+%   if bitset.extends is not None:
+       .parent   = &bitset_${isa.bitsets[bitset.extends].get_c_name()},
+%   endif
+       .name     = "${name}",
+       .gen      = {
+           .min  = ${bitset.get_gen_min()},
+           .max  = ${bitset.get_gen_max()},
+       },
+       .match.bitset    = { ${', '.join(isa.split_bits(pattern.match))} },
+       .dontcare.bitset = { ${', '.join(isa.split_bits(pattern.dontcare))} },
+       .mask.bitset     = { ${', '.join(isa.split_bits(pattern.mask))} },
+       .num_cases = ${len(bitset.cases)},
+       .cases    = {
+%   for case in bitset.cases:
+            &${case.get_c_name()},
+%   endfor
+       },
+};
+%endfor
+
+/*
+ * bitset hierarchy root tables (where decoding starts from):
+ */
+
+%for root_name, root in isa.roots.items():
+const struct isa_bitset *${root.get_c_name()}[] = {
+%   for leaf_name, leaf in isa.leafs.items():
+%      if leaf.get_root() == root:
+          &bitset_${leaf.get_c_name()},
+%      endif
+%   endfor
+    (void *)0
+};
+%endfor
+
+"""
+
+header = """\
+/* Copyright (C) 2020 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _${guard}_
+#define _${guard}_
+
+#include <stdint.h>
+#include <util/bitset.h>
+
+#define BITMASK_WORDS BITSET_WORDS(${isa.bitsize})
+
+typedef struct {
+    BITSET_WORD bitset[BITMASK_WORDS];
+} bitmask_t;
+
+
+#define BITSET_FORMAT ${isa.format()}
+#define BITSET_VALUE(v) ${isa.value()}
+
+static inline void
+next_instruction(bitmask_t *instr, BITSET_WORD *start)
+{
+    %for i in range(0, int(isa.bitsize / 32)):
+    instr->bitset[${i}] = *(start + ${i});
+    %endfor
+}
+
+static inline uint64_t
+bitmask_to_uint64_t(bitmask_t mask)
+{
+    return ((uint64_t)mask.bitset[1] << 32) | mask.bitset[0];
+}
+
+static inline bitmask_t
+uint64_t_to_bitmask(uint64_t val)
+{
+    bitmask_t mask = {
+        .bitset[0] = val & 0xffffffff,
+        .bitset[1] = (val >> 32) & 0xffffffff,
+    };
+
+    return mask;
+}
+
+#endif /* _${guard}_ */
+
+"""
+
+glue = """\
+/* Copyright (C) 2020 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _${guard}_
+#define _${guard}_
+
+#include "${isa}"
+
+#endif /* _${guard}_ */
+
+"""
+
+xml = sys.argv[1]
+glue_h = sys.argv[2]
+dst_c = sys.argv[3]
+dst_h = sys.argv[4]
+
+isa = ISA(xml)
+
+with open(glue_h, 'w') as f:
+    guard = os.path.basename(glue_h).upper().replace("-", "_").replace(".", "_")
+    f.write(Template(glue).render(guard=guard, isa=os.path.basename(dst_h)))
+
+with open(dst_c, 'w') as f:
+    f.write(Template(template).render(isa=isa))
+
+with open(dst_h, 'w') as f:
+    guard = os.path.basename(dst_h).upper().replace("-", "_").replace(".", "_")
+    f.write(Template(header).render(isa=isa, guard=guard))
diff --git a/mesa 3D driver/src/compiler/isaspec/encode.py b/mesa 3D driver/src/compiler/isaspec/encode.py
new file mode 100644
index 0000000000..bc19209e74
--- /dev/null
+++ b/mesa 3D driver/src/compiler/isaspec/encode.py	
@@ -0,0 +1,652 @@
+#!/usr/bin/env python3
+#
+# Copyright © 2020 Google, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+from mako.template import Template
+from isa import ISA, BitSetDerivedField, BitSetAssertField
+import sys
+import re
+
+# Encoding is driven by the display template that would be used
+# to decode any given instruction, essentially working backwards
+# from the decode case.  (Or put another way, the decoded bitset
+# should contain enough information to re-encode it again.)
+#
+# In the xml, we can have multiple override cases per bitset,
+# which can override display template and/or fields.  Iterating
+# all this from within the template is messy, so use helpers
+# outside of the template for this.
+#
+# The hierarchy of iterators for encoding is:
+#
+#   // First level - Case()  (s.bitset_cases() iterator)
+#   if (caseA.expression()) {  // maps to <override/> in xml
+#      // Second level - DisplayField()  (case.display_fields() iterator)
+#      ... encode field A ...
+#      ... encode field B ...
+#
+#      // Third level - each display field can be potentially resolved
+#      // by multiple different overrides, you can end up with
+#      // an if/else ladder for an individual display field
+#      if (field_c_case1.expression()) {
+#         ... encode field C ...
+#      } else if (field_c_case2.expression() {
+#         ... encode field C ...
+#      } else {
+#      }
+#
+#   } else if (caseB.expression())(
+#   } else {  // maps to the default case in bitset, ie. outside <override/>
+#   }
+
+
+# Represents a concrete field, ie. a field can be overriden
+# by an override, so the exact choice to encode a given field
+# in a bitset may be conditional
+class FieldCase(object):
+    def __init__(self, field, case):
+        self.field = field
+        self.expr  = None
+        if case.expr is not None:
+            self.expr = isa.expressions[case.expr]
+
+    def signed(self):
+        if self.field.type in ['int', 'offset', 'branch']:
+            return 'true'
+        return 'false'
+
+class AssertField(object):
+    def __init__(self, field, case):
+        self.field = field
+        self.expr  = None
+        if case.expr is not None:
+            self.expr = isa.expressions[case.expr]
+
+    def signed(self):
+        return 'false'
+
+# Represents a field to be encoded:
+class DisplayField(object):
+    def __init__(self, bitset, case, name):
+        self.bitset = bitset   # leaf bitset
+        self.case = case
+        self.name = name
+
+    def fields(self, bitset=None):
+        if bitset is None:
+            bitset = self.bitset
+        # resolving the various cases for encoding a given
+        # field is similar to resolving the display template
+        # string
+        for case in bitset.cases:
+            if case.expr is not None:
+                expr = bitset.isa.expressions[case.expr]
+                self.case.append_expr_fields(expr)
+            if self.name in case.fields:
+                field = case.fields[self.name]
+                # For bitset fields, the bitset type could reference
+                # fields in this (the containing) bitset, in addition
+                # to the ones which are directly used to encode the
+                # field itself.
+                if field.get_c_typename() == 'TYPE_BITSET':
+                    for param in field.params:
+                        self.case.append_field(param[0])
+                # For derived fields, we want to consider any other
+                # fields that are referenced by the expr
+                if isinstance(field, BitSetDerivedField):
+                    expr = bitset.isa.expressions[field.expr]
+                    self.case.append_expr_fields(expr)
+                elif not isinstance(field, BitSetAssertField):
+                    yield FieldCase(field, case)
+                # if we've found an unconditional case specifying
+                # the named field, we are done
+                if case.expr is None:
+                    return
+        if bitset.extends is not None:
+            yield from self.fields(isa.bitsets[bitset.extends])
+
+# Represents an if/else case in bitset encoding which has a display
+# template string:
+class Case(object):
+    def __init__(self, bitset, case):
+        self.bitset = bitset   # leaf bitset
+        self.case = case
+        self.expr = None
+        if case.expr is not None:
+            self.expr = isa.expressions[case.expr]
+        self.fieldnames = re.findall(r"{([a-zA-Z0-9_]+)}", case.display)
+        self.append_forced(bitset)
+
+    # Handle fields which don't appear in display template but have
+    # force="true"
+    def append_forced(self, bitset):
+        if bitset.encode is not None:
+            for name, val in bitset.encode.forced.items():
+                self.append_field(name)
+        if bitset.extends is not None:
+            self.append_forced(isa.bitsets[bitset.extends])
+
+    # In the process of resolving a field, we might discover additional
+    # fields that need resolving:
+    #
+    # a) a derived field which maps to one or more other concrete fields
+    # b) a bitset field, which may be "parameterized".. for example a
+    #    #multisrc field which refers back to SRC1_R/SRC2_R outside of
+    #    the range of bits covered by the #multisrc field itself
+    def append_field(self, fieldname):
+        if fieldname not in self.fieldnames:
+            self.fieldnames.append(fieldname)
+
+    def append_expr_fields(self, expr):
+        for fieldname in expr.fieldnames:
+            self.append_field(fieldname)
+
+    def display_fields(self):
+        for fieldname in self.fieldnames:
+            yield DisplayField(self.bitset, self, fieldname)
+
+    def assert_cases(self, bitset=None):
+        if bitset is None:
+            bitset = self.bitset
+        for case in bitset.cases:
+            for name, field in case.fields.items():
+                if field.get_c_typename() == 'TYPE_ASSERT':
+                    yield AssertField(field, case)
+        if bitset.extends is not None:
+            yield from self.assert_cases(isa.bitsets[bitset.extends])
+
+# State and helpers used by the template:
+class State(object):
+    def __init__(self, isa):
+        self.isa = isa
+        self.warned_missing_extractors = []
+
+    def bitset_cases(self, bitset, leaf_bitset=None):
+        if leaf_bitset is None:
+            leaf_bitset = bitset;
+        for case in bitset.cases:
+            if case.display is None:
+                # if this is the last case (ie. case.expr is None)
+                # then we need to go up the inheritance chain:
+                if case.expr is None and bitset.extends is not None:
+                    parent_bitset = isa.bitsets[bitset.extends]
+                    yield from self.bitset_cases(parent_bitset, leaf_bitset)
+                continue;
+            yield Case(leaf_bitset, case)
+
+    # Find unique bitset remap/parameter names, to generate a struct
+    # used to pass "parameters" to bitset fields:
+    def unique_param_names(self):
+        unique_names = []
+        for root in self.encode_roots():
+            for leaf in self.encode_leafs(root):
+                for case in s.bitset_cases(leaf):
+                    for df in case.display_fields():
+                        for f in df.fields():
+                            if f.field.get_c_typename() == 'TYPE_BITSET':
+                                for param in f.field.params:
+                                    target_name = param[1]
+                                    if target_name not in unique_names:
+                                        yield target_name
+                                        unique_names.append(target_name)
+
+    def case_name(self, bitset, name):
+       return bitset.encode.case_prefix + name.upper().replace('.', '_').replace('-', '_').replace('#', '')
+
+    def encode_roots(self):
+       for name, root in self.isa.roots.items():
+          if root.encode is None:
+             continue
+          yield root
+
+    def encode_leafs(self, root):
+       for name, leaf in self.isa.leafs.items():
+          if leaf.get_root() != root:
+             continue
+          yield leaf
+
+    # expressions used in a bitset (case or field or recursively parent bitsets)
+    def bitset_used_exprs(self, bitset):
+       for case in bitset.cases:
+          if case.expr:
+             yield self.isa.expressions[case.expr]
+          for name, field in case.fields.items():
+             if isinstance(field, BitSetDerivedField):
+                yield self.isa.expressions[field.expr]
+       if bitset.extends is not None:
+          yield from self.bitset_used_exprs(self.isa.bitsets[bitset.extends])
+
+    def extractor_impl(self, bitset, name):
+        if bitset.encode is not None:
+            if name in bitset.encode.maps:
+                return bitset.encode.maps[name]
+        if bitset.extends is not None:
+            return self.extractor_impl(self.isa.bitsets[bitset.extends], name)
+        return None
+
+    # Default fallback when no mapping is defined, simply to avoid
+    # having to deal with encoding at the same time as r/e new
+    # instruction decoding.. but we can at least print warnings:
+    def extractor_fallback(self, bitset, name):
+        extr_name = bitset.name + '.' + name
+        if extr_name not in self.warned_missing_extractors:
+            print('WARNING: no encode mapping for {}.{}'.format(bitset.name, name))
+            self.warned_missing_extractors.append(extr_name)
+        return '0 /* XXX */'
+
+    def extractor(self, bitset, name):
+        extr = self.extractor_impl(bitset, name)
+        if extr is not None:
+            return extr
+        return self.extractor_fallback(bitset, name)
+
+    # In the special case of needing to access a field with bitset type
+    # for an expr, we need to encode the field so we end up with an
+    # integer, and not some pointer to a thing that will be encoded to
+    # an integer
+    def expr_extractor(self, bitset, name, p):
+        extr = self.extractor_impl(bitset, name)
+        field = self.resolve_simple_field(bitset, name)
+        if isinstance(field, BitSetDerivedField):
+            expr = self.isa.expressions[field.expr]
+            return self.expr_name(bitset.get_root(), expr) + '(s, p, src)'
+        if extr is None:
+            if name in self.unique_param_names():
+                extr = 'p->' + name
+            else:
+                extr = self.extractor_fallback(bitset, name)
+        if field and field.get_c_typename() == 'TYPE_BITSET':
+            extr = 'encode' + isa.roots[field.type].get_c_name() + '(s, ' + p + ', ' + extr + ')'
+        return extr
+
+    # A limited resolver for field type which doesn't properly account for
+    # overrides.  In particular, if a field is defined differently in multiple
+    # different cases, this just blindly picks the last one.
+    #
+    # TODO to do this properly, I don't think there is an alternative than
+    # to emit code which evaluates the case.expr
+    def resolve_simple_field(self, bitset, name):
+        field = None
+        for case in bitset.cases:
+            if name in case.fields:
+                field = case.fields[name]
+        if field is not None:
+            return field
+        if bitset.extends is not None:
+            return self.resolve_simple_field(isa.bitsets[bitset.extends], name)
+        return None
+
+    def encode_type(self, bitset):
+        if bitset.encode is not None:
+            if bitset.encode.type is not None:
+                return bitset.encode.type
+        if bitset.extends is not None:
+            return self.encode_type(isa.bitsets[bitset.extends])
+        return None
+
+    def expr_name(self, root, expr):
+       return root.get_c_name() + '_' + expr.get_c_name()
+
+template = """\
+/* Copyright (C) 2020 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <util/bitset.h>
+
+<%
+isa = s.isa
+%>
+
+#define BITMASK_WORDS BITSET_WORDS(${isa.bitsize})
+
+typedef struct {
+    BITSET_WORD bitset[BITMASK_WORDS];
+} bitmask_t;
+
+static inline uint64_t
+bitmask_to_uint64_t(bitmask_t mask)
+{
+    return ((uint64_t)mask.bitset[1] << 32) | mask.bitset[0];
+}
+
+static inline bitmask_t
+uint64_t_to_bitmask(uint64_t val)
+{
+    bitmask_t mask = {
+        .bitset[0] = val & 0xffffffff,
+        .bitset[1] = (val >> 32) & 0xffffffff,
+    };
+
+    return mask;
+}
+
+static inline void
+store_instruction(BITSET_WORD *dst, bitmask_t instr)
+{
+%   for i in range(0, int(isa.bitsize / 32)):
+    *(dst + ${i}) = instr.bitset[${i}];
+%   endfor
+}
+
+/**
+ * Opaque type from the PoV of generated code, but allows state to be passed
+ * thru to the hand written helpers used by the generated code.
+ */
+struct encode_state;
+
+struct bitset_params;
+
+static bitmask_t
+pack_field(unsigned low, unsigned high, int64_t val, bool is_signed)
+{
+   bitmask_t field, mask;
+
+   if (is_signed) {
+      /* NOTE: Don't assume val is already sign-extended to 64b,
+       * just check that the bits above the valid range are either
+       * all zero or all one:
+       */
+      assert(!(( val & ~BITFIELD64_MASK(1 + high - low)) &&
+               (~val & ~BITFIELD64_MASK(1 + high - low))));
+   } else {
+      assert(!(val & ~BITFIELD64_MASK(1 + high - low)));
+   }
+
+   BITSET_ZERO(field.bitset);
+
+   if (!val)
+      return field;
+
+   BITSET_ZERO(mask.bitset);
+   BITSET_SET_RANGE(mask.bitset, 0, high - low);
+
+   field = uint64_t_to_bitmask(val);
+   BITSET_AND(field.bitset, field.bitset, mask.bitset);
+   BITSET_SHL(field.bitset, low);
+
+   return field;
+}
+
+/*
+ * Forward-declarations (so we don't have to figure out which order to
+ * emit various encoders when they have reference each other)
+ */
+
+%for root in s.encode_roots():
+static bitmask_t encode${root.get_c_name()}(struct encode_state *s, struct bitset_params *p, ${root.encode.type} src);
+%endfor
+
+## TODO before the expr evaluators, we should generate extract_FOO() for
+## derived fields.. which probably also need to be in the context of the
+## respective root so they take the correct src arg??
+
+/*
+ * Expression evaluators:
+ */
+
+struct bitset_params {
+%for name in s.unique_param_names():
+   int64_t ${name};
+%endfor
+};
+
+## TODO can we share this def between the two templates somehow?
+<%def name="encode_params(leaf, field)">
+ struct bitset_params bp = {
+%for param in field.params:
+    .${param[1]} = ${s.expr_extractor(leaf, param[0], 'p')},  /* ${param[0]} */
+%endfor
+ };
+</%def>
+
+<%def name="render_expr(leaf, expr)">
+static inline int64_t
+${s.expr_name(leaf.get_root(), expr)}(struct encode_state *s, struct bitset_params *p, ${leaf.get_root().encode.type} src)
+{
+%   for fieldname in expr.fieldnames:
+    int64_t ${fieldname};
+%   endfor
+%   for fieldname in expr.fieldnames:
+<% field = s.resolve_simple_field(leaf, fieldname) %>
+%      if field is not None and field.get_c_typename() == 'TYPE_BITSET':
+          { ${encode_params(leaf, field)}
+          const bitmask_t tmp = ${s.expr_extractor(leaf, fieldname, '&bp')};
+          ${fieldname} = bitmask_to_uint64_t(tmp);
+          }
+%      else:
+          ${fieldname} = ${s.expr_extractor(leaf, fieldname, 'p')};
+%      endif
+%   endfor
+    return ${expr.expr};
+}
+</%def>
+
+## note, we can't just iterate all the expressions, but we need to find
+## the context in which they are used to know the correct src type
+
+%for root in s.encode_roots():
+<%
+    rendered_exprs = []
+%>
+%   for leaf in s.encode_leafs(root):
+%      for expr in s.bitset_used_exprs(leaf):
+<%
+          if expr in rendered_exprs:
+             continue
+          rendered_exprs.append(expr)
+%>
+          ${render_expr(leaf, expr)}
+%      endfor
+%   endfor
+%endfor
+
+
+/*
+ * The actual encoder definitions
+ */
+
+%for root in s.encode_roots():
+%   for leaf in s.encode_leafs(root):
+<% snippet = encode_bitset.render(s=s, root=root, leaf=leaf) %>
+%      if snippet not in root.snippets.keys():
+<% snippet_name = "snippet" + root.get_c_name() + "_" + str(len(root.snippets)) %>
+static bitmask_t
+${snippet_name}(struct encode_state *s, struct bitset_params *p, ${root.encode.type} src)
+{
+   bitmask_t val = uint64_t_to_bitmask(0);
+${snippet}
+   return val;
+}
+<% root.snippets[snippet] = snippet_name %>
+%      endif
+%   endfor
+
+static bitmask_t
+encode${root.get_c_name()}(struct encode_state *s, struct bitset_params *p, ${root.encode.type} src)
+{
+%   if root.encode.case_prefix is not None:
+   switch (${root.get_c_name()}_case(s, src)) {
+%      for leaf in s.encode_leafs(root):
+   case ${s.case_name(root, leaf.name)}: {
+<% snippet = encode_bitset.render(s=s, root=root, leaf=leaf) %>
+      bitmask_t val = uint64_t_to_bitmask(${hex(leaf.get_pattern().match)});
+      BITSET_OR(val.bitset, val.bitset, ${root.snippets[snippet]}(s, p, src).bitset);
+      return val;
+    }
+%      endfor
+   default:
+      /* Note that we need the default case, because there are
+       * instructions which we never expect to be encoded, (ie.
+       * meta/macro instructions) as they are removed/replace
+       * in earlier stages of the compiler.
+       */
+      break;
+   }
+   mesa_loge("Unhandled ${root.name} encode case: 0x%x\\n", ${root.get_c_name()}_case(s, src));
+   return uint64_t_to_bitmask(0);
+%   else: # single case bitset, no switch
+%      for leaf in s.encode_leafs(root):
+<% snippet = encode_bitset.render(s=s, root=root, leaf=leaf) %>
+      bitmask_t val = uint64_t_to_bitmask(${hex(leaf.get_pattern().match)});
+      BITSET_OR(val.bitset, val.bitset, ${root.snippets[snippet]}(s, p, src).bitset);
+      return val;
+%      endfor
+%   endif
+}
+%endfor
+"""
+
+encode_bitset_template = """
+<%
+isa = s.isa
+%>
+
+<%def name="case_pre(root, expr)">
+%if expr is not None:
+    if (${s.expr_name(root, expr)}(s, p, src)) {
+%else:
+    {
+%endif
+</%def>
+
+<%def name="case_post(root, expr)">
+%if expr is not None:
+    } else
+%else:
+    }
+%endif
+</%def>
+
+<%def name="encode_params(leaf, field)">
+ struct bitset_params bp = {
+%for param in field.params:
+    .${param[1]} = ${s.expr_extractor(leaf, param[0], 'p')},  /* ${param[0]} */
+%endfor
+ };
+</%def>
+
+      uint64_t fld;
+
+      (void)fld;
+<% visited_exprs = [] %>
+%for case in s.bitset_cases(leaf):
+<%
+    if case.expr is not None:
+        visited_exprs.append(case.expr)
+
+    # per-expression-case track display-field-names that we have
+    # already emitted encoding for.  It is possible that an
+    # <override> case overrides a given field (for ex. #cat5-src3)
+    # and we don't want to emit encoding for both the override and
+    # the fallback
+    seen_fields = {}
+%>
+    ${case_pre(root, case.expr)}
+%   for df in case.display_fields():
+%       for f in df.fields():
+<%
+          # simplify the control flow a bit to give the compiler a bit
+          # less to clean up
+          expr = f.expr
+          if expr == case.expr:
+              # Don't need to evaluate the same condition twice:
+              expr = None
+          elif expr in visited_exprs:
+              # We are in an 'else'/'else-if' leg that we wouldn't
+              # go down due to passing an earlier if()
+              continue
+
+          if not expr in seen_fields.keys():
+              seen_fields[expr] = []
+
+          if f.field.name in seen_fields[expr]:
+              continue
+          seen_fields[expr].append(f.field.name)
+%>
+           ${case_pre(root, expr)}
+%         if f.field.get_c_typename() == 'TYPE_BITSET':
+             { ${encode_params(leaf, f.field)}
+               bitmask_t tmp = encode${isa.roots[f.field.type].get_c_name()}(s, &bp, ${s.extractor(leaf, f.field.name)});
+               fld = bitmask_to_uint64_t(tmp);
+             }
+%         else:
+             fld = ${s.extractor(leaf, f.field.name)};
+%         endif
+             const bitmask_t packed = pack_field(${f.field.low}, ${f.field.high}, fld, ${f.signed()});  /* ${f.field.name} */
+             BITSET_OR(val.bitset, val.bitset, packed.bitset);
+             ${case_post(root, expr)}
+%       endfor
+%   endfor
+
+%   for f in case.assert_cases():
+<%
+      # simplify the control flow a bit to give the compiler a bit
+      # less to clean up
+      expr = f.expr
+      if expr == case.expr:
+          # Don't need to evaluate the same condition twice:
+          expr = None
+      elif expr in visited_exprs:
+          # We are in an 'else'/'else-if' leg that we wouldn't
+          # go down due to passing an earlier if()
+          continue
+%>
+       ${case_pre(root, expr)}
+       const bitmask_t packed = pack_field(${f.field.low}, ${f.field.high}, ${f.field.val}, ${f.signed()});
+       BITSET_OR(val.bitset, val.bitset, packed.bitset);
+       ${case_post(root, None)}
+%   endfor
+      {}  /* in case no unconditional field to close out last '} else' */
+    ${case_post(root, case.expr)}
+%endfor
+      return val;
+"""
+
+xml = sys.argv[1]
+dst = sys.argv[2]
+
+isa = ISA(xml)
+s = State(isa)
+
+with open(dst, 'w') as f:
+    f.write(Template(template).render(s=s, encode_bitset=Template(encode_bitset_template)))
diff --git a/mesa 3D driver/src/compiler/isaspec/isa.py b/mesa 3D driver/src/compiler/isaspec/isa.py
new file mode 100644
index 0000000000..62257d971b
--- /dev/null
+++ b/mesa 3D driver/src/compiler/isaspec/isa.py	
@@ -0,0 +1,553 @@
+#
+# Copyright © 2020 Google, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+from xml.etree import ElementTree
+import os
+import re
+
+def dbg(str):
+    if False:
+        print(str)
+
+class BitSetPattern(object):
+    """Class that encapsulated the pattern matching, ie.
+       the match/dontcare/mask bitmasks.  The following
+       rules should hold
+
+          (match ^ dontcare) == 0
+          (match || dontcare) == mask
+
+       For a leaf node, the mask should be (1 << size) - 1
+       (ie. all bits set)
+    """
+    def __init__(self, bitset):
+        self.match      = bitset.match
+        self.dontcare   = bitset.dontcare
+        self.mask       = bitset.mask
+        self.field_mask = bitset.field_mask;
+
+    def merge(self, pattern):
+        p = BitSetPattern(pattern)
+        p.match      = p.match      | self.match
+        p.dontcare   = p.dontcare   | self.dontcare
+        p.mask       = p.mask       | self.mask
+        p.field_mask = p.field_mask | self.field_mask
+        return p
+
+    def defined_bits(self):
+        return self.match | self.dontcare | self.mask | self.field_mask
+
+def get_bitrange(field):
+    if 'pos' in field.attrib:
+        assert('low' not in field.attrib)
+        assert('high' not in field.attrib)
+        low = int(field.attrib['pos'])
+        high = low
+    else:
+        low = int(field.attrib['low'])
+        high = int(field.attrib['high'])
+    assert low <= high
+    return low, high
+
+def extract_pattern(xml, name, is_defined_bits=None):
+    low, high = get_bitrange(xml)
+    mask = ((1 << (1 + high - low)) - 1) << low
+
+    patstr = xml.text.strip()
+
+    assert (len(patstr) == (1 + high - low)), "Invalid {} length in {}: {}..{}".format(xml.tag, name, low, high)
+    if is_defined_bits is not None:
+        assert not is_defined_bits(mask), "Redefined bits in {} {}: {}..{}".format(xml.tag, name, low, high);
+
+    match = 0;
+    dontcare = 0
+
+    for n in range(0, len(patstr)):
+        match = match << 1
+        dontcare = dontcare << 1
+        if patstr[n] == '1':
+            match |= 1
+        elif patstr[n] == 'x':
+            dontcare |= 1
+        elif patstr[n] != '0':
+            assert 0, "Invalid {} character in {}: {}".format(xml.tag, name, patstr[n])
+
+    dbg("{}: {}.{} => {:016x} / {:016x} / {:016x}".format(xml.tag, name, patstr, match << low, dontcare << low, mask))
+
+    return match << low, dontcare << low, mask
+
+def get_c_name(name):
+    return name.lower().replace('#', '__').replace('-', '_').replace('.', '_')
+
+class BitSetField(object):
+    """Class that encapsulates a field defined in a bitset
+    """
+    def __init__(self, isa, xml):
+        self.isa = isa
+        self.low, self.high = get_bitrange(xml)
+        self.name = xml.attrib['name']
+        self.type = xml.attrib['type']
+        self.params = []
+        for param in xml.findall('param'):
+            aas = name = param.attrib['name']
+            if 'as' in param.attrib:
+                aas = param.attrib['as']
+            self.params.append([name, aas])
+        self.expr = None
+        self.display = None
+        if 'display' in xml.attrib:
+            self.display = xml.attrib['display'].strip()
+
+    def get_c_name(self):
+        return get_c_name(self.name)
+
+    def get_c_typename(self):
+        if self.type in self.isa.enums:
+            return 'TYPE_ENUM'
+        if self.type in self.isa.bitsets:
+            return 'TYPE_BITSET'
+        return 'TYPE_' + self.type.upper()
+
+    def mask(self):
+        return ((1 << self.get_size()) - 1) << self.low
+
+    def get_size(self):
+        return 1 + self.high - self.low
+
+class BitSetAssertField(BitSetField):
+    """Similar to BitSetField, but for <assert/>s, which can be
+       used to specify that a certain bitpattern is expected in
+       place of (for example) unused bitfields
+    """
+    def __init__(self, case, xml):
+        self.isa = case.bitset.isa
+        self.low, self.high = get_bitrange(xml)
+        self.name = case.bitset.name + '#assert' + str(len(case.fields))
+        self.type = 'uint'
+        self.expr = None
+        self.display = None
+
+        match, dontcare, mask = extract_pattern(xml, case.bitset.name)
+        self.val = match >> self.low
+
+        assert dontcare == 0, "'x' (dontcare) is not valid in an assert"
+
+    def get_c_typename(self):
+        return 'TYPE_ASSERT'
+
+class BitSetDerivedField(BitSetField):
+    """Similar to BitSetField, but for derived fields
+    """
+    def __init__(self, isa, xml):
+        self.isa = isa
+        self.low = 0
+        self.high = 0
+        # NOTE: a width should be provided for 'int' derived fields, ie.
+        # where sign extension is needed.  We just repurpose the 'high'
+        # field for that to make '1 + high - low' work out
+        if 'width' in xml.attrib:
+            self.high = int(xml.attrib['width']) - 1
+        self.name = xml.attrib['name']
+        self.type = xml.attrib['type']
+        if 'expr' in xml.attrib:
+            self.expr = xml.attrib['expr']
+        else:
+            e = isa.parse_one_expression(xml, self.name)
+            self.expr = e.name
+        self.display = None
+        if 'display' in xml.attrib:
+            self.display = xml.attrib['display'].strip()
+
+class BitSetCase(object):
+    """Class that encapsulates a single bitset case
+    """
+    def __init__(self, bitset, xml, update_field_mask, expr=None):
+        self.bitset = bitset
+        if expr is not None:
+            self.name = bitset.name + '#case' + str(len(bitset.cases))
+        else:
+            self.name = bitset.name + "#default"
+        self.expr = expr
+        self.fields = {}
+
+        for derived in xml.findall('derived'):
+            f = BitSetDerivedField(bitset.isa, derived)
+            self.fields[f.name] = f
+
+        for assrt in xml.findall('assert'):
+            f = BitSetAssertField(self, assrt)
+            update_field_mask(self, f)
+            self.fields[f.name] = f
+
+        for field in xml.findall('field'):
+            dbg("{}.{}".format(self.name, field.attrib['name']))
+            f = BitSetField(bitset.isa, field)
+            update_field_mask(self, f)
+            self.fields[f.name] = f
+
+        self.display = None
+        for d in xml.findall('display'):
+            # Allow <display/> for empty display string:
+            if d.text is not None:
+                self.display = d.text.strip()
+            else:
+                self.display = ''
+            dbg("found display: '{}'".format(self.display))
+
+    def get_c_name(self):
+        return get_c_name(self.name)
+
+class BitSetEncode(object):
+    """Additional data that may be associated with a root bitset node
+       to provide additional information needed to generate helpers
+       to encode the bitset, such as source data type and "opcode"
+       case prefix (ie. how to choose/enumerate which leaf node bitset
+       to use to encode the source data
+    """
+    def __init__(self, xml):
+        self.type = None
+        if 'type' in xml.attrib:
+            self.type = xml.attrib['type']
+        self.case_prefix = None
+        if 'case-prefix' in xml.attrib:
+            self.case_prefix = xml.attrib['case-prefix']
+        # The encode element may also contain mappings from encode src
+        # to individual field names:
+        self.maps = {}
+        self.forced = {}
+        for map in xml.findall('map'):
+            name = map.attrib['name']
+            self.maps[name] = map.text.strip()
+            if 'force' in map.attrib and map.attrib['force']  == 'true':
+                self.forced[name] = 'true'
+
+class BitSet(object):
+    """Class that encapsulates a single bitset rule
+    """
+    def __init__(self, isa, xml):
+        self.isa = isa
+        self.xml = xml
+        self.name = xml.attrib['name']
+
+        # Used for generated encoder, to de-duplicate encoding for
+        # similar instructions:
+        self.snippets = {}
+
+        if 'size' in xml.attrib:
+            assert('extends' not in xml.attrib)
+            self.size = int(xml.attrib['size'])
+            self.extends = None
+        else:
+            self.size = None
+            self.extends = xml.attrib['extends']
+
+        self.encode = None
+        if xml.find('encode') is not None:
+            self.encode = BitSetEncode(xml.find('encode'))
+
+        self.gen_min = 0
+        self.gen_max = 1 << 32 - 1
+
+        for gen in xml.findall('gen'):
+            if 'min' in gen.attrib:
+                self.gen_min = int(gen.attrib['min'])
+            if 'max' in gen.attrib:
+                self.gen_max = int(gen.attrib['max'])
+
+        # Collect up the match/dontcare/mask bitmasks for
+        # this bitset case:
+        self.match = 0
+        self.dontcare = 0
+        self.mask = 0
+        self.field_mask = 0
+
+        self.cases = []
+
+        # Helper to check for redefined bits:
+        def is_defined_bits(m):
+            return ((self.field_mask | self.mask | self.dontcare | self.match) & m) != 0
+
+        def update_default_bitmask_field(bs, field):
+            m = field.mask()
+            dbg("field: {}.{} => {:016x}".format(self.name, field.name, m))
+            # For default case, we don't expect any bits to be doubly defined:
+            assert not is_defined_bits(m), "Redefined bits in field {}.{}: {}..{}".format(
+                self.name, field.name, field.low, field.high);
+            self.field_mask |= m
+
+        def update_override_bitmask_field(bs, field):
+            m = field.mask()
+            dbg("field: {}.{} => {:016x}".format(self.name, field.name, m))
+            assert self.field_mask ^ ~m
+
+        dflt = BitSetCase(self, xml, update_default_bitmask_field)
+
+        for override in xml.findall('override'):
+            if 'expr' in override.attrib:
+                expr = override.attrib['expr']
+            else:
+                e = isa.parse_one_expression(override, self.name)
+                expr = e.name
+            c = BitSetCase(self, override, update_override_bitmask_field, expr)
+            self.cases.append(c)
+
+        # Default case is expected to be the last one:
+        self.cases.append(dflt)
+
+        for pattern in xml.findall('pattern'):
+            match, dontcare, mask = extract_pattern(pattern, self.name, is_defined_bits)
+
+            self.match    |= match
+            self.dontcare |= dontcare
+            self.mask     |= mask
+
+    def get_pattern(self):
+        if self.extends is not None:
+            parent = self.isa.bitsets[self.extends]
+            ppat = parent.get_pattern()
+            pat  = BitSetPattern(self)
+
+            assert ((ppat.defined_bits() & pat.defined_bits()) == 0), "bitset conflict in {}: {:x}".format(self.name, (ppat.defined_bits() & pat.defined_bits()))
+
+            return pat.merge(ppat)
+
+        return BitSetPattern(self)
+
+    def get_size(self):
+        if self.extends is not None:
+            parent = self.isa.bitsets[self.extends]
+            return parent.get_size()
+        return self.size
+
+    def get_gen_min(self):
+        if self.extends is not None:
+            parent = self.isa.bitsets[self.extends]
+
+            assert (self.gen_min == 0) or (self.gen_min >= parent.get_gen_min()), "bitset {} should not have min gen lower than the parent's one".format(self.name)
+
+            return max(self.gen_min, parent.get_gen_min())
+        return self.gen_min
+
+    def get_gen_max(self):
+        if self.extends is not None:
+            parent = self.isa.bitsets[self.extends]
+
+            assert (self.gen_max == (1 << 32 - 1)) or (self.gen_max <= parent.get_gen_max()), "bitset {} should not have max gen higher than the parent's one".format(self.name)
+
+            return min(self.gen_max, parent.get_gen_max())
+        return self.gen_max
+
+    def get_c_name(self):
+        return get_c_name(self.name)
+
+    def get_root(self):
+        if self.extends is not None:
+            return self.isa.bitsets[self.extends].get_root()
+        return self
+
+class BitSetEnum(object):
+    """Class that encapsulates an enum declaration
+    """
+    def __init__(self, isa, xml):
+        self.isa = isa
+        self.name = xml.attrib['name']
+        # Table mapping value to name
+        # TODO currently just mapping to 'display' name, but if we
+        # need more attributes then maybe need BitSetEnumValue?
+        self.values = {}
+        for value in xml.findall('value'):
+            self.values[value.attrib['val']] = value.attrib['display']
+
+    def get_c_name(self):
+        return 'enum_' + get_c_name(self.name)
+
+class BitSetExpression(object):
+    """Class that encapsulates an <expr> declaration
+    """
+    def __init__(self, isa, xml):
+        self.isa = isa
+        if 'name' in xml.attrib:
+            self.name = xml.attrib['name']
+        else:
+            self.name = 'anon_' + str(isa.anon_expression_count)
+            isa.anon_expression_count = isa.anon_expression_count + 1
+        expr = xml.text.strip()
+        self.fieldnames = list(set(re.findall(r"{([a-zA-Z0-9_]+)}", expr)))
+        self.expr = re.sub(r"{([a-zA-Z0-9_]+)}", r"\1", expr)
+        dbg("'{}' -> '{}'".format(expr, self.expr))
+
+    def get_c_name(self):
+        return 'expr_' + get_c_name(self.name)
+
+class ISA(object):
+    """Class that encapsulates all the parsed bitset rules
+    """
+    def __init__(self, xmlpath):
+        self.base_path = os.path.dirname(xmlpath)
+
+        # Counter used to name inline (anonymous) expressions:
+        self.anon_expression_count = 0
+
+        # Table of (globally defined) expressions:
+        self.expressions = {}
+
+        # Table of enums:
+        self.enums = {}
+
+        # Table of toplevel bitset hierarchies:
+        self.roots = {}
+
+        # Table of leaf nodes of bitset hierarchies:
+        self.leafs = {}
+
+        # Table of all bitsets:
+        self.bitsets = {}
+
+        # Max needed bitsize for one instruction
+        self.bitsize = 0
+
+        root = ElementTree.parse(xmlpath).getroot()
+        self.parse_file(root)
+        self.validate_isa()
+
+    def parse_expressions(self, root):
+        e = None
+        for expr in root.findall('expr'):
+            e = BitSetExpression(self, expr)
+            self.expressions[e.name] = e
+        return e
+
+    def parse_one_expression(self, root, name):
+        assert len(root.findall('expr')) == 1, "expected a single expression in: {}".format(name)
+        return self.parse_expressions(root)
+
+    def parse_file(self, root):
+        # Handle imports up-front:
+        for imprt in root.findall('import'):
+            p = os.path.join(self.base_path, imprt.attrib['file'])
+            self.parse_file(ElementTree.parse(p))
+
+        # Extract expressions:
+        self.parse_expressions(root)
+
+        # Extract enums:
+        for enum in root.findall('enum'):
+            e = BitSetEnum(self, enum)
+            self.enums[e.name] = e
+
+        # Extract bitsets:
+        for bitset in root.findall('bitset'):
+            b = BitSet(self, bitset)
+            if b.size is not None:
+                dbg("toplevel: " + b.name)
+                self.roots[b.name] = b
+                self.bitsize = max(self.bitsize, b.size)
+            else:
+                dbg("derived: " + b.name)
+            self.bitsets[b.name] = b
+            self.leafs[b.name]  = b
+
+        # Remove non-leaf nodes from the leafs table:
+        for name, bitset in self.bitsets.items():
+            if bitset.extends is not None:
+                if bitset.extends in self.leafs:
+                    del self.leafs[bitset.extends]
+
+    def validate_isa(self):
+        # Validate that all bitset fields have valid types, and in
+        # the case of bitset type, the sizes match:
+        builtin_types = ['branch', 'int', 'uint', 'hex', 'offset', 'uoffset', 'float', 'bool', 'enum']
+        for bitset_name, bitset in self.bitsets.items():
+            if bitset.extends is not None:
+                assert bitset.extends in self.bitsets, "{} extends invalid type: {}".format(
+                    bitset_name, bitset.extends)
+            for case in bitset.cases:
+                for field_name, field in case.fields.items():
+                    if field.type == 'float':
+                        assert field.get_size() == 32 or field.get_size() == 16
+
+                    if not isinstance(field, BitSetDerivedField):
+                        assert field.high < bitset.get_size(), \
+                            "{}.{}: invalid bit range: [{}, {}] is not in [{}, {}]".format(
+                            bitset_name, field_name, field.low, field.high, 0, bitset.get_size() - 1)
+
+                    if field.type in builtin_types:
+                        continue
+                    if field.type in self.enums:
+                        continue
+                    assert field.type in self.bitsets, "{}.{}: invalid type: {}".format(
+                        bitset_name, field_name, field.type)
+                    bs = self.bitsets[field.type]
+                    assert field.get_size() == bs.get_size(), "{}.{}: invalid size: {} vs {}".format(
+                        bitset_name, field_name, field.get_size(), bs.get_size())
+
+        # Validate that all the leaf node bitsets have no remaining
+        # undefined bits
+        for name, bitset in self.leafs.items():
+            pat = bitset.get_pattern()
+            sz  = bitset.get_size()
+            assert ((pat.mask | pat.field_mask) == (1 << sz) - 1), "leaf bitset {} has undefined bits: {:x}".format(
+                bitset.name, ~(pat.mask | pat.field_mask) & ((1 << sz) - 1))
+
+        # TODO somehow validating that only one bitset in a hierarchy
+        # matches any given bit pattern would be useful.
+
+        # TODO we should probably be able to look at the contexts where
+        # an expression is evaluated and verify that it doesn't have any
+        # {VARNAME} references that would be unresolved at evaluation time
+
+    def format(self):
+        ''' Generate format string used by printf(..) and friends '''
+        parts = []
+        words = self.bitsize / 32
+
+        for i in range(int(words)):
+            parts.append('%08x')
+
+        fmt = ''.join(parts)
+
+        return f"\"{fmt[1:]}\""
+
+    def value(self):
+        ''' Generate format values used by printf(..) and friends '''
+        parts = []
+        words = self.bitsize / 32
+
+        for i in range(int(words) - 1, -1, -1):
+            parts.append('v[' + str(i) + ']')
+
+        return ', '.join(parts)
+
+    def split_bits(self, value):
+        ''' Split `value` into a list of 32-bit integers '''
+        mask, parts = (1 << 32) - 1, []
+        words = self.bitsize / 32
+
+        while value:
+            parts.append(hex(value & mask))
+            value >>= 32
+
+        # Add 'missing' words
+        while len(parts) < words:
+            parts.append('0x0')
+
+        return parts
diff --git a/mesa 3D driver/src/compiler/isaspec/meson.build b/mesa 3D driver/src/compiler/isaspec/meson.build
new file mode 100644
index 0000000000..9c8dd0dc4f
--- /dev/null
+++ b/mesa 3D driver/src/compiler/isaspec/meson.build	
@@ -0,0 +1,24 @@
+# Copyright © 2020 Google, Inc
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+prog_isaspec_decode = find_program('decode.py')
+idep_isaspec_decode = declare_dependency(sources : files('decode.c'), include_directories : include_directories('.'))
+
+prog_isaspec_encode = find_program('encode.py')
diff --git a/mesa 3D driver/src/compiler/meson.build b/mesa 3D driver/src/compiler/meson.build
index a74a630f0f..ce6c6c06c4 100644
--- a/mesa 3D driver/src/compiler/meson.build	
+++ b/mesa 3D driver/src/compiler/meson.build	
@@ -71,30 +71,26 @@ spirv2nir = executable(
 
 if with_tests
   test(
-    'avail_vis',
+    'spirv_tests',
     executable(
-      'avail_vis',
-      files('spirv/tests/avail_vis.cpp'),
-      c_args : [c_msvc_compat_args, no_override_init_args],
-      gnu_symbol_visibility : 'hidden',
-      include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux],
-      dependencies : [dep_thread, idep_gtest, idep_nir, idep_mesautil],
-    ),
-    suite : ['compiler', 'spirv'],
-  )
-
-  test(
-    'volatile',
-    executable(
-      'volatile',
-      files('spirv/tests/volatile.cpp'),
+      'spirv_tests',
+      files(
+        'spirv/tests/helpers.h',
+        'spirv/tests/avail_vis.cpp',
+        'spirv/tests/volatile.cpp',
+      ),
       c_args : [c_msvc_compat_args, no_override_init_args],
       gnu_symbol_visibility : 'hidden',
       include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux],
       dependencies : [dep_thread, idep_gtest, idep_nir, idep_mesautil],
     ),
     suite : ['compiler', 'spirv'],
+    protocol : gtest_test_protocol,
   )
 endif
 
+if with_clc
+  subdir('clc')
+endif
 subdir('glsl')
+subdir('isaspec')
diff --git a/mesa 3D driver/src/compiler/nir/meson.build b/mesa 3D driver/src/compiler/nir/meson.build
index 339b37200e..26dfc2de51 100644
--- a/mesa 3D driver/src/compiler/nir/meson.build	
+++ b/mesa 3D driver/src/compiler/nir/meson.build	
@@ -207,6 +207,7 @@ files_libnir = files(
   'nir_lower_bit_size.c',
   'nir_lower_ubo_vec4.c',
   'nir_lower_uniforms_to_ubo.c',
+  'nir_lower_sysvals_to_varyings.c',
   'nir_metadata.c',
   'nir_move_vec_src_uses_to_dest.c',
   'nir_normalize_cubemap_coords.c',
@@ -369,55 +370,29 @@ nir_algebraic_py = files('nir_algebraic.py')
 
 if with_tests
   test(
-    'nir_builder',
+    'nir_tests',
     executable(
-      'nir_builder_test',
-      files('tests/builder_tests.cpp'),
-      cpp_args : [cpp_msvc_compat_args],
-      gnu_symbol_visibility : 'hidden',
-      include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux],
-      dependencies : [dep_thread, idep_gtest, idep_nir, idep_mesautil],
-    ),
-    suite : ['compiler', 'nir'],
-  )
-
-  test(
-    'nir_control_flow',
-    executable(
-      'nir_control_flow_test',
-      files('tests/control_flow_tests.cpp'),
-      cpp_args : [cpp_msvc_compat_args],
-      gnu_symbol_visibility : 'hidden',
-      include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux],
-      dependencies : [dep_thread, idep_gtest, idep_nir, idep_mesautil],
-    ),
-    suite : ['compiler', 'nir'],
-  )
-
-  test(
-    'nir_core',
-    executable(
-      'nir_core_test',
-      files('tests/core_tests.cpp'),
-      cpp_args : [cpp_msvc_compat_args],
-      gnu_symbol_visibility : 'hidden',
-      include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux],
-      dependencies : [dep_thread, idep_gtest, idep_nir, idep_mesautil],
-    ),
-    suite : ['compiler', 'nir'],
-  )
-
-  test(
-    'nir_vars',
-    executable(
-      'nir_vars_test',
-      files('tests/vars_tests.cpp'),
+      'nir_tests',
+      files(
+        'tests/algebraic_tests.cpp',
+        'tests/builder_tests.cpp',
+        'tests/comparison_pre_tests.cpp',
+        'tests/control_flow_tests.cpp',
+        'tests/core_tests.cpp',
+        'tests/lower_returns_tests.cpp',
+        'tests/negative_equal_tests.cpp',
+        'tests/opt_if_tests.cpp',
+        'tests/serialize_tests.cpp',
+        'tests/ssa_def_bits_used_tests.cpp',
+        'tests/vars_tests.cpp',
+      ),
       cpp_args : [cpp_msvc_compat_args],
       gnu_symbol_visibility : 'hidden',
       include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux],
       dependencies : [dep_thread, idep_gtest, idep_nir, idep_mesautil],
     ),
     suite : ['compiler', 'nir'],
+    protocol : gtest_test_protocol,
   )
 
   test(
@@ -429,32 +404,6 @@ if with_tests
     suite : ['compiler', 'nir'],
   )
 
-  test(
-    'negative_equal',
-    executable(
-      'negative_equal',
-      files('tests/negative_equal_tests.cpp'),
-      c_args : [c_msvc_compat_args, no_override_init_args],
-      gnu_symbol_visibility : 'hidden',
-      include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux],
-      dependencies : [dep_thread, idep_gtest, idep_nir, idep_mesautil],
-    ),
-    suite : ['compiler', 'nir'],
-  )
-
-  test(
-    'comparison_pre',
-    executable(
-      'comparison_pre',
-      files('tests/comparison_pre_tests.cpp'),
-      c_args : [c_msvc_compat_args, no_override_init_args],
-      gnu_symbol_visibility : 'hidden',
-      include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux],
-      dependencies : [dep_thread, idep_gtest, idep_nir, idep_mesautil],
-    ),
-    suite : ['compiler', 'nir'],
-  )
-
   test(
     'load_store_vectorizer',
     executable(
@@ -466,71 +415,8 @@ if with_tests
       dependencies : [dep_thread, idep_gtest, idep_nir, idep_mesautil],
     ),
     suite : ['compiler', 'nir'],
+    # TODO: Use a negative filter for gtest instead of the expect failure here.
     should_fail : meson.get_cross_property('xfail', '').contains('load_store_vectorizer'),
-  )
-
-  test(
-    'nir_serialize_test',
-    executable(
-      'nir_serialize_test',
-      files('tests/serialize_tests.cpp'),
-      cpp_args : [cpp_msvc_compat_args],
-      gnu_symbol_visibility : 'hidden',
-      include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux],
-      dependencies : [dep_thread, idep_gtest, idep_nir, idep_mesautil],
-    ),
-    suite : ['compiler', 'nir'],
-  )
-
-  test(
-    'nir_opt_if',
-    executable(
-      'nir_opt_if_tests',
-      files('tests/opt_if_tests.cpp'),
-      cpp_args : [cpp_msvc_compat_args],
-      gnu_symbol_visibility : 'hidden',
-      include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux],
-      dependencies : [dep_thread, idep_gtest, idep_nir, idep_mesautil],
-    ),
-    suite : ['compiler', 'nir'],
-  )
-
-  test(
-    'nir_lower_returns',
-    executable(
-      'nir_lower_returns_tests',
-      files('tests/lower_returns_tests.cpp'),
-      cpp_args : [cpp_msvc_compat_args],
-      gnu_symbol_visibility : 'hidden',
-      include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux],
-      dependencies : [dep_thread, idep_gtest, idep_nir, idep_mesautil],
-    ),
-    suite : ['compiler', 'nir'],
-  )
-
-  test(
-    'ssa_def_bits_used',
-    executable(
-      'ssa_def_bits_used',
-      files('tests/ssa_def_bits_used_tests.cpp'),
-      c_args : [c_msvc_compat_args, no_override_init_args],
-      gnu_symbol_visibility : 'hidden',
-      include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux],
-      dependencies : [dep_thread, idep_gtest, idep_nir, idep_mesautil],
-    ),
-    suite : ['compiler', 'nir'],
-  )
-
-  test(
-    'algebraic',
-    executable(
-      'algebraic',
-      files('tests/algebraic_tests.cpp'),
-      c_args : [c_msvc_compat_args, no_override_init_args],
-      gnu_symbol_visibility : 'hidden',
-      include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux],
-      dependencies : [dep_thread, idep_gtest, idep_nir, idep_mesautil],
-    ),
-    suite : ['compiler', 'nir'],
+    protocol : gtest_test_protocol,
   )
 endif
diff --git a/mesa 3D driver/src/compiler/nir/nir.c b/mesa 3D driver/src/compiler/nir/nir.c
index 7b1e7a5589..d5e85dfc99 100644
--- a/mesa 3D driver/src/compiler/nir/nir.c	
+++ b/mesa 3D driver/src/compiler/nir/nir.c	
@@ -100,6 +100,17 @@ nir_component_mask_reinterpret(nir_component_mask_t mask,
    return new_mask;
 }
 
+static void
+nir_shader_destructor(void *ptr)
+{
+   nir_shader *shader = ptr;
+
+   /* Free all instrs from the shader, since they're not ralloced. */
+   list_for_each_entry_safe(nir_instr, instr, &shader->gc_list, gc_node) {
+      nir_instr_free(instr);
+   }
+}
+
 nir_shader *
 nir_shader_create(void *mem_ctx,
                   gl_shader_stage stage,
@@ -107,6 +118,7 @@ nir_shader_create(void *mem_ctx,
                   shader_info *si)
 {
    nir_shader *shader = rzalloc(mem_ctx, nir_shader);
+   ralloc_set_destructor(shader, nir_shader_destructor);
 
    exec_list_make_empty(&shader->variables);
 
@@ -121,6 +133,8 @@ nir_shader_create(void *mem_ctx,
 
    exec_list_make_empty(&shader->functions);
 
+   list_inithead(&shader->gc_list);
+
    shader->num_inputs = 0;
    shader->num_outputs = 0;
    shader->num_uniforms = 0;
@@ -175,6 +189,7 @@ nir_shader_add_variable(nir_shader *shader, nir_variable *var)
    case nir_var_uniform:
    case nir_var_mem_ubo:
    case nir_var_mem_ssbo:
+   case nir_var_image:
    case nir_var_mem_shared:
    case nir_var_system_value:
    case nir_var_mem_push_const:
@@ -325,11 +340,36 @@ nir_function_create(nir_shader *shader, const char *name)
    return func;
 }
 
+static bool src_has_indirect(nir_src *src)
+{
+   return !src->is_ssa && src->reg.indirect;
+}
+
+static void src_free_indirects(nir_src *src)
+{
+   if (src_has_indirect(src)) {
+      assert(src->reg.indirect->is_ssa || !src->reg.indirect->reg.indirect);
+      free(src->reg.indirect);
+      src->reg.indirect = NULL;
+   }
+}
+
+static void dest_free_indirects(nir_dest *dest)
+{
+   if (!dest->is_ssa && dest->reg.indirect) {
+      assert(dest->reg.indirect->is_ssa || !dest->reg.indirect->reg.indirect);
+      free(dest->reg.indirect);
+      dest->reg.indirect = NULL;
+   }
+}
+
 /* NOTE: if the instruction you are copying a src to is already added
  * to the IR, use nir_instr_rewrite_src() instead.
  */
-void nir_src_copy(nir_src *dest, const nir_src *src, void *mem_ctx)
+void nir_src_copy(nir_src *dest, const nir_src *src)
 {
+   src_free_indirects(dest);
+
    dest->is_ssa = src->is_ssa;
    if (src->is_ssa) {
       dest->ssa = src->ssa;
@@ -337,36 +377,37 @@ void nir_src_copy(nir_src *dest, const nir_src *src, void *mem_ctx)
       dest->reg.base_offset = src->reg.base_offset;
       dest->reg.reg = src->reg.reg;
       if (src->reg.indirect) {
-         dest->reg.indirect = ralloc(mem_ctx, nir_src);
-         nir_src_copy(dest->reg.indirect, src->reg.indirect, mem_ctx);
+         dest->reg.indirect = calloc(1, sizeof(nir_src));
+         nir_src_copy(dest->reg.indirect, src->reg.indirect);
       } else {
          dest->reg.indirect = NULL;
       }
    }
 }
 
-void nir_dest_copy(nir_dest *dest, const nir_dest *src, nir_instr *instr)
+void nir_dest_copy(nir_dest *dest, const nir_dest *src)
 {
    /* Copying an SSA definition makes no sense whatsoever. */
    assert(!src->is_ssa);
 
+   dest_free_indirects(dest);
+
    dest->is_ssa = false;
 
    dest->reg.base_offset = src->reg.base_offset;
    dest->reg.reg = src->reg.reg;
    if (src->reg.indirect) {
-      dest->reg.indirect = ralloc(instr, nir_src);
-      nir_src_copy(dest->reg.indirect, src->reg.indirect, instr);
+      dest->reg.indirect = calloc(1, sizeof(nir_src));
+      nir_src_copy(dest->reg.indirect, src->reg.indirect);
    } else {
       dest->reg.indirect = NULL;
    }
 }
 
 void
-nir_alu_src_copy(nir_alu_src *dest, const nir_alu_src *src,
-                 nir_alu_instr *instr)
+nir_alu_src_copy(nir_alu_src *dest, const nir_alu_src *src)
 {
-   nir_src_copy(&dest->src, &src->src, &instr->instr);
+   nir_src_copy(&dest->src, &src->src);
    dest->abs = src->abs;
    dest->negate = src->negate;
    for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++)
@@ -374,10 +415,9 @@ nir_alu_src_copy(nir_alu_src *dest, const nir_alu_src *src,
 }
 
 void
-nir_alu_dest_copy(nir_alu_dest *dest, const nir_alu_dest *src,
-                  nir_alu_instr *instr)
+nir_alu_dest_copy(nir_alu_dest *dest, const nir_alu_dest *src)
 {
-   nir_dest_copy(&dest->dest, &src->dest, &instr->instr);
+   nir_dest_copy(&dest->dest, &src->dest);
    dest->write_mask = src->write_mask;
    dest->saturate = src->saturate;
 }
@@ -565,10 +605,8 @@ nir_alu_instr *
 nir_alu_instr_create(nir_shader *shader, nir_op op)
 {
    unsigned num_srcs = nir_op_infos[op].num_inputs;
-   /* TODO: don't use rzalloc */
-   nir_alu_instr *instr =
-      rzalloc_size(shader,
-                   sizeof(nir_alu_instr) + num_srcs * sizeof(nir_alu_src));
+   /* TODO: don't use calloc */
+   nir_alu_instr *instr = calloc(1, sizeof(nir_alu_instr) + num_srcs * sizeof(nir_alu_src));
 
    instr_init(&instr->instr, nir_instr_type_alu);
    instr->op = op;
@@ -576,14 +614,15 @@ nir_alu_instr_create(nir_shader *shader, nir_op op)
    for (unsigned i = 0; i < num_srcs; i++)
       alu_src_init(&instr->src[i]);
 
+   list_add(&instr->instr.gc_node, &shader->gc_list);
+
    return instr;
 }
 
 nir_deref_instr *
 nir_deref_instr_create(nir_shader *shader, nir_deref_type deref_type)
 {
-   nir_deref_instr *instr =
-      rzalloc_size(shader, sizeof(nir_deref_instr));
+   nir_deref_instr *instr = calloc(1, sizeof(*instr));
 
    instr_init(&instr->instr, nir_instr_type_deref);
 
@@ -597,18 +636,23 @@ nir_deref_instr_create(nir_shader *shader, nir_deref_type deref_type)
 
    dest_init(&instr->dest);
 
+   list_add(&instr->instr.gc_node, &shader->gc_list);
+
    return instr;
 }
 
 nir_jump_instr *
 nir_jump_instr_create(nir_shader *shader, nir_jump_type type)
 {
-   nir_jump_instr *instr = ralloc(shader, nir_jump_instr);
+   nir_jump_instr *instr = malloc(sizeof(*instr));
    instr_init(&instr->instr, nir_instr_type_jump);
    src_init(&instr->condition);
    instr->type = type;
    instr->target = NULL;
    instr->else_target = NULL;
+
+   list_add(&instr->instr.gc_node, &shader->gc_list);
+
    return instr;
 }
 
@@ -617,11 +661,13 @@ nir_load_const_instr_create(nir_shader *shader, unsigned num_components,
                             unsigned bit_size)
 {
    nir_load_const_instr *instr =
-      rzalloc_size(shader, sizeof(*instr) + num_components * sizeof(*instr->value));
+      calloc(1, sizeof(*instr) + num_components * sizeof(*instr->value));
    instr_init(&instr->instr, nir_instr_type_load_const);
 
    nir_ssa_def_init(&instr->instr, &instr->def, num_components, bit_size);
 
+   list_add(&instr->instr.gc_node, &shader->gc_list);
+
    return instr;
 }
 
@@ -629,10 +675,9 @@ nir_intrinsic_instr *
 nir_intrinsic_instr_create(nir_shader *shader, nir_intrinsic_op op)
 {
    unsigned num_srcs = nir_intrinsic_infos[op].num_srcs;
-   /* TODO: don't use rzalloc */
+   /* TODO: don't use calloc */
    nir_intrinsic_instr *instr =
-      rzalloc_size(shader,
-                  sizeof(nir_intrinsic_instr) + num_srcs * sizeof(nir_src));
+      calloc(1, sizeof(nir_intrinsic_instr) + num_srcs * sizeof(nir_src));
 
    instr_init(&instr->instr, nir_instr_type_intrinsic);
    instr->intrinsic = op;
@@ -643,6 +688,8 @@ nir_intrinsic_instr_create(nir_shader *shader, nir_intrinsic_op op)
    for (unsigned i = 0; i < num_srcs; i++)
       src_init(&instr->src[i]);
 
+   list_add(&instr->instr.gc_node, &shader->gc_list);
+
    return instr;
 }
 
@@ -651,8 +698,7 @@ nir_call_instr_create(nir_shader *shader, nir_function *callee)
 {
    const unsigned num_params = callee->num_params;
    nir_call_instr *instr =
-      rzalloc_size(shader, sizeof(*instr) +
-                   num_params * sizeof(instr->params[0]));
+      calloc(1, sizeof(*instr) + num_params * sizeof(instr->params[0]));
 
    instr_init(&instr->instr, nir_instr_type_call);
    instr->callee = callee;
@@ -660,6 +706,8 @@ nir_call_instr_create(nir_shader *shader, nir_function *callee)
    for (unsigned i = 0; i < num_params; i++)
       src_init(&instr->params[i]);
 
+   list_add(&instr->instr.gc_node, &shader->gc_list);
+
    return instr;
 }
 
@@ -674,13 +722,13 @@ static int8_t default_tg4_offsets[4][2] =
 nir_tex_instr *
 nir_tex_instr_create(nir_shader *shader, unsigned num_srcs)
 {
-   nir_tex_instr *instr = rzalloc(shader, nir_tex_instr);
+   nir_tex_instr *instr = calloc(1, sizeof(*instr));
    instr_init(&instr->instr, nir_instr_type_tex);
 
    dest_init(&instr->dest);
 
    instr->num_srcs = num_srcs;
-   instr->src = ralloc_array(instr, nir_tex_src, num_srcs);
+   instr->src = malloc(sizeof(nir_tex_src) * num_srcs);
    for (unsigned i = 0; i < num_srcs; i++)
       src_init(&instr->src[i].src);
 
@@ -688,6 +736,8 @@ nir_tex_instr_create(nir_shader *shader, unsigned num_srcs)
    instr->sampler_index = 0;
    memcpy(instr->tg4_offsets, default_tg4_offsets, sizeof(instr->tg4_offsets));
 
+   list_add(&instr->instr.gc_node, &shader->gc_list);
+
    return instr;
 }
 
@@ -696,7 +746,7 @@ nir_tex_instr_add_src(nir_tex_instr *tex,
                       nir_tex_src_type src_type,
                       nir_src src)
 {
-   nir_tex_src *new_srcs = rzalloc_array(tex, nir_tex_src,
+   nir_tex_src *new_srcs = calloc(sizeof(*new_srcs),
                                          tex->num_srcs + 1);
 
    for (unsigned i = 0; i < tex->num_srcs; i++) {
@@ -705,7 +755,7 @@ nir_tex_instr_add_src(nir_tex_instr *tex,
                          &tex->src[i].src);
    }
 
-   ralloc_free(tex->src);
+   free(tex->src);
    tex->src = new_srcs;
 
    tex->src[tex->num_srcs].src_type = src_type;
@@ -741,11 +791,14 @@ nir_tex_instr_has_explicit_tg4_offsets(nir_tex_instr *tex)
 nir_phi_instr *
 nir_phi_instr_create(nir_shader *shader)
 {
-   nir_phi_instr *instr = ralloc(shader, nir_phi_instr);
+   nir_phi_instr *instr = malloc(sizeof(*instr));
    instr_init(&instr->instr, nir_instr_type_phi);
 
    dest_init(&instr->dest);
    exec_list_make_empty(&instr->srcs);
+
+   list_add(&instr->instr.gc_node, &shader->gc_list);
+
    return instr;
 }
 
@@ -762,7 +815,7 @@ nir_phi_instr_add_src(nir_phi_instr *instr, nir_block *pred, nir_src src)
 {
    nir_phi_src *phi_src;
 
-   phi_src = rzalloc(instr, nir_phi_src);
+   phi_src = calloc(1, sizeof(nir_phi_src));
    phi_src->pred = pred;
    phi_src->src = src;
    phi_src->src.parent_instr = &instr->instr;
@@ -774,11 +827,13 @@ nir_phi_instr_add_src(nir_phi_instr *instr, nir_block *pred, nir_src src)
 nir_parallel_copy_instr *
 nir_parallel_copy_instr_create(nir_shader *shader)
 {
-   nir_parallel_copy_instr *instr = ralloc(shader, nir_parallel_copy_instr);
+   nir_parallel_copy_instr *instr = malloc(sizeof(*instr));
    instr_init(&instr->instr, nir_instr_type_parallel_copy);
 
    exec_list_make_empty(&instr->entries);
 
+   list_add(&instr->instr.gc_node, &shader->gc_list);
+
    return instr;
 }
 
@@ -787,11 +842,13 @@ nir_ssa_undef_instr_create(nir_shader *shader,
                            unsigned num_components,
                            unsigned bit_size)
 {
-   nir_ssa_undef_instr *instr = ralloc(shader, nir_ssa_undef_instr);
+   nir_ssa_undef_instr *instr = malloc(sizeof(*instr));
    instr_init(&instr->instr, nir_instr_type_ssa_undef);
 
    nir_ssa_def_init(&instr->instr, &instr->def, num_components, bit_size);
 
+   list_add(&instr->instr.gc_node, &shader->gc_list);
+
    return instr;
 }
 
@@ -1091,6 +1148,54 @@ void nir_instr_remove_v(nir_instr *instr)
    }
 }
 
+static bool free_src_indirects_cb(nir_src *src, void *state)
+{
+   src_free_indirects(src);
+   return true;
+}
+
+static bool free_dest_indirects_cb(nir_dest *dest, void *state)
+{
+   dest_free_indirects(dest);
+   return true;
+}
+
+void nir_instr_free(nir_instr *instr)
+{
+   nir_foreach_src(instr, free_src_indirects_cb, NULL);
+   nir_foreach_dest(instr, free_dest_indirects_cb, NULL);
+
+   switch (instr->type) {
+   case nir_instr_type_tex:
+      free(nir_instr_as_tex(instr)->src);
+      break;
+
+   case nir_instr_type_phi: {
+      nir_phi_instr *phi = nir_instr_as_phi(instr);
+      nir_foreach_phi_src_safe(phi_src, phi) {
+         free(phi_src);
+      }
+      break;
+   }
+
+   default:
+      break;
+   }
+
+   list_del(&instr->gc_node);
+   free(instr);
+}
+
+void
+nir_instr_free_list(struct exec_list *list)
+{
+   struct exec_node *node;
+   while ((node = exec_list_pop_head(list))) {
+      nir_instr *removed_instr = exec_node_data(nir_instr, node, node);
+      nir_instr_free(removed_instr);
+   }
+}
+
 static bool nir_instr_free_and_dce_live_cb(nir_ssa_def *def, void *state)
 {
    bool *live = state;
@@ -1120,6 +1225,29 @@ static bool nir_instr_free_and_dce_is_live(nir_instr *instr)
    return live;
 }
 
+static bool
+nir_instr_dce_add_dead_srcs_cb(nir_src *src, void *state)
+{
+   nir_instr_worklist *wl = state;
+
+   if (src->is_ssa) {
+      list_del(&src->use_link);
+      if (!nir_instr_free_and_dce_is_live(src->ssa->parent_instr))
+         nir_instr_worklist_push_tail(wl, src->ssa->parent_instr);
+
+      /* Stop nir_instr_remove from trying to delete the link again. */
+      src->ssa = NULL;
+   }
+
+   return true;
+}
+
+static void
+nir_instr_dce_add_dead_ssa_srcs(nir_instr_worklist *wl, nir_instr *instr)
+{
+   nir_foreach_src(instr, nir_instr_dce_add_dead_srcs_cb, wl);
+}
+
 /**
  * Frees an instruction and any SSA defs that it used that are now dead,
  * returning a nir_cursor where the instruction previously was.
@@ -1129,7 +1257,7 @@ nir_instr_free_and_dce(nir_instr *instr)
 {
    nir_instr_worklist *worklist = nir_instr_worklist_create();
 
-   nir_instr_worklist_add_ssa_srcs(worklist, instr);
+   nir_instr_dce_add_dead_ssa_srcs(worklist, instr);
    nir_cursor c = nir_instr_remove(instr);
 
    struct exec_list to_free;
@@ -1137,27 +1265,21 @@ nir_instr_free_and_dce(nir_instr *instr)
 
    nir_instr *dce_instr;
    while ((dce_instr = nir_instr_worklist_pop_head(worklist))) {
-      if (!nir_instr_free_and_dce_is_live(dce_instr)) {
-         nir_instr_worklist_add_ssa_srcs(worklist, dce_instr);
+      nir_instr_dce_add_dead_ssa_srcs(worklist, dce_instr);
 
-         /* If we're removing the instr where our cursor is, then we have to
-          * point the cursor elsewhere.
-          */
-         if ((c.option == nir_cursor_before_instr ||
-              c.option == nir_cursor_after_instr) &&
-             c.instr == dce_instr)
-            c = nir_instr_remove(dce_instr);
-         else
-            nir_instr_remove(dce_instr);
-         exec_list_push_tail(&to_free, &dce_instr->node);
-      }
+      /* If we're removing the instr where our cursor is, then we have to
+       * point the cursor elsewhere.
+       */
+      if ((c.option == nir_cursor_before_instr ||
+           c.option == nir_cursor_after_instr) &&
+          c.instr == dce_instr)
+         c = nir_instr_remove(dce_instr);
+      else
+         nir_instr_remove(dce_instr);
+      exec_list_push_tail(&to_free, &dce_instr->node);
    }
 
-   struct exec_node *node;
-   while ((node = exec_list_pop_head(&to_free))) {
-      nir_instr *removed_instr = exec_node_data(nir_instr, node, node);
-      ralloc_free(removed_instr);
-   }
+   nir_instr_free_list(&to_free);
 
    nir_instr_worklist_destroy(worklist);
 
@@ -1355,12 +1477,18 @@ nir_src_is_dynamically_uniform(nir_src src)
    if (src.ssa->parent_instr->type == nir_instr_type_load_const)
       return true;
 
-   /* As are uniform variables */
    if (src.ssa->parent_instr->type == nir_instr_type_intrinsic) {
       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(src.ssa->parent_instr);
+      /* As are uniform variables */
       if (intr->intrinsic == nir_intrinsic_load_uniform &&
           nir_src_is_dynamically_uniform(intr->src[0]))
          return true;
+      /* Push constant loads always use uniform offsets. */
+      if (intr->intrinsic == nir_intrinsic_load_push_constant)
+         return true;
+      if (intr->intrinsic == nir_intrinsic_load_deref &&
+          nir_deref_mode_is(nir_src_as_deref(intr->src[0]), nir_var_mem_push_const))
+         return true;
    }
 
    /* Operating together dynamically uniform expressions produces a
@@ -1423,7 +1551,7 @@ nir_instr_rewrite_src(nir_instr *instr, nir_src *src, nir_src new_src)
    assert(!src_is_valid(src) || src->parent_instr == instr);
 
    src_remove_all_uses(src);
-   *src = new_src;
+   nir_src_copy(src, &new_src);
    src_add_all_uses(src, instr, NULL);
 }
 
@@ -1433,6 +1561,7 @@ nir_instr_move_src(nir_instr *dest_instr, nir_src *dest, nir_src *src)
    assert(!src_is_valid(dest) || dest->parent_instr == dest_instr);
 
    src_remove_all_uses(dest);
+   src_free_indirects(dest);
    src_remove_all_uses(src);
    *dest = *src;
    *src = NIR_SRC_INIT;
@@ -1446,7 +1575,7 @@ nir_if_rewrite_condition(nir_if *if_stmt, nir_src new_src)
    assert(!src_is_valid(src) || src->parent_if == if_stmt);
 
    src_remove_all_uses(src);
-   *src = new_src;
+   nir_src_copy(src, &new_src);
    src_add_all_uses(src, NULL, if_stmt);
 }
 
@@ -1465,7 +1594,7 @@ nir_instr_rewrite_dest(nir_instr *instr, nir_dest *dest, nir_dest new_dest)
    /* We can't re-write with an SSA def */
    assert(!new_dest.is_ssa);
 
-   nir_dest_copy(dest, &new_dest, instr);
+   nir_dest_copy(dest, &new_dest);
 
    dest->reg.parent_instr = instr;
    list_addtail(&dest->reg.def_link, &new_dest.reg.reg->defs);
@@ -1601,32 +1730,42 @@ get_store_value(nir_intrinsic_instr *intrin)
    return intrin->src[0].ssa;
 }
 
+nir_component_mask_t
+nir_src_components_read(const nir_src *src)
+{
+   assert(src->is_ssa && src->parent_instr);
+
+   if (src->parent_instr->type == nir_instr_type_alu) {
+      nir_alu_instr *alu = nir_instr_as_alu(src->parent_instr);
+      nir_alu_src *alu_src = exec_node_data(nir_alu_src, src, src);
+      int src_idx = alu_src - &alu->src[0];
+      assert(src_idx >= 0 && src_idx < nir_op_infos[alu->op].num_inputs);
+      return nir_alu_instr_src_read_mask(alu, src_idx);
+   } else if (src->parent_instr->type == nir_instr_type_intrinsic) {
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src->parent_instr);
+      if (nir_intrinsic_has_write_mask(intrin) && src->ssa == get_store_value(intrin))
+         return nir_intrinsic_write_mask(intrin);
+      else
+         return (1 << src->ssa->num_components) - 1;
+   } else {
+      return (1 << src->ssa->num_components) - 1;
+   }
+}
+
 nir_component_mask_t
 nir_ssa_def_components_read(const nir_ssa_def *def)
 {
    nir_component_mask_t read_mask = 0;
-   nir_foreach_use(use, def) {
-      if (use->parent_instr->type == nir_instr_type_alu) {
-         nir_alu_instr *alu = nir_instr_as_alu(use->parent_instr);
-         nir_alu_src *alu_src = exec_node_data(nir_alu_src, use, src);
-         int src_idx = alu_src - &alu->src[0];
-         assert(src_idx >= 0 && src_idx < nir_op_infos[alu->op].num_inputs);
-         read_mask |= nir_alu_instr_src_read_mask(alu, src_idx);
-      } else if (use->parent_instr->type == nir_instr_type_intrinsic) {
-         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(use->parent_instr);
-         if (nir_intrinsic_has_write_mask(intrin) && use->ssa == get_store_value(intrin)) {
-            read_mask |= nir_intrinsic_write_mask(intrin);
-         } else {
-            return (1 << def->num_components) - 1;
-         }
-      } else {
-         return (1 << def->num_components) - 1;
-      }
-   }
 
    if (!list_is_empty(&def->if_uses))
       read_mask |= 1;
 
+   nir_foreach_use(use, def) {
+      read_mask |= nir_src_components_read(use);
+      if (read_mask == (1 << def->num_components) - 1)
+         return read_mask;
+   }
+
    return read_mask;
 }
 
@@ -2098,6 +2237,17 @@ nir_shader_lower_instructions(nir_shader *shader,
    return progress;
 }
 
+/**
+ * Returns true if the shader supports quad-based implicit derivatives on
+ * texture sampling.
+ */
+bool nir_shader_supports_implicit_lod(nir_shader *shader)
+{
+   return (shader->info.stage == MESA_SHADER_FRAGMENT ||
+           (shader->info.stage == MESA_SHADER_COMPUTE &&
+            shader->info.cs.derivative_group != DERIVATIVE_GROUP_NONE));
+}
+
 nir_intrinsic_op
 nir_intrinsic_from_system_value(gl_system_value val)
 {
diff --git a/mesa 3D driver/src/compiler/nir/nir.h b/mesa 3D driver/src/compiler/nir/nir.h
index 0ee29855fd..9325f821ba 100644
--- a/mesa 3D driver/src/compiler/nir/nir.h	
+++ b/mesa 3D driver/src/compiler/nir/nir.h	
@@ -120,26 +120,35 @@ typedef struct {
 } nir_state_slot;
 
 typedef enum {
-   nir_var_shader_in       = (1 << 0),
-   nir_var_shader_out      = (1 << 1),
-   nir_var_shader_temp     = (1 << 2),
-   nir_var_function_temp   = (1 << 3),
-   nir_var_uniform         = (1 << 4),
-   nir_var_mem_ubo         = (1 << 5),
-   nir_var_system_value    = (1 << 6),
-   nir_var_mem_ssbo        = (1 << 7),
-   nir_var_mem_shared      = (1 << 8),
-   nir_var_mem_global      = (1 << 9),
+   nir_var_system_value    = (1 << 0),
+   nir_var_uniform         = (1 << 1),
+   nir_var_shader_in       = (1 << 2),
+   nir_var_shader_out      = (1 << 3),
+   nir_var_image           = (1 << 4),
+   /** Incoming call or ray payload data for ray-tracing shaders */
+   nir_var_shader_call_data = (1 << 5),
+   /** Ray hit attributes */
+   nir_var_ray_hit_attrib  = (1 << 6),
+
+   /* Modes named nir_var_mem_* have explicit data layout */
+   nir_var_mem_ubo         = (1 << 7),
+   nir_var_mem_push_const  = (1 << 8),
+   nir_var_mem_ssbo        = (1 << 9),
+   nir_var_mem_constant    = (1 << 10),
+
+   /* Generic modes intentionally come last. See encode_dref_modes() in
+    * nir_serialize.c for more details.
+    */
+   nir_var_shader_temp     = (1 << 11),
+   nir_var_function_temp   = (1 << 12),
+   nir_var_mem_shared      = (1 << 13),
+   nir_var_mem_global      = (1 << 14),
+
    nir_var_mem_generic     = (nir_var_shader_temp |
                               nir_var_function_temp |
                               nir_var_mem_shared |
                               nir_var_mem_global),
-   nir_var_mem_push_const  = (1 << 10), /* not actually used for variables */
-   nir_var_mem_constant    = (1 << 11),
-   /** Incoming call or ray payload data for ray-tracing shaders */
-   nir_var_shader_call_data = (1 << 12),
-   /** Ray hit attributes */
-   nir_var_ray_hit_attrib  = (1 << 13),
+
    nir_var_read_only_modes = nir_var_shader_in | nir_var_uniform |
                              nir_var_system_value | nir_var_mem_constant |
                              nir_var_mem_ubo,
@@ -147,7 +156,7 @@ typedef enum {
    nir_var_vec_indexable_modes = nir_var_mem_ubo | nir_var_mem_ssbo |
                                  nir_var_mem_shared | nir_var_mem_global |
                                  nir_var_mem_push_const,
-   nir_num_variable_modes  = 14,
+   nir_num_variable_modes  = 15,
    nir_var_all             = (1 << nir_num_variable_modes) - 1,
 } nir_variable_mode;
 MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(nir_variable_mode)
@@ -356,7 +365,7 @@ typedef struct nir_variable {
        *
        * \sa nir_variable_mode
        */
-      unsigned mode:14;
+      unsigned mode:15;
 
       /**
        * Is the variable read-only?
@@ -494,6 +503,12 @@ typedef struct nir_variable {
        */
       unsigned per_view:1;
 
+      /**
+       * Whether the variable is per-primitive.
+       * Can be use by Mesh Shader outputs and corresponding Fragment Shader inputs.
+       */
+      unsigned per_primitive:1;
+
       /**
        * \brief Layout qualifier for gl_FragDepth. See nir_depth_layout.
        *
@@ -546,6 +561,9 @@ typedef struct nir_variable {
        *   - Geometry shader output: one of the values from \c gl_varying_slot.
        *   - Fragment shader input: one of the values from \c gl_varying_slot.
        *   - Fragment shader output: one of the values from \c gl_frag_result.
+       *   - Task shader output: one of the values from \c gl_varying_slot.
+       *   - Mesh shader input: one of the values from \c gl_varying_slot.
+       *   - Mesh shader output: one of the values from \c gl_varying_slot.
        *   - Uniforms: Per-stage uniform slot number for default uniform block.
        *   - Uniforms: Index within the uniform block definition for UBO members.
        *   - Non-UBO Uniforms: uniform slot number.
@@ -558,7 +576,8 @@ typedef struct nir_variable {
 
       /**
        * The actual location of the variable in the IR. Only valid for inputs,
-       * outputs, and uniforms (including samplers and images).
+       * outputs, uniforms (including samplers and images), and for UBO and SSBO
+       * variables in GLSL.
        */
       unsigned driver_location;
 
@@ -704,6 +723,12 @@ _nir_shader_variable_has_mode(nir_variable *var, unsigned modes)
 #define nir_foreach_uniform_variable_safe(var, shader) \
    nir_foreach_variable_with_modes_safe(var, shader, nir_var_uniform)
 
+#define nir_foreach_image_variable(var, shader) \
+   nir_foreach_variable_with_modes(var, shader, nir_var_image)
+
+#define nir_foreach_image_variable_safe(var, shader) \
+   nir_foreach_variable_with_modes_safe(var, shader, nir_var_image)
+
 static inline bool
 nir_variable_is_global(const nir_variable *var)
 {
@@ -758,6 +783,7 @@ typedef enum PACKED {
 
 typedef struct nir_instr {
    struct exec_node node;
+   struct list_head gc_node;
    struct nir_block *block;
    nir_instr_type type;
 
@@ -1016,8 +1042,8 @@ nir_is_sequential_comp_swizzle(uint8_t *swiz, unsigned nr_comp)
    return true;
 }
 
-void nir_src_copy(nir_src *dest, const nir_src *src, void *instr_or_if);
-void nir_dest_copy(nir_dest *dest, const nir_dest *src, nir_instr *instr);
+void nir_src_copy(nir_src *dest, const nir_src *src);
+void nir_dest_copy(nir_dest *dest, const nir_dest *src);
 
 typedef struct {
    /** Base source */
@@ -1159,6 +1185,7 @@ nir_get_nir_type_for_glsl_base_type(enum glsl_base_type base_type)
       break;
 
    case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_TEXTURE:
    case GLSL_TYPE_IMAGE:
    case GLSL_TYPE_ATOMIC_UINT:
    case GLSL_TYPE_STRUCT:
@@ -1446,10 +1473,8 @@ typedef struct nir_alu_instr {
    nir_alu_src src[];
 } nir_alu_instr;
 
-void nir_alu_src_copy(nir_alu_src *dest, const nir_alu_src *src,
-                      nir_alu_instr *instr);
-void nir_alu_dest_copy(nir_alu_dest *dest, const nir_alu_dest *src,
-                       nir_alu_instr *instr);
+void nir_alu_src_copy(nir_alu_src *dest, const nir_alu_src *src);
+void nir_alu_dest_copy(nir_alu_dest *dest, const nir_alu_dest *src);
 
 bool nir_alu_instr_is_copy(nir_alu_instr *instr);
 
@@ -2172,8 +2197,8 @@ typedef enum {
                                   * identical.
                                   */
    nir_texop_tex_prefetch,       /**< Regular texture look-up, eligible for pre-dispatch */
-   nir_texop_fragment_fetch,     /**< Multisample fragment color texture fetch */
-   nir_texop_fragment_mask_fetch,/**< Multisample fragment mask texture fetch */
+   nir_texop_fragment_fetch_amd,      /**< Multisample fragment color texture fetch */
+   nir_texop_fragment_mask_fetch_amd, /**< Multisample fragment mask texture fetch */
 } nir_texop;
 
 /** Represents a texture instruction */
@@ -2350,7 +2375,7 @@ nir_tex_instr_result_size(const nir_tex_instr *instr)
    case nir_texop_texture_samples:
    case nir_texop_query_levels:
    case nir_texop_samples_identical:
-   case nir_texop_fragment_mask_fetch:
+   case nir_texop_fragment_mask_fetch_amd:
       return 1;
 
    default:
@@ -2938,6 +2963,21 @@ nir_block_ends_in_jump(nir_block *block)
           nir_block_last_instr(block)->type == nir_instr_type_jump;
 }
 
+static inline bool
+nir_block_ends_in_return_or_halt(nir_block *block)
+{
+   if (exec_list_is_empty(&block->instr_list))
+      return false;
+
+   nir_instr *instr = nir_block_last_instr(block);
+   if (instr->type != nir_instr_type_jump)
+      return false;
+
+   nir_jump_instr *jump_instr = nir_instr_as_jump(instr);
+   return jump_instr->type == nir_jump_return ||
+          jump_instr->type == nir_jump_halt;
+}
+
 static inline bool
 nir_block_ends_in_break(nir_block *block)
 {
@@ -3391,6 +3431,16 @@ typedef enum {
    nir_divergence_multiple_workgroup_per_compute_subgroup = (1 << 5),
 } nir_divergence_options;
 
+typedef enum {
+   nir_pack_varying_interp_mode_none          = (1 << 0),
+   nir_pack_varying_interp_mode_smooth        = (1 << 1),
+   nir_pack_varying_interp_mode_flat          = (1 << 2),
+   nir_pack_varying_interp_mode_noperspective = (1 << 3),
+   nir_pack_varying_interp_loc_sample         = (1 << 16),
+   nir_pack_varying_interp_loc_centroid       = (1 << 17),
+   nir_pack_varying_interp_loc_center         = (1 << 18),
+} nir_pack_varying_options;
+
 /** An instruction filtering callback
  *
  * Returns true if the instruction should be processed and false otherwise.
@@ -3590,7 +3640,7 @@ typedef struct nir_shader_compiler_options {
    bool lower_hadd64;
 
    /**
-    * Set if nir_op_add_sat and nir_op_usub_sat should be lowered to simple
+    * Set if nir_op_uadd_sat and nir_op_usub_sat should be lowered to simple
     * arithmetic.
     *
     * If this flag is set, the lowering will be applied to all bit-sizes of
@@ -3598,7 +3648,7 @@ typedef struct nir_shader_compiler_options {
     *
     * \sa ::lower_usub_sat64
     */
-   bool lower_add_sat;
+   bool lower_uadd_sat;
 
    /**
     * Set if only 64-bit nir_op_usub_sat should be lowered to simple
@@ -3608,6 +3658,15 @@ typedef struct nir_shader_compiler_options {
     */
    bool lower_usub_sat64;
 
+   /**
+    * Set if nir_op_iadd_sat and nir_op_isub_sat should be lowered to simple
+    * arithmetic.
+    *
+    * If this flag is set, the lowering will be applied to all bit-sizes of
+    * these instructions.
+    */
+   bool lower_iadd_sat;
+
    /**
     * Should IO be re-vectorized?  Some scalar ISAs still operate on vec4's
     * for IO purposes and would prefer loads/stores be vectorized.
@@ -3688,6 +3747,15 @@ typedef struct nir_shader_compiler_options {
     * for rect texture lowering. */
    bool has_txs;
 
+   /** Backend supports sdot_4x8 and udot_4x8 opcodes. */
+   bool has_dot_4x8;
+
+   /** Backend supports sudot_4x8 opcodes. */
+   bool has_sudot_4x8;
+
+   /** Backend supports sdot_2x16 and udot_2x16 opcodes. */
+   bool has_dot_2x16;
+
    /* Whether to generate only scoped_barrier intrinsics instead of the set of
     * memory and control barrier intrinsics based on GLSL.
     */
@@ -3734,6 +3802,13 @@ typedef struct nir_shader_compiler_options {
    nir_lower_int64_options lower_int64_options;
    nir_lower_doubles_options lower_doubles_options;
    nir_divergence_options divergence_analysis_options;
+
+   /**
+    * Support pack varyings with different interpolation location
+    * (center, centroid, sample) and mode (flat, noperspective, smooth)
+    * into same slot.
+    */
+   nir_pack_varying_options pack_varying_options;
 } nir_shader_compiler_options;
 
 typedef struct nir_shader {
@@ -3752,6 +3827,8 @@ typedef struct nir_shader {
 
    struct exec_list functions; /** < list of nir_function */
 
+   struct list_head gc_list; /** < list of all nir_instrs allocated on the shader but not yet freed. */
+
    /**
     * The size of the variable space for load_input_*, load_uniform_*, etc.
     * intrinsics.  This is in back-end specific units which is likely one of
@@ -4165,6 +4242,8 @@ nir_instr_insert_after_cf_list(struct exec_list *list, nir_instr *after)
 }
 
 void nir_instr_remove_v(nir_instr *instr);
+void nir_instr_free(nir_instr *instr);
+void nir_instr_free_list(struct exec_list *list);
 
 static inline nir_cursor
 nir_instr_remove(nir_instr *instr)
@@ -4264,6 +4343,7 @@ void nir_ssa_def_rewrite_uses_src(nir_ssa_def *def, nir_src new_src);
 void nir_ssa_def_rewrite_uses_after(nir_ssa_def *def, nir_ssa_def *new_ssa,
                                     nir_instr *after_me);
 
+nir_component_mask_t nir_src_components_read(const nir_src *src);
 nir_component_mask_t nir_ssa_def_components_read(const nir_ssa_def *def);
 
 static inline bool
@@ -4984,6 +5064,16 @@ typedef struct nir_lower_compute_system_values_options {
 bool nir_lower_compute_system_values(nir_shader *shader,
                                      const nir_lower_compute_system_values_options *options);
 
+struct nir_lower_sysvals_to_varyings_options {
+   bool frag_coord:1;
+   bool front_face:1;
+   bool point_coord:1;
+};
+
+bool
+nir_lower_sysvals_to_varyings(nir_shader *shader,
+                              const struct nir_lower_sysvals_to_varyings_options *options);
+
 enum PACKED nir_lower_tex_packing {
    /** No packing */
    nir_lower_tex_packing_none = 0,
@@ -5158,6 +5248,12 @@ typedef struct nir_lower_tex_options {
     */
    bool lower_tg4_offsets;
 
+   /**
+    * Lower txf_ms to fragment_mask_fetch and fragment_fetch and samples_identical to
+    * fragment_mask_fetch.
+    */
+   bool lower_to_fragment_fetch_amd;
+
    /**
     * To lower packed sampler return formats.
     *
@@ -5379,9 +5475,12 @@ bool nir_shader_uses_view_index(nir_shader *shader);
 bool nir_can_lower_multiview(nir_shader *shader);
 bool nir_lower_multiview(nir_shader *shader, uint32_t view_mask);
 
+
 bool nir_lower_fp16_casts(nir_shader *shader);
 bool nir_normalize_cubemap_coords(nir_shader *shader);
 
+bool nir_shader_supports_implicit_lod(nir_shader *shader);
+
 void nir_live_ssa_defs_impl(nir_function_impl *impl);
 
 const BITSET_WORD *nir_get_live_ssa_defs(nir_cursor cursor, void *mem_ctx);
diff --git a/mesa 3D driver/src/compiler/nir/nir_builder.h b/mesa 3D driver/src/compiler/nir/nir_builder.h
index 962fdf0faf..dc7ccd477e 100644
--- a/mesa 3D driver/src/compiler/nir/nir_builder.h	
+++ b/mesa 3D driver/src/compiler/nir/nir_builder.h	
@@ -1232,6 +1232,8 @@ nir_ssa_for_src(nir_builder *build, nir_src src, int num_components)
    if (src.is_ssa && src.ssa->num_components == num_components)
       return src.ssa;
 
+   assert((unsigned)num_components <= nir_src_num_components(src));
+
    nir_alu_src alu = { NIR_SRC_INIT };
    alu.src = src;
    for (int j = 0; j < NIR_MAX_VEC_COMPONENTS; j++)
diff --git a/mesa 3D driver/src/compiler/nir/nir_builtin_builder.c b/mesa 3D driver/src/compiler/nir/nir_builtin_builder.c
index d2fa24a457..da332e72de 100644
--- a/mesa 3D driver/src/compiler/nir/nir_builtin_builder.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_builtin_builder.c	
@@ -345,7 +345,7 @@ nir_get_texture_size(nir_builder *b, nir_tex_instr *tex)
           tex->src[i].src_type == nir_tex_src_sampler_offset ||
           tex->src[i].src_type == nir_tex_src_texture_handle ||
           tex->src[i].src_type == nir_tex_src_sampler_handle) {
-         nir_src_copy(&txs->src[idx].src, &tex->src[i].src, txs);
+         nir_src_copy(&txs->src[idx].src, &tex->src[i].src);
          txs->src[idx].src_type = tex->src[i].src_type;
          idx++;
       }
@@ -400,7 +400,7 @@ nir_get_texture_lod(nir_builder *b, nir_tex_instr *tex)
           tex->src[i].src_type == nir_tex_src_sampler_offset ||
           tex->src[i].src_type == nir_tex_src_texture_handle ||
           tex->src[i].src_type == nir_tex_src_sampler_handle) {
-         nir_src_copy(&tql->src[idx].src, &tex->src[i].src, tql);
+         nir_src_copy(&tql->src[idx].src, &tex->src[i].src);
          tql->src[idx].src_type = tex->src[i].src_type;
          idx++;
       }
diff --git a/mesa 3D driver/src/compiler/nir/nir_clone.c b/mesa 3D driver/src/compiler/nir/nir_clone.c
index db46d40502..fd7184fddc 100644
--- a/mesa 3D driver/src/compiler/nir/nir_clone.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_clone.c	
@@ -243,7 +243,7 @@ __clone_src(clone_state *state, void *ninstr_or_if,
    } else {
       nsrc->reg.reg = remap_reg(state, src->reg.reg);
       if (src->reg.indirect) {
-         nsrc->reg.indirect = ralloc(ninstr_or_if, nir_src);
+         nsrc->reg.indirect = malloc(sizeof(nir_src));
          __clone_src(state, ninstr_or_if, nsrc->reg.indirect, src->reg.indirect);
       }
       nsrc->reg.base_offset = src->reg.base_offset;
@@ -263,7 +263,7 @@ __clone_dst(clone_state *state, nir_instr *ninstr,
    } else {
       ndst->reg.reg = remap_reg(state, dst->reg.reg);
       if (dst->reg.indirect) {
-         ndst->reg.indirect = ralloc(ninstr, nir_src);
+         ndst->reg.indirect = malloc(sizeof(nir_src));
          __clone_src(state, ninstr, ndst->reg.indirect, dst->reg.indirect);
       }
       ndst->reg.base_offset = dst->reg.base_offset;
@@ -790,6 +790,10 @@ nir_shader_replace(nir_shader *dst, nir_shader *src)
    ralloc_adopt(dead_ctx, dst);
    ralloc_free(dead_ctx);
 
+   list_for_each_entry_safe(nir_instr, instr, &dst->gc_list, gc_node) {
+      nir_instr_free(instr);
+   }
+
    /* Re-parent all of src's ralloc children to dst */
    ralloc_adopt(dst, src);
 
@@ -798,6 +802,8 @@ nir_shader_replace(nir_shader *dst, nir_shader *src)
    /* We have to move all the linked lists over separately because we need the
     * pointers in the list elements to point to the lists in dst and not src.
     */
+   list_replace(&src->gc_list, &dst->gc_list);
+   list_inithead(&src->gc_list);
    exec_list_move_nodes_to(&src->variables, &dst->variables);
 
    /* Now move the functions over.  This takes a tiny bit more work */
diff --git a/mesa 3D driver/src/compiler/nir/nir_control_flow.c b/mesa 3D driver/src/compiler/nir/nir_control_flow.c
index 6cf0f1add5..dd5e7982a2 100644
--- a/mesa 3D driver/src/compiler/nir/nir_control_flow.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_control_flow.c	
@@ -236,7 +236,7 @@ nir_insert_phi_undef(nir_block *block, nir_block *pred)
 
       nir_phi_instr *phi = nir_instr_as_phi(instr);
       nir_ssa_undef_instr *undef =
-         nir_ssa_undef_instr_create(ralloc_parent(phi),
+         nir_ssa_undef_instr_create(impl->function->shader,
                                     phi->dest.ssa.num_components,
                                     phi->dest.ssa.bit_size);
       nir_instr_insert_before_cf_list(&impl->body, &undef->instr);
@@ -440,6 +440,7 @@ remove_phi_src(nir_block *block, nir_block *pred)
          if (src->pred == pred) {
             list_del(&src->src.use_link);
             exec_node_remove(&src->node);
+            free(src);
          }
       }
    }
@@ -607,10 +608,10 @@ static bool
 replace_ssa_def_uses(nir_ssa_def *def, void *void_impl)
 {
    nir_function_impl *impl = void_impl;
-   void *mem_ctx = ralloc_parent(impl);
 
    nir_ssa_undef_instr *undef =
-      nir_ssa_undef_instr_create(mem_ctx, def->num_components,
+      nir_ssa_undef_instr_create(impl->function->shader,
+                                 def->num_components,
                                  def->bit_size);
    nir_instr_insert_before_cf_list(&impl->body, &undef->instr);
    nir_ssa_def_rewrite_uses(def, &undef->def);
diff --git a/mesa 3D driver/src/compiler/nir/nir_deref.c b/mesa 3D driver/src/compiler/nir/nir_deref.c
index dd41608f32..39d7b71ca4 100644
--- a/mesa 3D driver/src/compiler/nir/nir_deref.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_deref.c	
@@ -709,7 +709,7 @@ rematerialize_deref_in_block(nir_deref_instr *deref,
          parent = rematerialize_deref_in_block(parent, state);
          new_deref->parent = nir_src_for_ssa(&parent->dest.ssa);
       } else {
-         nir_src_copy(&new_deref->parent, &deref->parent, new_deref);
+         nir_src_copy(&new_deref->parent, &deref->parent);
       }
    }
 
@@ -726,7 +726,7 @@ rematerialize_deref_in_block(nir_deref_instr *deref,
    case nir_deref_type_array:
    case nir_deref_type_ptr_as_array:
       assert(!nir_src_as_deref(deref->arr.index));
-      nir_src_copy(&new_deref->arr.index, &deref->arr.index, new_deref);
+      nir_src_copy(&new_deref->arr.index, &deref->arr.index);
       break;
 
    case nir_deref_type_struct:
@@ -1012,14 +1012,17 @@ opt_remove_sampler_cast(nir_deref_instr *cast)
       cast_type = glsl_get_array_element(cast_type);
    }
 
-   if (glsl_type_is_array(parent_type) || glsl_type_is_array(cast_type))
+   if (!glsl_type_is_sampler(parent_type))
       return false;
 
-   if (!glsl_type_is_sampler(parent_type) ||
-       cast_type != glsl_bare_sampler_type())
+   if (cast_type != glsl_bare_sampler_type() &&
+       (glsl_type_is_bare_sampler(parent_type) ||
+        cast_type != glsl_sampler_type_to_texture(parent_type)))
       return false;
 
-   /* We're a cast from a more detailed sampler type to a bare sampler */
+   /* We're a cast from a more detailed sampler type to a bare sampler or a
+    * texture type with the same dimensionality.
+    */
    nir_ssa_def_rewrite_uses(&cast->dest.ssa,
                             &parent->dest.ssa);
    nir_instr_remove(&cast->instr);
diff --git a/mesa 3D driver/src/compiler/nir/nir_divergence_analysis.c b/mesa 3D driver/src/compiler/nir/nir_divergence_analysis.c
index 9aecf8aee1..9cda1cd6c3 100644
--- a/mesa 3D driver/src/compiler/nir/nir_divergence_analysis.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_divergence_analysis.c	
@@ -105,6 +105,7 @@ visit_intrinsic(nir_shader *shader, nir_intrinsic_instr *instr)
    case nir_intrinsic_load_workgroup_size:
    case nir_intrinsic_load_subgroup_id:
    case nir_intrinsic_load_num_subgroups:
+   case nir_intrinsic_load_ray_launch_size:
    case nir_intrinsic_load_subgroup_size:
    case nir_intrinsic_load_subgroup_eq_mask:
    case nir_intrinsic_load_subgroup_ge_mask:
@@ -167,7 +168,7 @@ visit_intrinsic(nir_shader *shader, nir_intrinsic_instr *instr)
          is_divergent |= !(options & nir_divergence_single_prim_per_subgroup);
       else if (stage == MESA_SHADER_TESS_EVAL)
          is_divergent |= !(options & nir_divergence_single_patch_per_tes_subgroup);
-      else
+      else if (stage != MESA_SHADER_MESH)
          is_divergent = true;
       break;
    case nir_intrinsic_load_per_vertex_input:
@@ -186,18 +187,33 @@ visit_intrinsic(nir_shader *shader, nir_intrinsic_instr *instr)
       is_divergent |= !(options & nir_divergence_single_prim_per_subgroup);
       break;
    case nir_intrinsic_load_output:
-      assert(stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT);
       is_divergent = instr->src[0].ssa->divergent;
-      if (stage == MESA_SHADER_TESS_CTRL)
+      switch (stage) {
+      case MESA_SHADER_TESS_CTRL:
          is_divergent |= !(options & nir_divergence_single_patch_per_tcs_subgroup);
-      else
+         break;
+      case MESA_SHADER_FRAGMENT:
          is_divergent = true;
+         break;
+      case MESA_SHADER_TASK:
+      case MESA_SHADER_MESH:
+         /* Divergent if src[0] is, so nothing else to do. */
+         break;
+      default:
+         unreachable("Invalid stage for load_output");
+      }
       break;
    case nir_intrinsic_load_per_vertex_output:
-      assert(stage == MESA_SHADER_TESS_CTRL);
+      assert(stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_MESH);
       is_divergent = instr->src[0].ssa->divergent ||
                      instr->src[1].ssa->divergent ||
-                     !(options & nir_divergence_single_patch_per_tcs_subgroup);
+                     (stage == MESA_SHADER_TESS_CTRL &&
+                      !(options & nir_divergence_single_patch_per_tcs_subgroup));
+      break;
+   case nir_intrinsic_load_per_primitive_output:
+      assert(stage == MESA_SHADER_MESH);
+      is_divergent = instr->src[0].ssa->divergent ||
+                     instr->src[1].ssa->divergent;
       break;
    case nir_intrinsic_load_layer_id:
    case nir_intrinsic_load_front_face:
@@ -245,8 +261,9 @@ visit_intrinsic(nir_shader *shader, nir_intrinsic_instr *instr)
       break;
 
    case nir_intrinsic_load_workgroup_id:
-      assert(stage == MESA_SHADER_COMPUTE);
-      is_divergent |= (options & nir_divergence_multiple_workgroup_per_compute_subgroup);
+      assert(gl_shader_stage_uses_workgroup(stage));
+      if (stage == MESA_SHADER_COMPUTE)
+         is_divergent |= (options & nir_divergence_multiple_workgroup_per_compute_subgroup);
       break;
 
    /* Clustered reductions are uniform if cluster_size == subgroup_size or
@@ -515,6 +532,8 @@ visit_intrinsic(nir_shader *shader, nir_intrinsic_instr *instr)
    case nir_intrinsic_load_packed_passthrough_primitive_amd:
    case nir_intrinsic_load_initial_edgeflags_amd:
    case nir_intrinsic_gds_atomic_add_amd:
+   case nir_intrinsic_load_rt_arg_scratch_offset_amd:
+   case nir_intrinsic_load_intersection_opaque_amd:
       is_divergent = true;
       break;
 
@@ -584,6 +603,7 @@ nir_variable_mode_is_uniform(nir_variable_mode mode) {
    case nir_var_mem_ssbo:
    case nir_var_mem_shared:
    case nir_var_mem_global:
+   case nir_var_image:
       return true;
    default:
       return false;
diff --git a/mesa 3D driver/src/compiler/nir/nir_from_ssa.c b/mesa 3D driver/src/compiler/nir/nir_from_ssa.c
index 6bfeed23b3..7e69487bd2 100644
--- a/mesa 3D driver/src/compiler/nir/nir_from_ssa.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_from_ssa.c	
@@ -38,6 +38,7 @@
 struct from_ssa_state {
    nir_builder builder;
    void *dead_ctx;
+   struct exec_list dead_instrs;
    bool phi_webs_only;
    struct hash_table *merge_node_table;
    nir_instr *instr;
@@ -300,9 +301,8 @@ merge_sets_interfere(merge_set *a, merge_set *b)
 }
 
 static bool
-add_parallel_copy_to_end_of_block(nir_block *block, void *dead_ctx)
+add_parallel_copy_to_end_of_block(nir_shader *shader, nir_block *block, void *dead_ctx)
 {
-
    bool need_end_copy = false;
    if (block->successors[0]) {
       nir_instr *instr = nir_block_first_instr(block->successors[0]);
@@ -322,7 +322,7 @@ add_parallel_copy_to_end_of_block(nir_block *block, void *dead_ctx)
        * (if there is one).
        */
       nir_parallel_copy_instr *pcopy =
-         nir_parallel_copy_instr_create(dead_ctx);
+         nir_parallel_copy_instr_create(shader);
 
       nir_instr_insert(nir_after_block_before_jump(block), &pcopy->instr);
    }
@@ -378,7 +378,7 @@ get_parallel_copy_at_end_of_block(nir_block *block)
  * time because of potential back-edges in the CFG.
  */
 static bool
-isolate_phi_nodes_block(nir_block *block, void *dead_ctx)
+isolate_phi_nodes_block(nir_shader *shader, nir_block *block, void *dead_ctx)
 {
    nir_instr *last_phi_instr = NULL;
    nir_foreach_instr(instr, block) {
@@ -397,7 +397,7 @@ isolate_phi_nodes_block(nir_block *block, void *dead_ctx)
     * start of this block but after the phi nodes.
     */
    nir_parallel_copy_instr *block_pcopy =
-      nir_parallel_copy_instr_create(dead_ctx);
+      nir_parallel_copy_instr_create(shader);
    nir_instr_insert_after(last_phi_instr, &block_pcopy->instr);
 
    nir_foreach_instr(instr, block) {
@@ -587,7 +587,7 @@ rewrite_ssa_def(nir_ssa_def *def, void *void_state)
        */
       nir_instr *parent_instr = def->parent_instr;
       nir_instr_remove(parent_instr);
-      ralloc_steal(state->dead_ctx, parent_instr);
+      exec_list_push_tail(&state->dead_instrs, &parent_instr->node);
       state->progress = true;
       return true;
    }
@@ -616,7 +616,7 @@ resolve_registers_block(nir_block *block, struct from_ssa_state *state)
 
       if (instr->type == nir_instr_type_phi) {
          nir_instr_remove(instr);
-         ralloc_steal(state->dead_ctx, instr);
+         exec_list_push_tail(&state->dead_instrs, &instr->node);
          state->progress = true;
       }
    }
@@ -638,7 +638,7 @@ emit_copy(nir_builder *b, nir_src src, nir_src dest_src)
       assert(src.reg.reg->num_components >= dest_src.reg.reg->num_components);
 
    nir_alu_instr *mov = nir_alu_instr_create(b->shader, nir_op_mov);
-   nir_src_copy(&mov->src[0].src, &src, mov);
+   nir_src_copy(&mov->src[0].src, &src);
    mov->dest.dest = nir_dest_for_reg(dest_src.reg.reg);
    mov->dest.write_mask = (1 << dest_src.reg.reg->num_components) - 1;
 
@@ -683,6 +683,7 @@ resolve_parallel_copy(nir_parallel_copy_instr *pcopy,
    if (num_copies == 0) {
       /* Hooray, we don't need any copies! */
       nir_instr_remove(&pcopy->instr);
+      exec_list_push_tail(&state->dead_instrs, &pcopy->instr.node);
       return;
    }
 
@@ -825,6 +826,7 @@ resolve_parallel_copy(nir_parallel_copy_instr *pcopy,
    }
 
    nir_instr_remove(&pcopy->instr);
+   exec_list_push_tail(&state->dead_instrs, &pcopy->instr.node);
 }
 
 /* Resolves the parallel copies in a block.  Each block can have at most
@@ -864,6 +866,8 @@ resolve_parallel_copies_block(nir_block *block, struct from_ssa_state *state)
 static bool
 nir_convert_from_ssa_impl(nir_function_impl *impl, bool phi_webs_only)
 {
+   nir_shader *shader = impl->function->shader;
+
    struct from_ssa_state state;
 
    nir_builder_init(&state.builder, impl);
@@ -871,13 +875,14 @@ nir_convert_from_ssa_impl(nir_function_impl *impl, bool phi_webs_only)
    state.phi_webs_only = phi_webs_only;
    state.merge_node_table = _mesa_pointer_hash_table_create(NULL);
    state.progress = false;
+   exec_list_make_empty(&state.dead_instrs);
 
    nir_foreach_block(block, impl) {
-      add_parallel_copy_to_end_of_block(block, state.dead_ctx);
+      add_parallel_copy_to_end_of_block(shader, block, state.dead_ctx);
    }
 
    nir_foreach_block(block, impl) {
-      isolate_phi_nodes_block(block, state.dead_ctx);
+      isolate_phi_nodes_block(shader, block, state.dead_ctx);
    }
 
    /* Mark metadata as dirty before we ask for liveness analysis */
@@ -908,6 +913,7 @@ nir_convert_from_ssa_impl(nir_function_impl *impl, bool phi_webs_only)
                                nir_metadata_dominance);
 
    /* Clean up dead instructions and the hash tables */
+   nir_instr_free_list(&state.dead_instrs);
    _mesa_hash_table_destroy(state.merge_node_table, NULL);
    ralloc_free(state.dead_ctx);
    return state.progress;
diff --git a/mesa 3D driver/src/compiler/nir/nir_gather_info.c b/mesa 3D driver/src/compiler/nir/nir_gather_info.c
index e4f858f46d..4da3a11be0 100644
--- a/mesa 3D driver/src/compiler/nir/nir_gather_info.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_gather_info.c	
@@ -533,6 +533,7 @@ gather_intrinsic_info(nir_intrinsic_instr *instr, nir_shader *shader,
 
    case nir_intrinsic_load_output:
    case nir_intrinsic_load_per_vertex_output:
+   case nir_intrinsic_load_per_primitive_output:
       if (shader->info.stage == MESA_SHADER_TESS_CTRL &&
           instr->intrinsic == nir_intrinsic_load_output) {
          shader->info.patch_outputs_read |= slot_mask;
@@ -559,6 +560,7 @@ gather_intrinsic_info(nir_intrinsic_instr *instr, nir_shader *shader,
 
    case nir_intrinsic_store_output:
    case nir_intrinsic_store_per_vertex_output:
+   case nir_intrinsic_store_per_primitive_output:
       if (shader->info.stage == MESA_SHADER_TESS_CTRL &&
           instr->intrinsic == nir_intrinsic_store_output) {
          shader->info.patch_outputs_written |= slot_mask;
@@ -832,7 +834,7 @@ nir_shader_gather_info(nir_shader *shader, nir_function_impl *entrypoint)
    shader->info.bit_sizes_float = 0;
    shader->info.bit_sizes_int = 0;
 
-   nir_foreach_uniform_variable(var, shader) {
+   nir_foreach_variable_with_modes(var, shader, nir_var_image | nir_var_uniform) {
       /* Bindless textures and images don't use non-bindless slots.
        * Interface blocks imply inputs, outputs, UBO, or SSBO, which can only
        * mean bindless.
@@ -910,4 +912,27 @@ nir_shader_gather_info(nir_shader *shader, nir_function_impl *entrypoint)
        */
       shader->info.fs.uses_sample_shading = true;
    }
+
+   shader->info.per_primitive_outputs = 0;
+   if (shader->info.stage == MESA_SHADER_MESH) {
+      nir_foreach_shader_out_variable(var, shader) {
+         if (var->data.per_primitive) {
+            assert(nir_is_arrayed_io(var, shader->info.stage));
+            const unsigned slots =
+               glsl_count_attribute_slots(glsl_get_array_element(var->type), false);
+            shader->info.per_primitive_outputs |= BITFIELD64_RANGE(var->data.location, slots);
+         }
+      }
+   }
+
+   shader->info.per_primitive_inputs = 0;
+   if (shader->info.stage == MESA_SHADER_FRAGMENT) {
+      nir_foreach_shader_in_variable(var, shader) {
+         if (var->data.per_primitive) {
+            const unsigned slots =
+               glsl_count_attribute_slots(var->type, false);
+            shader->info.per_primitive_inputs |= BITFIELD64_RANGE(var->data.location, slots);
+         }
+      }
+   }
 }
diff --git a/mesa 3D driver/src/compiler/nir/nir_intrinsics.py b/mesa 3D driver/src/compiler/nir/nir_intrinsics.py
index f3f62ad452..00a72d6308 100644
--- a/mesa 3D driver/src/compiler/nir/nir_intrinsics.py	
+++ b/mesa 3D driver/src/compiler/nir/nir_intrinsics.py	
@@ -921,6 +921,8 @@ load("ssbo_address", [1], [], [CAN_ELIMINATE, CAN_REORDER])
 load("output", [1], [BASE, COMPONENT, DEST_TYPE, IO_SEMANTICS], flags=[CAN_ELIMINATE])
 # src[] = { vertex, offset }.
 load("per_vertex_output", [1, 1], [BASE, COMPONENT, DEST_TYPE, IO_SEMANTICS], [CAN_ELIMINATE])
+# src[] = { primitive, offset }.
+load("per_primitive_output", [1, 1], [BASE, COMPONENT, DEST_TYPE, IO_SEMANTICS], [CAN_ELIMINATE])
 # src[] = { offset }.
 load("shared", [1], [BASE, ALIGN_MUL, ALIGN_OFFSET], [CAN_ELIMINATE])
 # src[] = { offset }.
@@ -956,6 +958,8 @@ def store(name, srcs, indices=[], flags=[]):
 store("output", [1], [BASE, WRITE_MASK, COMPONENT, SRC_TYPE, IO_SEMANTICS])
 # src[] = { value, vertex, offset }.
 store("per_vertex_output", [1, 1], [BASE, WRITE_MASK, COMPONENT, SRC_TYPE, IO_SEMANTICS])
+# src[] = { value, primitive, offset }.
+store("per_primitive_output", [1, 1], [BASE, WRITE_MASK, COMPONENT, SRC_TYPE, IO_SEMANTICS])
 # src[] = { value, block_index, offset }
 store("ssbo", [-1, 1], [WRITE_MASK, ACCESS, ALIGN_MUL, ALIGN_OFFSET])
 # src[] = { value, offset }.
@@ -1075,7 +1079,7 @@ load("scratch_dxil", [1], [], [CAN_ELIMINATE])
 # src[] = { deref_var, offset }
 load("ptr_dxil", [1, 1], [], [])
 # src[] = { index, 16-byte-based-offset }
-load("ubo_dxil", [1, 1], [], [CAN_ELIMINATE])
+load("ubo_dxil", [1, 1], [], [CAN_ELIMINATE, CAN_REORDER])
 
 # DXIL Shared atomic intrinsics
 #
@@ -1210,8 +1214,9 @@ intrinsic("overwrite_vs_arguments_amd", src_comp=[1, 1], indices=[])
 # Overwrites TES input registers, for use with vertex compaction after culling. src = {tes_u, tes_v, rel_patch_id, patch_id}.
 intrinsic("overwrite_tes_arguments_amd", src_comp=[1, 1, 1, 1], indices=[])
 
-# src = [index] BINDING = which table BASE = offset within handle
-intrinsic("load_sbt_amd", src_comp=[-1], dest_comp=0, indices=[BINDING, BASE],
+# loads a descriptor for an sbt.
+# src = [index] BINDING = which table
+intrinsic("load_sbt_amd", dest_comp=4, bit_sizes=[32], indices=[BINDING],
           flags=[CAN_ELIMINATE, CAN_REORDER])
 
 # 1. HW descriptor
@@ -1222,6 +1227,15 @@ intrinsic("load_sbt_amd", src_comp=[-1], dest_comp=0, indices=[BINDING, BASE],
 # 6. inverse ray direction (componentwise 1.0/ray direction)
 intrinsic("bvh64_intersect_ray_amd", [4, 2, 1, 3, 3, 3], 4, flags=[CAN_ELIMINATE, CAN_REORDER])
 
+# Return of a callable in raytracing pipelines
+intrinsic("rt_return_amd")
+
+# offset into scratch for the input callable data in a raytracing pipeline.
+system_value("rt_arg_scratch_offset_amd", 1)
+
+# Whether to call the anyhit shader for an intersection in an intersection shader.
+system_value("intersection_opaque_amd", 1, bit_sizes=[1])
+
 # V3D-specific instrinc for tile buffer color reads.
 #
 # The hardware requires that we read the samples and components of a pixel
diff --git a/mesa 3D driver/src/compiler/nir/nir_linking_helpers.c b/mesa 3D driver/src/compiler/nir/nir_linking_helpers.c
index be4144ade8..bc1ee4dd54 100644
--- a/mesa 3D driver/src/compiler/nir/nir_linking_helpers.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_linking_helpers.c	
@@ -58,6 +58,15 @@ get_variable_io_mask(nir_variable *var, gl_shader_stage stage)
    return ((1ull << slots) - 1) << location;
 }
 
+static bool
+is_non_generic_patch_var(nir_variable *var)
+{
+   return var->data.location == VARYING_SLOT_TESS_LEVEL_INNER ||
+          var->data.location == VARYING_SLOT_TESS_LEVEL_OUTER ||
+          var->data.location == VARYING_SLOT_BOUNDING_BOX0 ||
+          var->data.location == VARYING_SLOT_BOUNDING_BOX1;
+}
+
 static uint8_t
 get_num_components(nir_variable *var)
 {
@@ -90,6 +99,9 @@ tcs_add_output_reads(nir_shader *shader, uint64_t *read, uint64_t *patches_read)
             nir_variable *var = nir_deref_instr_get_variable(deref);
             for (unsigned i = 0; i < get_num_components(var); i++) {
                if (var->data.patch) {
+                  if (is_non_generic_patch_var(var))
+                     continue;
+
                   patches_read[var->data.location_frac + i] |=
                      get_variable_io_mask(var, shader->info.stage);
                } else {
@@ -172,6 +184,9 @@ nir_remove_unused_varyings(nir_shader *producer, nir_shader *consumer)
    nir_foreach_shader_out_variable(var, producer) {
       for (unsigned i = 0; i < get_num_components(var); i++) {
          if (var->data.patch) {
+            if (is_non_generic_patch_var(var))
+               continue;
+
             patches_written[var->data.location_frac + i] |=
                get_variable_io_mask(var, producer->info.stage);
          } else {
@@ -184,6 +199,9 @@ nir_remove_unused_varyings(nir_shader *producer, nir_shader *consumer)
    nir_foreach_shader_in_variable(var, consumer) {
       for (unsigned i = 0; i < get_num_components(var); i++) {
          if (var->data.patch) {
+            if (is_non_generic_patch_var(var))
+               continue;
+
             patches_read[var->data.location_frac + i] |=
                get_variable_io_mask(var, consumer->info.stage);
          } else {
@@ -670,12 +688,60 @@ gather_varying_component_info(nir_shader *producer, nir_shader *consumer,
    }
 }
 
+static bool
+allow_pack_interp_type(nir_pack_varying_options options, int type)
+{
+   int sel;
+
+   switch (type) {
+   case INTERP_MODE_NONE:
+      sel = nir_pack_varying_interp_mode_none;
+      break;
+   case INTERP_MODE_SMOOTH:
+      sel = nir_pack_varying_interp_mode_smooth;
+      break;
+   case INTERP_MODE_FLAT:
+      sel = nir_pack_varying_interp_mode_flat;
+      break;
+   case INTERP_MODE_NOPERSPECTIVE:
+      sel = nir_pack_varying_interp_mode_noperspective;
+      break;
+   default:
+      return false;
+   }
+
+   return options & sel;
+}
+
+static bool
+allow_pack_interp_loc(nir_pack_varying_options options, int loc)
+{
+   int sel;
+
+   switch (loc) {
+   case INTERPOLATE_LOC_SAMPLE:
+      sel = nir_pack_varying_interp_loc_sample;
+      break;
+   case INTERPOLATE_LOC_CENTROID:
+      sel = nir_pack_varying_interp_loc_centroid;
+      break;
+   case INTERPOLATE_LOC_CENTER:
+      sel = nir_pack_varying_interp_loc_center;
+      break;
+   default:
+      return false;
+   }
+
+   return options & sel;
+}
+
 static void
 assign_remap_locations(struct varying_loc (*remap)[4],
                        struct assigned_comps *assigned_comps,
                        struct varying_component *info,
                        unsigned *cursor, unsigned *comp,
-                       unsigned max_location)
+                       unsigned max_location,
+                       nir_pack_varying_options options)
 {
    unsigned tmp_cursor = *cursor;
    unsigned tmp_comp = *comp;
@@ -683,21 +749,28 @@ assign_remap_locations(struct varying_loc (*remap)[4],
    for (; tmp_cursor < max_location; tmp_cursor++) {
 
       if (assigned_comps[tmp_cursor].comps) {
-         /* We can only pack varyings with matching interpolation types,
-          * interpolation loc must match also.
-          * TODO: i965 can handle interpolation locations that don't match,
-          * but the radeonsi nir backend handles everything as vec4s and so
-          * expects this to be the same for all components. We could make this
-          * check driver specfific or drop it if NIR ever become the only
-          * radeonsi backend.
-          * TODO2: The radeonsi comment above is not true. Only "flat" is per
-          * vec4 (128-bit granularity), all other interpolation qualifiers are
-          * per component (16-bit granularity for float16, 32-bit granularity
-          * otherwise). Each vec4 (128 bits) must be either vec4 or f16vec8.
+         /* We can only pack varyings with matching precision. */
+         if (assigned_comps[tmp_cursor].is_mediump != info->is_mediump) {
+            tmp_comp = 0;
+            continue;
+         }
+
+         /* We can only pack varyings with matching interpolation type
+          * if driver does not support it.
           */
-         if (assigned_comps[tmp_cursor].interp_type != info->interp_type ||
-             assigned_comps[tmp_cursor].interp_loc != info->interp_loc ||
-             assigned_comps[tmp_cursor].is_mediump != info->is_mediump) {
+         if (assigned_comps[tmp_cursor].interp_type != info->interp_type &&
+             (!allow_pack_interp_type(options, assigned_comps[tmp_cursor].interp_type) ||
+              !allow_pack_interp_type(options, info->interp_type))) {
+            tmp_comp = 0;
+            continue;
+         }
+
+         /* We can only pack varyings with matching interpolation location
+          * if driver does not support it.
+          */
+         if (assigned_comps[tmp_cursor].interp_loc != info->interp_loc &&
+             (!allow_pack_interp_loc(options, assigned_comps[tmp_cursor].interp_loc) ||
+              !allow_pack_interp_loc(options, info->interp_loc))) {
             tmp_comp = 0;
             continue;
          }
@@ -764,6 +837,8 @@ compact_components(nir_shader *producer, nir_shader *consumer,
    qsort(varying_comp_info, varying_comp_info_size,
          sizeof(struct varying_component), cmp_varying_component);
 
+   nir_pack_varying_options options = consumer->options->pack_varying_options;
+
    unsigned cursor = 0;
    unsigned comp = 0;
 
@@ -783,10 +858,12 @@ compact_components(nir_shader *producer, nir_shader *consumer,
          }
 
          assign_remap_locations(remap, assigned_comps, info,
-                                &cursor, &comp, MAX_VARYINGS_INCL_PATCH);
+                                &cursor, &comp, MAX_VARYINGS_INCL_PATCH,
+                                options);
       } else {
          assign_remap_locations(remap, assigned_comps, info,
-                                &cursor, &comp, MAX_VARYING);
+                                &cursor, &comp, MAX_VARYING,
+                                options);
 
          /* Check if we failed to assign a remap location. This can happen if
           * for example there are a bunch of unmovable components with
@@ -799,7 +876,8 @@ compact_components(nir_shader *producer, nir_shader *consumer,
             cursor = 0;
             comp = 0;
             assign_remap_locations(remap, assigned_comps, info,
-                                   &cursor, &comp, MAX_VARYING);
+                                   &cursor, &comp, MAX_VARYING,
+                                   options);
          }
       }
    }
@@ -927,7 +1005,8 @@ can_replace_varying(nir_variable *out_var)
 }
 
 static bool
-replace_constant_input(nir_shader *shader, nir_intrinsic_instr *store_intr)
+replace_varying_input_by_constant_load(nir_shader *shader,
+                                       nir_intrinsic_instr *store_intr)
 {
    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
 
@@ -1022,6 +1101,156 @@ replace_duplicate_input(nir_shader *shader, nir_variable *input_var,
    return progress;
 }
 
+static bool
+is_direct_uniform_load(nir_ssa_def *def, nir_ssa_scalar *s)
+{
+   /* def is sure to be scalar as can_replace_varying() filter out vector case. */
+   assert(def->num_components == 1);
+
+   /* Uniform load may hide behind some move instruction for converting
+    * vector to scalar:
+    *
+    *     vec1 32 ssa_1 = deref_var &color (uniform vec3)
+    *     vec3 32 ssa_2 = intrinsic load_deref (ssa_1) (0)
+    *     vec1 32 ssa_3 = mov ssa_2.x
+    *     vec1 32 ssa_4 = deref_var &color_out (shader_out float)
+    *     intrinsic store_deref (ssa_4, ssa_3) (1, 0)
+    */
+   *s = nir_ssa_scalar_resolved(def, 0);
+
+   nir_ssa_def *ssa = s->def;
+   if (ssa->parent_instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(ssa->parent_instr);
+   if (intr->intrinsic != nir_intrinsic_load_deref)
+      return false;
+
+   nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
+   /* TODO: support nir_var_mem_ubo. */
+   if (!nir_deref_mode_is(deref, nir_var_uniform))
+      return false;
+
+   /* Does not support indirect uniform load. */
+   return !nir_deref_instr_has_indirect(deref);
+}
+
+static nir_variable *
+get_uniform_var_in_consumer(nir_shader *consumer,
+                            nir_variable *var_in_producer)
+{
+   /* Find if uniform already exists in consumer. */
+   nir_variable *new_var = NULL;
+   nir_foreach_uniform_variable(v, consumer) {
+      if (!strcmp(var_in_producer->name, v->name)) {
+         new_var = v;
+         break;
+      }
+   }
+
+   /* Create a variable if not exist. */
+   if (!new_var) {
+      new_var = nir_variable_clone(var_in_producer, consumer);
+      nir_shader_add_variable(consumer, new_var);
+   }
+
+   return new_var;
+}
+
+static nir_deref_instr *
+clone_deref_instr(nir_builder *b, nir_variable *var, nir_deref_instr *deref)
+{
+   if (deref->deref_type == nir_deref_type_var)
+       return nir_build_deref_var(b, var);
+
+   nir_deref_instr *parent_deref = nir_deref_instr_parent(deref);
+   nir_deref_instr *parent = clone_deref_instr(b, var, parent_deref);
+
+   /* Build array and struct deref instruction.
+    * "deref" instr is sure to be direct (see is_direct_uniform_load()).
+    */
+   switch (deref->deref_type) {
+   case nir_deref_type_array: {
+      nir_load_const_instr *index =
+         nir_instr_as_load_const(deref->arr.index.ssa->parent_instr);
+      return nir_build_deref_array_imm(b, parent, index->value->i64);
+   }
+   case nir_deref_type_ptr_as_array: {
+      nir_load_const_instr *index =
+         nir_instr_as_load_const(deref->arr.index.ssa->parent_instr);
+      nir_ssa_def *ssa = nir_imm_intN_t(b, index->value->i64,
+                                        parent->dest.ssa.bit_size);
+      return nir_build_deref_ptr_as_array(b, parent, ssa);
+   }
+   case nir_deref_type_struct:
+      return nir_build_deref_struct(b, parent, deref->strct.index);
+   default:
+      unreachable("invalid type");
+      return NULL;
+   }
+}
+
+static bool
+replace_varying_input_by_uniform_load(nir_shader *shader,
+                                      nir_intrinsic_instr *store_intr,
+                                      nir_ssa_scalar *scalar)
+{
+   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+
+   nir_builder b;
+   nir_builder_init(&b, impl);
+
+   nir_variable *out_var =
+      nir_deref_instr_get_variable(nir_src_as_deref(store_intr->src[0]));
+
+   nir_intrinsic_instr *load = nir_instr_as_intrinsic(scalar->def->parent_instr);
+   nir_deref_instr *deref = nir_src_as_deref(load->src[0]);
+   nir_variable *uni_var = nir_deref_instr_get_variable(deref);
+   uni_var = get_uniform_var_in_consumer(shader, uni_var);
+
+   bool progress = false;
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+         if (intr->intrinsic != nir_intrinsic_load_deref)
+            continue;
+
+         nir_deref_instr *in_deref = nir_src_as_deref(intr->src[0]);
+         if (!nir_deref_mode_is(in_deref, nir_var_shader_in))
+            continue;
+
+         nir_variable *in_var = nir_deref_instr_get_variable(in_deref);
+
+         if (!does_varying_match(out_var, in_var))
+            continue;
+
+         b.cursor = nir_before_instr(instr);
+
+         /* Clone instructions start from deref load to variable deref. */
+         nir_deref_instr *uni_deref = clone_deref_instr(&b, uni_var, deref);
+         nir_ssa_def *uni_def = nir_load_deref(&b, uni_deref);
+
+         /* Add a vector to scalar move if uniform is a vector. */
+         if (uni_def->num_components > 1) {
+            nir_alu_src src = {0};
+            src.src = nir_src_for_ssa(uni_def);
+            src.swizzle[0] = scalar->comp;
+            uni_def = nir_mov_alu(&b, src, 1);
+         }
+
+         /* Replace load input with load uniform. */
+         nir_ssa_def_rewrite_uses(&intr->dest.ssa, uni_def);
+
+         progress = true;
+      }
+   }
+
+   return progress;
+}
+
 /* The GLSL ES 3.20 spec says:
  *
  * "The precision of a vertex output does not need to match the precision of
@@ -1123,11 +1352,16 @@ nir_link_opt_varyings(nir_shader *producer, nir_shader *consumer)
       if (!can_replace_varying(out_var))
          continue;
 
-      if (intr->src[1].ssa->parent_instr->type == nir_instr_type_load_const) {
-         progress |= replace_constant_input(consumer, intr);
+      nir_ssa_scalar uni_scalar;
+      nir_ssa_def *ssa = intr->src[1].ssa;
+      if (ssa->parent_instr->type == nir_instr_type_load_const) {
+         progress |= replace_varying_input_by_constant_load(consumer, intr);
+      } else if (is_direct_uniform_load(ssa, &uni_scalar)) {
+         progress |= replace_varying_input_by_uniform_load(consumer, intr,
+                                                           &uni_scalar);
       } else {
          struct hash_entry *entry =
-               _mesa_hash_table_search(varying_values, intr->src[1].ssa);
+               _mesa_hash_table_search(varying_values, ssa);
          if (entry) {
             progress |= replace_duplicate_input(consumer,
                                                 (nir_variable *) entry->data,
@@ -1135,8 +1369,7 @@ nir_link_opt_varyings(nir_shader *producer, nir_shader *consumer)
          } else {
             nir_variable *in_var = get_matching_input_var(consumer, out_var);
             if (in_var) {
-               _mesa_hash_table_insert(varying_values, intr->src[1].ssa,
-                                       in_var);
+               _mesa_hash_table_insert(varying_values, ssa, in_var);
             }
          }
       }
diff --git a/mesa 3D driver/src/compiler/nir/nir_loop_analyze.c b/mesa 3D driver/src/compiler/nir/nir_loop_analyze.c
index 65291a70a9..2c6c179082 100644
--- a/mesa 3D driver/src/compiler/nir/nir_loop_analyze.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_loop_analyze.c	
@@ -151,6 +151,14 @@ instr_cost(nir_instr *instr, const nir_shader_compiler_options *options)
 
    nir_alu_instr *alu = nir_instr_as_alu(instr);
    const nir_op_info *info = &nir_op_infos[alu->op];
+   unsigned cost = 1;
+
+   if (alu->op == nir_op_flrp) {
+      if ((options->lower_flrp16 && nir_dest_bit_size(alu->dest.dest) == 16) ||
+          (options->lower_flrp32 && nir_dest_bit_size(alu->dest.dest) == 32) ||
+          (options->lower_flrp64 && nir_dest_bit_size(alu->dest.dest) == 64))
+         cost *= 3;
+   }
 
    /* Assume everything 16 or 32-bit is cheap.
     *
@@ -159,7 +167,7 @@ instr_cost(nir_instr *instr, const nir_shader_compiler_options *options)
     */
    if (nir_dest_bit_size(alu->dest.dest) < 64 &&
        nir_src_bit_size(alu->src[0].src) < 64)
-      return 1;
+      return cost;
 
    bool is_fp64 = nir_dest_bit_size(alu->dest.dest) == 64 &&
       nir_alu_type_get_base_type(info->output_type) == nir_type_float;
@@ -171,7 +179,6 @@ instr_cost(nir_instr *instr, const nir_shader_compiler_options *options)
 
    if (is_fp64) {
       /* If it's something lowered normally, it's expensive. */
-      unsigned cost = 1;
       if (options->lower_doubles_options &
           nir_lower_doubles_op_to_options_mask(alu->op))
          cost *= 20;
@@ -188,13 +195,13 @@ instr_cost(nir_instr *instr, const nir_shader_compiler_options *options)
          if (alu->op == nir_op_idiv || alu->op == nir_op_udiv ||
              alu->op == nir_op_imod || alu->op == nir_op_umod ||
              alu->op == nir_op_irem)
-            return 100;
+            return cost * 100;
 
          /* Other int64 lowering isn't usually all that expensive */
-         return 5;
+         return cost * 5;
       }
 
-      return 1;
+      return cost;
    }
 }
 
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_alu_to_scalar.c b/mesa 3D driver/src/compiler/nir/nir_lower_alu_to_scalar.c
index 1203e53a33..a769f0dfd2 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_alu_to_scalar.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_alu_to_scalar.c	
@@ -71,11 +71,11 @@ lower_reduction(nir_alu_instr *alu, nir_op chan_op, nir_op merge_op,
    for (int i = num_components - 1; i >= 0; i--) {
       nir_alu_instr *chan = nir_alu_instr_create(builder->shader, chan_op);
       nir_alu_ssa_dest_init(chan, 1, alu->dest.dest.ssa.bit_size);
-      nir_alu_src_copy(&chan->src[0], &alu->src[0], chan);
+      nir_alu_src_copy(&chan->src[0], &alu->src[0]);
       chan->src[0].swizzle[0] = chan->src[0].swizzle[i];
       if (nir_op_infos[chan_op].num_inputs > 1) {
          assert(nir_op_infos[chan_op].num_inputs == 2);
-         nir_alu_src_copy(&chan->src[1], &alu->src[1], chan);
+         nir_alu_src_copy(&chan->src[1], &alu->src[1]);
          chan->src[1].swizzle[0] = chan->src[1].swizzle[i];
       }
       chan->exact = alu->exact;
@@ -124,7 +124,7 @@ lower_fdot(nir_alu_instr *alu, nir_builder *builder)
          builder->shader, prev ? nir_op_ffma : nir_op_fmul);
       nir_alu_ssa_dest_init(instr, 1, alu->dest.dest.ssa.bit_size);
       for (unsigned j = 0; j < 2; j++) {
-         nir_alu_src_copy(&instr->src[j], &alu->src[j], instr);
+         nir_alu_src_copy(&instr->src[j], &alu->src[j]);
          instr->src[j].swizzle[0] = alu->src[j].swizzle[i];
       }
       if (i != num_components - 1)
@@ -336,7 +336,7 @@ lower_alu_instr_scalar(nir_builder *b, nir_instr *instr, void *_data)
          unsigned src_chan = (nir_op_infos[alu->op].input_sizes[i] == 1 ?
                               0 : chan);
 
-         nir_alu_src_copy(&lower->src[i], &alu->src[i], lower);
+         nir_alu_src_copy(&lower->src[i], &alu->src[i]);
          for (int j = 0; j < NIR_MAX_VEC_COMPONENTS; j++)
             lower->src[i].swizzle[j] = alu->dest.write_mask & (1 << chan) ?
                                        alu->src[i].swizzle[src_chan] : 0;
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_amul.c b/mesa 3D driver/src/compiler/nir/nir_lower_amul.c
index 7b2108cbc7..cbea520397 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_amul.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_amul.c	
@@ -65,6 +65,8 @@ typedef struct {
    bool has_large_ssbo;
 
    unsigned max_slot;
+
+   bool progress;
 } lower_state;
 
 /* Lower 'amul's in offset src of large variables to 'imul': */
@@ -83,19 +85,19 @@ lower_large_src(nir_src *src, void *s)
    if (parent->pass_flags)
       return false;
 
-   bool progress = nir_foreach_src(parent, lower_large_src, state);
+   nir_foreach_src(parent, lower_large_src, state);
 
    if (parent->type == nir_instr_type_alu) {
       nir_alu_instr *alu = nir_instr_as_alu(parent);
       if (alu->op == nir_op_amul) {
          alu->op = nir_op_imul;
-         progress = true;
+         state->progress = true;
       }
    }
 
    parent->pass_flags = 1;
 
-   return progress;
+   return true;
 }
 
 static bool
@@ -118,27 +120,27 @@ large_ssbo(lower_state *state, nir_src src)
    return state->large_ssbos[idx];
 }
 
-static bool
+static void
 lower_intrinsic(lower_state *state, nir_intrinsic_instr *intr)
 {
    switch (intr->intrinsic) {
    case nir_intrinsic_load_ubo:
       //# src[] = { buffer_index, offset }.
       if (large_ubo(state, intr->src[0]))
-         return lower_large_src(&intr->src[1], state);
-      return false;
+         lower_large_src(&intr->src[1], state);
+      return;
 
    case nir_intrinsic_load_ssbo:
       //# src[] = { buffer_index, offset }.
       if (large_ssbo(state, intr->src[0]))
-         return lower_large_src(&intr->src[1], state);
-      return false;
+         lower_large_src(&intr->src[1], state);
+      return;
 
    case nir_intrinsic_store_ssbo:
       //# src[] = { value, block_index, offset }
       if (large_ssbo(state, intr->src[1]))
-         return lower_large_src(&intr->src[2], state);
-      return false;
+         lower_large_src(&intr->src[2], state);
+      return;
 
    case nir_intrinsic_ssbo_atomic_add:
    case nir_intrinsic_ssbo_atomic_imin:
@@ -158,8 +160,8 @@ lower_intrinsic(lower_state *state, nir_intrinsic_instr *intr)
        * 1: offset
        */
       if (large_ssbo(state, intr->src[0]))
-         return lower_large_src(&intr->src[1], state);
-      return false;
+         lower_large_src(&intr->src[1], state);
+      return;
 
    case nir_intrinsic_global_atomic_add:
    case nir_intrinsic_global_atomic_imin:
@@ -175,8 +177,16 @@ lower_intrinsic(lower_state *state, nir_intrinsic_instr *intr)
    case nir_intrinsic_global_atomic_fmin:
    case nir_intrinsic_global_atomic_fmax:
    case nir_intrinsic_global_atomic_fcomp_swap:
+   case nir_intrinsic_load_global_constant:
+   case nir_intrinsic_load_global:
       /* just assume we that 24b is not sufficient: */
-      return lower_large_src(&intr->src[0], state);
+      lower_large_src(&intr->src[0], state);
+      return;
+
+   case nir_intrinsic_store_global:
+      /* just assume we that 24b is not sufficient: */
+      lower_large_src(&intr->src[1], state);
+      return;
 
    /* These should all be small enough to unconditionally use imul24: */
    case nir_intrinsic_shared_atomic_add:
@@ -198,20 +208,16 @@ lower_intrinsic(lower_state *state, nir_intrinsic_instr *intr)
    case nir_intrinsic_load_output:
    case nir_intrinsic_store_output:
    default:
-      return false;
+      return;
    }
 }
 
-static bool
+static void
 lower_instr(lower_state *state, nir_instr *instr)
 {
-   bool progress = false;
-
    if (instr->type == nir_instr_type_intrinsic) {
-      progress |= lower_intrinsic(state, nir_instr_as_intrinsic(instr));
+      lower_intrinsic(state, nir_instr_as_intrinsic(instr));
    }
-
-   return progress;
 }
 
 static bool
@@ -278,7 +284,6 @@ nir_lower_amul(nir_shader *shader,
       }
    }
 
-   bool progress = false;
    nir_foreach_function(function, shader) {
       nir_function_impl *impl = function->impl;
 
@@ -287,7 +292,7 @@ nir_lower_amul(nir_shader *shader,
 
       nir_foreach_block(block, impl) {
          nir_foreach_instr(instr, block) {
-            progress |= lower_instr(&state, instr);
+            lower_instr(&state, instr);
          }
       }
    }
@@ -295,6 +300,9 @@ nir_lower_amul(nir_shader *shader,
    /* At this point, all 'amul's used in calculating an offset into
     * a large variable have been replaced with 'imul'.  So remaining
     * 'amul's can be replaced with 'imul24':
+    *
+    * Note the exception for 64b (such as load/store_global where
+    * address size is 64b) as imul24 cannot have 64b bitsize
     */
    nir_foreach_function(function, shader) {
       nir_function_impl *impl = function->impl;
@@ -311,8 +319,12 @@ nir_lower_amul(nir_shader *shader,
             if (alu->op != nir_op_amul)
                continue;
 
-            alu->op = nir_op_imul24;
-            progress |= true;
+            if (nir_dest_bit_size(alu->dest.dest) <= 32)
+               alu->op = nir_op_imul24;
+            else
+               alu->op = nir_op_imul;
+
+            state.progress |= true;
          }
       }
 
@@ -321,5 +333,5 @@ nir_lower_amul(nir_shader *shader,
 
    }
 
-   return progress;
+   return state.progress;
 }
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_atomics_to_ssbo.c b/mesa 3D driver/src/compiler/nir/nir_lower_atomics_to_ssbo.c
index 99b6612e00..c1799d0b9d 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_atomics_to_ssbo.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_atomics_to_ssbo.c	
@@ -89,7 +89,7 @@ lower_instr(nir_intrinsic_instr *instr, unsigned ssbo_offset, nir_builder *b)
    nir_ssa_def *buffer = nir_imm_int(b, ssbo_offset + nir_intrinsic_base(instr));
    nir_ssa_def *temp = NULL;
    nir_intrinsic_instr *new_instr =
-         nir_intrinsic_instr_create(ralloc_parent(instr), op);
+         nir_intrinsic_instr_create(b->shader, op);
 
    /* a couple instructions need special handling since they don't map
     * 1:1 with ssbo atomics
@@ -99,7 +99,7 @@ lower_instr(nir_intrinsic_instr *instr, unsigned ssbo_offset, nir_builder *b)
       /* remapped to ssbo_atomic_add: { buffer_idx, offset, +1 } */
       temp = nir_imm_int(b, +1);
       new_instr->src[0] = nir_src_for_ssa(buffer);
-      nir_src_copy(&new_instr->src[1], &instr->src[0], new_instr);
+      nir_src_copy(&new_instr->src[1], &instr->src[0]);
       new_instr->src[2] = nir_src_for_ssa(temp);
       break;
    case nir_intrinsic_atomic_counter_pre_dec:
@@ -108,22 +108,22 @@ lower_instr(nir_intrinsic_instr *instr, unsigned ssbo_offset, nir_builder *b)
       /* NOTE semantic difference so we adjust the return value below */
       temp = nir_imm_int(b, -1);
       new_instr->src[0] = nir_src_for_ssa(buffer);
-      nir_src_copy(&new_instr->src[1], &instr->src[0], new_instr);
+      nir_src_copy(&new_instr->src[1], &instr->src[0]);
       new_instr->src[2] = nir_src_for_ssa(temp);
       break;
    case nir_intrinsic_atomic_counter_read:
       /* remapped to load_ssbo: { buffer_idx, offset } */
       new_instr->src[0] = nir_src_for_ssa(buffer);
-      nir_src_copy(&new_instr->src[1], &instr->src[0], new_instr);
+      nir_src_copy(&new_instr->src[1], &instr->src[0]);
       break;
    default:
       /* remapped to ssbo_atomic_x: { buffer_idx, offset, data, (compare)? } */
       new_instr->src[0] = nir_src_for_ssa(buffer);
-      nir_src_copy(&new_instr->src[1], &instr->src[0], new_instr);
-      nir_src_copy(&new_instr->src[2], &instr->src[1], new_instr);
+      nir_src_copy(&new_instr->src[1], &instr->src[0]);
+      nir_src_copy(&new_instr->src[2], &instr->src[1]);
       if (op == nir_intrinsic_ssbo_atomic_comp_swap ||
           op == nir_intrinsic_ssbo_atomic_fcomp_swap)
-         nir_src_copy(&new_instr->src[3], &instr->src[2], new_instr);
+         nir_src_copy(&new_instr->src[3], &instr->src[2]);
       break;
    }
 
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_bit_size.c b/mesa 3D driver/src/compiler/nir/nir_lower_bit_size.c
index e31d9c8600..4c76d8d56c 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_bit_size.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_bit_size.c	
@@ -38,7 +38,7 @@ static nir_ssa_def *convert_to_bit_size(nir_builder *bld, nir_ssa_def *src,
    if ((type & (nir_type_uint | nir_type_int)) && bit_size == 32 &&
        alu && (alu->op == nir_op_b2i8 || alu->op == nir_op_b2i16)) {
       nir_alu_instr *instr = nir_alu_instr_create(bld->shader, nir_op_b2i32);
-      nir_alu_src_copy(&instr->src[0], &alu->src[0], instr);
+      nir_alu_src_copy(&instr->src[0], &alu->src[0]);
       return nir_builder_alu_instr_finish_and_insert(bld, instr);
    }
 
@@ -74,13 +74,30 @@ lower_alu_instr(nir_builder *bld, nir_alu_instr *alu, unsigned bit_size)
    nir_ssa_def *lowered_dst = NULL;
    if (op == nir_op_imul_high || op == nir_op_umul_high) {
       assert(dst_bit_size * 2 <= bit_size);
-      nir_ssa_def *lowered_dst = nir_imul(bld, srcs[0], srcs[1]);
+      lowered_dst = nir_imul(bld, srcs[0], srcs[1]);
       if (nir_op_infos[op].output_type & nir_type_uint)
          lowered_dst = nir_ushr_imm(bld, lowered_dst, dst_bit_size);
       else
          lowered_dst = nir_ishr_imm(bld, lowered_dst, dst_bit_size);
    } else {
       lowered_dst = nir_build_alu_src_arr(bld, op, srcs);
+
+      /* The add_sat and sub_sat instructions need to clamp the result to the
+       * range of the original type.
+       */
+      if (op == nir_op_iadd_sat || op == nir_op_isub_sat) {
+         const int64_t int_max = u_intN_max(dst_bit_size);
+         const int64_t int_min = u_intN_min(dst_bit_size);
+
+         lowered_dst = nir_iclamp(bld, lowered_dst,
+                                  nir_imm_intN_t(bld, int_min, bit_size),
+                                  nir_imm_intN_t(bld, int_max, bit_size));
+      } else if (op == nir_op_uadd_sat || op == nir_op_usub_sat) {
+         const uint64_t uint_max = u_uintN_max(dst_bit_size);
+
+         lowered_dst = nir_umin(bld, lowered_dst,
+                                nir_imm_intN_t(bld, uint_max, bit_size));
+      }
    }
 
 
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_blend.c b/mesa 3D driver/src/compiler/nir/nir_lower_blend.c
index 0524533724..5e6a04f0ec 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_blend.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_blend.c	
@@ -344,10 +344,16 @@ nir_lower_blend_instr(nir_builder *b, nir_instr *instr, void *data)
       (var->data.location == FRAG_RESULT_COLOR) ? 0 :
       (var->data.location - FRAG_RESULT_DATA0);
 
+   /* No blend lowering requested on this RT */
+   if (options->format[rt] == PIPE_FORMAT_NONE)
+      return false;
+
    b->cursor = nir_before_instr(instr);
 
    /* Grab the input color */
-   nir_ssa_def *src = nir_ssa_for_src(b, intr->src[1], 4);
+   unsigned src_num_comps = nir_src_num_components(intr->src[1]);
+   nir_ssa_def *src =
+      nir_pad_vector(b, nir_ssa_for_src(b, intr->src[1], src_num_comps), 4);
 
    /* Grab the previous fragment color */
    var->data.fb_fetch_output = true;
@@ -358,14 +364,19 @@ nir_lower_blend_instr(nir_builder *b, nir_instr *instr, void *data)
    /* Blend the two colors per the passed options */
    nir_ssa_def *blended = src;
 
-   if (options->logicop_enable)
+   if (options->logicop_enable) {
       blended = nir_blend_logicop(b, *options, rt, src, dst);
-   else if (!util_format_is_pure_integer(options->format[rt]))
+   } else if (!util_format_is_pure_integer(options->format[rt])) {
+      assert(!util_format_is_scaled(options->format[rt]));
       blended = nir_blend(b, *options, rt, src, options->src1, dst);
+   }
 
    /* Apply a colormask */
    blended = nir_color_mask(b, options->rt[rt].colormask, blended, dst);
 
+   if (src_num_comps != 4)
+      blended = nir_channels(b, blended, BITFIELD_MASK(src_num_comps));
+
    /* Write out the final color instead of the input */
    nir_instr_rewrite_src_ssa(instr, &intr->src[1], blended);
    return true;
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_discard_or_demote.c b/mesa 3D driver/src/compiler/nir/nir_lower_discard_or_demote.c
index 5857dcaa76..c3c3c3e17e 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_discard_or_demote.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_discard_or_demote.c	
@@ -164,7 +164,10 @@ nir_lower_discard_or_demote(nir_shader *shader,
        */
       progress = nir_shader_instructions_pass(shader,
                                               nir_lower_discard_to_demote_instr,
-                                              nir_metadata_all,
+                                              nir_metadata_block_index |
+                                              nir_metadata_dominance |
+                                              nir_metadata_live_ssa_defs |
+                                              nir_metadata_instr_index,
                                               NULL);
       shader->info.fs.uses_demote = true;
    } else if (!shader->info.fs.needs_quad_helper_invocations &&
@@ -173,7 +176,8 @@ nir_lower_discard_or_demote(nir_shader *shader,
       /* If we don't need any helper invocations, convert demote to discard. */
       progress = nir_shader_instructions_pass(shader,
                                               nir_lower_demote_to_discard_instr,
-                                              nir_metadata_all,
+                                              nir_metadata_block_index |
+                                              nir_metadata_dominance,
                                               NULL);
       shader->info.fs.uses_demote = false;
    } else if (shader->info.fs.uses_demote &&
@@ -184,7 +188,8 @@ nir_lower_discard_or_demote(nir_shader *shader,
       nir_ssa_def *is_helper = NULL;
       progress = nir_shader_instructions_pass(shader,
                                               nir_lower_load_helper_to_is_helper,
-                                              nir_metadata_all,
+                                              nir_metadata_block_index |
+                                              nir_metadata_dominance,
                                               &is_helper);
       BITSET_CLEAR(shader->info.system_values_read,
                    nir_system_value_from_intrinsic(nir_intrinsic_load_helper_invocation));
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_flrp.c b/mesa 3D driver/src/compiler/nir/nir_lower_flrp.c
index e23c3c189a..9c13619462 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_flrp.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_flrp.c	
@@ -635,7 +635,7 @@ nir_lower_flrp(nir_shader *shader,
 {
    struct u_vector dead_flrp;
 
-   if (!u_vector_init(&dead_flrp, sizeof(struct nir_alu_instr *), 64))
+   if (!u_vector_init_pow2(&dead_flrp, 8, sizeof(struct nir_alu_instr *)))
       return false;
 
    nir_foreach_function(function, shader) {
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_gs_intrinsics.c b/mesa 3D driver/src/compiler/nir/nir_lower_gs_intrinsics.c
index 33fff69ae7..171b37e6eb 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_gs_intrinsics.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_gs_intrinsics.c	
@@ -246,9 +246,11 @@ rewrite_intrinsics(nir_block *block, struct state *state)
       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
       switch (intrin->intrinsic) {
       case nir_intrinsic_emit_vertex:
+      case nir_intrinsic_emit_vertex_with_counter:
          rewrite_emit_vertex(intrin, state);
          break;
       case nir_intrinsic_end_primitive:
+      case nir_intrinsic_end_primitive_with_counter:
          rewrite_end_primitive(intrin, state);
          break;
       default:
@@ -302,10 +304,53 @@ append_set_vertex_and_primitive_count(nir_block *end_block, struct state *state)
          }
 
          nir_set_vertex_and_primitive_count(b, vtx_cnt, prim_cnt, stream);
+         state->progress = true;
       }
    }
 }
 
+/**
+ * Check to see if there are any blocks that need set_vertex_and_primitive_count
+ *
+ * If every block that could need the set_vertex_and_primitive_count intrinsic
+ * already has one, there is nothing for this pass to do.
+ */
+static bool
+a_block_needs_set_vertex_and_primitive_count(nir_block *end_block, bool per_stream)
+{
+   set_foreach(end_block->predecessors, entry) {
+      nir_block *pred = (nir_block *) entry->key;
+
+
+      for (unsigned stream = 0; stream < NIR_MAX_XFB_STREAMS; ++stream) {
+         /* When it's not per-stream, we only need to write one variable. */
+         if (!per_stream && stream != 0)
+            continue;
+
+         bool found = false;
+
+         nir_foreach_instr_reverse(instr, pred) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            const nir_intrinsic_instr *const intrin =
+               nir_instr_as_intrinsic(instr);
+
+            if (intrin->intrinsic == nir_intrinsic_set_vertex_and_primitive_count &&
+                intrin->const_index[0] == stream) {
+               found = true;
+               break;
+            }
+         }
+
+         if (!found)
+            return true;
+      }
+   }
+
+   return false;
+}
+
 bool
 nir_lower_gs_intrinsics(nir_shader *shader, nir_lower_gs_intrinsics_flags options)
 {
@@ -326,6 +371,9 @@ nir_lower_gs_intrinsics(nir_shader *shader, nir_lower_gs_intrinsics_flags option
    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
    assert(impl);
 
+   if (!a_block_needs_set_vertex_and_primitive_count(impl->end_block, per_stream))
+      return false;
+
    nir_builder b;
    nir_builder_init(&b, impl);
    state.builder = &b;
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_image.c b/mesa 3D driver/src/compiler/nir/nir_lower_image.c
index 2a53c1972b..946ddc6cd9 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_image.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_image.c	
@@ -58,7 +58,7 @@ lower_cube_size(nir_builder *b, nir_intrinsic_instr *intrin)
    nir_ssa_def *vec = nir_vec(b, comps, intrin->dest.ssa.num_components);
    nir_ssa_def_rewrite_uses(&intrin->dest.ssa, vec);
    nir_instr_remove(&intrin->instr);
-   ralloc_free(&intrin->instr);
+   nir_instr_free(&intrin->instr);
 }
 
 static bool
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_indirect_derefs.c b/mesa 3D driver/src/compiler/nir/nir_lower_indirect_derefs.c
index a432537753..cf2e61bc7d 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_indirect_derefs.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_indirect_derefs.c	
@@ -98,7 +98,7 @@ emit_load_store_deref(nir_builder *b, nir_intrinsic_instr *orig_instr,
       /* Copy over any other sources.  This is needed for interp_deref_at */
       for (unsigned i = 1;
            i < nir_intrinsic_infos[orig_instr->intrinsic].num_srcs; i++)
-         nir_src_copy(&load->src[i], &orig_instr->src[i], load);
+         nir_src_copy(&load->src[i], &orig_instr->src[i]);
 
       nir_ssa_dest_init(&load->instr, &load->dest,
                         orig_instr->dest.ssa.num_components,
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_input_attachments.c b/mesa 3D driver/src/compiler/nir/nir_lower_input_attachments.c
index 057a716b87..f449d4a151 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_input_attachments.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_input_attachments.c	
@@ -197,8 +197,8 @@ nir_lower_input_attachments(nir_shader *shader,
             case nir_instr_type_tex: {
                nir_tex_instr *tex = nir_instr_as_tex(instr);
 
-               if (tex->op == nir_texop_fragment_mask_fetch ||
-                   tex->op == nir_texop_fragment_fetch) {
+               if (tex->op == nir_texop_fragment_mask_fetch_amd ||
+                   tex->op == nir_texop_fragment_fetch_amd) {
                   progress |= try_lower_input_texop(function->impl, tex,
                                                     options);
                }
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_int_to_float.c b/mesa 3D driver/src/compiler/nir/nir_lower_int_to_float.c
index 5e20c71a31..91199708db 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_int_to_float.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_int_to_float.c	
@@ -165,6 +165,8 @@ nir_lower_int_to_float_impl(nir_function_impl *impl)
    if (progress) {
       nir_metadata_preserve(impl, nir_metadata_block_index |
                                   nir_metadata_dominance);
+   } else {
+      nir_metadata_preserve(impl, nir_metadata_all);
    }
 
    free(float_types);
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_io.c b/mesa 3D driver/src/compiler/nir/nir_lower_io.c
index 4391949639..faaa1d717c 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_io.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_io.c	
@@ -158,7 +158,8 @@ nir_is_arrayed_io(const nir_variable *var, gl_shader_stage stage)
              stage == MESA_SHADER_TESS_EVAL;
 
    if (var->data.mode == nir_var_shader_out)
-      return stage == MESA_SHADER_TESS_CTRL;
+      return stage == MESA_SHADER_TESS_CTRL ||
+             stage == MESA_SHADER_MESH;
 
    return false;
 }
@@ -255,7 +256,8 @@ emit_load(struct lower_io_state *state,
    case nir_var_shader_in:
       if (nir->info.stage == MESA_SHADER_FRAGMENT &&
           nir->options->use_interpolated_input_intrinsics &&
-          var->data.interpolation != INTERP_MODE_FLAT) {
+          var->data.interpolation != INTERP_MODE_FLAT &&
+          !var->data.per_primitive) {
          if (var->data.interpolation == INTERP_MODE_EXPLICIT) {
             assert(array_index != NULL);
             op = nir_intrinsic_load_input_vertex;
@@ -281,8 +283,9 @@ emit_load(struct lower_io_state *state,
       }
       break;
    case nir_var_shader_out:
-      op = array_index ? nir_intrinsic_load_per_vertex_output :
-                         nir_intrinsic_load_output;
+      op = !array_index            ? nir_intrinsic_load_output :
+           var->data.per_primitive ? nir_intrinsic_load_per_primitive_output :
+                                     nir_intrinsic_load_per_vertex_output;
       break;
    case nir_var_uniform:
       op = nir_intrinsic_load_uniform;
@@ -393,8 +396,9 @@ emit_store(struct lower_io_state *state, nir_ssa_def *data,
 
    assert(var->data.mode == nir_var_shader_out);
    nir_intrinsic_op op =
-      array_index ? nir_intrinsic_store_per_vertex_output :
-                    nir_intrinsic_store_output;
+      !array_index            ? nir_intrinsic_store_output :
+      var->data.per_primitive ? nir_intrinsic_store_per_primitive_output :
+                                nir_intrinsic_store_per_vertex_output;
 
    nir_intrinsic_instr *store =
       nir_intrinsic_instr_create(state->builder.shader, op);
@@ -550,7 +554,7 @@ lower_interpolate_at(nir_intrinsic_instr *intrin, struct lower_io_state *state,
    if (intrin->intrinsic == nir_intrinsic_interp_deref_at_sample ||
        intrin->intrinsic == nir_intrinsic_interp_deref_at_offset ||
        intrin->intrinsic == nir_intrinsic_interp_deref_at_vertex)
-      nir_src_copy(&bary_setup->src[0], &intrin->src[1], bary_setup);
+      nir_src_copy(&bary_setup->src[0], &intrin->src[1]);
 
    nir_builder_instr_insert(b, &bary_setup->instr);
 
@@ -786,16 +790,12 @@ build_addr_iadd(nir_builder *b, nir_ssa_def *addr,
    case nir_address_format_64bit_bounded_global:
       assert(addr->num_components == 4);
       assert(addr->bit_size == offset->bit_size);
-      return nir_vec4(b, nir_channel(b, addr, 0),
-                         nir_channel(b, addr, 1),
-                         nir_channel(b, addr, 2),
-                         nir_iadd(b, nir_channel(b, addr, 3), offset));
+      return nir_vector_insert_imm(b, addr, nir_iadd(b, nir_channel(b, addr, 3), offset), 3);
 
    case nir_address_format_32bit_index_offset:
       assert(addr->num_components == 2);
       assert(addr->bit_size == offset->bit_size);
-      return nir_vec2(b, nir_channel(b, addr, 0),
-                         nir_iadd(b, nir_channel(b, addr, 1), offset));
+      return nir_vector_insert_imm(b, addr, nir_iadd(b, nir_channel(b, addr, 1), offset), 1);
 
    case nir_address_format_32bit_index_offset_pack64:
       assert(addr->num_components == 1);
@@ -807,8 +807,7 @@ build_addr_iadd(nir_builder *b, nir_ssa_def *addr,
    case nir_address_format_vec2_index_32bit_offset:
       assert(addr->num_components == 3);
       assert(offset->bit_size == 32);
-      return nir_vec3(b, nir_channel(b, addr, 0), nir_channel(b, addr, 1),
-                         nir_iadd(b, nir_channel(b, addr, 2), offset));
+      return nir_vector_insert_imm(b, addr, nir_iadd(b, nir_channel(b, addr, 2), offset), 2);
 
    case nir_address_format_62bit_generic:
       assert(addr->num_components == 1);
@@ -2130,6 +2129,8 @@ nir_lower_explicit_io_impl(nir_function_impl *impl, nir_variable_mode modes,
    if (progress) {
       nir_metadata_preserve(impl, nir_metadata_block_index |
                                   nir_metadata_dominance);
+   } else {
+      nir_metadata_preserve(impl, nir_metadata_all);
    }
 
    return progress;
@@ -2223,6 +2224,8 @@ nir_lower_vars_to_explicit_types_impl(nir_function_impl *impl,
                                   nir_metadata_dominance |
                                   nir_metadata_live_ssa_defs |
                                   nir_metadata_loop_analysis);
+   } else {
+      nir_metadata_preserve(impl, nir_metadata_all);
    }
 
    return progress;
@@ -2476,6 +2479,7 @@ nir_get_io_offset_src(nir_intrinsic_instr *instr)
    case nir_intrinsic_load_input_vertex:
    case nir_intrinsic_load_per_vertex_input:
    case nir_intrinsic_load_per_vertex_output:
+   case nir_intrinsic_load_per_primitive_output:
    case nir_intrinsic_load_interpolated_input:
    case nir_intrinsic_store_output:
    case nir_intrinsic_store_shared:
@@ -2498,6 +2502,7 @@ nir_get_io_offset_src(nir_intrinsic_instr *instr)
       return &instr->src[1];
    case nir_intrinsic_store_ssbo:
    case nir_intrinsic_store_per_vertex_output:
+   case nir_intrinsic_store_per_primitive_output:
       return &instr->src[2];
    default:
       return NULL;
@@ -2636,8 +2641,10 @@ is_output(nir_intrinsic_instr *intrin)
 {
    return intrin->intrinsic == nir_intrinsic_load_output ||
           intrin->intrinsic == nir_intrinsic_load_per_vertex_output ||
+          intrin->intrinsic == nir_intrinsic_load_per_primitive_output ||
           intrin->intrinsic == nir_intrinsic_store_output ||
-          intrin->intrinsic == nir_intrinsic_store_per_vertex_output;
+          intrin->intrinsic == nir_intrinsic_store_per_vertex_output ||
+          intrin->intrinsic == nir_intrinsic_store_per_primitive_output;
 }
 
 static bool is_dual_slot(nir_intrinsic_instr *intrin)
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_io_arrays_to_elements.c b/mesa 3D driver/src/compiler/nir/nir_lower_io_arrays_to_elements.c
index 901da66962..6383b1d002 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_io_arrays_to_elements.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_io_arrays_to_elements.c	
@@ -181,8 +181,7 @@ lower_array(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var,
       if (intr->intrinsic == nir_intrinsic_interp_deref_at_offset ||
           intr->intrinsic == nir_intrinsic_interp_deref_at_sample ||
           intr->intrinsic == nir_intrinsic_interp_deref_at_vertex) {
-         nir_src_copy(&element_intr->src[1], &intr->src[1],
-                      &element_intr->instr);
+         nir_src_copy(&element_intr->src[1], &intr->src[1]);
       }
 
       nir_ssa_def_rewrite_uses(&intr->dest.ssa,
@@ -190,8 +189,7 @@ lower_array(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var,
    } else {
       nir_intrinsic_set_write_mask(element_intr,
                                    nir_intrinsic_write_mask(intr));
-      nir_src_copy(&element_intr->src[1], &intr->src[1],
-                   &element_intr->instr);
+      nir_src_copy(&element_intr->src[1], &intr->src[1]);
    }
 
    nir_builder_instr_insert(b, &element_intr->instr);
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_io_to_scalar.c b/mesa 3D driver/src/compiler/nir/nir_lower_io_to_scalar.c
index f07733aa73..bad3a91215 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_io_to_scalar.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_io_to_scalar.c	
@@ -52,7 +52,7 @@ lower_load_input_to_scalar(nir_builder *b, nir_intrinsic_instr *intr)
       nir_intrinsic_set_dest_type(chan_intr, nir_intrinsic_dest_type(intr));
       nir_intrinsic_set_io_semantics(chan_intr, nir_intrinsic_io_semantics(intr));
       /* offset */
-      nir_src_copy(&chan_intr->src[0], &intr->src[0], chan_intr);
+      nir_src_copy(&chan_intr->src[0], &intr->src[0]);
 
       nir_builder_instr_insert(b, &chan_intr->instr);
 
@@ -88,7 +88,7 @@ lower_store_output_to_scalar(nir_builder *b, nir_intrinsic_instr *intr)
       /* value */
       chan_intr->src[0] = nir_src_for_ssa(nir_channel(b, value, i));
       /* offset */
-      nir_src_copy(&chan_intr->src[1], &intr->src[1], chan_intr);
+      nir_src_copy(&chan_intr->src[1], &intr->src[1]);
 
       nir_builder_instr_insert(b, &chan_intr->instr);
    }
@@ -222,7 +222,7 @@ lower_load_to_scalar_early(nir_builder *b, nir_intrinsic_instr *intr,
       if (intr->intrinsic == nir_intrinsic_interp_deref_at_offset ||
           intr->intrinsic == nir_intrinsic_interp_deref_at_sample ||
           intr->intrinsic == nir_intrinsic_interp_deref_at_vertex)
-         nir_src_copy(&chan_intr->src[1], &intr->src[1], &chan_intr->instr);
+         nir_src_copy(&chan_intr->src[1], &intr->src[1]);
 
       nir_builder_instr_insert(b, &chan_intr->instr);
 
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_io_to_temporaries.c b/mesa 3D driver/src/compiler/nir/nir_lower_io_to_temporaries.c
index 2cb8235359..f767fab0ef 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_io_to_temporaries.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_io_to_temporaries.c	
@@ -328,7 +328,9 @@ nir_lower_io_to_temporaries(nir_shader *shader, nir_function_impl *entrypoint,
 {
    struct lower_io_state state;
 
-   if (shader->info.stage == MESA_SHADER_TESS_CTRL)
+   if (shader->info.stage == MESA_SHADER_TESS_CTRL ||
+       shader->info.stage == MESA_SHADER_TASK ||
+       shader->info.stage == MESA_SHADER_MESH)
       return;
 
    state.shader = shader;
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_io_to_vector.c b/mesa 3D driver/src/compiler/nir/nir_lower_io_to_vector.c
index 13d692e72b..c2224f8f40 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_io_to_vector.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_io_to_vector.c	
@@ -127,7 +127,9 @@ variables_can_merge(const nir_shader *shader,
    assert(a->data.mode == b->data.mode);
    if (shader->info.stage == MESA_SHADER_FRAGMENT &&
        a->data.mode == nir_var_shader_in &&
-       a->data.interpolation != b->data.interpolation)
+       (a->data.interpolation != b->data.interpolation ||
+        a->data.centroid != b->data.centroid ||
+        a->data.sample != b->data.sample))
       return false;
 
    if (shader->info.stage == MESA_SHADER_FRAGMENT &&
@@ -380,6 +382,24 @@ build_array_deref_of_new_var_flat(nir_shader *shader,
       build_array_index(b, leader, nir_imm_int(b, base), vs_in, per_vertex));
 }
 
+ASSERTED static bool
+nir_shader_can_read_output(const shader_info *info)
+{
+   switch (info->stage) {
+   case MESA_SHADER_TESS_CTRL:
+   case MESA_SHADER_FRAGMENT:
+      return true;
+
+   case MESA_SHADER_TASK:
+   case MESA_SHADER_MESH:
+      /* TODO(mesh): This will not be allowed on EXT. */
+      return true;
+
+   default:
+      return false;
+   }
+}
+
 static bool
 nir_lower_io_to_vector_impl(nir_function_impl *impl, nir_variable_mode modes)
 {
@@ -446,8 +466,7 @@ nir_lower_io_to_vector_impl(nir_function_impl *impl, nir_variable_mode modes)
                break;
 
             if (nir_deref_mode_is(old_deref, nir_var_shader_out))
-               assert(b.shader->info.stage == MESA_SHADER_TESS_CTRL ||
-                      b.shader->info.stage == MESA_SHADER_FRAGMENT);
+               assert(nir_shader_can_read_output(&b.shader->info));
 
             nir_variable *old_var = nir_deref_instr_get_variable(old_deref);
 
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_locals_to_regs.c b/mesa 3D driver/src/compiler/nir/nir_lower_locals_to_regs.c
index cddc49bc37..aabd998000 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_locals_to_regs.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_locals_to_regs.c	
@@ -159,7 +159,7 @@ get_deref_reg_src(nir_deref_instr *deref, struct locals_to_regs_state *state)
          if (src.reg.indirect) {
             assert(src.reg.base_offset == 0);
          } else {
-            src.reg.indirect = ralloc(b->shader, nir_src);
+            src.reg.indirect = malloc(sizeof(nir_src));
             *src.reg.indirect =
                nir_src_for_ssa(nir_imm_int(b, src.reg.base_offset));
             src.reg.base_offset = 0;
@@ -208,7 +208,7 @@ lower_locals_to_regs_block(nir_block *block,
             nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
                                      &mov->dest.dest.ssa);
          } else {
-            nir_dest_copy(&mov->dest.dest, &intrin->dest, &mov->instr);
+            nir_dest_copy(&mov->dest.dest, &intrin->dest);
          }
          nir_builder_instr_insert(b, &mov->instr);
 
@@ -227,7 +227,7 @@ lower_locals_to_regs_block(nir_block *block,
          nir_src reg_src = get_deref_reg_src(deref, state);
 
          nir_alu_instr *mov = nir_alu_instr_create(b->shader, nir_op_mov);
-         nir_src_copy(&mov->src[0].src, &intrin->src[1], mov);
+         nir_src_copy(&mov->src[0].src, &intrin->src[1]);
          mov->dest.write_mask = nir_intrinsic_write_mask(intrin);
          mov->dest.dest.is_ssa = false;
          mov->dest.dest.reg.reg = reg_src.reg.reg;
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_passthrough_edgeflags.c b/mesa 3D driver/src/compiler/nir/nir_lower_passthrough_edgeflags.c
index 1fd2cd2001..e6d0b14f26 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_passthrough_edgeflags.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_passthrough_edgeflags.c	
@@ -96,5 +96,9 @@ lower_impl(nir_function_impl *impl)
 
 void nir_lower_passthrough_edgeflags(nir_shader *shader)
 {
+   assert(shader->info.stage == MESA_SHADER_VERTEX);
+
+   shader->info.vs.needs_edge_flag = true;
+
    lower_impl(nir_shader_get_entrypoint(shader));
 }
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_phis_to_scalar.c b/mesa 3D driver/src/compiler/nir/nir_lower_phis_to_scalar.c
index 2fcd71d70c..9abaf24bae 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_phis_to_scalar.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_phis_to_scalar.c	
@@ -33,8 +33,9 @@
  */
 
 struct lower_phis_to_scalar_state {
+   nir_shader *shader;
    void *mem_ctx;
-   void *dead_ctx;
+   struct exec_list dead_instrs;
 
    bool lower_all;
 
@@ -219,14 +220,14 @@ lower_phis_to_scalar_block(nir_block *block,
        */
       nir_op vec_op = nir_op_vec(phi->dest.ssa.num_components);
 
-      nir_alu_instr *vec = nir_alu_instr_create(state->mem_ctx, vec_op);
+      nir_alu_instr *vec = nir_alu_instr_create(state->shader, vec_op);
       nir_ssa_dest_init(&vec->instr, &vec->dest.dest,
                         phi->dest.ssa.num_components,
                         bit_size, NULL);
       vec->dest.write_mask = (1 << phi->dest.ssa.num_components) - 1;
 
       for (unsigned i = 0; i < phi->dest.ssa.num_components; i++) {
-         nir_phi_instr *new_phi = nir_phi_instr_create(state->mem_ctx);
+         nir_phi_instr *new_phi = nir_phi_instr_create(state->shader);
          nir_ssa_dest_init(&new_phi->instr, &new_phi->dest, 1,
                            phi->dest.ssa.bit_size, NULL);
 
@@ -234,11 +235,11 @@ lower_phis_to_scalar_block(nir_block *block,
 
          nir_foreach_phi_src(src, phi) {
             /* We need to insert a mov to grab the i'th component of src */
-            nir_alu_instr *mov = nir_alu_instr_create(state->mem_ctx,
+            nir_alu_instr *mov = nir_alu_instr_create(state->shader,
                                                       nir_op_mov);
             nir_ssa_dest_init(&mov->instr, &mov->dest.dest, 1, bit_size, NULL);
             mov->dest.write_mask = 1;
-            nir_src_copy(&mov->src[0].src, &src->src, state->mem_ctx);
+            nir_src_copy(&mov->src[0].src, &src->src);
             mov->src[0].swizzle[0] = i;
 
             /* Insert at the end of the predecessor but before the jump */
@@ -259,8 +260,8 @@ lower_phis_to_scalar_block(nir_block *block,
       nir_ssa_def_rewrite_uses(&phi->dest.ssa,
                                &vec->dest.dest.ssa);
 
-      ralloc_steal(state->dead_ctx, phi);
       nir_instr_remove(&phi->instr);
+      exec_list_push_tail(&state->dead_instrs, &phi->instr.node);
 
       progress = true;
 
@@ -283,9 +284,10 @@ lower_phis_to_scalar_impl(nir_function_impl *impl, bool lower_all)
    struct lower_phis_to_scalar_state state;
    bool progress = false;
 
+   state.shader = impl->function->shader;
    state.mem_ctx = ralloc_parent(impl);
-   state.dead_ctx = ralloc_context(NULL);
-   state.phi_table = _mesa_pointer_hash_table_create(state.dead_ctx);
+   exec_list_make_empty(&state.dead_instrs);
+   state.phi_table = _mesa_pointer_hash_table_create(NULL);
    state.lower_all = lower_all;
 
    nir_foreach_block(block, impl) {
@@ -295,7 +297,10 @@ lower_phis_to_scalar_impl(nir_function_impl *impl, bool lower_all)
    nir_metadata_preserve(impl, nir_metadata_block_index |
                                nir_metadata_dominance);
 
-   ralloc_free(state.dead_ctx);
+   nir_instr_free_list(&state.dead_instrs);
+
+   ralloc_free(state.phi_table);
+
    return progress;
 }
 
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_readonly_images_to_tex.c b/mesa 3D driver/src/compiler/nir/nir_lower_readonly_images_to_tex.c
index 3dfa42e3af..25ecc70793 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_readonly_images_to_tex.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_readonly_images_to_tex.c	
@@ -25,16 +25,16 @@
 #include "nir_builder.h"
 
 static const struct glsl_type *
-get_sampler_type_for_image(const struct glsl_type *type)
+get_texture_type_for_image(const struct glsl_type *type)
 {
    if (glsl_type_is_array(type)) {
       const struct glsl_type *elem_type =
-         get_sampler_type_for_image(glsl_get_array_element(type));
+         get_texture_type_for_image(glsl_get_array_element(type));
       return glsl_array_type(elem_type, glsl_get_length(type), 0 /*explicit size*/);
    }
 
    assert((glsl_type_is_image(type)));
-   return glsl_sampler_type(glsl_get_sampler_dim(type), false,
+   return glsl_texture_type(glsl_get_sampler_dim(type),
                             glsl_sampler_type_is_array(type),
                             glsl_get_sampler_result_type(type));
 }
@@ -45,14 +45,16 @@ replace_image_type_with_sampler(nir_deref_instr *deref)
    const struct glsl_type *type = deref->type;
 
    /* If we've already chased up the deref chain this far from a different intrinsic, we're done */
-   if (glsl_type_is_sampler(glsl_without_array(type)))
+   if (glsl_type_is_texture(glsl_without_array(type)))
       return;
 
-   deref->type = get_sampler_type_for_image(type);
+   deref->type = get_texture_type_for_image(type);
+   deref->modes = nir_var_uniform;
    if (deref->deref_type == nir_deref_type_var) {
       type = deref->var->type;
-      if (!glsl_type_is_sampler(glsl_without_array(type))) {
-         deref->var->type = get_sampler_type_for_image(type);
+      if (!glsl_type_is_texture(glsl_without_array(type))) {
+         deref->var->type = get_texture_type_for_image(type);
+         deref->var->data.mode = nir_var_uniform;
          memset(&deref->var->data.sampler, 0, sizeof(deref->var->data.sampler));
       }
    } else {
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_ssbo.c b/mesa 3D driver/src/compiler/nir/nir_lower_ssbo.c
index 19a040a68d..408b03b350 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_ssbo.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_ssbo.c	
@@ -90,7 +90,7 @@ nir_load_ssbo_prop(nir_builder *b, nir_intrinsic_op op,
 {
    nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, op);
    load->num_components = 1;
-   nir_src_copy(&load->src[0], idx, load);
+   nir_src_copy(&load->src[0], idx);
    nir_ssa_dest_init(&load->instr, &load->dest, 1, bitsize, NULL);
    nir_builder_instr_insert(b, &load->instr);
    return &load->dest.ssa;
@@ -134,7 +134,7 @@ lower_ssbo_instr(nir_builder *b, nir_intrinsic_instr *intr)
    }
 
    if (is_store) {
-      nir_src_copy(&global->src[0], &intr->src[0], global);
+      nir_src_copy(&global->src[0], &intr->src[0]);
       nir_intrinsic_set_write_mask(global, nir_intrinsic_write_mask(intr));
    } else {
       nir_ssa_dest_init(&global->instr, &global->dest,
@@ -142,9 +142,9 @@ lower_ssbo_instr(nir_builder *b, nir_intrinsic_instr *intr)
                         intr->dest.ssa.bit_size, NULL);
 
       if (is_atomic) {
-         nir_src_copy(&global->src[1], &intr->src[2], global);
+         nir_src_copy(&global->src[1], &intr->src[2]);
          if (nir_intrinsic_infos[op].num_srcs > 2)
-            nir_src_copy(&global->src[2], &intr->src[3], global);
+            nir_src_copy(&global->src[2], &intr->src[3]);
       }
    }
 
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_subgroups.c b/mesa 3D driver/src/compiler/nir/nir_lower_subgroups.c
index ecac878325..3619fd1fd0 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_subgroups.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_subgroups.c	
@@ -45,7 +45,7 @@ lower_subgroups_64bit_split_intrinsic(nir_builder *b, nir_intrinsic_instr *intri
    intr->const_index[1] = intrin->const_index[1];
    intr->src[0] = nir_src_for_ssa(comp);
    if (nir_intrinsic_infos[intrin->intrinsic].num_srcs == 2)
-      nir_src_copy(&intr->src[1], &intrin->src[1], intr);
+      nir_src_copy(&intr->src[1], &intrin->src[1]);
 
    intr->num_components = 1;
    nir_builder_instr_insert(b, &intr->instr);
@@ -126,7 +126,7 @@ lower_subgroup_op_to_scalar(nir_builder *b, nir_intrinsic_instr *intrin,
       /* invocation */
       if (nir_intrinsic_infos[intrin->intrinsic].num_srcs > 1) {
          assert(nir_intrinsic_infos[intrin->intrinsic].num_srcs == 2);
-         nir_src_copy(&chan_intrin->src[1], &intrin->src[1], chan_intrin);
+         nir_src_copy(&chan_intrin->src[1], &intrin->src[1]);
       }
 
       chan_intrin->const_index[0] = intrin->const_index[0];
@@ -209,7 +209,7 @@ lower_shuffle_to_swizzle(nir_builder *b, nir_intrinsic_instr *intrin,
    nir_intrinsic_instr *swizzle = nir_intrinsic_instr_create(
       b->shader, nir_intrinsic_masked_swizzle_amd);
    swizzle->num_components = intrin->num_components;
-   nir_src_copy(&swizzle->src[0], &intrin->src[0], swizzle);
+   nir_src_copy(&swizzle->src[0], &intrin->src[0]);
    nir_intrinsic_set_swizzle_mask(swizzle, (mask << 10) | 0x1f);
    nir_ssa_dest_init(&swizzle->instr, &swizzle->dest,
                      intrin->dest.ssa.num_components,
@@ -286,7 +286,7 @@ lower_shuffle(nir_builder *b, nir_intrinsic_instr *intrin,
    nir_intrinsic_instr *shuffle =
       nir_intrinsic_instr_create(b->shader, nir_intrinsic_shuffle);
    shuffle->num_components = intrin->num_components;
-   nir_src_copy(&shuffle->src[0], &intrin->src[0], shuffle);
+   nir_src_copy(&shuffle->src[0], &intrin->src[0]);
    shuffle->src[1] = nir_src_for_ssa(index);
    nir_ssa_dest_init(&shuffle->instr, &shuffle->dest,
                      intrin->dest.ssa.num_components,
@@ -489,7 +489,7 @@ lower_dynamic_quad_broadcast(nir_builder *b, nir_intrinsic_instr *intrin,
 
       qbcst->num_components = intrin->num_components;
       qbcst->src[1] = nir_src_for_ssa(nir_imm_int(b, i));
-      nir_src_copy(&qbcst->src[0], &intrin->src[0], qbcst);
+      nir_src_copy(&qbcst->src[0], &intrin->src[0]);
       nir_ssa_dest_init(&qbcst->instr, &qbcst->dest,
                         intrin->dest.ssa.num_components,
                         intrin->dest.ssa.bit_size, NULL);
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_system_values.c b/mesa 3D driver/src/compiler/nir/nir_lower_system_values.c
index 6044305040..4d270578da 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_system_values.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_system_values.c	
@@ -502,8 +502,7 @@ bool
 nir_lower_compute_system_values(nir_shader *shader,
                                 const nir_lower_compute_system_values_options *options)
 {
-   if (shader->info.stage != MESA_SHADER_COMPUTE &&
-       shader->info.stage != MESA_SHADER_KERNEL)
+   if (!gl_shader_stage_uses_workgroup(shader->info.stage))
       return false;
 
    struct lower_sysval_state state;
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_sysvals_to_varyings.c b/mesa 3D driver/src/compiler/nir/nir_lower_sysvals_to_varyings.c
new file mode 100644
index 0000000000..6422e9cb92
--- /dev/null
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_sysvals_to_varyings.c	
@@ -0,0 +1,72 @@
+/*
+ * Copyright © 2021 Collabora Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+
+/*
+ * spirv_to_nir() creates system values for some builtin inputs, but
+ * backends might want to have those inputs exposed as varyings. This
+ * lowering pass allows backends to convert system values to input
+ * varyings and should be called just after spirv_to_nir() when needed.
+ */
+
+bool
+nir_lower_sysvals_to_varyings(nir_shader *shader,
+                              const struct nir_lower_sysvals_to_varyings_options *options)
+{
+   bool progress = false;
+
+   nir_foreach_variable_with_modes(var, shader, nir_var_system_value) {
+      switch (var->data.location) {
+#define SYSVAL_TO_VARYING(opt, sysval, varying) \
+        case SYSTEM_VALUE_ ## sysval: \
+           if (options->opt) { \
+              var->data.mode = nir_var_shader_in; \
+              var->data.location = VARYING_SLOT_ ## varying; \
+              progress = true; \
+           } \
+           break
+
+      SYSVAL_TO_VARYING(frag_coord, FRAG_COORD, POS);
+      SYSVAL_TO_VARYING(point_coord, POINT_COORD, PNTC);
+      SYSVAL_TO_VARYING(front_face, FRONT_FACE, FACE);
+
+#undef SYSVAL_TO_VARYING
+
+      default:
+         break;
+      }
+   }
+
+   if (progress)
+      nir_fixup_deref_modes(shader);
+
+   /* Nothing this does actually changes anything tracked by metadata.
+    * If we ever made this pass more complicated, we might need to care
+    * more about metadata.
+    */
+   nir_shader_preserve_all_metadata(shader);
+
+   return progress;
+}
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_tex.c b/mesa 3D driver/src/compiler/nir/nir_lower_tex.c
index d9e6f7e0ad..30aa1e7524 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_tex.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_tex.c	
@@ -289,7 +289,7 @@ sample_plane(nir_builder *b, nir_tex_instr *tex, int plane,
    nir_tex_instr *plane_tex =
       nir_tex_instr_create(b->shader, tex->num_srcs + 1);
    for (unsigned i = 0; i < tex->num_srcs; i++) {
-      nir_src_copy(&plane_tex->src[i].src, &tex->src[i].src, plane_tex);
+      nir_src_copy(&plane_tex->src[i].src, &tex->src[i].src);
       plane_tex->src[i].src_type = tex->src[i].src_type;
    }
    plane_tex->src[tex->num_srcs].src = nir_src_for_ssa(nir_imm_int(b, plane));
@@ -770,7 +770,7 @@ lower_tex_to_txd(nir_builder *b, nir_tex_instr *tex)
 
    /* reuse existing srcs */
    for (unsigned i = 0; i < tex->num_srcs; i++) {
-      nir_src_copy(&txd->src[i].src, &tex->src[i].src, txd);
+      nir_src_copy(&txd->src[i].src, &tex->src[i].src);
       txd->src[i].src_type = tex->src[i].src_type;
    }
    int coord = nir_tex_instr_src_index(tex, nir_tex_src_coord);
@@ -807,7 +807,7 @@ lower_txb_to_txl(nir_builder *b, nir_tex_instr *tex)
    /* reuse all but bias src */
    for (int i = 0; i < 2; i++) {
       if (tex->src[i].src_type != nir_tex_src_bias) {
-         nir_src_copy(&txl->src[i].src, &tex->src[i].src, txl);
+         nir_src_copy(&txl->src[i].src, &tex->src[i].src);
          txl->src[i].src_type = tex->src[i].src_type;
       }
    }
@@ -1097,7 +1097,7 @@ lower_tg4_offsets(nir_builder *b, nir_tex_instr *tex)
       tex_copy->dest_type = tex->dest_type;
 
       for (unsigned j = 0; j < tex->num_srcs; ++j) {
-         nir_src_copy(&tex_copy->src[j].src, &tex->src[j].src, tex_copy);
+         nir_src_copy(&tex_copy->src[j].src, &tex->src[j].src);
          tex_copy->src[j].src_type = tex->src[j].src_type;
       }
 
@@ -1191,6 +1191,70 @@ nir_lower_txs_cube_array(nir_builder *b, nir_tex_instr *tex)
    nir_ssa_def_rewrite_uses_after(&tex->dest.ssa, size, size->parent_instr);
 }
 
+static void
+nir_lower_ms_txf_to_fragment_fetch(nir_builder *b, nir_tex_instr *tex)
+{
+   lower_offset(b, tex);
+
+   b->cursor = nir_before_instr(&tex->instr);
+
+   /* Create FMASK fetch. */
+   assert(tex->texture_index == 0);
+   nir_tex_instr *fmask_fetch = nir_tex_instr_create(b->shader, tex->num_srcs - 1);
+   fmask_fetch->op = nir_texop_fragment_mask_fetch_amd;
+   fmask_fetch->coord_components = tex->coord_components;
+   fmask_fetch->sampler_dim = tex->sampler_dim;
+   fmask_fetch->is_array = tex->is_array;
+   fmask_fetch->texture_non_uniform = tex->texture_non_uniform;
+   fmask_fetch->dest_type = nir_type_uint32;
+   nir_ssa_dest_init(&fmask_fetch->instr, &fmask_fetch->dest, 1, 32, NULL);
+
+   fmask_fetch->num_srcs = 0;
+   for (unsigned i = 0; i < tex->num_srcs; i++) {
+      if (tex->src[i].src_type == nir_tex_src_ms_index)
+         continue;
+      nir_tex_src *src = &fmask_fetch->src[fmask_fetch->num_srcs++];
+      src->src = nir_src_for_ssa(tex->src[i].src.ssa);
+      src->src_type = tex->src[i].src_type;
+   }
+
+   nir_builder_instr_insert(b, &fmask_fetch->instr);
+
+   /* Obtain new sample index. */
+   int ms_index = nir_tex_instr_src_index(tex, nir_tex_src_ms_index);
+   assert(ms_index >= 0);
+   nir_src sample = tex->src[ms_index].src;
+   nir_ssa_def *new_sample = NULL;
+   if (nir_src_is_const(sample) && (nir_src_as_uint(sample) == 0 || nir_src_as_uint(sample) == 7)) {
+      if (nir_src_as_uint(sample) == 7)
+         new_sample = nir_ushr(b, &fmask_fetch->dest.ssa, nir_imm_int(b, 28));
+      else
+         new_sample = nir_iand_imm(b, &fmask_fetch->dest.ssa, 0xf);
+   } else {
+      new_sample = nir_ubitfield_extract(b, &fmask_fetch->dest.ssa,
+                                         nir_imul_imm(b, sample.ssa, 4), nir_imm_int(b, 4));
+   }
+
+   /* Update instruction. */
+   tex->op = nir_texop_fragment_fetch_amd;
+   nir_instr_rewrite_src_ssa(&tex->instr, &tex->src[ms_index].src, new_sample);
+}
+
+static void
+nir_lower_samples_identical_to_fragment_fetch(nir_builder *b, nir_tex_instr *tex)
+{
+   b->cursor = nir_after_instr(&tex->instr);
+
+   nir_tex_instr *fmask_fetch = nir_instr_as_tex(nir_instr_clone(b->shader, &tex->instr));
+   fmask_fetch->op = nir_texop_fragment_mask_fetch_amd;
+   fmask_fetch->dest_type = nir_type_uint32;
+   nir_ssa_dest_init(&fmask_fetch->instr, &fmask_fetch->dest, 1, 32, NULL);
+   nir_builder_instr_insert(b, &fmask_fetch->instr);
+
+   nir_ssa_def_rewrite_uses(&tex->dest.ssa, nir_ieq_imm(b, &fmask_fetch->dest.ssa, 0));
+   nir_instr_remove_v(&tex->instr);
+}
+
 static bool
 nir_lower_tex_block(nir_block *block, nir_builder *b,
                     const nir_lower_tex_options *options,
@@ -1373,13 +1437,8 @@ nir_lower_tex_block(nir_block *block, nir_builder *b,
        * derivatives.  Lower those opcodes which use implicit derivatives to
        * use an explicit LOD of 0.
        */
-      bool shader_supports_implicit_lod =
-         b->shader->info.stage == MESA_SHADER_FRAGMENT ||
-         (b->shader->info.stage == MESA_SHADER_COMPUTE &&
-          b->shader->info.cs.derivative_group != DERIVATIVE_GROUP_NONE);
-
       if (nir_tex_instr_has_implicit_derivative(tex) &&
-          !shader_supports_implicit_lod) {
+          !nir_shader_supports_implicit_lod(b->shader)) {
          lower_zero_lod(b, tex);
          progress = true;
       }
@@ -1405,6 +1464,18 @@ nir_lower_tex_block(nir_block *block, nir_builder *b,
          progress |= lower_tg4_offsets(b, tex);
          continue;
       }
+
+      if (options->lower_to_fragment_fetch_amd && tex->op == nir_texop_txf_ms) {
+         nir_lower_ms_txf_to_fragment_fetch(b, tex);
+         progress = true;
+         continue;
+      }
+
+      if (options->lower_to_fragment_fetch_amd && tex->op == nir_texop_samples_identical) {
+         nir_lower_samples_identical_to_fragment_fetch(b, tex);
+         progress = true;
+         continue;
+      }
    }
 
    return progress;
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_ubo_vec4.c b/mesa 3D driver/src/compiler/nir/nir_lower_ubo_vec4.c
index c0f5b1ca4f..90885b133b 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_ubo_vec4.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_ubo_vec4.c	
@@ -134,7 +134,7 @@ nir_lower_ubo_vec4_lower(nir_builder *b, nir_instr *instr, void *data)
 
       result = nir_vector_extract(b, result, component);
    } else if (align_mul == 8 &&
-              align_offset + chan_size_bytes * intr->num_components <= 16) {
+              align_offset + chan_size_bytes * intr->num_components <= 8) {
       /* Special case: Loading small vectors from offset % 8 == 0 can be done
        * with just one load and one bcsel.
        */
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_uniforms_to_ubo.c b/mesa 3D driver/src/compiler/nir/nir_lower_uniforms_to_ubo.c
index 0b37b705ef..12f835c3f0 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_uniforms_to_ubo.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_uniforms_to_ubo.c	
@@ -135,6 +135,8 @@ nir_lower_uniforms_to_ubo(nir_shader *shader, bool dword_packed, bool load_vec4)
       if (!shader->info.first_ubo_is_default_ubo) {
          nir_foreach_variable_with_modes(var, shader, nir_var_mem_ubo) {
             var->data.binding++;
+            if (var->data.driver_location != -1)
+               var->data.driver_location++;
             /* only increment location for ubo arrays */
             if (glsl_without_array(var->type) == var->interface_type &&
                 glsl_type_is_array(var->type))
@@ -145,7 +147,7 @@ nir_lower_uniforms_to_ubo(nir_shader *shader, bool dword_packed, bool load_vec4)
 
       if (shader->num_uniforms > 0) {
          const struct glsl_type *type = glsl_array_type(glsl_vec4_type(),
-                                                        shader->num_uniforms, 0);
+                                                        shader->num_uniforms, 16);
          nir_variable *ubo = nir_variable_create(shader, nir_var_mem_ubo, type,
                                                  "uniform_0");
          ubo->data.binding = 0;
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_var_copies.c b/mesa 3D driver/src/compiler/nir/nir_lower_var_copies.c
index f9df4446a1..8a74898243 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_var_copies.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_var_copies.c	
@@ -142,7 +142,7 @@ lower_var_copies_impl(nir_function_impl *impl)
          nir_deref_instr_remove_if_unused(nir_src_as_deref(copy->src[1]));
 
          progress = true;
-         ralloc_free(copy);
+         nir_instr_free(&copy->instr);
       }
    }
 
diff --git a/mesa 3D driver/src/compiler/nir/nir_lower_vec_to_movs.c b/mesa 3D driver/src/compiler/nir/nir_lower_vec_to_movs.c
index bcad081b11..dd38276887 100644
--- a/mesa 3D driver/src/compiler/nir/nir_lower_vec_to_movs.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_lower_vec_to_movs.c	
@@ -68,8 +68,8 @@ insert_mov(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)
       return 1 << start_idx;
 
    nir_alu_instr *mov = nir_alu_instr_create(shader, nir_op_mov);
-   nir_alu_src_copy(&mov->src[0], &vec->src[start_idx], mov);
-   nir_alu_dest_copy(&mov->dest, &vec->dest, mov);
+   nir_alu_src_copy(&mov->src[0], &vec->src[start_idx]);
+   nir_alu_dest_copy(&mov->dest, &vec->dest);
 
    mov->dest.write_mask = (1u << start_idx);
    mov->src[0].swizzle[start_idx] = vec->src[start_idx].swizzle[0];
@@ -107,7 +107,7 @@ insert_mov(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)
    if (mov->dest.write_mask) {
       nir_instr_insert_before(&vec->instr, &mov->instr);
    } else {
-      ralloc_free(mov);
+      nir_instr_free(&mov->instr);
    }
 
    return channels_handled;
@@ -296,7 +296,7 @@ nir_lower_vec_to_movs_instr(nir_builder *b, nir_instr *instr, void *data)
    }
 
    nir_instr_remove(&vec->instr);
-   ralloc_free(vec);
+   nir_instr_free(&vec->instr);
 
    return true;
 }
diff --git a/mesa 3D driver/src/compiler/nir/nir_opcodes.py b/mesa 3D driver/src/compiler/nir/nir_opcodes.py
index 6b2fc24300..c16923b0ee 100644
--- a/mesa 3D driver/src/compiler/nir/nir_opcodes.py	
+++ b/mesa 3D driver/src/compiler/nir/nir_opcodes.py	
@@ -487,7 +487,7 @@ for (int bit = 31; bit >= 0; bit--) {
 }
 """)
 
-unop_convert("ifind_msb_rev", tint32, tuint, """
+unop_convert("ifind_msb_rev", tint32, tint, """
 dst = -1;
 if (src0 != 0 && src0 != -1) {
    for (int bit = 0; bit < 31; bit++) {
@@ -1060,6 +1060,11 @@ if (bits == 0) {
 }
 """)
 
+# Sum of absolute differences with accumulation.
+# (Equivalent to AMD's v_sad_u8 instruction.)
+# The first two sources contain packed 8-bit unsigned integers, the instruction
+# will calculate the absolute difference of these, and then add them together.
+# There is also a third source which is a 32-bit unsigned integer and added to the result.
 triop_horiz("sad_u8x4", 1, 1, 1, 1, """
 uint8_t s0_b0 = (src0.x & 0x000000ff) >> 0;
 uint8_t s0_b1 = (src0.x & 0x0000ff00) >> 8;
@@ -1303,6 +1308,10 @@ for (int i = 0; i < 32; i += 8) {
 unop("fsat_signed_mali", tfloat, ("fmin(fmax(src0, -1.0), 1.0)"))
 unop("fclamp_pos_mali", tfloat, ("fmax(src0, 0.0)"))
 
+# Magnitude equal to fddx/y, sign undefined. Derivative of a constant is zero.
+unop("fddx_must_abs_mali", tfloat, "0.0")
+unop("fddy_must_abs_mali", tfloat, "0.0")
+
 # DXIL specific double [un]pack
 # DXIL doesn't support generic [un]pack instructions, so we want those
 # lowered to bit ops. HLSL doesn't support 64bit bitcasts to/from
@@ -1314,3 +1323,160 @@ unop_horiz("pack_double_2x32_dxil", 1, tuint64, 2, tuint32,
            "dst.x = src0.x | ((uint64_t)src0.y << 32);")
 unop_horiz("unpack_double_2x32_dxil", 2, tuint32, 1, tuint64,
            "dst.x = src0.x; dst.y = src0.x >> 32;")
+
+# src0 and src1 are i8vec4 packed in an int32, and src2 is an int32.  The int8
+# components are sign-extended to 32-bits, and a dot-product is performed on
+# the resulting vectors.  src2 is added to the result of the dot-product.
+opcode("sdot_4x8_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
+       False, _2src_commutative, """
+   const int32_t v0x = (int8_t)(src0      );
+   const int32_t v0y = (int8_t)(src0 >>  8);
+   const int32_t v0z = (int8_t)(src0 >> 16);
+   const int32_t v0w = (int8_t)(src0 >> 24);
+   const int32_t v1x = (int8_t)(src1      );
+   const int32_t v1y = (int8_t)(src1 >>  8);
+   const int32_t v1z = (int8_t)(src1 >> 16);
+   const int32_t v1w = (int8_t)(src1 >> 24);
+
+   dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
+""")
+
+# Like sdot_4x8_iadd, but unsigned.
+opcode("udot_4x8_uadd", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32],
+       False, _2src_commutative, """
+   const uint32_t v0x = (uint8_t)(src0      );
+   const uint32_t v0y = (uint8_t)(src0 >>  8);
+   const uint32_t v0z = (uint8_t)(src0 >> 16);
+   const uint32_t v0w = (uint8_t)(src0 >> 24);
+   const uint32_t v1x = (uint8_t)(src1      );
+   const uint32_t v1y = (uint8_t)(src1 >>  8);
+   const uint32_t v1z = (uint8_t)(src1 >> 16);
+   const uint32_t v1w = (uint8_t)(src1 >> 24);
+
+   dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
+""")
+
+# src0 is i8vec4 packed in an int32, src1 is u8vec4 packed in an int32, and
+# src2 is an int32.  The 8-bit components are extended to 32-bits, and a
+# dot-product is performed on the resulting vectors.  src2 is added to the
+# result of the dot-product.
+#
+# NOTE: Unlike many of the other dp4a opcodes, this mixed signs of source 0
+# and source 1 mean that this opcode is not 2-source commutative
+opcode("sudot_4x8_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
+       False, "", """
+   const int32_t v0x = (int8_t)(src0      );
+   const int32_t v0y = (int8_t)(src0 >>  8);
+   const int32_t v0z = (int8_t)(src0 >> 16);
+   const int32_t v0w = (int8_t)(src0 >> 24);
+   const uint32_t v1x = (uint8_t)(src1      );
+   const uint32_t v1y = (uint8_t)(src1 >>  8);
+   const uint32_t v1z = (uint8_t)(src1 >> 16);
+   const uint32_t v1w = (uint8_t)(src1 >> 24);
+
+   dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
+""")
+
+# Like sdot_4x8_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff].
+opcode("sdot_4x8_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
+       False, _2src_commutative, """
+   const int64_t v0x = (int8_t)(src0      );
+   const int64_t v0y = (int8_t)(src0 >>  8);
+   const int64_t v0z = (int8_t)(src0 >> 16);
+   const int64_t v0w = (int8_t)(src0 >> 24);
+   const int64_t v1x = (int8_t)(src1      );
+   const int64_t v1y = (int8_t)(src1 >>  8);
+   const int64_t v1z = (int8_t)(src1 >> 16);
+   const int64_t v1w = (int8_t)(src1 >> 24);
+
+   const int64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
+
+   dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp);
+""")
+
+# Like udot_4x8_uadd, but the result is clampled to the range [0, 0xfffffffff].
+opcode("udot_4x8_uadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
+       False, _2src_commutative, """
+   const uint64_t v0x = (uint8_t)(src0      );
+   const uint64_t v0y = (uint8_t)(src0 >>  8);
+   const uint64_t v0z = (uint8_t)(src0 >> 16);
+   const uint64_t v0w = (uint8_t)(src0 >> 24);
+   const uint64_t v1x = (uint8_t)(src1      );
+   const uint64_t v1y = (uint8_t)(src1 >>  8);
+   const uint64_t v1z = (uint8_t)(src1 >> 16);
+   const uint64_t v1w = (uint8_t)(src1 >> 24);
+
+   const uint64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
+
+   dst = tmp >= UINT32_MAX ? UINT32_MAX : tmp;
+""")
+
+# Like sudot_4x8_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff].
+#
+# NOTE: Unlike many of the other dp4a opcodes, this mixed signs of source 0
+# and source 1 mean that this opcode is not 2-source commutative
+opcode("sudot_4x8_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
+       False, "", """
+   const int64_t v0x = (int8_t)(src0      );
+   const int64_t v0y = (int8_t)(src0 >>  8);
+   const int64_t v0z = (int8_t)(src0 >> 16);
+   const int64_t v0w = (int8_t)(src0 >> 24);
+   const uint64_t v1x = (uint8_t)(src1      );
+   const uint64_t v1y = (uint8_t)(src1 >>  8);
+   const uint64_t v1z = (uint8_t)(src1 >> 16);
+   const uint64_t v1w = (uint8_t)(src1 >> 24);
+
+   const int64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
+
+   dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp);
+""")
+
+# src0 and src1 are i16vec2 packed in an int32, and src2 is an int32.  The int16
+# components are sign-extended to 32-bits, and a dot-product is performed on
+# the resulting vectors.  src2 is added to the result of the dot-product.
+opcode("sdot_2x16_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
+       False, _2src_commutative, """
+   const int32_t v0x = (int16_t)(src0      );
+   const int32_t v0y = (int16_t)(src0 >> 16);
+   const int32_t v1x = (int16_t)(src1      );
+   const int32_t v1y = (int16_t)(src1 >> 16);
+
+   dst = (v0x * v1x) + (v0y * v1y) + src2;
+""")
+
+# Like sdot_2x16_iadd, but unsigned.
+opcode("udot_2x16_uadd", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32],
+       False, _2src_commutative, """
+   const uint32_t v0x = (uint16_t)(src0      );
+   const uint32_t v0y = (uint16_t)(src0 >> 16);
+   const uint32_t v1x = (uint16_t)(src1      );
+   const uint32_t v1y = (uint16_t)(src1 >> 16);
+
+   dst = (v0x * v1x) + (v0y * v1y) + src2;
+""")
+
+# Like sdot_2x16_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff].
+opcode("sdot_2x16_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
+       False, _2src_commutative, """
+   const int64_t v0x = (int16_t)(src0      );
+   const int64_t v0y = (int16_t)(src0 >> 16);
+   const int64_t v1x = (int16_t)(src1      );
+   const int64_t v1y = (int16_t)(src1 >> 16);
+
+   const int64_t tmp = (v0x * v1x) + (v0y * v1y) + src2;
+
+   dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp);
+""")
+
+# Like udot_2x16_uadd, but the result is clampled to the range [0, 0xfffffffff].
+opcode("udot_2x16_uadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
+       False, _2src_commutative, """
+   const uint64_t v0x = (uint16_t)(src0      );
+   const uint64_t v0y = (uint16_t)(src0 >> 16);
+   const uint64_t v1x = (uint16_t)(src1      );
+   const uint64_t v1y = (uint16_t)(src1 >> 16);
+
+   const uint64_t tmp = (v0x * v1x) + (v0y * v1y) + src2;
+
+   dst = tmp >= UINT32_MAX ? UINT32_MAX : tmp;
+""")
diff --git a/mesa 3D driver/src/compiler/nir/nir_opt_access.c b/mesa 3D driver/src/compiler/nir/nir_opt_access.c
index cc0e8025b2..4e5f0c4194 100644
--- a/mesa 3D driver/src/compiler/nir/nir_opt_access.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_opt_access.c	
@@ -115,9 +115,11 @@ gather_intrinsic(struct access_state *state, nir_intrinsic_instr *instr)
          state->images_written |= write;
       }
 
-      if (var->data.mode == nir_var_uniform && read)
+      if ((var->data.mode == nir_var_uniform ||
+           var->data.mode == nir_var_image) && read)
          _mesa_set_add(state->vars_read, var);
-      if (var->data.mode == nir_var_uniform && write)
+      if ((var->data.mode == nir_var_uniform ||
+           var->data.mode == nir_var_image) && write)
          _mesa_set_add(state->vars_written, var);
       break;
 
@@ -187,7 +189,8 @@ process_variable(struct access_state *state, nir_variable *var)
 {
    const struct glsl_type *type = glsl_without_array(var->type);
    if (var->data.mode != nir_var_mem_ssbo &&
-       !(var->data.mode == nir_var_uniform && glsl_type_is_image(type)))
+       !(var->data.mode == nir_var_uniform && glsl_type_is_image(type)) &&
+       var->data.mode != nir_var_image)
       return false;
 
    /* Ignore variables we've already marked */
@@ -343,7 +346,8 @@ nir_opt_access(nir_shader *shader, const nir_opt_access_options *options)
 
    nir_foreach_variable_with_modes(var, shader, nir_var_uniform |
                                                 nir_var_mem_ubo |
-                                                nir_var_mem_ssbo)
+                                                nir_var_mem_ssbo |
+                                                nir_var_image)
       var_progress |= process_variable(&state, var);
 
    nir_foreach_function(func, shader) {
diff --git a/mesa 3D driver/src/compiler/nir/nir_opt_algebraic.py b/mesa 3D driver/src/compiler/nir/nir_opt_algebraic.py
index 449f114da6..c690b72a4e 100644
--- a/mesa 3D driver/src/compiler/nir/nir_opt_algebraic.py	
+++ b/mesa 3D driver/src/compiler/nir/nir_opt_algebraic.py	
@@ -192,15 +192,101 @@ optimizations = [
 
    # flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c)
    (('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)),
+
+   (('sdot_4x8_iadd', a, 0, b), b),
+   (('udot_4x8_uadd', a, 0, b), b),
+   (('sdot_4x8_iadd_sat', a, 0, b), b),
+   (('udot_4x8_uadd_sat', a, 0, b), b),
+   (('sdot_2x16_iadd', a, 0, b), b),
+   (('udot_2x16_uadd', a, 0, b), b),
+   (('sdot_2x16_iadd_sat', a, 0, b), b),
+   (('udot_2x16_uadd_sat', a, 0, b), b),
+
+   # sudot_4x8_iadd is not commutative at all, so the patterns must be
+   # duplicated with zeros on each of the first positions.
+   (('sudot_4x8_iadd', a, 0, b), b),
+   (('sudot_4x8_iadd', 0, a, b), b),
+   (('sudot_4x8_iadd_sat', a, 0, b), b),
+   (('sudot_4x8_iadd_sat', 0, a, b), b),
+
+   (('iadd', ('sdot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_4x8_iadd', a, b, ('iadd', c, d))),
+   (('iadd', ('udot_4x8_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_4x8_uadd', a, b, ('iadd', c, d))),
+   (('iadd', ('sudot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sudot_4x8_iadd', a, b, ('iadd', c, d))),
+   (('iadd', ('sdot_2x16_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_2x16_iadd', a, b, ('iadd', c, d))),
+   (('iadd', ('udot_2x16_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_2x16_uadd', a, b, ('iadd', c, d))),
+
+   # Try to let constant folding eliminate the dot-product part.  These are
+   # safe because the dot product cannot overflow 32 bits.
+   (('iadd', ('sdot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sdot_4x8_iadd', a, b, c)),
+   (('iadd', ('udot_4x8_uadd', 'a(is_not_const)', b, 0), c), ('udot_4x8_uadd', a, b, c)),
+   (('iadd', ('sudot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sudot_4x8_iadd', a, b, c)),
+   (('iadd', ('sudot_4x8_iadd', a, 'b(is_not_const)', 0), c), ('sudot_4x8_iadd', a, b, c)),
+   (('iadd', ('sdot_2x16_iadd', 'a(is_not_const)', b, 0), c), ('sdot_2x16_iadd', a, b, c)),
+   (('iadd', ('udot_2x16_uadd', 'a(is_not_const)', b, 0), c), ('udot_2x16_uadd', a, b, c)),
+   (('sdot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_4x8_iadd', a, b, 0), c)),
+   (('udot_4x8_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_4x8_uadd', a, b, 0), c)),
+   (('sudot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sudot_4x8_iadd', a, b, 0), c)),
+   (('sdot_2x16_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_2x16_iadd', a, b, 0), c)),
+   (('udot_2x16_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_2x16_uadd', a, b, 0), c)),
+   (('sdot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'),
+   (('udot_4x8_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_4x8_uadd', a, b, 0), c), '!options->lower_uadd_sat'),
+   (('sudot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sudot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'),
+   (('sdot_2x16_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_2x16_iadd', a, b, 0), c), '!options->lower_iadd_sat'),
+   (('udot_2x16_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_2x16_uadd', a, b, 0), c), '!options->lower_uadd_sat'),
 ]
 
+# Shorthand for the expansion of just the dot product part of the [iu]dp4a
+# instructions.
+sdot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_i8', b, 0)),
+                                 ('imul', ('extract_i8', a, 1), ('extract_i8', b, 1))),
+                        ('iadd', ('imul', ('extract_i8', a, 2), ('extract_i8', b, 2)),
+                                 ('imul', ('extract_i8', a, 3), ('extract_i8', b, 3))))
+udot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_u8', a, 0), ('extract_u8', b, 0)),
+                                 ('imul', ('extract_u8', a, 1), ('extract_u8', b, 1))),
+                        ('iadd', ('imul', ('extract_u8', a, 2), ('extract_u8', b, 2)),
+                                 ('imul', ('extract_u8', a, 3), ('extract_u8', b, 3))))
+sudot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_u8', b, 0)),
+                                  ('imul', ('extract_i8', a, 1), ('extract_u8', b, 1))),
+                         ('iadd', ('imul', ('extract_i8', a, 2), ('extract_u8', b, 2)),
+                                  ('imul', ('extract_i8', a, 3), ('extract_u8', b, 3))))
+sdot_2x16_a_b = ('iadd', ('imul', ('extract_i16', a, 0), ('extract_i16', b, 0)),
+                         ('imul', ('extract_i16', a, 1), ('extract_i16', b, 1)))
+udot_2x16_a_b = ('iadd', ('imul', ('extract_u16', a, 0), ('extract_u16', b, 0)),
+                         ('imul', ('extract_u16', a, 1), ('extract_u16', b, 1)))
+
+optimizations.extend([
+   (('sdot_4x8_iadd', a, b, c), ('iadd', sdot_4x8_a_b, c), '!options->has_dot_4x8'),
+   (('udot_4x8_uadd', a, b, c), ('iadd', udot_4x8_a_b, c), '!options->has_dot_4x8'),
+   (('sudot_4x8_iadd', a, b, c), ('iadd', sudot_4x8_a_b, c), '!options->has_sudot_4x8'),
+   (('sdot_2x16_iadd', a, b, c), ('iadd', sdot_2x16_a_b, c), '!options->has_dot_2x16'),
+   (('udot_2x16_uadd', a, b, c), ('iadd', udot_2x16_a_b, c), '!options->has_dot_2x16'),
+
+   # For the unsigned dot-product, the largest possible value 4*(255*255) =
+   # 0x3f804, so we don't have to worry about that intermediate result
+   # overflowing.  0x100000000 - 0x3f804 = 0xfffc07fc.  If c is a constant
+   # that is less than 0xfffc07fc, then the result cannot overflow ever.
+   (('udot_4x8_uadd_sat', a, b, '#c(is_ult_0xfffc07fc)'), ('udot_4x8_uadd', a, b, c)),
+   (('udot_4x8_uadd_sat', a, b, c), ('uadd_sat', udot_4x8_a_b, c), '!options->has_dot_4x8'),
+
+   # For the signed dot-product, the largest positive value is 4*(-128*-128) =
+   # 0x10000, and the largest negative value is 4*(-128*127) = -0xfe00.  We
+   # don't have to worry about that intermediate result overflowing or
+   # underflowing.
+   (('sdot_4x8_iadd_sat', a, b, c), ('iadd_sat', sdot_4x8_a_b, c), '!options->has_dot_4x8'),
+
+   (('sudot_4x8_iadd_sat', a, b, c), ('iadd_sat', sudot_4x8_a_b, c), '!options->has_sudot_4x8'),
+
+   (('udot_2x16_uadd_sat', a, b, c), ('uadd_sat', udot_2x16_a_b, c), '!options->has_dot_2x16'),
+   (('sdot_2x16_iadd_sat', a, b, c), ('iadd_sat', sdot_2x16_a_b, c), '!options->has_dot_2x16'),
+])
+
 # Float sizes
 for s in [16, 32, 64]:
     optimizations.extend([
        (('~flrp@{}'.format(s), a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),
 
        (('~flrp@{}'.format(s), a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp{}'.format(s)),
-       (('~flrp@{}'.format(s), ('fadd', a, b), ('fadd', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp{}'.format(s)),
+       (('~flrp@{}'.format(s), ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp{}'.format(s)),
        (('~flrp@{}'.format(s), a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp{}'.format(s)),
 
        (('~fadd@{}'.format(s), ('fmul', a, ('fadd', 1.0, ('fneg', c))), ('fmul', b, c)), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)),
@@ -359,6 +445,8 @@ optimizations.extend([
 
    # (a + #b) * #c => (a * #c) + (#b * #c)
    (('imul', ('iadd(is_used_once)', a, '#b'), '#c'), ('iadd', ('imul', a, c), ('imul', b, c))),
+   (('~fmul', ('fadd(is_used_once)', a, '#b'), '#c'), ('fadd', ('fmul', a, c), ('fmul', b, c)),
+    '!options->avoid_ternary_with_two_constants'),
 
    # ((a + #b) + c) * #d => ((a + c) * #d) + (#b * #d)
    (('imul', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'),
@@ -588,6 +676,20 @@ optimizations.extend([
    (('imin', ('imin', ('imin', a, b), c), a), ('imin', ('imin', a, b), c)),
 ])
 
+for N in [8, 16, 32, 64]:
+    b2iN = 'b2i{0}'.format(N)
+    optimizations.extend([
+        (('ieq', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ieq', a, b)),
+        (('ine', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ine', a, b)),
+    ])
+
+for N in [16, 32, 64]:
+    b2fN = 'b2f{0}'.format(N)
+    optimizations.extend([
+        (('feq', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ieq', a, b)),
+        (('fneu', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ine', a, b)),
+    ])
+
 # Integer sizes
 for s in [8, 16, 32, 64]:
     optimizations.extend([
@@ -652,6 +754,11 @@ optimizations.extend([
    # fmin(0.0, b)) while the right one is "b", so this optimization is inexact.
    (('~fmin', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmin', a, b))),
 
+   # max(-min(b, a), b) -> max(abs(b), -a)
+   # min(-max(b, a), b) -> min(-abs(b), -a)
+   (('fmax', ('fneg', ('fmin', b, a)), b), ('fmax', ('fabs', b), ('fneg', a))),
+   (('fmin', ('fneg', ('fmax', b, a)), b), ('fmin', ('fneg', ('fabs', b)), ('fneg', a))),
+
    # If a in [0,b] then b-a is also in [0,b].  Since b in [0,1], max(b-a, 0) =
    # fsat(b-a).
    #
@@ -711,6 +818,10 @@ optimizations.extend([
    (('ior', ('ior(is_used_once)', ('flt', a, c), d), ('flt(is_used_once)', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)),
    (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, b), d), ('flt', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)),
    (('ior', ('ior(is_used_once)', ('flt', a, b), d), ('flt(is_used_once)', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)),
+
+   # This is how SpvOpFOrdNotEqual might be implemented.  If both values are
+   # numbers, then it can be replaced with fneu.
+   (('ior', ('flt', 'a(is_a_number)', 'b(is_a_number)'), ('flt', b, a)), ('fneu', a, b)),
 ])
 
 # Float sizes
@@ -1144,9 +1255,22 @@ optimizations.extend([
    (('ieq', ('ineg', ('b2i', 'a@1')), -1), a),
    (('ine', ('ineg', ('b2i', 'a@1')), 0), a),
    (('ine', ('ineg', ('b2i', 'a@1')), -1), ('inot', a)),
+   (('ige', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)),
+   (('ilt', ('ineg', ('b2i', 'a@1')), 0), a),
+   (('ult', 0, ('ineg', ('b2i', 'a@1'))), a),
    (('iand', ('ineg', ('b2i', a)), 1.0), ('b2f', a)),
    (('iand', ('ineg', ('b2i', a)), 1),   ('b2i', a)),
 
+   # With D3D booleans, imax is AND and umax is OR
+   (('imax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
+    ('ineg', ('b2i', ('iand', a, b)))),
+   (('imin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
+    ('ineg', ('b2i', ('ior', a, b)))),
+   (('umax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
+    ('ineg', ('b2i', ('ior', a, b)))),
+   (('umin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
+    ('ineg', ('b2i', ('iand', a, b)))),
+
    # Conversions
    (('i2b16', ('b2i', 'a@16')), a),
    (('i2b32', ('b2i', 'a@32')), a),
@@ -1357,6 +1481,15 @@ optimizations.extend([
 
    (('iadd', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)),
    (('ior',  ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)),
+
+   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 0), ('i2i', a)),
+   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 1), ('i2i', b)),
+   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 2), ('i2i', c)),
+   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 3), ('i2i', d)),
+   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 0), ('u2u', a)),
+   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 1), ('u2u', b)),
+   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 2), ('u2u', c)),
+   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 3), ('u2u', d)),
 ])
 
 # After the ('extract_u8', a, 0) pattern, above, triggers, there will be
@@ -1459,9 +1592,9 @@ optimizations.extend([
    (('irhadd@64', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
    (('urhadd@64', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
 
-   (('uadd_sat@64', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_add_sat || (options->lower_int64_options & nir_lower_iadd64) != 0'),
-   (('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_add_sat'),
-   (('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_add_sat'),
+   (('uadd_sat@64', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_uadd_sat || (options->lower_int64_options & nir_lower_iadd64) != 0'),
+   (('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_uadd_sat'),
+   (('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_uadd_sat'),
    (('usub_sat@64', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_usub_sat64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
 
    # int64_t sum = a + b;
@@ -1781,10 +1914,10 @@ for bit_size in [8, 16, 32, 64]:
    optimizations += [
       (('iadd_sat@' + str(bit_size), a, b),
        ('bcsel', ('ige', b, 1), ('bcsel', ('ilt', ('iadd', a, b), a), intmax, ('iadd', a, b)),
-                                ('bcsel', ('ilt', a, ('iadd', a, b)), intmin, ('iadd', a, b))), 'options->lower_add_sat'),
+                                ('bcsel', ('ilt', a, ('iadd', a, b)), intmin, ('iadd', a, b))), 'options->lower_iadd_sat'),
       (('isub_sat@' + str(bit_size), a, b),
        ('bcsel', ('ilt', b, 0), ('bcsel', ('ilt', ('isub', a, b), a), intmax, ('isub', a, b)),
-                                ('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_add_sat'),
+                                ('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_iadd_sat'),
    ]
 
 invert = OrderedDict([('feq', 'fneu'), ('fneu', 'feq')])
@@ -2312,6 +2445,16 @@ late_optimizations = [
    (('feq',  ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('feq',  a, ('fneg', b))),
    (('fneu', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('fneu', a, ('fneg', b))),
 
+   # This is how SpvOpFOrdNotEqual might be implemented.  Replace it with
+   # SpvOpLessOrGreater.
+   (('iand', ('fneu', a, b),   ('iand', ('feq', a, a), ('feq', b, b))), ('ior', ('!flt', a, b), ('!flt', b, a))),
+   (('iand', ('fneu', a, 0.0),          ('feq', a, a)                ), ('!flt', 0.0, ('fabs', a))),
+
+   # This is how SpvOpFUnordEqual might be implemented.  Replace it with
+   # !SpvOpLessOrGreater.
+   (('ior', ('feq', a, b),   ('ior', ('fneu', a, a), ('fneu', b, b))), ('inot', ('ior', ('!flt', a, b), ('!flt', b, a)))),
+   (('ior', ('feq', a, 0.0),         ('fneu', a, a),                ), ('inot', ('!flt', 0.0, ('fabs', a)))),
+
    # nir_lower_to_source_mods will collapse this, but its existence during the
    # optimization loop can prevent other optimizations.
    (('fneg', ('fneg', a)), a),
@@ -2466,6 +2609,11 @@ late_optimizations = [
    (('ishr', a, 0), a),
    (('ishr', a, -32), a),
    (('ushr', a, 0), a),
+
+   (('extract_i8', ('extract_i8', a, b), 0), ('extract_i8', a, b)),
+   (('extract_i8', ('extract_u8', a, b), 0), ('extract_i8', a, b)),
+   (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)),
+   (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)),
 ]
 
 # A few more extract cases we'd rather leave late
@@ -2570,6 +2718,7 @@ late_optimizations += [
   (('i2fmp', a), ('i2f16', a)),
   (('i2imp', a), ('u2u16', a)),
   (('u2fmp', a), ('u2f16', a)),
+  (('fisfinite', a), ('flt', ('fabs', a), float("inf"))),
 ]
 
 distribute_src_mods = [
diff --git a/mesa 3D driver/src/compiler/nir/nir_opt_comparison_pre.c b/mesa 3D driver/src/compiler/nir/nir_opt_comparison_pre.c
index ae35e2c8d0..19516a5061 100644
--- a/mesa 3D driver/src/compiler/nir/nir_opt_comparison_pre.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_opt_comparison_pre.c	
@@ -105,9 +105,7 @@ push_block(struct block_queue *bq)
          return NULL;
    }
 
-   if (!u_vector_init(&bi->instructions,
-                      sizeof(nir_alu_instr *),
-                      8 * sizeof(nir_alu_instr *))) {
+   if (!u_vector_init_pow2(&bi->instructions, 8, sizeof(nir_alu_instr *))) {
       free(bi);
       return NULL;
    }
@@ -139,8 +137,6 @@ static void
 rewrite_compare_instruction(nir_builder *bld, nir_alu_instr *orig_cmp,
                             nir_alu_instr *orig_add, bool zero_on_left)
 {
-   void *const mem_ctx = ralloc_parent(orig_cmp);
-
    bld->cursor = nir_before_instr(&orig_cmp->instr);
 
    /* This is somewhat tricky.  The compare instruction may be something like
@@ -174,7 +170,7 @@ rewrite_compare_instruction(nir_builder *bld, nir_alu_instr *orig_cmp,
     * will clean these up.  This is similar to nir_replace_instr (in
     * nir_search.c).
     */
-   nir_alu_instr *mov_add = nir_alu_instr_create(mem_ctx, nir_op_mov);
+   nir_alu_instr *mov_add = nir_alu_instr_create(bld->shader, nir_op_mov);
    mov_add->dest.write_mask = orig_add->dest.write_mask;
    nir_ssa_dest_init(&mov_add->instr, &mov_add->dest.dest,
                      orig_add->dest.dest.ssa.num_components,
@@ -183,7 +179,7 @@ rewrite_compare_instruction(nir_builder *bld, nir_alu_instr *orig_cmp,
 
    nir_builder_instr_insert(bld, &mov_add->instr);
 
-   nir_alu_instr *mov_cmp = nir_alu_instr_create(mem_ctx, nir_op_mov);
+   nir_alu_instr *mov_cmp = nir_alu_instr_create(bld->shader, nir_op_mov);
    mov_cmp->dest.write_mask = orig_cmp->dest.write_mask;
    nir_ssa_dest_init(&mov_cmp->instr, &mov_cmp->dest.dest,
                      orig_cmp->dest.dest.ssa.num_components,
diff --git a/mesa 3D driver/src/compiler/nir/nir_opt_constant_folding.c b/mesa 3D driver/src/compiler/nir/nir_opt_constant_folding.c
index 5695a09e2e..b46239794c 100644
--- a/mesa 3D driver/src/compiler/nir/nir_opt_constant_folding.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_opt_constant_folding.c	
@@ -105,8 +105,7 @@ try_fold_alu(nir_builder *b, nir_alu_instr *alu)
                                        dest);
    nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, imm);
    nir_instr_remove(&alu->instr);
-
-   ralloc_free(alu);
+   nir_instr_free(&alu->instr);
 
    return true;
 }
diff --git a/mesa 3D driver/src/compiler/nir/nir_opt_gcm.c b/mesa 3D driver/src/compiler/nir/nir_opt_gcm.c
index 853b630b22..48f1aed2a7 100644
--- a/mesa 3D driver/src/compiler/nir/nir_opt_gcm.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_opt_gcm.c	
@@ -222,6 +222,74 @@ is_src_scalarizable(nir_src *src)
    }
 }
 
+static bool
+is_binding_dynamically_uniform(nir_src src)
+{
+   nir_binding binding = nir_chase_binding(src);
+   if (!binding.success)
+      return false;
+
+   for (unsigned i = 0; i < binding.num_indices; i++) {
+      if (!nir_src_is_dynamically_uniform(binding.indices[i]))
+         return false;
+   }
+
+   return true;
+}
+
+static void
+pin_intrinsic(nir_intrinsic_instr *intrin)
+{
+   nir_instr *instr = &intrin->instr;
+
+   if (!nir_intrinsic_can_reorder(intrin)) {
+      instr->pass_flags = GCM_INSTR_PINNED;
+      return;
+   }
+
+   instr->pass_flags = 0;
+
+   /* If the intrinsic requires a uniform source, we can't safely move it across non-uniform
+    * control flow if it's not uniform at the point it's defined.
+    * Stores and atomics can never be re-ordered, so we don't have to consider them here.
+    */
+   bool non_uniform = nir_intrinsic_has_access(intrin) &&
+                      (nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM);
+   if (!non_uniform &&
+       (intrin->intrinsic == nir_intrinsic_load_ubo ||
+        intrin->intrinsic == nir_intrinsic_load_ssbo ||
+        intrin->intrinsic == nir_intrinsic_get_ubo_size ||
+        intrin->intrinsic == nir_intrinsic_get_ssbo_size ||
+        nir_intrinsic_has_image_dim(intrin) ||
+        ((intrin->intrinsic == nir_intrinsic_load_deref ||
+          intrin->intrinsic == nir_intrinsic_deref_buffer_array_length) &&
+         nir_deref_mode_may_be(nir_src_as_deref(intrin->src[0]),
+                               nir_var_mem_ubo | nir_var_mem_ssbo)))) {
+      if (!is_binding_dynamically_uniform(intrin->src[0]))
+         instr->pass_flags = GCM_INSTR_PINNED;
+   } else if (intrin->intrinsic == nir_intrinsic_load_push_constant) {
+      if (!nir_src_is_dynamically_uniform(intrin->src[0]))
+         instr->pass_flags = GCM_INSTR_PINNED;
+   } else if (intrin->intrinsic == nir_intrinsic_load_deref &&
+              nir_deref_mode_is(nir_src_as_deref(intrin->src[0]),
+                                nir_var_mem_push_const)) {
+      nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+      while (deref->deref_type != nir_deref_type_var) {
+         if ((deref->deref_type == nir_deref_type_array ||
+              deref->deref_type == nir_deref_type_ptr_as_array) &&
+             !nir_src_is_dynamically_uniform(deref->arr.index)) {
+            instr->pass_flags = GCM_INSTR_PINNED;
+            return;
+         }
+         deref = nir_deref_instr_parent(deref);
+         if (!deref) {
+            instr->pass_flags = GCM_INSTR_PINNED;
+            return;
+         }
+      }
+   }
+}
+
 /* Walks the instruction list and marks immovable instructions as pinned or
  * placed.
  *
@@ -265,24 +333,47 @@ gcm_pin_instructions(nir_function_impl *impl, struct gcm_state *state)
             }
             break;
 
-         case nir_instr_type_tex:
-            if (nir_tex_instr_has_implicit_derivative(nir_instr_as_tex(instr)))
+         case nir_instr_type_tex: {
+            nir_tex_instr *tex = nir_instr_as_tex(instr);
+            if (nir_tex_instr_has_implicit_derivative(tex))
                instr->pass_flags = GCM_INSTR_SCHEDULE_EARLIER_ONLY;
+
+            for (unsigned i = 0; i < tex->num_srcs; i++) {
+               nir_tex_src *src = &tex->src[i];
+               switch (src->src_type) {
+               case nir_tex_src_texture_deref:
+                  if (!tex->texture_non_uniform && !is_binding_dynamically_uniform(src->src))
+                     instr->pass_flags = GCM_INSTR_PINNED;
+                  break;
+               case nir_tex_src_sampler_deref:
+                  if (!tex->sampler_non_uniform && !is_binding_dynamically_uniform(src->src))
+                     instr->pass_flags = GCM_INSTR_PINNED;
+                  break;
+               case nir_tex_src_texture_offset:
+               case nir_tex_src_texture_handle:
+                  if (!tex->texture_non_uniform && !nir_src_is_dynamically_uniform(src->src))
+                     instr->pass_flags = GCM_INSTR_PINNED;
+                  break;
+               case nir_tex_src_sampler_offset:
+               case nir_tex_src_sampler_handle:
+                  if (!tex->sampler_non_uniform && !nir_src_is_dynamically_uniform(src->src))
+                     instr->pass_flags = GCM_INSTR_PINNED;
+                  break;
+               default:
+                  break;
+               }
+            }
             break;
+         }
 
          case nir_instr_type_deref:
          case nir_instr_type_load_const:
             instr->pass_flags = 0;
             break;
 
-         case nir_instr_type_intrinsic: {
-            if (nir_intrinsic_can_reorder(nir_instr_as_intrinsic(instr))) {
-               instr->pass_flags = 0;
-            } else {
-               instr->pass_flags = GCM_INSTR_PINNED;
-            }
+         case nir_instr_type_intrinsic:
+            pin_intrinsic(nir_instr_as_intrinsic(instr));
             break;
-         }
 
          case nir_instr_type_jump:
          case nir_instr_type_ssa_undef:
diff --git a/mesa 3D driver/src/compiler/nir/nir_opt_if.c b/mesa 3D driver/src/compiler/nir/nir_opt_if.c
index 20a72d647b..1033606997 100644
--- a/mesa 3D driver/src/compiler/nir/nir_opt_if.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_opt_if.c	
@@ -540,7 +540,7 @@ opt_split_alu_of_phi(nir_builder *b, nir_loop *loop)
        * remove it.
        */
       nir_instr_remove_v(&alu->instr);
-      ralloc_free(alu);
+      nir_instr_free(&alu->instr);
 
       progress = true;
    }
@@ -705,7 +705,7 @@ opt_simplify_bcsel_of_phi(nir_builder *b, nir_loop *loop)
        * just remove it.
        */
       nir_instr_remove_v(&bcsel->instr);
-      ralloc_free(bcsel);
+      nir_instr_free(&bcsel->instr);
 
       progress = true;
    }
@@ -1200,6 +1200,111 @@ opt_if_evaluate_condition_use(nir_builder *b, nir_if *nif)
    return progress;
 }
 
+static bool
+rewrite_comp_uses_within_if(nir_builder *b, nir_if *nif, bool invert,
+                            nir_ssa_scalar scalar, nir_ssa_scalar new_scalar)
+{
+   bool progress = false;
+
+   nir_block *first = invert ? nir_if_first_else_block(nif) : nir_if_first_then_block(nif);
+   nir_block *last = invert ? nir_if_last_else_block(nif) : nir_if_last_then_block(nif);
+
+   nir_ssa_def *new_ssa = NULL;
+   nir_foreach_use_safe(use, scalar.def) {
+      if (use->parent_instr->block->index < first->index ||
+          use->parent_instr->block->index > last->index)
+         continue;
+
+      /* Only rewrite users which use only the new component. This is to avoid a
+       * situation where copy propagation will undo the rewrite and we risk an infinite
+       * loop.
+       *
+       * We could rewrite users which use a mix of the old and new components, but if
+       * nir_src_components_read() is incomplete, then we risk the new component actually being
+       * unused and some optimization later undoing the rewrite.
+       */
+      if (nir_src_components_read(use) != BITFIELD64_BIT(scalar.comp))
+         continue;
+
+      if (!new_ssa) {
+         b->cursor = nir_before_cf_node(&nif->cf_node);
+         new_ssa = nir_channel(b, new_scalar.def, new_scalar.comp);
+         if (scalar.def->num_components > 1) {
+            nir_ssa_def *vec = nir_ssa_undef(b, scalar.def->num_components, scalar.def->bit_size);
+            new_ssa = nir_vector_insert_imm(b, vec, new_ssa, scalar.comp);
+         }
+      }
+
+      nir_instr_rewrite_src_ssa(use->parent_instr, use, new_ssa);
+      progress = true;
+   }
+
+   return progress;
+}
+
+/*
+ * This optimization turns:
+ *
+ *     if (a == (b=readfirstlane(a)))
+ *        use(a)
+ *     if (c == (d=load_const))
+ *        use(c)
+ *
+ * into:
+ *
+ *     if (a == (b=readfirstlane(a)))
+ *        use(b)
+ *     if (c == (d=load_const))
+ *        use(d)
+*/
+static bool
+opt_if_rewrite_uniform_uses(nir_builder *b, nir_if *nif, nir_ssa_scalar cond, bool accept_ine)
+{
+   bool progress = false;
+
+   if (!nir_ssa_scalar_is_alu(cond))
+      return false;
+
+   nir_op op = nir_ssa_scalar_alu_op(cond);
+   if (op == nir_op_iand) {
+      progress |= opt_if_rewrite_uniform_uses(b, nif, nir_ssa_scalar_chase_alu_src(cond, 0), false);
+      progress |= opt_if_rewrite_uniform_uses(b, nif, nir_ssa_scalar_chase_alu_src(cond, 1), false);
+      return progress;
+   }
+
+   if (op != nir_op_ieq && (op != nir_op_ine || !accept_ine))
+      return false;
+
+   for (unsigned i = 0; i < 2; i++) {
+      nir_ssa_scalar src_uni = nir_ssa_scalar_chase_alu_src(cond, i);
+      nir_ssa_scalar src_div = nir_ssa_scalar_chase_alu_src(cond, !i);
+
+      if (src_uni.def->parent_instr->type == nir_instr_type_load_const && src_div.def != src_uni.def)
+         return rewrite_comp_uses_within_if(b, nif, op == nir_op_ine, src_div, src_uni);
+
+      if (src_uni.def->parent_instr->type != nir_instr_type_intrinsic)
+         continue;
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src_uni.def->parent_instr);
+      if (intrin->intrinsic != nir_intrinsic_read_first_invocation &&
+          (intrin->intrinsic != nir_intrinsic_reduce || nir_intrinsic_cluster_size(intrin)))
+         continue;
+
+      nir_ssa_scalar intrin_src = {intrin->src[0].ssa, src_uni.comp};
+      nir_ssa_scalar resolved_intrin_src = nir_ssa_scalar_resolved(intrin_src.def, intrin_src.comp);
+
+      if (resolved_intrin_src.comp != src_div.comp || resolved_intrin_src.def != src_div.def)
+         continue;
+
+      progress |= rewrite_comp_uses_within_if(b, nif, op == nir_op_ine, resolved_intrin_src, src_uni);
+      if (intrin_src.comp != resolved_intrin_src.comp || intrin_src.def != resolved_intrin_src.def)
+         progress |= rewrite_comp_uses_within_if(b, nif, op == nir_op_ine, intrin_src, src_uni);
+
+      return progress;
+   }
+
+   return false;
+}
+
 static void
 simple_merge_if(nir_if *dest_if, nir_if *src_if, bool dest_if_then,
                 bool src_if_then)
@@ -1387,6 +1492,8 @@ opt_if_safe_cf_list(nir_builder *b, struct exec_list *cf_list)
          progress |= opt_if_safe_cf_list(b, &nif->then_list);
          progress |= opt_if_safe_cf_list(b, &nif->else_list);
          progress |= opt_if_evaluate_condition_use(b, nif);
+         nir_ssa_scalar cond = nir_ssa_scalar_resolved(nif->condition.ssa, 0);
+         progress |= opt_if_rewrite_uniform_uses(b, nif, cond, true);
          break;
       }
 
diff --git a/mesa 3D driver/src/compiler/nir/nir_opt_loop_unroll.c b/mesa 3D driver/src/compiler/nir/nir_opt_loop_unroll.c
index 3dec5e15e5..e81cebbb33 100644
--- a/mesa 3D driver/src/compiler/nir/nir_opt_loop_unroll.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_opt_loop_unroll.c	
@@ -976,7 +976,27 @@ process_loops(nir_shader *sh, nir_cf_node *cf_node, bool *has_nested_loop_out,
          }
       }
 
-      if (has_nested_loop || !loop->info->limiting_terminator)
+      /* Intentionally don't consider exact_trip_count_known here.  When
+       * max_trip_count is non-zero, it is the upper bound on the number of
+       * times the loop will iterate, but the loop may iterate less.  For
+       * example, the following loop will iterate 0 or 1 time:
+       *
+       *    for (i = 0; i < min(x, 1); i++) { ... }
+       *
+       * Trivial single-interation loops (e.g., do { ... } while (false)) and
+       * trivial zero-iteration loops (e.g., while (false) { ... }) will have
+       * already been handled.
+       *
+       * If the loop is known to execute at most once and meets the other
+       * unrolling criteria, unroll it even if it has nested loops.
+       *
+       * It is unlikely that such loops exist in real shaders. GraphicsFuzz is
+       * known to generate spurious loops that iterate exactly once.  It is
+       * plausible that it could eventually start generating loops like the
+       * example above, so it seems logical to defend against it now.
+       */
+      if (!loop->info->limiting_terminator ||
+          (loop->info->max_trip_count != 1 && has_nested_loop))
          goto exit;
 
       if (!check_unrolling_restrictions(sh, loop))
diff --git a/mesa 3D driver/src/compiler/nir/nir_opt_memcpy.c b/mesa 3D driver/src/compiler/nir/nir_opt_memcpy.c
index fe2a3039be..be3f19de1f 100644
--- a/mesa 3D driver/src/compiler/nir/nir_opt_memcpy.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_opt_memcpy.c	
@@ -152,7 +152,7 @@ try_lower_memcpy(nir_builder *b, nir_intrinsic_instr *cpy)
          nir_load_deref_with_access(b, src, nir_intrinsic_src_access(cpy));
       data = nir_bitcast_vector(b, data, glsl_get_bit_size(dst->type));
       assert(data->num_components == glsl_get_vector_elements(dst->type));
-      nir_store_deref_with_access(b, src, data, ~0 /* write mask */,
+      nir_store_deref_with_access(b, dst, data, ~0 /* write mask */,
                                   nir_intrinsic_dst_access(cpy));
       return true;
    }
diff --git a/mesa 3D driver/src/compiler/nir/nir_opt_peephole_select.c b/mesa 3D driver/src/compiler/nir/nir_opt_peephole_select.c
index 5eeb5f66b9..9d7e4e4b3a 100644
--- a/mesa 3D driver/src/compiler/nir/nir_opt_peephole_select.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_opt_peephole_select.c	
@@ -104,6 +104,7 @@ block_check_for_allowed_instrs(nir_block *block, unsigned *count,
             switch (deref->modes) {
             case nir_var_shader_in:
             case nir_var_uniform:
+            case nir_var_image:
                /* Don't try to remove flow control around an indirect load
                 * because that flow control may be trying to avoid invalid
                 * loads.
@@ -381,6 +382,17 @@ nir_opt_peephole_select_block(nir_block *block, nir_shader *shader,
    if (prev_node->type != nir_cf_node_if)
       return false;
 
+   nir_block *prev_block = nir_cf_node_as_block(nir_cf_node_prev(prev_node));
+
+   /* If the last instruction before this if/else block is a jump, we can't
+    * append stuff after it because it would break a bunch of assumption about
+    * control flow (nir_validate expects the successor of a return/halt jump
+    * to be the end of the function, which might not match the successor of
+    * the if/else blocks).
+    */
+   if (nir_block_ends_in_return_or_halt(prev_block))
+      return false;
+
    nir_if *if_stmt = nir_cf_node_as_if(prev_node);
 
    /* first, try to collapse the if */
@@ -422,8 +434,6 @@ nir_opt_peephole_select_block(nir_block *block, nir_shader *shader,
     * selects.
     */
 
-   nir_block *prev_block = nir_cf_node_as_block(nir_cf_node_prev(prev_node));
-
    /* First, we move the remaining instructions from the blocks to the
     * block before.  We have already guaranteed that this is safe by
     * calling block_check_for_allowed_instrs()
@@ -446,7 +456,7 @@ nir_opt_peephole_select_block(nir_block *block, nir_shader *shader,
 
       nir_phi_instr *phi = nir_instr_as_phi(instr);
       nir_alu_instr *sel = nir_alu_instr_create(shader, nir_op_bcsel);
-      nir_src_copy(&sel->src[0].src, &if_stmt->condition, sel);
+      nir_src_copy(&sel->src[0].src, &if_stmt->condition);
       /* Splat the condition to all channels */
       memset(sel->src[0].swizzle, 0, sizeof sel->src[0].swizzle);
 
@@ -456,7 +466,7 @@ nir_opt_peephole_select_block(nir_block *block, nir_shader *shader,
          assert(src->src.is_ssa);
 
          unsigned idx = src->pred == then_block ? 1 : 2;
-         nir_src_copy(&sel->src[idx].src, &src->src, sel);
+         nir_src_copy(&sel->src[idx].src, &src->src);
       }
 
       nir_ssa_dest_init(&sel->instr, &sel->dest.dest,
diff --git a/mesa 3D driver/src/compiler/nir/nir_opt_undef.c b/mesa 3D driver/src/compiler/nir/nir_opt_undef.c
index 20b31a4ada..5b19393c8a 100644
--- a/mesa 3D driver/src/compiler/nir/nir_opt_undef.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_opt_undef.c	
@@ -56,8 +56,7 @@ opt_undef_csel(nir_alu_instr *instr)
        */
       nir_instr_rewrite_src(&instr->instr, &instr->src[0].src,
                             instr->src[i == 1 ? 2 : 1].src);
-      nir_alu_src_copy(&instr->src[0], &instr->src[i == 1 ? 2 : 1],
-                       ralloc_parent(instr));
+      nir_alu_src_copy(&instr->src[0], &instr->src[i == 1 ? 2 : 1]);
 
       nir_src empty_src;
       memset(&empty_src, 0, sizeof(empty_src));
@@ -137,6 +136,7 @@ opt_undef_store(nir_intrinsic_instr *intrin)
       break;
    case nir_intrinsic_store_output:
    case nir_intrinsic_store_per_vertex_output:
+   case nir_intrinsic_store_per_primitive_output:
    case nir_intrinsic_store_ssbo:
    case nir_intrinsic_store_shared:
    case nir_intrinsic_store_global:
diff --git a/mesa 3D driver/src/compiler/nir/nir_opt_vectorize.c b/mesa 3D driver/src/compiler/nir/nir_opt_vectorize.c
index 2f4acafa3c..83c841ee63 100644
--- a/mesa 3D driver/src/compiler/nir/nir_opt_vectorize.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_opt_vectorize.c	
@@ -420,9 +420,12 @@ nir_opt_vectorize_impl(struct nir_shader *nir, nir_function_impl *impl,
    bool progress = vectorize_block(nir, nir_start_block(impl), instr_set,
                                    filter, data);
 
-   if (progress)
+   if (progress) {
       nir_metadata_preserve(impl, nir_metadata_block_index |
                                   nir_metadata_dominance);
+   } else {
+      nir_metadata_preserve(impl, nir_metadata_all);
+   }
 
    vec_instr_set_destroy(instr_set);
    return progress;
diff --git a/mesa 3D driver/src/compiler/nir/nir_print.c b/mesa 3D driver/src/compiler/nir/nir_print.c
index b42a21b09b..56ce1f9be4 100644
--- a/mesa 3D driver/src/compiler/nir/nir_print.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_print.c	
@@ -464,6 +464,8 @@ get_variable_mode_str(nir_variable_mode mode, bool want_local_global_mode)
       return "push_const";
    case nir_var_mem_constant:
       return "constant";
+   case nir_var_image:
+      return "image";
    case nir_var_shader_temp:
       return want_local_global_mode ? "shader_temp" : "";
    case nir_var_function_temp:
@@ -484,13 +486,15 @@ print_var_decl(nir_variable *var, print_state *state)
 
    fprintf(fp, "decl_var ");
 
+   const char *const bindless = (var->data.bindless) ? "bindless " : "";
    const char *const cent = (var->data.centroid) ? "centroid " : "";
    const char *const samp = (var->data.sample) ? "sample " : "";
    const char *const patch = (var->data.patch) ? "patch " : "";
    const char *const inv = (var->data.invariant) ? "invariant " : "";
    const char *const per_view = (var->data.per_view) ? "per_view " : "";
-   fprintf(fp, "%s%s%s%s%s%s %s ",
-           cent, samp, patch, inv, per_view,
+   const char *const per_primitive = (var->data.per_primitive) ? "per_primitive " : "";
+   fprintf(fp, "%s%s%s%s%s%s%s%s %s ",
+           bindless, cent, samp, patch, inv, per_view, per_primitive,
            get_variable_mode_str(var->data.mode, false),
            glsl_interp_mode_name(var->data.interpolation));
 
@@ -520,11 +524,12 @@ print_var_decl(nir_variable *var, print_state *state)
    fprintf(fp, "%s %s", glsl_get_type_name(var->type),
            get_var_name(var, state));
 
-   if (var->data.mode == nir_var_shader_in ||
-       var->data.mode == nir_var_shader_out ||
-       var->data.mode == nir_var_uniform ||
-       var->data.mode == nir_var_mem_ubo ||
-       var->data.mode == nir_var_mem_ssbo) {
+   if (var->data.mode & (nir_var_shader_in |
+                         nir_var_shader_out |
+                         nir_var_uniform |
+                         nir_var_mem_ubo |
+                         nir_var_mem_ssbo |
+                         nir_var_image)) {
       const char *loc = NULL;
       char buf[4];
 
@@ -1102,11 +1107,11 @@ print_tex_instr(nir_tex_instr *instr, print_state *state)
    case nir_texop_tex_prefetch:
       fprintf(fp, "tex (pre-dispatchable) ");
       break;
-   case nir_texop_fragment_fetch:
-      fprintf(fp, "fragment_fetch ");
+   case nir_texop_fragment_fetch_amd:
+      fprintf(fp, "fragment_fetch_amd ");
       break;
-   case nir_texop_fragment_mask_fetch:
-      fprintf(fp, "fragment_mask_fetch ");
+   case nir_texop_fragment_mask_fetch_amd:
+      fprintf(fp, "fragment_mask_fetch_amd ");
       break;
    default:
       unreachable("Invalid texture operation");
@@ -1263,7 +1268,7 @@ print_load_const_instr(nir_load_const_instr *instr, print_state *state)
 
       switch (instr->def.bit_size) {
       case 64:
-         fprintf(fp, "0x%16" PRIx64 " /* %f */", instr->value[i].u64,
+         fprintf(fp, "0x%016" PRIx64 " /* %f */", instr->value[i].u64,
                  instr->value[i].f64);
          break;
       case 32:
diff --git a/mesa 3D driver/src/compiler/nir/nir_propagate_invariant.c b/mesa 3D driver/src/compiler/nir/nir_propagate_invariant.c
index c5664bf701..8cffce0ac4 100644
--- a/mesa 3D driver/src/compiler/nir/nir_propagate_invariant.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_propagate_invariant.c	
@@ -183,6 +183,8 @@ propagate_invariant_impl(nir_function_impl *impl, struct set *invariants)
       nir_metadata_preserve(impl, nir_metadata_block_index |
                                   nir_metadata_dominance |
                                   nir_metadata_live_ssa_defs);
+   } else {
+      nir_metadata_preserve(impl, nir_metadata_all);
    }
 
    return progress;
diff --git a/mesa 3D driver/src/compiler/nir/nir_range_analysis.c b/mesa 3D driver/src/compiler/nir/nir_range_analysis.c
index 4e37881526..a50e43651b 100644
--- a/mesa 3D driver/src/compiler/nir/nir_range_analysis.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_range_analysis.c	
@@ -1292,7 +1292,15 @@ nir_unsigned_upper_bound(nir_shader *shader, struct hash_table *range_ht,
       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(scalar.def->parent_instr);
       switch (intrin->intrinsic) {
       case nir_intrinsic_load_local_invocation_index:
-         if (shader->info.workgroup_size_variable) {
+         /* The local invocation index is used under the hood by RADV for
+          * some non-compute-like shaders (eg. LS and NGG). These technically
+          * run in workgroups on the HW, even though this fact is not exposed
+          * by the API.
+          * They can safely use the same code path here as variable sized
+          * compute-like shader stages.
+          */
+         if (!gl_shader_stage_uses_workgroup(shader->info.stage) ||
+             shader->info.workgroup_size_variable) {
             res = config->max_workgroup_invocations - 1;
          } else {
             res = (shader->info.workgroup_size[0] *
@@ -1459,6 +1467,10 @@ nir_unsigned_upper_bound(nir_shader *shader, struct hash_table *range_ht,
       case nir_op_bfm:
       case nir_op_f2u32:
       case nir_op_fmul:
+      case nir_op_extract_u8:
+      case nir_op_extract_i8:
+      case nir_op_extract_u16:
+      case nir_op_extract_i16:
          break;
       case nir_op_u2u1:
       case nir_op_u2u8:
@@ -1591,6 +1603,18 @@ nir_unsigned_upper_bound(nir_shader *shader, struct hash_table *range_ht,
       case nir_op_sad_u8x4:
          res = src2 + 4 * 255;
          break;
+      case nir_op_extract_u8:
+         res = MIN2(src0, UINT8_MAX);
+         break;
+      case nir_op_extract_i8:
+         res = (src0 >= 0x80) ? max : MIN2(src0, INT8_MAX);
+         break;
+      case nir_op_extract_u16:
+         res = MIN2(src0, UINT16_MAX);
+         break;
+      case nir_op_extract_i16:
+         res = (src0 >= 0x8000) ? max : MIN2(src0, INT16_MAX);
+         break;
       default:
          res = max;
          break;
diff --git a/mesa 3D driver/src/compiler/nir/nir_schedule.c b/mesa 3D driver/src/compiler/nir/nir_schedule.c
index 083c3b134f..d13623217d 100644
--- a/mesa 3D driver/src/compiler/nir/nir_schedule.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_schedule.c	
@@ -409,6 +409,18 @@ nir_schedule_intrinsic_deps(nir_deps_state *state,
       add_write_dep(state, &state->unknown_intrinsic, n);
       break;
 
+   case nir_intrinsic_scoped_barrier: {
+      const nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
+
+      if (modes & nir_var_mem_shared)
+         add_write_dep(state, &state->store_shared, n);
+
+      /* Serialize against other categories. */
+      add_write_dep(state, &state->unknown_intrinsic, n);
+
+      break;
+   }
+
    default:
       /* Attempt to handle other intrinsics that we haven't individually
        * categorized by serializing them in the same order relative to each
diff --git a/mesa 3D driver/src/compiler/nir/nir_search.c b/mesa 3D driver/src/compiler/nir/nir_search.c
index 437a24b9b0..a968c8a372 100644
--- a/mesa 3D driver/src/compiler/nir/nir_search.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_search.c	
@@ -524,8 +524,7 @@ construct_value(nir_builder *build,
       assert(state->variables_seen & (1 << var->variable));
 
       nir_alu_src val = { NIR_SRC_INIT };
-      nir_alu_src_copy(&val, &state->variables[var->variable],
-                       (void *)build->shader);
+      nir_alu_src_copy(&val, &state->variables[var->variable]);
       assert(!var->is_constant);
 
       for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++)
diff --git a/mesa 3D driver/src/compiler/nir/nir_search_helpers.h b/mesa 3D driver/src/compiler/nir/nir_search_helpers.h
index 2493848437..1188b50ed2 100644
--- a/mesa 3D driver/src/compiler/nir/nir_search_helpers.h	
+++ b/mesa 3D driver/src/compiler/nir/nir_search_helpers.h	
@@ -205,6 +205,27 @@ is_not_const_zero(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
    return true;
 }
 
+/** Is value unsigned less than 0xfffc07fc? */
+static inline bool
+is_ult_0xfffc07fc(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
+                  unsigned src, unsigned num_components,
+                  const uint8_t *swizzle)
+{
+   /* only constant srcs: */
+   if (!nir_src_is_const(instr->src[src].src))
+      return false;
+
+   for (unsigned i = 0; i < num_components; i++) {
+      const unsigned val =
+         nir_src_comp_as_uint(instr->src[src].src, swizzle[i]);
+
+      if (val >= 0xfffc07fcU)
+         return false;
+   }
+
+   return true;
+}
+
 static inline bool
 is_not_const(UNUSED struct hash_table *ht, const nir_alu_instr *instr,
              unsigned src, UNUSED unsigned num_components,
diff --git a/mesa 3D driver/src/compiler/nir/nir_serialize.c b/mesa 3D driver/src/compiler/nir/nir_serialize.c
index 78fa2ee122..93de4147bf 100644
--- a/mesa 3D driver/src/compiler/nir/nir_serialize.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_serialize.c	
@@ -550,7 +550,7 @@ read_src(read_ctx *ctx, nir_src *src, void *mem_ctx)
       src->reg.reg = read_lookup_object(ctx, header.any.object_idx);
       src->reg.base_offset = blob_read_uint32(ctx->blob);
       if (header.any.is_indirect) {
-         src->reg.indirect = ralloc(mem_ctx, nir_src);
+         src->reg.indirect = malloc(sizeof(nir_src));
          read_src(ctx, src->reg.indirect, mem_ctx);
       } else {
          src->reg.indirect = NULL;
@@ -575,14 +575,12 @@ union packed_dest {
 };
 
 enum intrinsic_const_indices_encoding {
-   /* Use the 9 bits of packed_const_indices to store 1-9 indices.
-    * 1 9-bit index, or 2 4-bit indices, or 3 3-bit indices, or
-    * 4 2-bit indices, or 5-9 1-bit indices.
+   /* Use packed_const_indices to store tightly packed indices.
     *
     * The common case for load_ubo is 0, 0, 0, which is trivially represented.
     * The common cases for load_interpolated_input also fit here, e.g.: 7, 3
     */
-   const_indices_9bit_all_combined,
+   const_indices_all_combined,
 
    const_indices_8bit,  /* 8 bits per element */
    const_indices_16bit, /* 16 bits per element */
@@ -627,9 +625,9 @@ union packed_instr {
       unsigned instr_type:4;
       unsigned deref_type:3;
       unsigned cast_type_same_as_last:1;
-      unsigned modes:14; /* deref_var redefines this */
+      unsigned modes:5; /* See (de|en)code_deref_modes() */
+      unsigned _pad:10;
       unsigned packed_src_ssa_16bit:1; /* deref_var redefines this */
-      unsigned _pad:1;  /* deref_var redefines this */
       unsigned dest:8;
    } deref;
    struct {
@@ -641,9 +639,9 @@ union packed_instr {
    } deref_var;
    struct {
       unsigned instr_type:4;
-      unsigned intrinsic:9;
+      unsigned intrinsic:10;
       unsigned const_indices_encoding:2;
-      unsigned packed_const_indices:9;
+      unsigned packed_const_indices:8;
       unsigned dest:8;
    } intrinsic;
    struct {
@@ -770,7 +768,7 @@ read_dest(read_ctx *ctx, nir_dest *dst, nir_instr *instr,
       dst->reg.reg = read_object(ctx);
       dst->reg.base_offset = blob_read_uint32(ctx->blob);
       if (dest.reg.is_indirect) {
-         dst->reg.indirect = ralloc(instr, nir_src);
+         dst->reg.indirect = malloc(sizeof(nir_src));
          read_src(ctx, dst->reg.indirect, instr);
       }
    }
@@ -966,11 +964,51 @@ read_alu(read_ctx *ctx, union packed_instr header)
    return alu;
 }
 
+#define MODE_ENC_GENERIC_BIT (1 << 4)
+
+static nir_variable_mode
+decode_deref_modes(unsigned modes)
+{
+   if (modes & MODE_ENC_GENERIC_BIT) {
+      modes &= ~MODE_ENC_GENERIC_BIT;
+      return modes << (ffs(nir_var_mem_generic) - 1);
+   } else {
+      return 1 << modes;
+   }
+}
+
+static unsigned
+encode_deref_modes(nir_variable_mode modes)
+{
+   /* Mode sets on derefs generally come in two forms.  For certain OpenCL
+    * cases, we can have more than one of the generic modes set.  In this
+    * case, we need the full bitfield.  Fortunately, there are only 4 of
+    * these.  For all other modes, we can only have one mode at a time so we
+    * can compress them by only storing the bit position.  This, plus one bit
+    * to select encoding, lets us pack the entire bitfield in 5 bits.
+    */
+   STATIC_ASSERT((nir_var_all & ~nir_var_mem_generic) <
+                 (1 << MODE_ENC_GENERIC_BIT));
+
+   unsigned enc;
+   if (modes == 0 || (modes & nir_var_mem_generic)) {
+      assert(!(modes & ~nir_var_mem_generic));
+      enc = modes >> (ffs(nir_var_mem_generic) - 1);
+      assert(enc < MODE_ENC_GENERIC_BIT);
+      enc |= MODE_ENC_GENERIC_BIT;
+   } else {
+      assert(util_is_power_of_two_nonzero(modes));
+      enc = ffs(modes) - 1;
+      assert(enc < MODE_ENC_GENERIC_BIT);
+   }
+   assert(modes == decode_deref_modes(enc));
+   return enc;
+}
+
 static void
 write_deref(write_ctx *ctx, const nir_deref_instr *deref)
 {
    assert(deref->deref_type < 8);
-   assert(deref->modes < (1 << 14));
 
    union packed_instr header;
    header.u32 = 0;
@@ -979,7 +1017,7 @@ write_deref(write_ctx *ctx, const nir_deref_instr *deref)
    header.deref.deref_type = deref->deref_type;
 
    if (deref->deref_type == nir_deref_type_cast) {
-      header.deref.modes = deref->modes;
+      header.deref.modes = encode_deref_modes(deref->modes);
       header.deref.cast_type_same_as_last = deref->type == ctx->last_type;
    }
 
@@ -1115,7 +1153,7 @@ read_deref(read_ctx *ctx, union packed_instr header)
    if (deref_type == nir_deref_type_var) {
       deref->modes = deref->var->data.mode;
    } else if (deref->deref_type == nir_deref_type_cast) {
-      deref->modes = header.deref.modes;
+      deref->modes = decode_deref_modes(header.deref.modes);
    } else {
       assert(deref->parent.is_ssa);
       deref->modes = nir_instr_as_deref(deref->parent.ssa->parent_instr)->modes;
@@ -1127,11 +1165,11 @@ read_deref(read_ctx *ctx, union packed_instr header)
 static void
 write_intrinsic(write_ctx *ctx, const nir_intrinsic_instr *intrin)
 {
-   /* 9 bits for nir_intrinsic_op */
-   STATIC_ASSERT(nir_num_intrinsics <= 512);
+   /* 10 bits for nir_intrinsic_op */
+   STATIC_ASSERT(nir_num_intrinsics <= 1024);
    unsigned num_srcs = nir_intrinsic_infos[intrin->intrinsic].num_srcs;
    unsigned num_indices = nir_intrinsic_infos[intrin->intrinsic].num_indices;
-   assert(intrin->intrinsic < 512);
+   assert(intrin->intrinsic < 1024);
 
    union packed_instr header;
    header.u32 = 0;
@@ -1147,11 +1185,11 @@ write_intrinsic(write_ctx *ctx, const nir_intrinsic_instr *intrin)
          max_bits = MAX2(max_bits, max);
       }
 
-      if (max_bits * num_indices <= 9) {
-         header.intrinsic.const_indices_encoding = const_indices_9bit_all_combined;
+      if (max_bits * num_indices <= 8) {
+         header.intrinsic.const_indices_encoding = const_indices_all_combined;
 
-         /* Pack all const indices into 6 bits. */
-         unsigned bit_size = 9 / num_indices;
+         /* Pack all const indices into 8 bits. */
+         unsigned bit_size = 8 / num_indices;
          for (unsigned i = 0; i < num_indices; i++) {
             header.intrinsic.packed_const_indices |=
                intrin->const_index[i] << (i * bit_size);
@@ -1222,8 +1260,8 @@ read_intrinsic(read_ctx *ctx, union packed_instr header)
 
    if (num_indices) {
       switch (header.intrinsic.const_indices_encoding) {
-      case const_indices_9bit_all_combined: {
-         unsigned bit_size = 9 / num_indices;
+      case const_indices_all_combined: {
+         unsigned bit_size = 8 / num_indices;
          unsigned bit_mask = u_bit_consecutive(0, bit_size);
          for (unsigned i = 0; i < num_indices; i++) {
             intrin->const_index[i] =
@@ -1810,6 +1848,7 @@ static void
 write_if(write_ctx *ctx, nir_if *nif)
 {
    write_src(ctx, &nif->condition);
+   blob_write_uint8(ctx->blob, nif->control);
 
    write_cf_list(ctx, &nif->then_list);
    write_cf_list(ctx, &nif->else_list);
@@ -1821,6 +1860,7 @@ read_if(read_ctx *ctx, struct exec_list *cf_list)
    nir_if *nif = nir_if_create(ctx->nir);
 
    read_src(ctx, &nif->condition, nif);
+   nif->control = blob_read_uint8(ctx->blob);
 
    nir_cf_node_insert_end(cf_list, &nif->cf_node);
 
@@ -1831,6 +1871,7 @@ read_if(read_ctx *ctx, struct exec_list *cf_list)
 static void
 write_loop(write_ctx *ctx, nir_loop *loop)
 {
+   blob_write_uint8(ctx->blob, loop->control);
    write_cf_list(ctx, &loop->body);
 }
 
@@ -1841,6 +1882,7 @@ read_loop(read_ctx *ctx, struct exec_list *cf_list)
 
    nir_cf_node_insert_end(cf_list, &loop->cf_node);
 
+   loop->control = blob_read_uint8(ctx->blob);
    read_cf_list(ctx, &loop->body);
 }
 
diff --git a/mesa 3D driver/src/compiler/nir/nir_split_vars.c b/mesa 3D driver/src/compiler/nir/nir_split_vars.c
index 3b8482f720..fbe44287c7 100644
--- a/mesa 3D driver/src/compiler/nir/nir_split_vars.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_split_vars.c	
@@ -78,19 +78,6 @@ struct field {
    nir_variable *var;
 };
 
-static const struct glsl_type *
-wrap_type_in_array(const struct glsl_type *type,
-                   const struct glsl_type *array_type)
-{
-   if (!glsl_type_is_array(array_type))
-      return type;
-
-   const struct glsl_type *elem_type =
-      wrap_type_in_array(type, glsl_get_array_element(array_type));
-   assert(glsl_get_explicit_stride(array_type) == 0);
-   return glsl_array_type(elem_type, glsl_get_length(array_type), 0);
-}
-
 static int
 num_array_levels_in_array_of_vector_type(const struct glsl_type *type)
 {
@@ -141,7 +128,7 @@ init_field_for_type(struct field *field, struct field *parent,
    } else {
       const struct glsl_type *var_type = type;
       for (struct field *f = field->parent; f; f = f->parent)
-         var_type = wrap_type_in_array(var_type, f->type);
+         var_type = glsl_type_wrap_in_arrays(var_type, f->type);
 
       nir_variable_mode mode = state->base_var->data.mode;
       if (mode == nir_var_function_temp) {
diff --git a/mesa 3D driver/src/compiler/nir/nir_sweep.c b/mesa 3D driver/src/compiler/nir/nir_sweep.c
index e2b70f5f76..0c5c71ad97 100644
--- a/mesa 3D driver/src/compiler/nir/nir_sweep.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_sweep.c	
@@ -40,24 +40,6 @@
 
 static void sweep_cf_node(nir_shader *nir, nir_cf_node *cf_node);
 
-static bool
-sweep_src_indirect(nir_src *src, void *nir)
-{
-   if (!src->is_ssa && src->reg.indirect)
-      ralloc_steal(nir, src->reg.indirect);
-
-   return true;
-}
-
-static bool
-sweep_dest_indirect(nir_dest *dest, void *nir)
-{
-   if (!dest->is_ssa && dest->reg.indirect)
-      ralloc_steal(nir, dest->reg.indirect);
-
-   return true;
-}
-
 static void
 sweep_block(nir_shader *nir, nir_block *block)
 {
@@ -73,10 +55,8 @@ sweep_block(nir_shader *nir, nir_block *block)
    block->live_out = NULL;
 
    nir_foreach_instr(instr, block) {
-      ralloc_steal(nir, instr);
-
-      nir_foreach_src(instr, sweep_src_indirect, nir);
-      nir_foreach_dest(instr, sweep_dest_indirect, nir);
+      list_del(&instr->gc_node);
+      list_add(&instr->gc_node, &nir->gc_list);
    }
 }
 
@@ -155,6 +135,12 @@ nir_sweep(nir_shader *nir)
 {
    void *rubbish = ralloc_context(NULL);
 
+   struct list_head instr_gc_list;
+   list_inithead(&instr_gc_list);
+
+   list_replace(&nir->gc_list, &instr_gc_list);
+   list_inithead(&nir->gc_list);
+
    /* First, move ownership of all the memory to a temporary context; assume dead. */
    ralloc_adopt(rubbish, nir);
 
@@ -170,6 +156,12 @@ nir_sweep(nir_shader *nir)
       sweep_function(nir, func);
    }
 
+   /* Sweep instrs not found while walking the shader. */
+   list_for_each_entry_safe(nir_instr, instr, &instr_gc_list, gc_node) {
+      nir_instr_free(instr);
+   }
+   assert(list_is_empty(&instr_gc_list));
+
    ralloc_steal(nir, nir->constant_data);
 
    /* Free everything we didn't steal back. */
diff --git a/mesa 3D driver/src/compiler/nir/nir_validate.c b/mesa 3D driver/src/compiler/nir/nir_validate.c
index 648e5af7a1..1139a29a9a 100644
--- a/mesa 3D driver/src/compiler/nir/nir_validate.c	
+++ b/mesa 3D driver/src/compiler/nir/nir_validate.c	
@@ -100,6 +100,8 @@ typedef struct {
 
    /* map of instruction/var/etc to failed assert string */
    struct hash_table *errors;
+
+   struct set *shader_gc_list;
 } validate_state;
 
 static void
@@ -676,6 +678,7 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
    case nir_intrinsic_load_interpolated_input:
    case nir_intrinsic_load_output:
    case nir_intrinsic_load_per_vertex_output:
+   case nir_intrinsic_load_per_primitive_output:
    case nir_intrinsic_load_push_constant:
       /* All memory load operations must load at least a byte */
       validate_assert(state, nir_dest_bit_size(instr->dest) >= 8);
@@ -886,6 +889,7 @@ validate_tex_instr(nir_tex_instr *instr, validate_state *state)
             break;
 
          validate_assert(state, glsl_type_is_image(deref->type) ||
+                                glsl_type_is_texture(deref->type) ||
                                 glsl_type_is_sampler(deref->type));
          break;
       }
@@ -1072,6 +1076,8 @@ validate_instr(nir_instr *instr, validate_state *state)
 
    state->instr = instr;
 
+   validate_assert(state, _mesa_set_search(state->shader_gc_list, instr));
+
    switch (instr->type) {
    case nir_instr_type_alu:
       validate_alu_instr(nir_instr_as_alu(instr), state);
@@ -1513,6 +1519,11 @@ validate_var_decl(nir_variable *var, nir_variable_mode valid_modes,
    if (var->constant_initializer)
       validate_constant(var->constant_initializer, var->type, state);
 
+   if (var->data.mode == nir_var_image) {
+      validate_assert(state, !var->data.bindless);
+      validate_assert(state, glsl_type_is_image(glsl_without_array(var->type)));
+   }
+
    /*
     * TODO validate some things ir_validate.cpp does (requires more GLSL type
     * support)
@@ -1666,6 +1677,7 @@ init_validate_state(validate_state *state)
    state->blocks = _mesa_pointer_set_create(state->mem_ctx);
    state->var_defs = _mesa_pointer_hash_table_create(state->mem_ctx);
    state->errors = _mesa_pointer_hash_table_create(state->mem_ctx);
+   state->shader_gc_list = _mesa_pointer_set_create(state->mem_ctx);
 
    state->loop = NULL;
    state->instr = NULL;
@@ -1725,6 +1737,10 @@ nir_validate_shader(nir_shader *shader, const char *when)
    validate_state state;
    init_validate_state(&state);
 
+   list_for_each_entry(nir_instr, instr, &shader->gc_list, gc_node) {
+      _mesa_set_add(state.shader_gc_list, instr);
+   }
+
    state.shader = shader;
 
    nir_variable_mode valid_modes =
@@ -1737,7 +1753,8 @@ nir_validate_shader(nir_shader *shader, const char *when)
       nir_var_mem_ssbo |
       nir_var_mem_shared |
       nir_var_mem_push_const |
-      nir_var_mem_constant;
+      nir_var_mem_constant |
+      nir_var_image;
 
    if (gl_shader_stage_is_callable(shader->info.stage))
       valid_modes |= nir_var_shader_call_data;
diff --git a/mesa 3D driver/src/compiler/nir/nir_worklist.h b/mesa 3D driver/src/compiler/nir/nir_worklist.h
index 0f402e080f..ad45b064d5 100644
--- a/mesa 3D driver/src/compiler/nir/nir_worklist.h	
+++ b/mesa 3D driver/src/compiler/nir/nir_worklist.h	
@@ -108,8 +108,7 @@ nir_instr_worklist_create() {
    if (!wl)
       return NULL;
 
-   if (!u_vector_init(&wl->instr_vec, sizeof(struct nir_instr *),
-                      sizeof(struct nir_instr *) * 8)) {
+   if (!u_vector_init_pow2(&wl->instr_vec, 8, sizeof(struct nir_instr *))) {
       free(wl);
       return NULL;
    }
diff --git a/mesa 3D driver/src/compiler/nir/tests/core_tests.cpp b/mesa 3D driver/src/compiler/nir/tests/core_tests.cpp
index 66bcca84d9..d243396d68 100644
--- a/mesa 3D driver/src/compiler/nir/tests/core_tests.cpp	
+++ b/mesa 3D driver/src/compiler/nir/tests/core_tests.cpp	
@@ -123,4 +123,23 @@ TEST_F(nir_core_test, nir_instr_free_and_dce_all_test)
    nir_validate_shader(b->shader, "after remove_and_dce");
 }
 
+TEST_F(nir_core_test, nir_instr_free_and_dce_multiple_src_test)
+{
+   nir_ssa_def *one = nir_imm_int(b, 1);
+   nir_ssa_def *add = nir_iadd(b, one, one);
+
+   /* This risks triggering removing add multiple times, which can segfault in
+    * nir_instr_remove for instructions with srcs. */
+   nir_ssa_def *add2 = nir_iadd(b, add, add);
+
+   nir_cursor c = nir_instr_free_and_dce(add2->parent_instr);
+   ASSERT_FALSE(shader_contains_def(add2));
+   ASSERT_FALSE(shader_contains_def(add));
+   ASSERT_FALSE(shader_contains_def(one));
+
+   ASSERT_TRUE(nir_cursors_equal(c, nir_before_block(nir_start_block(b->impl))));
+
+   nir_validate_shader(b->shader, "after remove_and_dce");
+}
+
 }
diff --git a/mesa 3D driver/src/compiler/nir/tests/negative_equal_tests.cpp b/mesa 3D driver/src/compiler/nir/tests/negative_equal_tests.cpp
index ff9eeb27f4..17d341a73d 100644
--- a/mesa 3D driver/src/compiler/nir/tests/negative_equal_tests.cpp	
+++ b/mesa 3D driver/src/compiler/nir/tests/negative_equal_tests.cpp	
@@ -306,7 +306,8 @@ TEST_F(alu_srcs_negative_equal_test, unused_components_mismatch)
    nir_alu_instr *instr = nir_instr_as_alu(result->parent_instr);
 
    /* Disable the channels that aren't negations of each other. */
-   instr->dest.dest.is_ssa = false;
+   nir_register *reg = nir_local_reg_create(bld.impl);
+   nir_instr_rewrite_dest(&instr->instr, &instr->dest.dest, nir_dest_for_reg(reg));
    instr->dest.write_mask = 8 + 1;
 
    EXPECT_TRUE(nir_alu_srcs_negative_equal(instr, instr, 0, 1));
diff --git a/mesa 3D driver/src/compiler/nir_types.cpp b/mesa 3D driver/src/compiler/nir_types.cpp
index 448e962b39..b069e92c40 100644
--- a/mesa 3D driver/src/compiler/nir_types.cpp	
+++ b/mesa 3D driver/src/compiler/nir_types.cpp	
@@ -110,6 +110,24 @@ glsl_get_function_param(const glsl_type *type, unsigned index)
    return &type->fields.parameters[index + 1];
 }
 
+const glsl_type *
+glsl_texture_type_to_sampler(const glsl_type *type, bool is_shadow)
+{
+   assert(glsl_type_is_texture(type));
+   return glsl_sampler_type((glsl_sampler_dim)type->sampler_dimensionality,
+                            is_shadow, type->sampler_array,
+                            (glsl_base_type)type->sampled_type);
+}
+
+const glsl_type *
+glsl_sampler_type_to_texture(const glsl_type *type)
+{
+   assert(glsl_type_is_sampler(type) && !glsl_type_is_bare_sampler(type));
+   return glsl_texture_type((glsl_sampler_dim)type->sampler_dimensionality,
+                            type->sampler_array,
+                            (glsl_base_type)type->sampled_type);
+}
+
 const struct glsl_type *
 glsl_get_column_type(const struct glsl_type *type)
 {
@@ -199,14 +217,18 @@ glsl_get_struct_elem_name(const struct glsl_type *type, unsigned index)
 glsl_sampler_dim
 glsl_get_sampler_dim(const struct glsl_type *type)
 {
-   assert(glsl_type_is_sampler(type) || glsl_type_is_image(type));
+   assert(glsl_type_is_sampler(type) ||
+          glsl_type_is_texture(type) ||
+          glsl_type_is_image(type));
    return (glsl_sampler_dim)type->sampler_dimensionality;
 }
 
 glsl_base_type
 glsl_get_sampler_result_type(const struct glsl_type *type)
 {
-   assert(glsl_type_is_sampler(type) || glsl_type_is_image(type));
+   assert(glsl_type_is_sampler(type) ||
+          glsl_type_is_texture(type) ||
+          glsl_type_is_image(type));
    return (glsl_base_type)type->sampled_type;
 }
 
@@ -220,7 +242,9 @@ glsl_get_sampler_target(const struct glsl_type *type)
 int
 glsl_get_sampler_coordinate_components(const struct glsl_type *type)
 {
-   assert(glsl_type_is_sampler(type) || glsl_type_is_image(type));
+   assert(glsl_type_is_sampler(type) ||
+          glsl_type_is_texture(type) ||
+          glsl_type_is_image(type));
    return type->coordinate_components();
 }
 
@@ -340,6 +364,18 @@ glsl_type_is_sampler(const struct glsl_type *type)
    return type->is_sampler();
 }
 
+bool
+glsl_type_is_bare_sampler(const struct glsl_type *type)
+{
+   return type->is_sampler() && type->sampled_type == GLSL_TYPE_VOID;
+}
+
+bool
+glsl_type_is_texture(const struct glsl_type *type)
+{
+   return type->is_texture();
+}
+
 bool
 glsl_type_is_image(const struct glsl_type *type)
 {
@@ -356,7 +392,9 @@ glsl_sampler_type_is_shadow(const struct glsl_type *type)
 bool
 glsl_sampler_type_is_array(const struct glsl_type *type)
 {
-   assert(glsl_type_is_sampler(type) || glsl_type_is_image(type));
+   assert(glsl_type_is_sampler(type) ||
+          glsl_type_is_texture(type) ||
+          glsl_type_is_image(type));
    return type->sampler_array;
 }
 
@@ -396,6 +434,12 @@ glsl_type_contains_64bit(const struct glsl_type *type)
    return type->contains_64bit();
 }
 
+bool
+glsl_type_contains_image(const struct glsl_type *type)
+{
+   return type->contains_image();
+}
+
 const glsl_type *
 glsl_void_type(void)
 {
@@ -636,6 +680,13 @@ glsl_bare_shadow_sampler_type()
    return glsl_type::samplerShadow_type;
 }
 
+const struct glsl_type *
+glsl_texture_type(enum glsl_sampler_dim dim, bool is_array,
+                  enum glsl_base_type base_type)
+{
+   return glsl_type::get_texture_instance(dim, is_array, base_type);
+}
+
 const struct glsl_type *
 glsl_image_type(enum glsl_sampler_dim dim, bool is_array,
                 enum glsl_base_type base_type)
@@ -765,6 +816,7 @@ glsl_get_natural_size_align_bytes(const struct glsl_type *type,
       break;
 
    case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_TEXTURE:
    case GLSL_TYPE_IMAGE:
       /* Bindless samplers and images. */
       *size = 8;
@@ -823,6 +875,7 @@ glsl_get_vec4_size_align_bytes(const struct glsl_type *type,
       break;
 
    case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_TEXTURE:
    case GLSL_TYPE_IMAGE:
    case GLSL_TYPE_ATOMIC_UINT:
    case GLSL_TYPE_SUBROUTINE:
@@ -877,12 +930,12 @@ glsl_get_cl_type_size_align(const struct glsl_type *type,
    *align = glsl_get_cl_alignment(type);
 }
 
-unsigned
-glsl_type_get_sampler_count(const struct glsl_type *type)
+static unsigned
+glsl_type_count(const glsl_type *type, glsl_base_type base_type)
 {
    if (glsl_type_is_array(type)) {
-      return (glsl_get_aoa_size(type) *
-              glsl_type_get_sampler_count(glsl_without_array(type)));
+      return glsl_get_length(type) *
+             glsl_type_count(glsl_get_array_element(type), base_type);
    }
 
    /* Ignore interface blocks - they can only contain bindless samplers,
@@ -891,38 +944,32 @@ glsl_type_get_sampler_count(const struct glsl_type *type)
    if (glsl_type_is_struct(type)) {
       unsigned count = 0;
       for (unsigned i = 0; i < glsl_get_length(type); i++)
-         count += glsl_type_get_sampler_count(glsl_get_struct_field(type, i));
+         count += glsl_type_count(glsl_get_struct_field(type, i), base_type);
       return count;
    }
 
-   if (glsl_type_is_sampler(type))
+   if (glsl_get_base_type(type) == base_type)
       return 1;
 
    return 0;
 }
 
+unsigned
+glsl_type_get_sampler_count(const struct glsl_type *type)
+{
+   return glsl_type_count(type, GLSL_TYPE_SAMPLER);
+}
+
+unsigned
+glsl_type_get_texture_count(const struct glsl_type *type)
+{
+   return glsl_type_count(type, GLSL_TYPE_TEXTURE);
+}
+
 unsigned
 glsl_type_get_image_count(const struct glsl_type *type)
 {
-   if (glsl_type_is_array(type)) {
-      return (glsl_get_aoa_size(type) *
-              glsl_type_get_image_count(glsl_without_array(type)));
-   }
-
-   /* Ignore interface blocks - they can only contain bindless images,
-    * which we shouldn't count.
-    */
-   if (glsl_type_is_struct(type)) {
-      unsigned count = 0;
-      for (unsigned i = 0; i < glsl_get_length(type); i++)
-         count += glsl_type_get_image_count(glsl_get_struct_field(type, i));
-      return count;
-   }
-
-   if (glsl_type_is_image(type))
-      return 1;
-
-   return 0;
+   return glsl_type_count(type, GLSL_TYPE_IMAGE);
 }
 
 enum glsl_interface_packing
@@ -1001,6 +1048,19 @@ glsl_get_explicit_type_for_size_align(const struct glsl_type *type,
    return type->get_explicit_type_for_size_align(type_info, size, align);
 }
 
+const struct glsl_type *
+glsl_type_wrap_in_arrays(const struct glsl_type *type,
+                         const struct glsl_type *arrays)
+{
+   if (!glsl_type_is_array(arrays))
+      return type;
+
+   const glsl_type *elem_type =
+      glsl_type_wrap_in_arrays(type, glsl_get_array_element(arrays));
+   return glsl_array_type(elem_type, glsl_get_length(arrays),
+                          glsl_get_explicit_stride(arrays));
+}
+
 const struct glsl_type *
 glsl_type_replace_vec3_with_vec4(const struct glsl_type *type)
 {
diff --git a/mesa 3D driver/src/compiler/nir_types.h b/mesa 3D driver/src/compiler/nir_types.h
index 82c268b98b..7034799557 100644
--- a/mesa 3D driver/src/compiler/nir_types.h	
+++ b/mesa 3D driver/src/compiler/nir_types.h	
@@ -79,6 +79,11 @@ glsl_get_function_return_type(const struct glsl_type *type);
 const struct glsl_function_param *
 glsl_get_function_param(const struct glsl_type *type, unsigned index);
 
+const struct glsl_type *
+glsl_texture_type_to_sampler(const struct glsl_type *type, bool is_shadow);
+const struct glsl_type *
+glsl_sampler_type_to_texture(const struct glsl_type *type);
+
 GLenum glsl_get_gl_type(const struct glsl_type *type);
 
 enum glsl_base_type glsl_get_base_type(const struct glsl_type *type);
@@ -149,12 +154,15 @@ bool glsl_type_is_struct(const struct glsl_type *type);
 bool glsl_type_is_interface(const struct glsl_type *type);
 bool glsl_type_is_struct_or_ifc(const struct glsl_type *type);
 bool glsl_type_is_sampler(const struct glsl_type *type);
+bool glsl_type_is_bare_sampler(const struct glsl_type *type);
+bool glsl_type_is_texture(const struct glsl_type *type);
 bool glsl_type_is_image(const struct glsl_type *type);
 bool glsl_type_is_dual_slot(const struct glsl_type *type);
 bool glsl_type_is_numeric(const struct glsl_type *type);
 bool glsl_type_is_boolean(const struct glsl_type *type);
 bool glsl_type_is_integer(const struct glsl_type *type);
 bool glsl_type_contains_64bit(const struct glsl_type *type);
+bool glsl_type_contains_image(const struct glsl_type *type);
 bool glsl_sampler_type_is_shadow(const struct glsl_type *type);
 bool glsl_sampler_type_is_array(const struct glsl_type *type);
 bool glsl_struct_type_is_packed(const struct glsl_type *type);
@@ -211,6 +219,9 @@ const struct glsl_type *glsl_sampler_type(enum glsl_sampler_dim dim,
                                           enum glsl_base_type base_type);
 const struct glsl_type *glsl_bare_sampler_type();
 const struct glsl_type *glsl_bare_shadow_sampler_type();
+const struct glsl_type *glsl_texture_type(enum glsl_sampler_dim dim,
+                                          bool is_array,
+                                          enum glsl_base_type base_type);
 const struct glsl_type *glsl_image_type(enum glsl_sampler_dim dim,
                                         bool is_array,
                                         enum glsl_base_type base_type);
@@ -237,9 +248,13 @@ const struct glsl_type *glsl_get_explicit_type_for_size_align(const struct glsl_
                                                               glsl_type_size_align_func type_info,
                                                               unsigned *size, unsigned *align);
 
+const struct glsl_type *glsl_type_wrap_in_arrays(const struct glsl_type *type,
+                                                 const struct glsl_type *arrays);
+
 const struct glsl_type *glsl_type_replace_vec3_with_vec4(const struct glsl_type *type);
 
 unsigned glsl_type_get_sampler_count(const struct glsl_type *type);
+unsigned glsl_type_get_texture_count(const struct glsl_type *type);
 unsigned glsl_type_get_image_count(const struct glsl_type *type);
 
 bool glsl_type_is_leaf(const struct glsl_type *type);
diff --git a/mesa 3D driver/src/compiler/shader_enums.c b/mesa 3D driver/src/compiler/shader_enums.c
index 8e6babed87..78d5c8baf7 100644
--- a/mesa 3D driver/src/compiler/shader_enums.c	
+++ b/mesa 3D driver/src/compiler/shader_enums.c	
@@ -160,6 +160,30 @@ gl_varying_slot_name_for_stage(gl_varying_slot slot, gl_shader_stage stage)
    if (stage != MESA_SHADER_FRAGMENT && slot == VARYING_SLOT_PRIMITIVE_SHADING_RATE)
       return "VARYING_SLOT_PRIMITIVE_SHADING_RATE";
 
+   switch (stage) {
+   case MESA_SHADER_MESH:
+      switch (slot) {
+      case VARYING_SLOT_PRIMITIVE_COUNT: return "VARYING_SLOT_PRIMITIVE_COUNT";
+      case VARYING_SLOT_PRIMITIVE_INDICES: return "VARYING_SLOT_PRIMITIVE_INDICES";
+      default:
+         /* Not an overlapping value. */
+         break;
+      }
+      break;
+
+   case MESA_SHADER_TASK:
+      switch (slot) {
+      case VARYING_SLOT_TASK_COUNT: return "VARYING_SLOT_TASK_COUNT";
+      default:
+         /* Not an overlapping value. */
+         break;
+      }
+      break;
+
+   default:
+      break;
+   }
+
    static const char *names[] = {
       ENUM(VARYING_SLOT_POS),
       ENUM(VARYING_SLOT_COL0),
diff --git a/mesa 3D driver/src/compiler/shader_enums.h b/mesa 3D driver/src/compiler/shader_enums.h
index 1833b8c1ed..1dc8f7a646 100644
--- a/mesa 3D driver/src/compiler/shader_enums.h	
+++ b/mesa 3D driver/src/compiler/shader_enums.h	
@@ -303,6 +303,11 @@ typedef enum
    VARYING_SLOT_VIEW_INDEX,
    VARYING_SLOT_VIEWPORT_MASK, /* Does not appear in FS */
    VARYING_SLOT_PRIMITIVE_SHADING_RATE = VARYING_SLOT_FACE, /* Does not appear in FS. */
+
+   VARYING_SLOT_PRIMITIVE_COUNT = VARYING_SLOT_TESS_LEVEL_OUTER, /* Only appears in MESH. */
+   VARYING_SLOT_PRIMITIVE_INDICES = VARYING_SLOT_TESS_LEVEL_INNER, /* Only appears in MESH. */
+   VARYING_SLOT_TASK_COUNT = VARYING_SLOT_BOUNDING_BOX0, /* Only appears in TASK. */
+
    VARYING_SLOT_VAR0 = 32, /* First generic varying slot */
    /* the remaining are simply for the benefit of gl_varying_slot_name()
     * and not to be construed as an upper bound:
diff --git a/mesa 3D driver/src/compiler/shader_info.h b/mesa 3D driver/src/compiler/shader_info.h
index 6794f3d15a..405b204228 100644
--- a/mesa 3D driver/src/compiler/shader_info.h	
+++ b/mesa 3D driver/src/compiler/shader_info.h	
@@ -57,6 +57,7 @@ struct spirv_supported_capabilities {
    bool fragment_shading_rate;
    bool generic_pointers;
    bool geometry_streams;
+   bool groups;
    bool image_ms_array;
    bool image_read_without_format;
    bool image_write_without_format;
@@ -70,6 +71,7 @@ struct spirv_supported_capabilities {
    bool kernel_image;
    bool kernel_image_read_write;
    bool literal_sampler;
+   bool mesh_shading_nv;
    bool min_lod;
    bool multiview;
    bool physical_storage_buffer_address;
@@ -90,6 +92,7 @@ struct spirv_supported_capabilities {
    bool subgroup_arithmetic;
    bool subgroup_ballot;
    bool subgroup_basic;
+   bool subgroup_dispatch;
    bool subgroup_quad;
    bool subgroup_shuffle;
    bool subgroup_uniform_control_flow;
@@ -154,6 +157,12 @@ typedef struct shader_info {
    /* Which system values are actually read */
    BITSET_DECLARE(system_values_read, SYSTEM_VALUE_MAX);
 
+   /* Which I/O is per-primitive, for read/written information combine with
+    * the fields above.
+    */
+   uint64_t per_primitive_inputs;
+   uint64_t per_primitive_outputs;
+
    /* Which 16-bit inputs and outputs are used corresponding to
     * VARYING_SLOT_VARn_16BIT.
     */
@@ -290,6 +299,9 @@ typedef struct shader_info {
 
          /* True if the shader writes position in window space coordinates pre-transform */
          bool window_space_position:1;
+
+         /** Is an edge flag input needed? */
+         bool needs_edge_flag:1;
       } vs;
 
       struct {
@@ -403,6 +415,11 @@ typedef struct shader_info {
          unsigned color1_interp:3; /* glsl_interp_mode */
          bool color1_sample:1;
          bool color1_centroid:1;
+
+         /* Bitmask of gl_advanced_blend_mode values that may be used with this
+          * shader.
+          */
+         unsigned advanced_blend_modes;
       } fs;
 
       struct {
@@ -416,6 +433,11 @@ typedef struct shader_info {
           */
          enum gl_derivative_group derivative_group:2;
 
+         /**
+          * Explicit subgroup size if set by the shader, otherwise 0.
+          */
+         unsigned subgroup_size;
+
          /**
           * pointer size is:
           *   AddressingModelLogical:    0    (default)
@@ -452,6 +474,13 @@ typedef struct shader_info {
           */
          uint64_t tcs_cross_invocation_outputs_read;
       } tess;
+
+      /* Applies to MESH. */
+      struct {
+         uint16_t max_vertices_out;
+         uint16_t max_primitives_out;
+         uint16_t primitive_type;  /* GL_POINTS, GL_LINES or GL_TRIANGLES. */
+      } mesh;
    };
 } shader_info;
 
diff --git a/mesa 3D driver/src/compiler/spirv/nir_spirv.h b/mesa 3D driver/src/compiler/spirv/nir_spirv.h
index 3e637ef497..1cb370c323 100644
--- a/mesa 3D driver/src/compiler/spirv/nir_spirv.h	
+++ b/mesa 3D driver/src/compiler/spirv/nir_spirv.h	
@@ -58,11 +58,6 @@ enum nir_spirv_execution_environment {
 struct spirv_to_nir_options {
    enum nir_spirv_execution_environment environment;
 
-   /* Whether to make FragCoord to a system value, the same as
-    * GLSLFragCoordIsSysVal in GLSL.
-    */
-   bool frag_coord_is_sysval;
-
    /* Whether to keep ViewIndex as an input instead of rewriting to a sysval.
     */
    bool view_index_is_input;
diff --git a/mesa 3D driver/src/compiler/spirv/spirv.core.grammar.json b/mesa 3D driver/src/compiler/spirv/spirv.core.grammar.json
index 04eb87e3f3..2ff4f9a442 100644
--- a/mesa 3D driver/src/compiler/spirv/spirv.core.grammar.json	
+++ b/mesa 3D driver/src/compiler/spirv/spirv.core.grammar.json	
@@ -2157,7 +2157,7 @@
         { "kind" : "IdRef",        "name" : "'Offset'" },
         { "kind" : "IdRef",        "name" : "'Count'" }
       ],
-      "capabilities" : [ "Shader" ]
+      "capabilities" : [ "Shader", "BitInstructions" ]
     },
     {
       "opname" : "OpBitFieldSExtract",
@@ -2170,7 +2170,7 @@
         { "kind" : "IdRef",        "name" : "'Offset'" },
         { "kind" : "IdRef",        "name" : "'Count'" }
       ],
-      "capabilities" : [ "Shader" ]
+      "capabilities" : [ "Shader", "BitInstructions" ]
     },
     {
       "opname" : "OpBitFieldUExtract",
@@ -2183,7 +2183,7 @@
         { "kind" : "IdRef",        "name" : "'Offset'" },
         { "kind" : "IdRef",        "name" : "'Count'" }
       ],
-      "capabilities" : [ "Shader" ]
+      "capabilities" : [ "Shader", "BitInstructions" ]
     },
     {
       "opname" : "OpBitReverse",
@@ -2194,7 +2194,7 @@
         { "kind" : "IdResult" },
         { "kind" : "IdRef",        "name" : "'Base'" }
       ],
-      "capabilities" : [ "Shader" ]
+      "capabilities" : [ "Shader", "BitInstructions" ]
     },
     {
       "opname" : "OpBitCount",
@@ -8689,6 +8689,10 @@
         {
           "enumerant" : "Const",
           "value" : "0x0008"
+        },
+        {
+          "enumerant" : "OptNoneINTEL",
+          "value" : "0x10000"
         }
       ]
     },
@@ -9445,7 +9449,9 @@
           "value" : 39,
           "capabilities" : [ "Kernel" ],
           "parameters" : [
-            { "kind" : "IdRef", "name" : "'Local Size Hint'" }
+            { "kind" : "IdRef", "name" : "'x size hint'" },
+            { "kind" : "IdRef", "name" : "'y size hint'" },
+            { "kind" : "IdRef", "name" : "'z size hint'" }
           ],
           "version" : "1.2"
         },
@@ -10433,35 +10439,51 @@
       "enumerants" : [
         {
           "enumerant" : "TRN",
-          "value" : 0
+          "value" : 0,
+          "capabilities" : [ "ArbitraryPrecisionFixedPointINTEL"],
+          "version" : "None"
         },
         {
           "enumerant" : "TRN_ZERO",
-          "value" : 1
+          "value" : 1,
+          "capabilities" : [ "ArbitraryPrecisionFixedPointINTEL"],
+          "version" : "None"
         },
         {
           "enumerant" : "RND",
-          "value" : 2
+          "value" : 2,
+          "capabilities" : [ "ArbitraryPrecisionFixedPointINTEL"],
+          "version" : "None"
         },
         {
           "enumerant" : "RND_ZERO",
-          "value" : 3
+          "value" : 3,
+          "capabilities" : [ "ArbitraryPrecisionFixedPointINTEL"],
+          "version" : "None"
         },
         {
           "enumerant" : "RND_INF",
-          "value" : 4
+          "value" : 4,
+          "capabilities" : [ "ArbitraryPrecisionFixedPointINTEL"],
+          "version" : "None"
         },
         {
           "enumerant" : "RND_MIN_INF",
-          "value" : 5
+          "value" : 5,
+          "capabilities" : [ "ArbitraryPrecisionFixedPointINTEL"],
+          "version" : "None"
         },
         {
           "enumerant" : "RND_CONV",
-          "value" : 6
+          "value" : 6,
+          "capabilities" : [ "ArbitraryPrecisionFixedPointINTEL"],
+          "version" : "None"
         },
         {
           "enumerant" : "RND_CONV_ODD",
-          "value" : 7
+          "value" : 7,
+          "capabilities" : [ "ArbitraryPrecisionFixedPointINTEL"],
+          "version" : "None"
         }
       ]
     },
@@ -10489,19 +10511,27 @@
       "enumerants" : [
         {
           "enumerant" : "WRAP",
-          "value" : 0
+          "value" : 0,
+          "capabilities" : [ "ArbitraryPrecisionFixedPointINTEL"],
+          "version" : "None"
         },
         {
           "enumerant" : "SAT",
-          "value" : 1
+          "value" : 1,
+          "capabilities" : [ "ArbitraryPrecisionFixedPointINTEL"],
+          "version" : "None"
         },
         {
           "enumerant" : "SAT_ZERO",
-          "value" : 2
+          "value" : 2,
+          "capabilities" : [ "ArbitraryPrecisionFixedPointINTEL"],
+          "version" : "None"
         },
         {
           "enumerant" : "SAT_SYM",
-          "value" : 3
+          "value" : 3,
+          "capabilities" : [ "ArbitraryPrecisionFixedPointINTEL"],
+          "version" : "None"
         }
       ]
     },
@@ -13401,6 +13431,12 @@
           "extensions" : [ "SPV_KHR_integer_dot_product" ],
           "version" : "None"
         },
+        {
+          "enumerant" : "BitInstructions",
+          "value" : 6025,
+          "extensions" : [ "SPV_KHR_bit_instructions" ],
+          "version" : "None"
+        },
         {
           "enumerant" : "AtomicFloat32AddEXT",
           "value" : 6033,
@@ -13421,12 +13457,24 @@
           "extensions" : [ "SPV_INTEL_long_constant_composite" ],
           "version" : "None"
         },
+        {
+          "enumerant" : "OptNoneINTEL",
+          "value" : 6094,
+          "extensions" : [ "SPV_INTEL_optnone" ],
+          "version" : "None"
+        },
         {
           "enumerant" : "AtomicFloat16AddEXT",
           "value" : 6095,
           "capabilities" : [ "Shader" ],
           "extensions" : [ "SPV_EXT_shader_atomic_float16_add" ],
           "version" : "None"
+        },
+        {
+          "enumerant" : "DebugInfoModuleINTEL",
+          "value" : 6114,
+          "extensions" : [ "SPV_INTEL_debug_module" ],
+          "version" : "None"
         }
       ]
     },
@@ -13497,7 +13545,7 @@
         {
           "enumerant" : "PackedVectorFormat4x8BitKHR",
           "value" : 0,
-          "capabilities" : [ "DotProductInput4x8BitPackedKHR" ],
+          "extensions" : [ "SPV_KHR_integer_dot_product" ],
           "version" : "None"
         }
       ]
diff --git a/mesa 3D driver/src/compiler/spirv/spirv.h b/mesa 3D driver/src/compiler/spirv/spirv.h
index 68e9a9aa5d..79aa4f548c 100644
--- a/mesa 3D driver/src/compiler/spirv/spirv.h	
+++ b/mesa 3D driver/src/compiler/spirv/spirv.h	
@@ -726,6 +726,7 @@ typedef enum SpvFunctionControlShift_ {
     SpvFunctionControlDontInlineShift = 1,
     SpvFunctionControlPureShift = 2,
     SpvFunctionControlConstShift = 3,
+    SpvFunctionControlOptNoneINTELShift = 16,
     SpvFunctionControlMax = 0x7fffffff,
 } SpvFunctionControlShift;
 
@@ -735,6 +736,7 @@ typedef enum SpvFunctionControlMask_ {
     SpvFunctionControlDontInlineMask = 0x00000002,
     SpvFunctionControlPureMask = 0x00000004,
     SpvFunctionControlConstMask = 0x00000008,
+    SpvFunctionControlOptNoneINTELMask = 0x00010000,
 } SpvFunctionControlMask;
 
 typedef enum SpvMemorySemanticsShift_ {
@@ -1049,10 +1051,13 @@ typedef enum SpvCapability_ {
     SpvCapabilityDotProductInput4x8BitKHR = 6017,
     SpvCapabilityDotProductInput4x8BitPackedKHR = 6018,
     SpvCapabilityDotProductKHR = 6019,
+    SpvCapabilityBitInstructions = 6025,
     SpvCapabilityAtomicFloat32AddEXT = 6033,
     SpvCapabilityAtomicFloat64AddEXT = 6034,
     SpvCapabilityLongConstantCompositeINTEL = 6089,
+    SpvCapabilityOptNoneINTEL = 6094,
     SpvCapabilityAtomicFloat16AddEXT = 6095,
+    SpvCapabilityDebugInfoModuleINTEL = 6114,
     SpvCapabilityMax = 0x7fffffff,
 } SpvCapability;
 
diff --git a/mesa 3D driver/src/compiler/spirv/spirv2nir.c b/mesa 3D driver/src/compiler/spirv/spirv2nir.c
index fed803e92a..c473bf2c8d 100644
--- a/mesa 3D driver/src/compiler/spirv/spirv2nir.c	
+++ b/mesa 3D driver/src/compiler/spirv/spirv2nir.c	
@@ -61,6 +61,10 @@ stage_to_enum(char *stage)
       return MESA_SHADER_COMPUTE;
    else if (!strcmp(stage, "kernel"))
       return MESA_SHADER_KERNEL;
+   else if (!strcmp(stage, "task"))
+      return MESA_SHADER_TASK;
+   else if (!strcmp(stage, "mesh"))
+      return MESA_SHADER_MESH;
    else
       return MESA_SHADER_NONE;
 }
@@ -74,7 +78,7 @@ print_usage(char *exec_name, FILE *f)
 "  -h  --help              Print this help.\n"
 "  -s, --stage <stage>     Specify the shader stage.  Valid stages are:\n"
 "                          vertex, tess-ctrl, tess-eval, geometry, fragment,\n"
-"                          compute, and kernel (OpenCL-style compute).\n"
+"                          task, mesh, compute, and kernel (OpenCL-style compute).\n"
 "  -e, --entry <name>      Specify the entry-point name.\n"
 "  -g, --opengl            Use OpenGL environment instead of Vulkan for\n"
 "                          graphics stages.\n"
diff --git a/mesa 3D driver/src/compiler/spirv/spirv_to_nir.c b/mesa 3D driver/src/compiler/spirv/spirv_to_nir.c
index 983d8f9f06..f78205d696 100644
--- a/mesa 3D driver/src/compiler/spirv/spirv_to_nir.c	
+++ b/mesa 3D driver/src/compiler/spirv/spirv_to_nir.c	
@@ -225,7 +225,7 @@ vtn_undef_ssa_value(struct vtn_builder *b, const struct glsl_type *type)
    return val;
 }
 
-static struct vtn_ssa_value *
+struct vtn_ssa_value *
 vtn_const_ssa_value(struct vtn_builder *b, nir_constant *constant,
                     const struct glsl_type *type)
 {
@@ -367,8 +367,10 @@ vtn_get_image(struct vtn_builder *b, uint32_t value_id,
    vtn_assert(type->base_type == vtn_base_type_image);
    if (access)
       *access |= spirv_to_gl_access_qualifier(b, type->access_qualifier);
+   nir_variable_mode mode = glsl_type_is_image(type->glsl_image) ?
+                            nir_var_image : nir_var_uniform;
    return nir_build_deref_cast(&b->nb, vtn_get_nir_ssa(b, value_id),
-                               nir_var_uniform, type->glsl_image, 0);
+                               mode, type->glsl_image, 0);
 }
 
 static void
@@ -415,10 +417,16 @@ vtn_get_sampled_image(struct vtn_builder *b, uint32_t value_id)
    vtn_assert(type->base_type == vtn_base_type_sampled_image);
    nir_ssa_def *si_vec2 = vtn_get_nir_ssa(b, value_id);
 
+   /* Even though this is a sampled image, we can end up here with a storage
+    * image because OpenCL doesn't distinguish between the two.
+    */
+   const struct glsl_type *image_type = type->image->glsl_image;
+   nir_variable_mode image_mode = glsl_type_is_image(image_type) ?
+                                  nir_var_image : nir_var_uniform;
+
    struct vtn_sampled_image si = { NULL, };
    si.image = nir_build_deref_cast(&b->nb, nir_channel(&b->nb, si_vec2, 0),
-                                   nir_var_uniform,
-                                   type->image->glsl_image, 0);
+                                   image_mode, image_type, 0);
    si.sampler = nir_build_deref_cast(&b->nb, nir_channel(&b->nb, si_vec2, 1),
                                      nir_var_uniform,
                                      glsl_bare_sampler_type(), 0);
@@ -845,19 +853,6 @@ vtn_type_copy(struct vtn_builder *b, struct vtn_type *src)
    return dest;
 }
 
-static const struct glsl_type *
-wrap_type_in_array(const struct glsl_type *type,
-                   const struct glsl_type *array_type)
-{
-   if (!glsl_type_is_array(array_type))
-      return type;
-
-   const struct glsl_type *elem_type =
-      wrap_type_in_array(type, glsl_get_array_element(array_type));
-   return glsl_array_type(elem_type, glsl_get_length(array_type),
-                          glsl_get_explicit_stride(array_type));
-}
-
 static bool
 vtn_type_needs_explicit_layout(struct vtn_builder *b, struct vtn_type *type,
                                enum vtn_variable_mode mode)
@@ -899,7 +894,7 @@ vtn_type_get_nir_type(struct vtn_builder *b, struct vtn_type *type,
       vtn_fail_if(glsl_without_array(type->type) != glsl_uint_type(),
                   "Variables in the AtomicCounter storage class should be "
                   "(possibly arrays of arrays of) uint.");
-      return wrap_type_in_array(glsl_atomic_uint_type(), type->type);
+      return glsl_type_wrap_in_arrays(glsl_atomic_uint_type(), type->type);
    }
 
    if (mode == vtn_variable_mode_uniform) {
@@ -942,19 +937,27 @@ vtn_type_get_nir_type(struct vtn_builder *b, struct vtn_type *type,
       }
 
       case vtn_base_type_image:
+         vtn_assert(glsl_type_is_texture(type->glsl_image));
          return type->glsl_image;
 
       case vtn_base_type_sampler:
          return glsl_bare_sampler_type();
 
       case vtn_base_type_sampled_image:
-         return type->image->glsl_image;
+         return glsl_texture_type_to_sampler(type->image->glsl_image,
+                                             false /* is_shadow */);
 
       default:
          return type->type;
       }
    }
 
+   if (mode == vtn_variable_mode_image) {
+      struct vtn_type *image_type = vtn_type_without_array(type);
+      vtn_assert(image_type->base_type == vtn_base_type_image);
+      return glsl_type_wrap_in_arrays(image_type->glsl_image, type->type);
+   }
+
    /* Layout decorations are allowed but ignored in certain conditions,
     * to allow SPIR-V generators perform type deduplication.  Discard
     * unnecessary ones when passing to NIR.
@@ -1084,6 +1087,8 @@ struct_member_decoration_cb(struct vtn_builder *b,
       break;
 
    case SpvDecorationPatch:
+   case SpvDecorationPerPrimitiveNV:
+   case SpvDecorationPerTaskNV:
       break;
 
    case SpvDecorationSpecId:
@@ -1128,6 +1133,11 @@ struct_member_decoration_cb(struct vtn_builder *b,
       /* User semantic decorations can safely be ignored by the driver. */
       break;
 
+   case SpvDecorationPerViewNV:
+      /* TODO(mesh): Handle multiview. */
+      vtn_warn("Mesh multiview not yet supported. Needed for decoration PerViewNV.");
+      break;
+
    default:
       vtn_fail_with_decoration("Unhandled decoration", dec->decoration);
    }
@@ -1580,6 +1590,7 @@ vtn_handle_type(struct vtn_builder *b, SpvOp opcode,
       if (opcode == SpvOpTypePointer)
          deref_type = vtn_get_type(b, w[3]);
 
+      bool has_forward_pointer = false;
       if (val->value_type == vtn_value_type_invalid) {
          val->value_type = vtn_value_type_type;
          val->type = rzalloc(b, struct vtn_type);
@@ -1610,6 +1621,7 @@ vtn_handle_type(struct vtn_builder *b, SpvOp opcode,
                      "The storage classes of an OpTypePointer and any "
                      "OpTypeForwardPointers that provide forward "
                      "declarations of it must match.");
+         has_forward_pointer = true;
       }
 
       if (opcode == SpvOpTypePointer) {
@@ -1618,6 +1630,11 @@ vtn_handle_type(struct vtn_builder *b, SpvOp opcode,
                      "forward declaration of a pointer, OpTypePointer can "
                      "only be used once for a given id.");
 
+         vtn_fail_if(has_forward_pointer &&
+                     deref_type->base_type != vtn_base_type_struct,
+                     "An OpTypePointer instruction must declare "
+                     "Pointer Type to be a pointer to an OpTypeStruct.");
+
          val->type->deref = deref_type;
 
          /* Only certain storage classes use ArrayStride. */
@@ -1715,7 +1732,7 @@ vtn_handle_type(struct vtn_builder *b, SpvOp opcode,
       enum glsl_base_type sampled_base_type =
          glsl_get_base_type(sampled_type->type);
       if (sampled == 1) {
-         val->type->glsl_image = glsl_sampler_type(dim, false, is_array,
+         val->type->glsl_image = glsl_texture_type(dim, is_array,
                                                    sampled_base_type);
       } else if (sampled == 2) {
          val->type->glsl_image = glsl_image_type(dim, is_array,
@@ -1957,17 +1974,20 @@ vtn_handle_constant(struct vtn_builder *b, SpvOp opcode,
                   spirv_op_to_string(opcode), elem_count, val->type->length);
 
       nir_constant **elems = ralloc_array(b, nir_constant *, elem_count);
+      val->is_undef_constant = true;
       for (unsigned i = 0; i < elem_count; i++) {
-         struct vtn_value *val = vtn_untyped_value(b, w[i + 3]);
+         struct vtn_value *elem_val = vtn_untyped_value(b, w[i + 3]);
 
-         if (val->value_type == vtn_value_type_constant) {
-            elems[i] = val->constant;
+         if (elem_val->value_type == vtn_value_type_constant) {
+            elems[i] = elem_val->constant;
+            val->is_undef_constant = val->is_undef_constant &&
+                                     elem_val->is_undef_constant;
          } else {
-            vtn_fail_if(val->value_type != vtn_value_type_undef,
+            vtn_fail_if(elem_val->value_type != vtn_value_type_undef,
                         "only constants or undefs allowed for "
                         "SpvOpConstantComposite");
             /* to make it easier, just insert a NULL constant for now */
-            elems[i] = vtn_null_constant(b, val->type);
+            elems[i] = vtn_null_constant(b, elem_val->type);
          }
       }
 
@@ -2216,8 +2236,7 @@ vtn_handle_constant(struct vtn_builder *b, SpvOp opcode,
    }
 
    /* Now that we have the value, update the workgroup size if needed */
-   if (b->entry_point_stage == MESA_SHADER_COMPUTE ||
-       b->entry_point_stage == MESA_SHADER_KERNEL)
+   if (gl_shader_stage_uses_workgroup(b->entry_point_stage))
       vtn_foreach_decoration(b, val, handle_workgroup_size_decoration_cb,
                              NULL);
 }
@@ -2381,18 +2400,15 @@ vtn_mem_semantics_to_nir_var_modes(struct vtn_builder *b,
                      SpvMemorySemanticsAtomicCounterMemoryMask);
    }
 
-   /* TODO: Consider adding nir_var_mem_image mode to NIR so it can be used
-    * for SpvMemorySemanticsImageMemoryMask.
-    */
-
    nir_variable_mode modes = 0;
-   if (semantics & (SpvMemorySemanticsUniformMemoryMask |
-                    SpvMemorySemanticsImageMemoryMask)) {
+   if (semantics & SpvMemorySemanticsUniformMemoryMask) {
       modes |= nir_var_uniform |
                nir_var_mem_ubo |
                nir_var_mem_ssbo |
                nir_var_mem_global;
    }
+   if (semantics & SpvMemorySemanticsImageMemoryMask)
+      modes |= nir_var_image;
    if (semantics & SpvMemorySemanticsWorkgroupMemoryMask)
       modes |= nir_var_mem_shared;
    if (semantics & SpvMemorySemanticsCrossWorkgroupMemoryMask)
@@ -2711,11 +2727,11 @@ vtn_handle_texture(struct vtn_builder *b, SpvOp opcode,
       break;
 
    case SpvOpFragmentFetchAMD:
-      texop = nir_texop_fragment_fetch;
+      texop = nir_texop_fragment_fetch_amd;
       break;
 
    case SpvOpFragmentMaskFetchAMD:
-      texop = nir_texop_fragment_mask_fetch;
+      texop = nir_texop_fragment_mask_fetch_amd;
       dest_type = nir_type_uint32;
       break;
 
@@ -2750,8 +2766,8 @@ vtn_handle_texture(struct vtn_builder *b, SpvOp opcode,
    case nir_texop_query_levels:
    case nir_texop_texture_samples:
    case nir_texop_samples_identical:
-   case nir_texop_fragment_fetch:
-   case nir_texop_fragment_mask_fetch:
+   case nir_texop_fragment_fetch_amd:
+   case nir_texop_fragment_mask_fetch_amd:
       /* These don't */
       break;
    case nir_texop_txf_ms_fb:
@@ -2797,6 +2813,15 @@ vtn_handle_texture(struct vtn_builder *b, SpvOp opcode,
 
       struct vtn_ssa_value *coord_val = vtn_ssa_value(b, w[idx++]);
       coord = coord_val->def;
+      /* From the SPIR-V spec verxion 1.5, rev. 5:
+       *
+       *    "Coordinate must be a scalar or vector of floating-point type. It
+       *    contains (u[, v] ... [, array layer]) as needed by the definition
+       *    of Sampled Image. It may be a vector larger than needed, but all
+       *    unused components appear after all used components."
+       */
+      vtn_fail_if(coord->num_components < coord_components,
+                  "Coordinate value passed has fewer components than sampler dimensionality.");
       p->src = nir_src_for_ssa(nir_channels(&b->nb, coord,
                                             (1 << coord_components) - 1));
 
@@ -2807,9 +2832,16 @@ vtn_handle_texture(struct vtn_builder *b, SpvOp opcode,
                      "Unless the Kernel capability is being used, the coordinate parameter "
                      "OpImageSampleExplicitLod must be floating point.");
 
-         p->src = nir_src_for_ssa(
-            nir_fadd(&b->nb, nir_i2f32(&b->nb, p->src.ssa),
-                             nir_imm_float(&b->nb, 0.5)));
+         nir_ssa_def *coords[4];
+         nir_ssa_def *f0_5 = nir_imm_float(&b->nb, 0.5);
+         for (unsigned i = 0; i < coord_components; i++) {
+            coords[i] = nir_i2f32(&b->nb, nir_channel(&b->nb, p->src.ssa, i));
+
+            if (!is_array || i != coord_components - 1)
+               coords[i] = nir_fadd(&b->nb, coords[i], f0_5);
+         }
+
+         p->src = nir_src_for_ssa(nir_vec(&b->nb, coords, coord_components));
       }
 
       p->src_type = nir_tex_src_coord;
@@ -3604,13 +3636,13 @@ vtn_handle_atomics(struct vtn_builder *b, SpvOp opcode,
    case SpvOpAtomicFMinEXT:
    case SpvOpAtomicFMaxEXT:
    case SpvOpAtomicFlagTestAndSet:
-      ptr = vtn_value(b, w[3], vtn_value_type_pointer)->pointer;
+      ptr = vtn_pointer(b, w[3]);
       scope = vtn_constant_uint(b, w[4]);
       semantics = vtn_constant_uint(b, w[5]);
       break;
    case SpvOpAtomicFlagClear:
    case SpvOpAtomicStore:
-      ptr = vtn_value(b, w[1], vtn_value_type_pointer)->pointer;
+      ptr = vtn_pointer(b, w[1]);
       scope = vtn_constant_uint(b, w[2]);
       semantics = vtn_constant_uint(b, w[3]);
       break;
@@ -4154,8 +4186,12 @@ vtn_handle_barrier(struct vtn_builder *b, SpvOp opcode,
        *    variables performed by any invocation executed prior to a
        *    OpControlBarrier will be visible to any other invocation after
        *    return from that OpControlBarrier."
+       *
+       * The same applies to VK_NV_mesh_shader.
        */
-      if (b->nb.shader->info.stage == MESA_SHADER_TESS_CTRL) {
+      if (b->nb.shader->info.stage == MESA_SHADER_TESS_CTRL ||
+          b->nb.shader->info.stage == MESA_SHADER_TASK ||
+          b->nb.shader->info.stage == MESA_SHADER_MESH) {
          memory_semantics &= ~(SpvMemorySemanticsAcquireMask |
                                SpvMemorySemanticsReleaseMask |
                                SpvMemorySemanticsAcquireReleaseMask |
@@ -4190,10 +4226,12 @@ gl_primitive_from_spv_execution_mode(struct vtn_builder *b,
    case SpvExecutionModeOutputPoints:
       return 0; /* GL_POINTS */
    case SpvExecutionModeInputLines:
+   case SpvExecutionModeOutputLinesNV:
       return 1; /* GL_LINES */
    case SpvExecutionModeInputLinesAdjacency:
       return 0x000A; /* GL_LINE_STRIP_ADJACENCY_ARB */
    case SpvExecutionModeTriangles:
+   case SpvExecutionModeOutputTrianglesNV:
       return 4; /* GL_TRIANGLES */
    case SpvExecutionModeInputTrianglesAdjacency:
       return 0x000C; /* GL_TRIANGLES_ADJACENCY_ARB */
@@ -4262,6 +4300,10 @@ stage_for_execution_model(struct vtn_builder *b, SpvExecutionModel model)
       return MESA_SHADER_INTERSECTION;
    case SpvExecutionModelCallableKHR:
        return MESA_SHADER_CALLABLE;
+   case SpvExecutionModelTaskNV:
+      return MESA_SHADER_TASK;
+   case SpvExecutionModelMeshNV:
+      return MESA_SHADER_MESH;
    default:
       vtn_fail("Unsupported execution model: %s (%u)",
                spirv_executionmodel_to_string(model), model);
@@ -4364,6 +4406,10 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, SpvOp opcode,
       case SpvCapabilityImageGatherExtended:
       case SpvCapabilityStorageImageExtendedFormats:
       case SpvCapabilityVector16:
+      case SpvCapabilityDotProductKHR:
+      case SpvCapabilityDotProductInputAllKHR:
+      case SpvCapabilityDotProductInput4x8BitKHR:
+      case SpvCapabilityDotProductInput4x8BitPackedKHR:
          break;
 
       case SpvCapabilityLinkage:
@@ -4503,7 +4549,19 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, SpvOp opcode,
          break;
 
       case SpvCapabilityGroups:
-         spv_check_supported(amd_shader_ballot, cap);
+         spv_check_supported(groups, cap);
+         break;
+
+      case SpvCapabilitySubgroupDispatch:
+         spv_check_supported(subgroup_dispatch, cap);
+         /* Missing :
+          *   - SpvOpGetKernelLocalSizeForSubgroupCount
+          *   - SpvOpGetKernelMaxNumSubgroups
+          *   - SpvExecutionModeSubgroupsPerWorkgroup
+          *   - SpvExecutionModeSubgroupsPerWorkgroupId
+          */
+         vtn_warn("Not fully supported capability: %s",
+                  spirv_capability_to_string(cap));
          break;
 
       case SpvCapabilityVariablePointersStorageBuffer:
@@ -4691,6 +4749,10 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, SpvOp opcode,
          spv_check_supported(float64_atomic_min_max, cap);
          break;
 
+      case SpvCapabilityMeshShadingNV:
+         spv_check_supported(mesh_shading_nv, cap);
+         break;
+
       default:
          vtn_fail("Unhandled capability: %s (%u)",
                   spirv_capability_to_string(cap), cap);
@@ -4863,19 +4925,32 @@ vtn_handle_execution_mode(struct vtn_builder *b, struct vtn_value *entry_point,
       break;
 
    case SpvExecutionModeLocalSize:
-      vtn_assert(gl_shader_stage_is_compute(b->shader->info.stage));
-      b->shader->info.workgroup_size[0] = mode->operands[0];
-      b->shader->info.workgroup_size[1] = mode->operands[1];
-      b->shader->info.workgroup_size[2] = mode->operands[2];
+      if (gl_shader_stage_uses_workgroup(b->shader->info.stage)) {
+         b->shader->info.workgroup_size[0] = mode->operands[0];
+         b->shader->info.workgroup_size[1] = mode->operands[1];
+         b->shader->info.workgroup_size[2] = mode->operands[2];
+      } else {
+         vtn_fail("Execution mode LocalSize not supported in stage %s",
+                  _mesa_shader_stage_to_string(b->shader->info.stage));
+      }
       break;
 
    case SpvExecutionModeOutputVertices:
-      if (b->shader->info.stage == MESA_SHADER_TESS_CTRL ||
-          b->shader->info.stage == MESA_SHADER_TESS_EVAL) {
+      switch (b->shader->info.stage) {
+      case MESA_SHADER_TESS_CTRL:
+      case MESA_SHADER_TESS_EVAL:
          b->shader->info.tess.tcs_vertices_out = mode->operands[0];
-      } else {
-         vtn_assert(b->shader->info.stage == MESA_SHADER_GEOMETRY);
+         break;
+      case MESA_SHADER_GEOMETRY:
          b->shader->info.gs.vertices_out = mode->operands[0];
+         break;
+      case MESA_SHADER_MESH:
+         b->shader->info.mesh.max_vertices_out = mode->operands[0];
+         break;
+      default:
+         vtn_fail("Execution mode OutputVertices not supported in stage %s",
+                  _mesa_shader_stage_to_string(b->shader->info.stage));
+         break;
       }
       break;
 
@@ -4899,7 +4974,37 @@ vtn_handle_execution_mode(struct vtn_builder *b, struct vtn_value *entry_point,
       }
       break;
 
-   case SpvExecutionModeOutputPoints:
+   case SpvExecutionModeOutputPrimitivesNV:
+      vtn_assert(b->shader->info.stage == MESA_SHADER_MESH);
+      b->shader->info.mesh.max_primitives_out = mode->operands[0];
+      break;
+
+   case SpvExecutionModeOutputLinesNV:
+   case SpvExecutionModeOutputTrianglesNV:
+      vtn_assert(b->shader->info.stage == MESA_SHADER_MESH);
+      b->shader->info.mesh.primitive_type =
+         gl_primitive_from_spv_execution_mode(b, mode->exec_mode);
+      break;
+
+   case SpvExecutionModeOutputPoints: {
+      const unsigned primitive =
+         gl_primitive_from_spv_execution_mode(b, mode->exec_mode);
+
+      switch (b->shader->info.stage) {
+      case MESA_SHADER_GEOMETRY:
+         b->shader->info.gs.output_primitive = primitive;
+         break;
+      case MESA_SHADER_MESH:
+         b->shader->info.mesh.primitive_type = primitive;
+         break;
+      default:
+         vtn_fail("Execution mode OutputPoints not supported in stage %s",
+                  _mesa_shader_stage_to_string(b->shader->info.stage));
+         break;
+      }
+      break;
+   }
+
    case SpvExecutionModeOutputLineStrip:
    case SpvExecutionModeOutputTriangleStrip:
       vtn_assert(b->shader->info.stage == MESA_SHADER_GEOMETRY);
@@ -5061,6 +5166,11 @@ vtn_handle_execution_mode(struct vtn_builder *b, struct vtn_value *entry_point,
       /* Handled later by vtn_handle_execution_mode_id(). */
       break;
 
+   case SpvExecutionModeSubgroupSize:
+      vtn_assert(b->shader->info.stage == MESA_SHADER_KERNEL);
+      b->shader->info.cs.subgroup_size = mode->operands[0];
+      break;
+
    case SpvExecutionModeSubgroupUniformControlFlowKHR:
       /* There's no corresponding SPIR-V capability, so check here. */
       vtn_fail_if(!b->options->caps.subgroup_uniform_control_flow,
@@ -5083,9 +5193,14 @@ vtn_handle_execution_mode_id(struct vtn_builder *b, struct vtn_value *entry_poin
 
    switch (mode->exec_mode) {
    case SpvExecutionModeLocalSizeId:
-      b->shader->info.workgroup_size[0] = vtn_constant_uint(b, mode->operands[0]);
-      b->shader->info.workgroup_size[1] = vtn_constant_uint(b, mode->operands[1]);
-      b->shader->info.workgroup_size[2] = vtn_constant_uint(b, mode->operands[2]);
+      if (gl_shader_stage_uses_workgroup(b->shader->info.stage)) {
+         b->shader->info.workgroup_size[0] = vtn_constant_uint(b, mode->operands[0]);
+         b->shader->info.workgroup_size[1] = vtn_constant_uint(b, mode->operands[1]);
+         b->shader->info.workgroup_size[2] = vtn_constant_uint(b, mode->operands[2]);
+      } else {
+         vtn_fail("Execution mode LocalSizeId not supported in stage %s",
+                  _mesa_shader_stage_to_string(b->shader->info.stage));
+      }
       break;
 
    case SpvExecutionModeLocalSizeHintId:
@@ -5389,6 +5504,58 @@ vtn_handle_ray_intrinsic(struct vtn_builder *b, SpvOp opcode,
    }
 }
 
+static void
+vtn_handle_write_packed_primitive_indices(struct vtn_builder *b, SpvOp opcode,
+                                          const uint32_t *w, unsigned count)
+{
+   vtn_assert(opcode == SpvOpWritePackedPrimitiveIndices4x8NV);
+
+   /* TODO(mesh): Use or create a primitive that allow the unpacking to
+    * happen in the backend.  What we have here is functional but too
+    * blunt.
+    */
+
+   struct vtn_type *offset_type = vtn_get_value_type(b, w[1]);
+   vtn_fail_if(offset_type->base_type != vtn_base_type_scalar ||
+               offset_type->type != glsl_uint_type(),
+               "Index Offset type of OpWritePackedPrimitiveIndices4x8NV "
+               "must be an OpTypeInt with 32-bit Width and 0 Signedness.");
+
+   struct vtn_type *packed_type = vtn_get_value_type(b, w[2]);
+   vtn_fail_if(packed_type->base_type != vtn_base_type_scalar ||
+               packed_type->type != glsl_uint_type(),
+               "Packed Indices type of OpWritePackedPrimitiveIndices4x8NV "
+               "must be an OpTypeInt with 32-bit Width and 0 Signedness.");
+
+   nir_deref_instr *indices = NULL;
+   nir_foreach_variable_with_modes(var, b->nb.shader, nir_var_shader_out) {
+      if (var->data.location == VARYING_SLOT_PRIMITIVE_INDICES) {
+         indices = nir_build_deref_var(&b->nb, var);
+         break;
+      }
+   }
+
+   /* TODO(mesh): It may be the case that the variable is not present in the
+    * entry point interface list.
+    *
+    * See https://github.com/KhronosGroup/SPIRV-Registry/issues/104.
+    */
+   vtn_fail_if(indices == NULL,
+               "Missing output variable decorated with PrimitiveIndices builtin.");
+
+   nir_ssa_def *offset = vtn_get_nir_ssa(b, w[1]);
+   nir_ssa_def *packed = vtn_get_nir_ssa(b, w[2]);
+   nir_ssa_def *unpacked = nir_unpack_bits(&b->nb, packed, 8);
+   for (int i = 0; i < 4; i++) {
+      nir_deref_instr *offset_deref =
+         nir_build_deref_array(&b->nb, indices,
+                               nir_iadd_imm(&b->nb, offset, i));
+      nir_ssa_def *val = nir_u2u(&b->nb, nir_channel(&b->nb, unpacked, i), 32);
+
+      nir_store_deref(&b->nb, offset_deref, val, 0x1);
+   }
+}
+
 static bool
 vtn_handle_body_instruction(struct vtn_builder *b, SpvOp opcode,
                             const uint32_t *w, unsigned count)
@@ -5479,7 +5646,7 @@ vtn_handle_body_instruction(struct vtn_builder *b, SpvOp opcode,
       if (glsl_type_is_image(image_type->glsl_image)) {
          vtn_handle_image(b, opcode, w, count);
       } else {
-         vtn_assert(glsl_type_is_sampler(image_type->glsl_image));
+         vtn_assert(glsl_type_is_texture(image_type->glsl_image));
          vtn_handle_texture(b, opcode, w, count);
       }
       break;
@@ -5650,6 +5817,15 @@ vtn_handle_body_instruction(struct vtn_builder *b, SpvOp opcode,
       vtn_handle_alu(b, opcode, w, count);
       break;
 
+   case SpvOpSDotKHR:
+   case SpvOpUDotKHR:
+   case SpvOpSUDotKHR:
+   case SpvOpSDotAccSatKHR:
+   case SpvOpUDotAccSatKHR:
+   case SpvOpSUDotAccSatKHR:
+      vtn_handle_integer_dot(b, opcode, w, count);
+      break;
+
    case SpvOpBitcast:
       vtn_handle_bitcast(b, w, count);
       break;
@@ -5818,6 +5994,10 @@ vtn_handle_body_instruction(struct vtn_builder *b, SpvOp opcode,
       vtn_handle_opencl_core_instruction(b, opcode, w, count);
       break;
 
+   case SpvOpWritePackedPrimitiveIndices4x8NV:
+      vtn_handle_write_packed_primitive_indices(b, opcode, w, count);
+      break;
+
    default:
       vtn_fail_with_opcode("Unhandled opcode", opcode);
    }
@@ -5876,6 +6056,29 @@ vtn_create_builder(const uint32_t *words, size_t word_count,
       (b->generator_id == vtn_generator_glslang_reference_front_end &&
        generator_version < 3);
 
+   /* Identifying the LLVM-SPIRV translator:
+    *
+    * The LLVM-SPIRV translator currently doesn't store any generator ID [1].
+    * Our use case involving the SPIRV-Tools linker also mean we want to check
+    * for that tool instead. Finally the SPIRV-Tools linker also stores its
+    * generator ID in the wrong location [2].
+    *
+    * [1] : https://github.com/KhronosGroup/SPIRV-LLVM-Translator/pull/1223
+    * [2] : https://github.com/KhronosGroup/SPIRV-Tools/pull/4549
+    */
+   const bool is_llvm_spirv_translator =
+      (b->generator_id == 0 &&
+       generator_version == vtn_generator_spirv_tools_linker) ||
+      b->generator_id == vtn_generator_spirv_tools_linker;
+
+   /* The LLVM-SPIRV translator generates Undef initializers for _local
+    * variables [1].
+    *
+    * [1] : https://github.com/KhronosGroup/SPIRV-LLVM-Translator/issues/1224
+    */
+   b->wa_llvm_spirv_ignore_workgroup_initializer =
+      b->options->environment == NIR_SPIRV_OPENCL && is_llvm_spirv_translator;
+
    /* words[2] == generator magic */
    unsigned value_id_bound = words[3];
    if (words[4] != 0) {
@@ -5925,22 +6128,25 @@ vtn_emit_kernel_entry_point_wrapper(struct vtn_builder *b,
 
       /* input variable */
       nir_variable *in_var = rzalloc(b->nb.shader, nir_variable);
-      in_var->data.mode = nir_var_uniform;
-      in_var->data.read_only = true;
-      in_var->data.location = i;
-      if (param_type->base_type == vtn_base_type_image) {
+
+      if (is_by_val) {
+         in_var->data.mode = nir_var_uniform;
+         in_var->type = param_type->deref->type;
+      } else if (param_type->base_type == vtn_base_type_image) {
+         in_var->data.mode = nir_var_image;
+         in_var->type = param_type->glsl_image;
          in_var->data.access =
             spirv_to_gl_access_qualifier(b, param_type->access_qualifier);
+      } else if (param_type->base_type == vtn_base_type_sampler) {
+         in_var->data.mode = nir_var_uniform;
+         in_var->type = glsl_bare_sampler_type();
+      } else {
+         in_var->data.mode = nir_var_uniform;
+         in_var->type = param_type->type;
       }
 
-      if (is_by_val)
-         in_var->type = param_type->deref->type;
-      else if (param_type->base_type == vtn_base_type_image)
-         in_var->type = param_type->glsl_image;
-      else if (param_type->base_type == vtn_base_type_sampler)
-         in_var->type = glsl_bare_sampler_type();
-      else
-         in_var->type = param_type->type;
+      in_var->data.read_only = true;
+      in_var->data.location = i;
 
       nir_shader_add_variable(b->nb.shader, in_var);
 
diff --git a/mesa 3D driver/src/compiler/spirv/vtn_alu.c b/mesa 3D driver/src/compiler/spirv/vtn_alu.c
index 48f41ac249..a9bdb4c3cf 100644
--- a/mesa 3D driver/src/compiler/spirv/vtn_alu.c	
+++ b/mesa 3D driver/src/compiler/spirv/vtn_alu.c	
@@ -378,8 +378,9 @@ vtn_nir_alu_op_for_spirv_opcode(struct vtn_builder *b,
 }
 
 static void
-handle_no_contraction(struct vtn_builder *b, struct vtn_value *val, int member,
-                      const struct vtn_decoration *dec, void *_void)
+handle_no_contraction(struct vtn_builder *b, UNUSED struct vtn_value *val,
+                      UNUSED int member, const struct vtn_decoration *dec,
+                      UNUSED void *_void)
 {
    vtn_assert(dec->scope == VTN_DEC_DECORATION);
    if (dec->decoration != SpvDecorationNoContraction)
@@ -423,7 +424,8 @@ struct conversion_opts {
 };
 
 static void
-handle_conversion_opts(struct vtn_builder *b, struct vtn_value *val, int member,
+handle_conversion_opts(struct vtn_builder *b, UNUSED struct vtn_value *val,
+                       UNUSED int member,
                        const struct vtn_decoration *dec, void *_opts)
 {
    struct conversion_opts *opts = _opts;
@@ -445,7 +447,8 @@ handle_conversion_opts(struct vtn_builder *b, struct vtn_value *val, int member,
 }
 
 static void
-handle_no_wrap(struct vtn_builder *b, struct vtn_value *val, int member,
+handle_no_wrap(UNUSED struct vtn_builder *b, UNUSED struct vtn_value *val,
+               UNUSED int member,
                const struct vtn_decoration *dec, void *_alu)
 {
    nir_alu_instr *alu = _alu;
@@ -597,8 +600,29 @@ vtn_handle_alu(struct vtn_builder *b, SpvOp opcode,
       break;
    }
 
-   case SpvOpFUnordEqual:
-   case SpvOpFUnordNotEqual:
+   case SpvOpFUnordEqual: {
+      const bool save_exact = b->nb.exact;
+
+      b->nb.exact = true;
+
+      /* This could also be implemented as !(a < b || b < a).  If one or both
+       * of the source are numbers, later optimization passes can easily
+       * eliminate the isnan() checks.  This may trim the sequence down to a
+       * single (a == b) operation.  Otherwise, the optimizer can transform
+       * whatever is left to !(a < b || b < a).  Since some applications will
+       * open-code this sequence, these optimizations are needed anyway.
+       */
+      dest->def =
+         nir_ior(&b->nb,
+                 nir_feq(&b->nb, src[0], src[1]),
+                 nir_ior(&b->nb,
+                         nir_fneu(&b->nb, src[0], src[0]),
+                         nir_fneu(&b->nb, src[1], src[1])));
+
+      b->nb.exact = save_exact;
+      break;
+   }
+
    case SpvOpFUnordLessThan:
    case SpvOpFUnordGreaterThan:
    case SpvOpFUnordLessThanEqual:
@@ -621,12 +645,16 @@ vtn_handle_alu(struct vtn_builder *b, SpvOp opcode,
 
       b->nb.exact = true;
 
+      /* Use the property FUnordLessThan(a, b) ≡ !FOrdGreaterThanEqual(a, b). */
+      switch (op) {
+      case nir_op_fge: op = nir_op_flt; break;
+      case nir_op_flt: op = nir_op_fge; break;
+      default: unreachable("Impossible opcode.");
+      }
+
       dest->def =
-         nir_ior(&b->nb,
-                 nir_build_alu(&b->nb, op, src[0], src[1], NULL, NULL),
-                 nir_ior(&b->nb,
-                         nir_fneu(&b->nb, src[0], src[0]),
-                         nir_fneu(&b->nb, src[1], src[1])));
+         nir_inot(&b->nb,
+                  nir_build_alu(&b->nb, op, src[0], src[1], NULL, NULL));
 
       b->nb.exact = save_exact;
       break;
@@ -638,23 +666,20 @@ vtn_handle_alu(struct vtn_builder *b, SpvOp opcode,
        * from the ALU will probably already be false if the operands are not
        * ordered so we don’t need to handle it specially.
        */
-      bool swap;
-      bool exact;
-      unsigned src_bit_size = glsl_get_bit_size(vtn_src[0]->type);
-      unsigned dst_bit_size = glsl_get_bit_size(dest_type);
-      nir_op op = vtn_nir_alu_op_for_spirv_opcode(b, opcode, &swap, &exact,
-                                                  src_bit_size, dst_bit_size);
-
-      assert(!swap);
-      assert(exact);
-
       const bool save_exact = b->nb.exact;
 
       b->nb.exact = true;
 
+      /* This could also be implemented as (a < b || b < a).  If one or both
+       * of the source are numbers, later optimization passes can easily
+       * eliminate the isnan() checks.  This may trim the sequence down to a
+       * single (a != b) operation.  Otherwise, the optimizer can transform
+       * whatever is left to (a < b || b < a).  Since some applications will
+       * open-code this sequence, these optimizations are needed anyway.
+       */
       dest->def =
          nir_iand(&b->nb,
-                  nir_build_alu(&b->nb, op, src[0], src[1], NULL, NULL),
+                  nir_fneu(&b->nb, src[0], src[1]),
                   nir_iand(&b->nb,
                           nir_feq(&b->nb, src[0], src[0]),
                           nir_feq(&b->nb, src[1], src[1])));
@@ -765,6 +790,14 @@ vtn_handle_alu(struct vtn_builder *b, SpvOp opcode,
       break;
    }
 
+   case SpvOpSDotKHR:
+   case SpvOpUDotKHR:
+   case SpvOpSUDotKHR:
+   case SpvOpSDotAccSatKHR:
+   case SpvOpUDotAccSatKHR:
+   case SpvOpSUDotAccSatKHR:
+      unreachable("Should have called vtn_handle_integer_dot instead.");
+
    default: {
       bool swap;
       bool exact;
@@ -823,6 +856,290 @@ vtn_handle_alu(struct vtn_builder *b, SpvOp opcode,
    b->nb.exact = b->exact;
 }
 
+void
+vtn_handle_integer_dot(struct vtn_builder *b, SpvOp opcode,
+                       const uint32_t *w, unsigned count)
+{
+   struct vtn_value *dest_val = vtn_untyped_value(b, w[2]);
+   const struct glsl_type *dest_type = vtn_get_type(b, w[1])->type;
+   const unsigned dest_size = glsl_get_bit_size(dest_type);
+
+   vtn_handle_no_contraction(b, dest_val);
+
+   /* Collect the various SSA sources.
+    *
+    * Due to the optional "Packed Vector Format" field, determine number of
+    * inputs from the opcode.  This differs from vtn_handle_alu.
+    */
+   const unsigned num_inputs = (opcode == SpvOpSDotAccSatKHR ||
+                                opcode == SpvOpUDotAccSatKHR ||
+                                opcode == SpvOpSUDotAccSatKHR) ? 3 : 2;
+
+   vtn_assert(count >= num_inputs + 3);
+
+   struct vtn_ssa_value *vtn_src[3] = { NULL, };
+   nir_ssa_def *src[3] = { NULL, };
+
+   for (unsigned i = 0; i < num_inputs; i++) {
+      vtn_src[i] = vtn_ssa_value(b, w[i + 3]);
+      src[i] = vtn_src[i]->def;
+
+      vtn_assert(glsl_type_is_vector_or_scalar(vtn_src[i]->type));
+   }
+
+   /* For all of the opcodes *except* SpvOpSUDotKHR and SpvOpSUDotAccSatKHR,
+    * the SPV_KHR_integer_dot_product spec says:
+    *
+    *    _Vector 1_ and _Vector 2_ must have the same type.
+    *
+    * The practical requirement is the same bit-size and the same number of
+    * components.
+    */
+   vtn_fail_if(glsl_get_bit_size(vtn_src[0]->type) !=
+               glsl_get_bit_size(vtn_src[1]->type) ||
+               glsl_get_vector_elements(vtn_src[0]->type) !=
+               glsl_get_vector_elements(vtn_src[1]->type),
+               "Vector 1 and vector 2 source of opcode %s must have the same "
+               "type",
+               spirv_op_to_string(opcode));
+
+   if (num_inputs == 3) {
+      /* The SPV_KHR_integer_dot_product spec says:
+       *
+       *    The type of Accumulator must be the same as Result Type.
+       *
+       * The handling of SpvOpSDotAccSatKHR and friends with the packed 4x8
+       * types (far below) assumes these types have the same size.
+       */
+      vtn_fail_if(dest_type != vtn_src[2]->type,
+                  "Accumulator type must be the same as Result Type for "
+                  "opcode %s",
+                  spirv_op_to_string(opcode));
+   }
+
+   unsigned packed_bit_size = 8;
+   if (glsl_type_is_vector(vtn_src[0]->type)) {
+      /* FINISHME: Is this actually as good or better for platforms that don't
+       * have the special instructions (i.e., one or both of has_dot_4x8 or
+       * has_sudot_4x8 is false)?
+       */
+      if (glsl_get_vector_elements(vtn_src[0]->type) == 4 &&
+          glsl_get_bit_size(vtn_src[0]->type) == 8 &&
+          glsl_get_bit_size(dest_type) <= 32) {
+         src[0] = nir_pack_32_4x8(&b->nb, src[0]);
+         src[1] = nir_pack_32_4x8(&b->nb, src[1]);
+      } else if (glsl_get_vector_elements(vtn_src[0]->type) == 2 &&
+                 glsl_get_bit_size(vtn_src[0]->type) == 16 &&
+                 glsl_get_bit_size(dest_type) <= 32 &&
+                 opcode != SpvOpSUDotKHR &&
+                 opcode != SpvOpSUDotAccSatKHR) {
+         src[0] = nir_pack_32_2x16(&b->nb, src[0]);
+         src[1] = nir_pack_32_2x16(&b->nb, src[1]);
+         packed_bit_size = 16;
+      }
+   } else if (glsl_type_is_scalar(vtn_src[0]->type) &&
+              glsl_type_is_32bit(vtn_src[0]->type)) {
+      /* The SPV_KHR_integer_dot_product spec says:
+       *
+       *    When _Vector 1_ and _Vector 2_ are scalar integer types, _Packed
+       *    Vector Format_ must be specified to select how the integers are to
+       *    be interpreted as vectors.
+       *
+       * The "Packed Vector Format" value follows the last input.
+       */
+      vtn_assert(count == (num_inputs + 4));
+      const SpvPackedVectorFormat pack_format = w[num_inputs + 3];
+      vtn_fail_if(pack_format != SpvPackedVectorFormatPackedVectorFormat4x8BitKHR,
+                  "Unsupported vector packing format %d for opcode %s",
+                  pack_format, spirv_op_to_string(opcode));
+   } else {
+      vtn_fail_with_opcode("Invalid source types.", opcode);
+   }
+
+   nir_ssa_def *dest = NULL;
+
+   if (src[0]->num_components > 1) {
+      const nir_op s_conversion_op =
+         nir_type_conversion_op(nir_type_int, nir_type_int | dest_size,
+                                nir_rounding_mode_undef);
+
+      const nir_op u_conversion_op =
+         nir_type_conversion_op(nir_type_uint, nir_type_uint | dest_size,
+                                nir_rounding_mode_undef);
+
+      nir_op src0_conversion_op;
+      nir_op src1_conversion_op;
+
+      switch (opcode) {
+      case SpvOpSDotKHR:
+      case SpvOpSDotAccSatKHR:
+         src0_conversion_op = s_conversion_op;
+         src1_conversion_op = s_conversion_op;
+         break;
+
+      case SpvOpUDotKHR:
+      case SpvOpUDotAccSatKHR:
+         src0_conversion_op = u_conversion_op;
+         src1_conversion_op = u_conversion_op;
+         break;
+
+      case SpvOpSUDotKHR:
+      case SpvOpSUDotAccSatKHR:
+         src0_conversion_op = s_conversion_op;
+         src1_conversion_op = u_conversion_op;
+         break;
+
+      default:
+         unreachable("Invalid opcode.");
+      }
+
+      /* The SPV_KHR_integer_dot_product spec says:
+       *
+       *    All components of the input vectors are sign-extended to the bit
+       *    width of the result's type. The sign-extended input vectors are
+       *    then multiplied component-wise and all components of the vector
+       *    resulting from the component-wise multiplication are added
+       *    together. The resulting value will equal the low-order N bits of
+       *    the correct result R, where N is the result width and R is
+       *    computed with enough precision to avoid overflow and underflow.
+       */
+      const unsigned vector_components =
+         glsl_get_vector_elements(vtn_src[0]->type);
+
+      for (unsigned i = 0; i < vector_components; i++) {
+         nir_ssa_def *const src0 =
+            nir_build_alu(&b->nb, src0_conversion_op,
+                          nir_channel(&b->nb, src[0], i), NULL, NULL, NULL);
+
+         nir_ssa_def *const src1 =
+            nir_build_alu(&b->nb, src1_conversion_op,
+                          nir_channel(&b->nb, src[1], i), NULL, NULL, NULL);
+
+         nir_ssa_def *const mul_result = nir_imul(&b->nb, src0, src1);
+
+         dest = (i == 0) ? mul_result : nir_iadd(&b->nb, dest, mul_result);
+      }
+
+      if (num_inputs == 3) {
+         /* For SpvOpSDotAccSatKHR, the SPV_KHR_integer_dot_product spec says:
+          *
+          *    Signed integer dot product of _Vector 1_ and _Vector 2_ and
+          *    signed saturating addition of the result with _Accumulator_.
+          *
+          * For SpvOpUDotAccSatKHR, the SPV_KHR_integer_dot_product spec says:
+          *
+          *    Unsigned integer dot product of _Vector 1_ and _Vector 2_ and
+          *    unsigned saturating addition of the result with _Accumulator_.
+          *
+          * For SpvOpSUDotAccSatKHR, the SPV_KHR_integer_dot_product spec says:
+          *
+          *    Mixed-signedness integer dot product of _Vector 1_ and _Vector
+          *    2_ and signed saturating addition of the result with
+          *    _Accumulator_.
+          */
+         dest = (opcode == SpvOpUDotAccSatKHR)
+            ? nir_uadd_sat(&b->nb, dest, src[2])
+            : nir_iadd_sat(&b->nb, dest, src[2]);
+      }
+   } else {
+      assert(src[0]->num_components == 1 && src[1]->num_components == 1);
+      assert(src[0]->bit_size == 32 && src[1]->bit_size == 32);
+
+      nir_ssa_def *const zero = nir_imm_zero(&b->nb, 1, 32);
+      bool is_signed = opcode == SpvOpSDotKHR || opcode == SpvOpSUDotKHR ||
+                       opcode == SpvOpSDotAccSatKHR || opcode == SpvOpSUDotAccSatKHR;
+
+      if (packed_bit_size == 16) {
+         switch (opcode) {
+         case SpvOpSDotKHR:
+            dest = nir_sdot_2x16_iadd(&b->nb, src[0], src[1], zero);
+            break;
+         case SpvOpUDotKHR:
+            dest = nir_udot_2x16_uadd(&b->nb, src[0], src[1], zero);
+            break;
+         case SpvOpSDotAccSatKHR:
+            if (dest_size == 32)
+               dest = nir_sdot_2x16_iadd_sat(&b->nb, src[0], src[1], src[2]);
+            else
+               dest = nir_sdot_2x16_iadd(&b->nb, src[0], src[1], zero);
+            break;
+         case SpvOpUDotAccSatKHR:
+            if (dest_size == 32)
+               dest = nir_udot_2x16_uadd_sat(&b->nb, src[0], src[1], src[2]);
+            else
+               dest = nir_udot_2x16_uadd(&b->nb, src[0], src[1], zero);
+            break;
+         default:
+            unreachable("Invalid opcode.");
+         }
+      } else {
+         switch (opcode) {
+         case SpvOpSDotKHR:
+            dest = nir_sdot_4x8_iadd(&b->nb, src[0], src[1], zero);
+            break;
+         case SpvOpUDotKHR:
+            dest = nir_udot_4x8_uadd(&b->nb, src[0], src[1], zero);
+            break;
+         case SpvOpSUDotKHR:
+            dest = nir_sudot_4x8_iadd(&b->nb, src[0], src[1], zero);
+            break;
+         case SpvOpSDotAccSatKHR:
+            if (dest_size == 32)
+               dest = nir_sdot_4x8_iadd_sat(&b->nb, src[0], src[1], src[2]);
+            else
+               dest = nir_sdot_4x8_iadd(&b->nb, src[0], src[1], zero);
+            break;
+         case SpvOpUDotAccSatKHR:
+            if (dest_size == 32)
+               dest = nir_udot_4x8_uadd_sat(&b->nb, src[0], src[1], src[2]);
+            else
+               dest = nir_udot_4x8_uadd(&b->nb, src[0], src[1], zero);
+            break;
+         case SpvOpSUDotAccSatKHR:
+            if (dest_size == 32)
+               dest = nir_sudot_4x8_iadd_sat(&b->nb, src[0], src[1], src[2]);
+            else
+               dest = nir_sudot_4x8_iadd(&b->nb, src[0], src[1], zero);
+            break;
+         default:
+            unreachable("Invalid opcode.");
+         }
+      }
+
+      if (dest_size != 32) {
+         /* When the accumulator is 32-bits, a NIR dot-product with saturate
+          * is generated above.  In all other cases a regular dot-product is
+          * generated above, and separate addition with saturate is generated
+          * here.
+          *
+          * The SPV_KHR_integer_dot_product spec says:
+          *
+          *    If any of the multiplications or additions, with the exception
+          *    of the final accumulation, overflow or underflow, the result of
+          *    the instruction is undefined.
+          *
+          * Therefore it is safe to cast the dot-product result down to the
+          * size of the accumulator before doing the addition.  Since the
+          * result of the dot-product cannot overflow 32-bits, this is also
+          * safe to cast up.
+          */
+         if (num_inputs == 3) {
+            dest = is_signed
+               ? nir_iadd_sat(&b->nb, nir_i2i(&b->nb, dest, dest_size), src[2])
+               : nir_uadd_sat(&b->nb, nir_u2u(&b->nb, dest, dest_size), src[2]);
+         } else {
+            dest = is_signed
+               ? nir_i2i(&b->nb, dest, dest_size)
+               : nir_u2u(&b->nb, dest, dest_size);
+         }
+      }
+   }
+
+   vtn_push_nir_ssa(b, w[2], dest);
+
+   b->nb.exact = b->exact;
+}
+
 void
 vtn_handle_bitcast(struct vtn_builder *b, const uint32_t *w, unsigned count)
 {
diff --git a/mesa 3D driver/src/compiler/spirv/vtn_private.h b/mesa 3D driver/src/compiler/spirv/vtn_private.h
index f2cfe14440..a2d2cdcdb1 100644
--- a/mesa 3D driver/src/compiler/spirv/vtn_private.h	
+++ b/mesa 3D driver/src/compiler/spirv/vtn_private.h	
@@ -604,6 +604,9 @@ struct vtn_value {
    /* Valid for vtn_value_type_constant to indicate the value is OpConstantNull. */
    bool is_null_constant:1;
 
+   /* Valid when all the members of the value are undef. */
+   bool is_undef_constant:1;
+
    const char *name;
    struct vtn_decoration *decoration;
    struct vtn_type *type;
@@ -696,6 +699,9 @@ struct vtn_builder {
    /* True if we need to fix up CS OpControlBarrier */
    bool wa_glslang_cs_barrier;
 
+   /* True if we need to ignore undef initializers */
+   bool wa_llvm_spirv_ignore_workgroup_initializer;
+
    /* Workaround discard bugs in HLSL -> SPIR-V compilers */
    bool uses_demote_to_helper_invocation;
    bool convert_discard_to_demote;
@@ -731,6 +737,10 @@ struct vtn_pointer *
 vtn_pointer_from_ssa(struct vtn_builder *b, nir_ssa_def *ssa,
                      struct vtn_type *ptr_type);
 
+struct vtn_ssa_value *
+vtn_const_ssa_value(struct vtn_builder *b, nir_constant *constant,
+                    const struct glsl_type *type);
+
 static inline struct vtn_value *
 vtn_untyped_value(struct vtn_builder *b, uint32_t value_id)
 {
@@ -781,6 +791,35 @@ vtn_value(struct vtn_builder *b, uint32_t value_id,
    return val;
 }
 
+static inline struct vtn_value *
+vtn_pointer_value(struct vtn_builder *b, uint32_t value_id)
+{
+   struct vtn_value *val = vtn_untyped_value(b, value_id);
+   vtn_fail_if(val->value_type != vtn_value_type_pointer &&
+               !val->is_null_constant,
+               "SPIR-V id %u is the wrong kind of value", value_id);
+   return val;
+}
+
+static inline struct vtn_pointer *
+vtn_value_to_pointer(struct vtn_builder *b, struct vtn_value *value)
+{
+   if (value->is_null_constant) {
+      vtn_assert(glsl_type_is_vector_or_scalar(value->type->type));
+      nir_ssa_def *const_ssa =
+         vtn_const_ssa_value(b, value->constant, value->type->type)->def;
+      return vtn_pointer_from_ssa(b, const_ssa, value->type);
+   }
+   vtn_assert(value->value_type == vtn_value_type_pointer);
+   return value->pointer;
+}
+
+static inline struct vtn_pointer *
+vtn_pointer(struct vtn_builder *b, uint32_t value_id)
+{
+   return vtn_value_to_pointer(b, vtn_pointer_value(b, value_id));
+}
+
 bool
 vtn_set_instruction_result_type(struct vtn_builder *b, SpvOp opcode,
                                 const uint32_t *w, unsigned count);
@@ -919,6 +958,9 @@ nir_op vtn_nir_alu_op_for_spirv_opcode(struct vtn_builder *b,
 void vtn_handle_alu(struct vtn_builder *b, SpvOp opcode,
                     const uint32_t *w, unsigned count);
 
+void vtn_handle_integer_dot(struct vtn_builder *b, SpvOp opcode,
+                            const uint32_t *w, unsigned count);
+
 void vtn_handle_bitcast(struct vtn_builder *b, const uint32_t *w,
                         unsigned count);
 
diff --git a/mesa 3D driver/src/compiler/spirv/vtn_variables.c b/mesa 3D driver/src/compiler/spirv/vtn_variables.c
index d281061a31..8657d5b81b 100644
--- a/mesa 3D driver/src/compiler/spirv/vtn_variables.c	
+++ b/mesa 3D driver/src/compiler/spirv/vtn_variables.c	
@@ -518,7 +518,7 @@ _vtn_local_load_store(struct vtn_builder *b, bool load, nir_deref_instr *deref,
 nir_deref_instr *
 vtn_nir_deref(struct vtn_builder *b, uint32_t id)
 {
-   struct vtn_pointer *ptr = vtn_value(b, id, vtn_value_type_pointer)->pointer;
+   struct vtn_pointer *ptr = vtn_pointer(b, id);
    return vtn_pointer_to_deref(b, ptr);
 }
 
@@ -597,7 +597,8 @@ _vtn_variable_load_store(struct vtn_builder *b, bool load,
                          enum gl_access_qualifier access,
                          struct vtn_ssa_value **inout)
 {
-   if (ptr->mode == vtn_variable_mode_uniform) {
+   if (ptr->mode == vtn_variable_mode_uniform ||
+       ptr->mode == vtn_variable_mode_image) {
       if (ptr->type->base_type == vtn_base_type_image ||
           ptr->type->base_type == vtn_base_type_sampler) {
          /* See also our handling of OpTypeSampler and OpTypeImage */
@@ -787,15 +788,18 @@ vtn_get_builtin_location(struct vtn_builder *b,
 {
    switch (builtin) {
    case SpvBuiltInPosition:
+   case SpvBuiltInPositionPerViewNV:
       *location = VARYING_SLOT_POS;
       break;
    case SpvBuiltInPointSize:
       *location = VARYING_SLOT_PSIZ;
       break;
    case SpvBuiltInClipDistance:
-      *location = VARYING_SLOT_CLIP_DIST0; /* XXX CLIP_DIST1? */
+   case SpvBuiltInClipDistancePerViewNV:
+      *location = VARYING_SLOT_CLIP_DIST0;
       break;
    case SpvBuiltInCullDistance:
+   case SpvBuiltInCullDistancePerViewNV:
       *location = VARYING_SLOT_CULL_DIST0;
       break;
    case SpvBuiltInVertexId:
@@ -840,7 +844,8 @@ vtn_get_builtin_location(struct vtn_builder *b,
          *mode = nir_var_shader_out;
       else if (b->options && b->options->caps.shader_viewport_index_layer &&
                (b->shader->info.stage == MESA_SHADER_VERTEX ||
-                b->shader->info.stage == MESA_SHADER_TESS_EVAL))
+                b->shader->info.stage == MESA_SHADER_TESS_EVAL ||
+                b->shader->info.stage == MESA_SHADER_MESH))
          *mode = nir_var_shader_out;
       else
          vtn_fail("invalid stage for SpvBuiltInLayer");
@@ -851,7 +856,8 @@ vtn_get_builtin_location(struct vtn_builder *b,
          *mode = nir_var_shader_out;
       else if (b->options && b->options->caps.shader_viewport_index_layer &&
                (b->shader->info.stage == MESA_SHADER_VERTEX ||
-                b->shader->info.stage == MESA_SHADER_TESS_EVAL))
+                b->shader->info.stage == MESA_SHADER_TESS_EVAL ||
+                b->shader->info.stage == MESA_SHADER_MESH))
          *mode = nir_var_shader_out;
       else if (b->shader->info.stage == MESA_SHADER_FRAGMENT)
          *mode = nir_var_shader_in;
@@ -874,16 +880,13 @@ vtn_get_builtin_location(struct vtn_builder *b,
       break;
    case SpvBuiltInFragCoord:
       vtn_assert(*mode == nir_var_shader_in);
-      if (b->options && b->options->frag_coord_is_sysval) {
-         *mode = nir_var_system_value;
-         *location = SYSTEM_VALUE_FRAG_COORD;
-      } else {
-         *location = VARYING_SLOT_POS;
-      }
+      *mode = nir_var_system_value;
+      *location = SYSTEM_VALUE_FRAG_COORD;
       break;
    case SpvBuiltInPointCoord:
-      *location = VARYING_SLOT_PNTC;
       vtn_assert(*mode == nir_var_shader_in);
+      set_mode_system_value(b, mode);
+      *location = SYSTEM_VALUE_POINT_COORD;
       break;
    case SpvBuiltInFrontFacing:
       *location = SYSTEM_VALUE_FRONT_FACE;
@@ -1123,6 +1126,15 @@ vtn_get_builtin_location(struct vtn_builder *b,
          vtn_fail("invalid stage for SpvBuiltInPrimitiveShadingRateKHR");
       }
       break;
+   case SpvBuiltInPrimitiveCountNV:
+      *location = VARYING_SLOT_PRIMITIVE_COUNT;
+      break;
+   case SpvBuiltInPrimitiveIndicesNV:
+      *location = VARYING_SLOT_PRIMITIVE_INDICES;
+      break;
+   case SpvBuiltInTaskCountNV:
+      *location = VARYING_SLOT_TASK_COUNT;
+      break;
    default:
       vtn_fail("Unsupported builtin: %s (%u)",
                spirv_builtin_to_string(builtin), builtin);
@@ -1276,18 +1288,64 @@ apply_var_decoration(struct vtn_builder *b,
       /* TODO: We should actually plumb alias information through NIR. */
       break;
 
+   case SpvDecorationPerPrimitiveNV:
+      vtn_fail_if(
+         !(b->shader->info.stage == MESA_SHADER_MESH && var_data->mode == nir_var_shader_out) &&
+         !(b->shader->info.stage == MESA_SHADER_FRAGMENT && var_data->mode == nir_var_shader_in),
+         "PerPrimitiveNV decoration only allowed for Mesh shader outputs or Fragment shader inputs");
+      var_data->per_primitive = true;
+      break;
+
+   case SpvDecorationPerTaskNV:
+      vtn_fail_if(
+         !(b->shader->info.stage == MESA_SHADER_TASK && var_data->mode == nir_var_shader_out) &&
+         !(b->shader->info.stage == MESA_SHADER_MESH && var_data->mode == nir_var_shader_in),
+         "PerTaskNV decoration only allowed for Task shader outputs or Mesh shader inputs");
+      /* Don't set anything, because this decoration is implied by being a
+       * non-builtin Task Output or Mesh Input.
+       */
+      break;
+
+   case SpvDecorationPerViewNV:
+      vtn_fail_if(b->shader->info.stage != MESA_SHADER_MESH,
+                  "PerViewNV decoration only allowed in Mesh shaders");
+      var_data->per_view = true;
+      break;
+
    default:
       vtn_fail_with_decoration("Unhandled decoration", dec->decoration);
    }
 }
 
 static void
-var_is_patch_cb(struct vtn_builder *b, struct vtn_value *val, int member,
-                const struct vtn_decoration *dec, void *void_var)
+gather_var_kind_cb(struct vtn_builder *b, struct vtn_value *val, int member,
+                   const struct vtn_decoration *dec, void *void_var)
 {
    struct vtn_variable *vtn_var = void_var;
-   if (dec->decoration == SpvDecorationPatch)
+   switch (dec->decoration) {
+   case SpvDecorationPatch:
       vtn_var->var->data.patch = true;
+      break;
+   case SpvDecorationPerPrimitiveNV:
+      vtn_var->var->data.per_primitive = true;
+      break;
+   case SpvDecorationBuiltIn:
+      if (b->shader->info.stage == MESA_SHADER_MESH) {
+         SpvBuiltIn builtin = dec->operands[0];
+         switch (builtin) {
+         case SpvBuiltInPrimitiveIndicesNV:
+            vtn_var->var->data.per_primitive = true;
+            break;
+         default:
+            /* Nothing to do. */
+            break;
+         }
+      }
+      break;
+   default:
+      /* Nothing to do. */
+      break;
+   }
 }
 
 static void
@@ -1358,7 +1416,8 @@ var_decoration_cb(struct vtn_builder *b, struct vtn_value *val, int member,
       } else if (vtn_var->mode == vtn_variable_mode_call_data ||
                  vtn_var->mode == vtn_variable_mode_ray_payload) {
          /* This location is fine as-is */
-      } else if (vtn_var->mode != vtn_variable_mode_uniform) {
+      } else if (vtn_var->mode != vtn_variable_mode_uniform &&
+                 vtn_var->mode != vtn_variable_mode_image) {
          vtn_warn("Location must be on input, output, uniform, sampler or "
                   "image variable");
          return;
@@ -1441,7 +1500,19 @@ vtn_storage_class_to_mode(struct vtn_builder *b,
       nir_mode = nir_var_mem_global;
       break;
    case SpvStorageClassUniformConstant:
-      if (b->shader->info.stage == MESA_SHADER_KERNEL) {
+      /* interface_type is only NULL when OpTypeForwardPointer is used and
+       * OpTypeForwardPointer can only be used for struct types, not images or
+       * acceleration structures.
+       */
+      if (interface_type)
+         interface_type = vtn_type_without_array(interface_type);
+
+      if (interface_type &&
+          interface_type->base_type == vtn_base_type_image &&
+          glsl_type_is_image(interface_type->glsl_image)) {
+         mode = vtn_variable_mode_image;
+         nir_mode = nir_var_image;
+      } else if (b->shader->info.stage == MESA_SHADER_KERNEL) {
          mode = vtn_variable_mode_constant;
          nir_mode = nir_var_mem_constant;
       } else {
@@ -1450,7 +1521,6 @@ vtn_storage_class_to_mode(struct vtn_builder *b,
           * storage class.
           */
          assert(interface_type != NULL);
-         interface_type = vtn_type_without_array(interface_type);
          if (interface_type->base_type == vtn_base_type_accel_struct) {
             mode = vtn_variable_mode_accel_struct;
             nir_mode = nir_var_uniform;
@@ -1494,7 +1564,7 @@ vtn_storage_class_to_mode(struct vtn_builder *b,
       break;
    case SpvStorageClassImage:
       mode = vtn_variable_mode_image;
-      nir_mode = nir_var_mem_ubo;
+      nir_mode = nir_var_image;
       break;
    case SpvStorageClassCallableDataKHR:
       mode = vtn_variable_mode_call_data;
@@ -1776,7 +1846,10 @@ vtn_create_variable(struct vtn_builder *b, struct vtn_value *val,
       break;
 
    case vtn_variable_mode_image:
-      vtn_fail("Cannot create a variable with the Image storage class");
+      if (storage_class == SpvStorageClassImage)
+         vtn_fail("Cannot create a variable with the Image storage class");
+      else
+         vtn_assert(storage_class == SpvStorageClassUniformConstant);
       break;
 
    case vtn_variable_mode_phys_ssbo:
@@ -1809,6 +1882,7 @@ vtn_create_variable(struct vtn_builder *b, struct vtn_value *val,
    case vtn_variable_mode_constant:
    case vtn_variable_mode_call_data:
    case vtn_variable_mode_call_data_in:
+   case vtn_variable_mode_image:
    case vtn_variable_mode_ray_payload:
    case vtn_variable_mode_ray_payload_in:
    case vtn_variable_mode_hit_attrib:
@@ -1878,12 +1952,12 @@ vtn_create_variable(struct vtn_builder *b, struct vtn_value *val,
        * it to be all or nothing, we'll call it patch if any of the members
        * are declared patch.
        */
-      vtn_foreach_decoration(b, val, var_is_patch_cb, var);
+      vtn_foreach_decoration(b, val, gather_var_kind_cb, var);
       if (glsl_type_is_array(var->type->type) &&
           glsl_type_is_struct_or_ifc(without_array->type)) {
          vtn_foreach_decoration(b, vtn_value(b, without_array->id,
                                              vtn_value_type_type),
-                                var_is_patch_cb, var);
+                                gather_var_kind_cb, var);
       }
 
       struct vtn_type *per_vertex_type = var->type;
@@ -1935,16 +2009,35 @@ vtn_create_variable(struct vtn_builder *b, struct vtn_value *val,
       vtn_foreach_decoration(b, vtn_value(b, per_vertex_type->id,
                                           vtn_value_type_type),
                              var_decoration_cb, var);
+
+      /* PerTask I/O is always a single block without any Location, so
+       * initialize the base_location of the block and let
+       * assign_missing_member_locations() do the rest.
+       */
+      if ((b->shader->info.stage == MESA_SHADER_TASK && var->mode == vtn_variable_mode_output) ||
+          (b->shader->info.stage == MESA_SHADER_MESH && var->mode == vtn_variable_mode_input)) {
+         if (var->type->block)
+            var->base_location = VARYING_SLOT_VAR0;
+      }
+
       break;
    }
 
-   case vtn_variable_mode_image:
    case vtn_variable_mode_phys_ssbo:
    case vtn_variable_mode_generic:
       unreachable("Should have been caught before");
    }
 
-   if (initializer) {
+   /* Ignore incorrectly generated Undef initializers. */
+   if (b->wa_llvm_spirv_ignore_workgroup_initializer &&
+       initializer &&
+       storage_class == SpvStorageClassWorkgroup)
+      initializer = NULL;
+
+   /* Only initialize variable when there is an initializer and it's not
+    * undef.
+    */
+   if (initializer && !initializer->is_undef_constant) {
       switch (storage_class) {
       case SpvStorageClassWorkgroup:
          /* VK_KHR_zero_initialize_workgroup_memory. */
@@ -2028,6 +2121,7 @@ vtn_create_variable(struct vtn_builder *b, struct vtn_value *val,
    }
 
    if (var->mode == vtn_variable_mode_uniform ||
+       var->mode == vtn_variable_mode_image ||
        var->mode == vtn_variable_mode_ssbo) {
       /* SSBOs and images are assumed to not alias in the Simple, GLSL and Vulkan memory models */
       var->var->data.access |= b->mem_model != SpvMemoryModelOpenCL ? ACCESS_RESTRICT : 0;
@@ -2046,6 +2140,7 @@ vtn_create_variable(struct vtn_builder *b, struct vtn_value *val,
    }
 
    if (var->mode == vtn_variable_mode_uniform ||
+       var->mode == vtn_variable_mode_image ||
        var->mode == vtn_variable_mode_ubo ||
        var->mode == vtn_variable_mode_ssbo ||
        var->mode == vtn_variable_mode_atomic_counter) {
@@ -2176,10 +2271,14 @@ vtn_get_mem_operands(struct vtn_builder *b, const uint32_t *w, unsigned count,
 static enum gl_access_qualifier
 spv_access_to_gl_access(SpvMemoryAccessMask access)
 {
-   if (access & SpvMemoryAccessVolatileMask)
-      return ACCESS_VOLATILE;
+   unsigned result = 0;
 
-   return 0;
+   if (access & SpvMemoryAccessVolatileMask)
+      result |= ACCESS_VOLATILE;
+   if (access & SpvMemoryAccessNontemporalMask)
+      result |= ACCESS_STREAM_CACHE_POLICY;
+
+   return result;
 }
 
 
@@ -2253,6 +2352,7 @@ vtn_handle_variables(struct vtn_builder *b, SpvOp opcode,
    case SpvOpUndef: {
       struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_undef);
       val->type = vtn_get_type(b, w[1]);
+      val->is_undef_constant = true;
       break;
    }
 
@@ -2335,8 +2435,8 @@ vtn_handle_variables(struct vtn_builder *b, SpvOp opcode,
       }
 
       struct vtn_type *ptr_type = vtn_get_type(b, w[1]);
-      struct vtn_pointer *base =
-         vtn_value(b, w[3], vtn_value_type_pointer)->pointer;
+
+      struct vtn_pointer *base = vtn_pointer(b, w[3]);
 
       /* Workaround for https://gitlab.freedesktop.org/mesa/mesa/-/issues/3406 */
       access |= base->access & ACCESS_NON_UNIFORM;
@@ -2349,10 +2449,10 @@ vtn_handle_variables(struct vtn_builder *b, SpvOp opcode,
    }
 
    case SpvOpCopyMemory: {
-      struct vtn_value *dest_val = vtn_value(b, w[1], vtn_value_type_pointer);
-      struct vtn_value *src_val = vtn_value(b, w[2], vtn_value_type_pointer);
-      struct vtn_pointer *dest = dest_val->pointer;
-      struct vtn_pointer *src = src_val->pointer;
+      struct vtn_value *dest_val = vtn_pointer_value(b, w[1]);
+      struct vtn_value *src_val = vtn_pointer_value(b, w[2]);
+      struct vtn_pointer *dest = vtn_value_to_pointer(b, dest_val);
+      struct vtn_pointer *src = vtn_value_to_pointer(b, src_val);
 
       vtn_assert_types_equal(b, opcode, dest_val->type->deref,
                                         src_val->type->deref);
@@ -2381,11 +2481,11 @@ vtn_handle_variables(struct vtn_builder *b, SpvOp opcode,
    }
 
    case SpvOpCopyMemorySized: {
-      struct vtn_value *dest_val = vtn_value(b, w[1], vtn_value_type_pointer);
-      struct vtn_value *src_val = vtn_value(b, w[2], vtn_value_type_pointer);
+      struct vtn_value *dest_val = vtn_pointer_value(b, w[1]);
+      struct vtn_value *src_val = vtn_pointer_value(b, w[2]);
       nir_ssa_def *size = vtn_get_nir_ssa(b, w[3]);
-      struct vtn_pointer *dest = dest_val->pointer;
-      struct vtn_pointer *src = src_val->pointer;
+      struct vtn_pointer *dest = vtn_value_to_pointer(b, dest_val);
+      struct vtn_pointer *src = vtn_value_to_pointer(b, src_val);
 
       unsigned idx = 4, dest_alignment, src_alignment;
       SpvMemoryAccessMask dest_access, src_access;
@@ -2416,7 +2516,7 @@ vtn_handle_variables(struct vtn_builder *b, SpvOp opcode,
    case SpvOpLoad: {
       struct vtn_type *res_type = vtn_get_type(b, w[1]);
       struct vtn_value *src_val = vtn_value(b, w[3], vtn_value_type_pointer);
-      struct vtn_pointer *src = src_val->pointer;
+      struct vtn_pointer *src = vtn_value_to_pointer(b, src_val);
 
       vtn_assert_types_equal(b, opcode, res_type, src_val->type->deref);
 
@@ -2433,8 +2533,8 @@ vtn_handle_variables(struct vtn_builder *b, SpvOp opcode,
    }
 
    case SpvOpStore: {
-      struct vtn_value *dest_val = vtn_value(b, w[1], vtn_value_type_pointer);
-      struct vtn_pointer *dest = dest_val->pointer;
+      struct vtn_value *dest_val = vtn_pointer_value(b, w[1]);
+      struct vtn_pointer *dest = vtn_value_to_pointer(b, dest_val);
       struct vtn_value *src_val = vtn_untyped_value(b, w[2]);
 
       /* OpStore requires us to actually have a storage type */
@@ -2476,8 +2576,7 @@ vtn_handle_variables(struct vtn_builder *b, SpvOp opcode,
    }
 
    case SpvOpArrayLength: {
-      struct vtn_pointer *ptr =
-         vtn_value(b, w[3], vtn_value_type_pointer)->pointer;
+      struct vtn_pointer *ptr = vtn_pointer(b, w[3]);
       const uint32_t field = w[4];
 
       vtn_fail_if(ptr->type->base_type != vtn_base_type_struct,
diff --git a/mesa 3D driver/src/egl/drivers/dri2/egl_dri2.c b/mesa 3D driver/src/egl/drivers/dri2/egl_dri2.c
index 99b05579c7..d687533168 100644
--- a/mesa 3D driver/src/egl/drivers/dri2/egl_dri2.c	
+++ b/mesa 3D driver/src/egl/drivers/dri2/egl_dri2.c	
@@ -649,33 +649,53 @@ dri2_add_pbuffer_configs_for_visuals(_EGLDisplay *disp)
    return (config_count != 0);
 }
 
-__DRIimage *
-dri2_lookup_egl_image(__DRIscreen *screen, void *image, void *data)
+GLboolean
+dri2_validate_egl_image(void *image, void *data)
 {
    _EGLDisplay *disp = data;
-   struct dri2_egl_image *dri2_img;
    _EGLImage *img;
 
-   (void) screen;
-
    mtx_lock(&disp->Mutex);
    img = _eglLookupImage(image, disp);
    mtx_unlock(&disp->Mutex);
 
    if (img == NULL) {
-      _eglError(EGL_BAD_PARAMETER, "dri2_lookup_egl_image");
-      return NULL;
+      _eglError(EGL_BAD_PARAMETER, "dri2_validate_egl_image");
+      return false;
    }
 
+   return true;
+}
+
+__DRIimage *
+dri2_lookup_egl_image_validated(void *image, void *data)
+{
+   struct dri2_egl_image *dri2_img;
+
+   (void)data;
+
    dri2_img = dri2_egl_image(image);
 
    return dri2_img->dri_image;
 }
 
-const __DRIimageLookupExtension image_lookup_extension = {
-   .base = { __DRI_IMAGE_LOOKUP, 1 },
+__DRIimage *
+dri2_lookup_egl_image(__DRIscreen *screen, void *image, void *data)
+{
+   (void) screen;
 
-   .lookupEGLImage       = dri2_lookup_egl_image
+   if (!dri2_validate_egl_image(image, data))
+      return NULL;
+
+   return dri2_lookup_egl_image_validated(image, data);
+}
+
+const __DRIimageLookupExtension image_lookup_extension = {
+   .base = { __DRI_IMAGE_LOOKUP, 2 },
+
+   .lookupEGLImage       = dri2_lookup_egl_image,
+   .validateEGLImage     = dri2_validate_egl_image,
+   .lookupEGLImageValidated = dri2_lookup_egl_image_validated,
 };
 
 struct dri2_extension_match {
diff --git a/mesa 3D driver/src/egl/drivers/dri2/egl_dri2.h b/mesa 3D driver/src/egl/drivers/dri2/egl_dri2.h
index 6a7eedea11..9acc846687 100644
--- a/mesa 3D driver/src/egl/drivers/dri2/egl_dri2.h	
+++ b/mesa 3D driver/src/egl/drivers/dri2/egl_dri2.h	
@@ -89,7 +89,7 @@ struct zwp_linux_dmabuf_v1;
 #include "util/u_vector.h"
 #include "util/bitset.h"
 
-#define EGL_DRI2_MAX_FORMATS 10
+#define EGL_DRI2_MAX_FORMATS 11
 
 struct wl_buffer;
 
@@ -302,6 +302,7 @@ struct dri2_egl_surface
    struct wl_drm         *wl_drm_wrapper;
    struct wl_callback    *throttle_callback;
    int                    format;
+   bool                   resized;
 #endif
 
 #ifdef HAVE_DRM_PLATFORM
@@ -421,6 +422,12 @@ dri2_setup_extensions(_EGLDisplay *disp);
 __DRIdrawable *
 dri2_surface_get_dri_drawable(_EGLSurface *surf);
 
+GLboolean
+dri2_validate_egl_image(void *image, void *data);
+
+__DRIimage *
+dri2_lookup_egl_image_validated(void *image, void *data);
+
 __DRIimage *
 dri2_lookup_egl_image(__DRIscreen *screen, void *image, void *data);
 
diff --git a/mesa 3D driver/src/egl/drivers/dri2/platform_device.c b/mesa 3D driver/src/egl/drivers/dri2/platform_device.c
index 3d979e982a..30f6f7e6b8 100644
--- a/mesa 3D driver/src/egl/drivers/dri2/platform_device.c	
+++ b/mesa 3D driver/src/egl/drivers/dri2/platform_device.c	
@@ -193,10 +193,23 @@ device_flush_front_buffer(__DRIdrawable *driDrawable, void *loaderPrivate)
 {
 }
 
+static unsigned
+device_get_capability(void *loaderPrivate, enum dri_loader_cap cap)
+{
+   /* Note: loaderPrivate is _EGLDisplay* */
+   switch (cap) {
+   case DRI_LOADER_CAP_FP16:
+      return 1;
+   default:
+      return 0;
+   }
+}
+
 static const __DRIimageLoaderExtension image_loader_extension = {
-   .base             = { __DRI_IMAGE_LOADER, 1 },
+   .base             = { __DRI_IMAGE_LOADER, 2 },
    .getBuffers       = device_image_get_buffers,
    .flushFrontBuffer = device_flush_front_buffer,
+   .getCapability    = device_get_capability,
 };
 
 static const __DRIextension *image_loader_extensions[] = {
diff --git a/mesa 3D driver/src/egl/drivers/dri2/platform_drm.c b/mesa 3D driver/src/egl/drivers/dri2/platform_drm.c
index 2b329437f8..6aada724a5 100644
--- a/mesa 3D driver/src/egl/drivers/dri2/platform_drm.c	
+++ b/mesa 3D driver/src/egl/drivers/dri2/platform_drm.c	
@@ -752,6 +752,8 @@ dri2_initialize_drm(_EGLDisplay *disp)
    dri2_dpy->driver_configs = dri2_dpy->gbm_dri->driver_configs;
 
    dri2_dpy->gbm_dri->lookup_image = dri2_lookup_egl_image;
+   dri2_dpy->gbm_dri->validate_image = dri2_validate_egl_image;
+   dri2_dpy->gbm_dri->lookup_image_validated = dri2_lookup_egl_image_validated;
    dri2_dpy->gbm_dri->lookup_user_data = disp;
 
    dri2_dpy->gbm_dri->get_buffers = dri2_drm_get_buffers;
diff --git a/mesa 3D driver/src/egl/drivers/dri2/platform_wayland.c b/mesa 3D driver/src/egl/drivers/dri2/platform_wayland.c
index cfff0ade90..e1f445e1de 100644
--- a/mesa 3D driver/src/egl/drivers/dri2/platform_wayland.c	
+++ b/mesa 3D driver/src/egl/drivers/dri2/platform_wayland.c	
@@ -128,6 +128,20 @@ static const struct dri2_wl_visual {
       { 16, 8, 0, 24 },
       { 8, 8, 8, 8 },
    },
+   {
+      "ABGR8888",
+      WL_DRM_FORMAT_ABGR8888, WL_SHM_FORMAT_ABGR8888,
+      __DRI_IMAGE_FORMAT_ABGR8888, __DRI_IMAGE_FORMAT_NONE, 32,
+      { 0, 8, 16, 24 },
+      { 8, 8, 8, 8 },
+   },
+   {
+      "XBGR8888",
+      WL_DRM_FORMAT_XBGR8888, WL_SHM_FORMAT_XBGR8888,
+      __DRI_IMAGE_FORMAT_XBGR8888, __DRI_IMAGE_FORMAT_NONE, 32,
+      { 0, 8, 16, -1 },
+      { 8, 8, 8, 0 },
+   },
    {
       "RGB565",
       WL_DRM_FORMAT_RGB565, WL_SHM_FORMAT_RGB565,
@@ -143,7 +157,8 @@ static_assert(ARRAY_SIZE(dri2_wl_visuals) <= EGL_DRI2_MAX_FORMATS,
 
 static int
 dri2_wl_visual_idx_from_config(struct dri2_egl_display *dri2_dpy,
-                               const __DRIconfig *config)
+                               const __DRIconfig *config,
+                               bool force_opaque)
 {
    int shifts[4];
    unsigned int sizes[4];
@@ -153,14 +168,14 @@ dri2_wl_visual_idx_from_config(struct dri2_egl_display *dri2_dpy,
    for (unsigned int i = 0; i < ARRAY_SIZE(dri2_wl_visuals); i++) {
       const struct dri2_wl_visual *wl_visual = &dri2_wl_visuals[i];
 
-      if (shifts[0] == wl_visual->rgba_shifts[0] &&
-          shifts[1] == wl_visual->rgba_shifts[1] &&
-          shifts[2] == wl_visual->rgba_shifts[2] &&
-          shifts[3] == wl_visual->rgba_shifts[3] &&
-          sizes[0] == wl_visual->rgba_sizes[0] &&
-          sizes[1] == wl_visual->rgba_sizes[1] &&
-          sizes[2] == wl_visual->rgba_sizes[2] &&
-          sizes[3] == wl_visual->rgba_sizes[3]) {
+      int cmp_rgb_shifts = memcmp(shifts, wl_visual->rgba_shifts,
+                                  3 * sizeof(shifts[0]));
+      int cmp_rgb_sizes = memcmp(sizes, wl_visual->rgba_sizes,
+                                 3 * sizeof(sizes[0]));
+
+      if (cmp_rgb_shifts == 0 && cmp_rgb_sizes == 0 &&
+          wl_visual->rgba_shifts[3] == (force_opaque ? -1 : shifts[3]) &&
+          wl_visual->rgba_sizes[3] == (force_opaque ? 0 : sizes[3])) {
          return i;
       }
    }
@@ -215,7 +230,8 @@ dri2_wl_is_format_supported(void* user_data, uint32_t format)
 
    for (int i = 0; dri2_dpy->driver_configs[i]; i++)
       if (j == dri2_wl_visual_idx_from_config(dri2_dpy,
-                                              dri2_dpy->driver_configs[i]))
+                                              dri2_dpy->driver_configs[i],
+                                              false))
          return true;
 
    return false;
@@ -263,6 +279,8 @@ resize_callback(struct wl_egl_window *wl_win, void *data)
        dri2_surf->base.Height == wl_win->height)
       return;
 
+   dri2_surf->resized = true;
+
    /* Update the surface size as soon as native window is resized; from user
     * pov, this makes the effect that resize is done immediately after native
     * window resize, without requiring to wait until the first draw.
@@ -342,7 +360,42 @@ dri2_wl_create_window_surface(_EGLDisplay *disp, _EGLConfig *conf,
    dri2_surf->base.Width = window->width;
    dri2_surf->base.Height = window->height;
 
-   visual_idx = dri2_wl_visual_idx_from_config(dri2_dpy, config);
+#ifndef NDEBUG
+   /* Enforce that every visual has an opaque variant (requirement to support
+    * EGL_EXT_present_opaque)
+    */
+   for (unsigned int i = 0; i < ARRAY_SIZE(dri2_wl_visuals); i++) {
+      const struct dri2_wl_visual *transparent_visual = &dri2_wl_visuals[i];
+      if (transparent_visual->rgba_sizes[3] == 0) {
+         continue;
+      }
+
+      bool found_opaque_equivalent = false;
+      for (unsigned int j = 0; j < ARRAY_SIZE(dri2_wl_visuals); j++) {
+         const struct dri2_wl_visual *opaque_visual = &dri2_wl_visuals[j];
+         if (opaque_visual->rgba_sizes[3] != 0) {
+            continue;
+         }
+
+         int cmp_rgb_shifts = memcmp(transparent_visual->rgba_shifts,
+                                     opaque_visual->rgba_shifts,
+                                     3 * sizeof(opaque_visual->rgba_shifts[0]));
+         int cmp_rgb_sizes = memcmp(transparent_visual->rgba_sizes,
+                                    opaque_visual->rgba_sizes,
+                                    3 * sizeof(opaque_visual->rgba_sizes[0]));
+
+         if (cmp_rgb_shifts == 0 && cmp_rgb_sizes == 0) {
+            found_opaque_equivalent = true;
+            break;
+         }
+      }
+
+      assert(found_opaque_equivalent);
+   }
+#endif
+
+   visual_idx = dri2_wl_visual_idx_from_config(dri2_dpy, config,
+                                               dri2_surf->base.PresentOpaque);
    assert(visual_idx != -1);
 
    if (dri2_dpy->wl_dmabuf || dri2_dpy->wl_drm) {
@@ -663,10 +716,9 @@ update_buffers(struct dri2_egl_surface *dri2_surf)
       dri2_surf->dy = dri2_surf->wl_win->dy;
    }
 
-   if (dri2_surf->wl_win &&
-       (dri2_surf->base.Width != dri2_surf->wl_win->attached_width ||
-        dri2_surf->base.Height != dri2_surf->wl_win->attached_height)) {
-      dri2_wl_release_buffers(dri2_surf);
+   if (dri2_surf->resized) {
+       dri2_wl_release_buffers(dri2_surf);
+       dri2_surf->resized = false;
    }
 
    if (get_back_bo(dri2_surf) < 0) {
@@ -812,6 +864,8 @@ dri2_wl_get_capability(void *loaderPrivate, enum dri_loader_cap cap)
    switch (cap) {
    case DRI_LOADER_CAP_FP16:
       return 1;
+   case DRI_LOADER_CAP_RGBA_ORDERING:
+      return 1;
    default:
       return 0;
    }
@@ -1422,7 +1476,8 @@ dri2_wl_add_configs_for_visuals(_EGLDisplay *disp)
 
          /* No match for config. Try if we can blitImage convert to a visual */
          c = dri2_wl_visual_idx_from_config(dri2_dpy,
-                                            dri2_dpy->driver_configs[i]);
+                                            dri2_dpy->driver_configs[i],
+                                            false);
 
          if (c == -1)
             continue;
@@ -1490,7 +1545,7 @@ dri2_initialize_wayland_drm(_EGLDisplay *disp)
    if (!dri2_dpy->wl_modifiers)
       goto cleanup;
    for (int i = 0; i < ARRAY_SIZE(dri2_wl_visuals); i++) {
-      if (!u_vector_init(&dri2_dpy->wl_modifiers[i], sizeof(uint64_t), 32))
+      if (!u_vector_init_pow2(&dri2_dpy->wl_modifiers[i], 4, sizeof(uint64_t)))
          goto cleanup;
    }
 
@@ -1621,6 +1676,8 @@ dri2_initialize_wayland_drm(_EGLDisplay *disp)
 
    disp->Extensions.EXT_swap_buffers_with_damage = EGL_TRUE;
 
+   disp->Extensions.EXT_present_opaque = EGL_TRUE;
+
    /* Fill vtbl last to prevent accidentally calling virtual function during
     * initialization.
     */
diff --git a/mesa 3D driver/src/egl/drivers/dri2/platform_x11_dri3.c b/mesa 3D driver/src/egl/drivers/dri2/platform_x11_dri3.c
index e117105fcb..81fb73a8cd 100644
--- a/mesa 3D driver/src/egl/drivers/dri2/platform_x11_dri3.c	
+++ b/mesa 3D driver/src/egl/drivers/dri2/platform_x11_dri3.c	
@@ -175,6 +175,7 @@ dri3_create_surface(_EGLDisplay *disp, EGLint type, _EGLConfig *conf,
                                  dri2_dpy->dri_screen,
                                  dri2_dpy->is_different_gpu,
                                  dri2_dpy->multibuffers_available,
+                                 true,
                                  dri_config,
                                  &dri2_dpy->loader_dri3_ext,
                                  &egl_dri3_vtable,
diff --git a/mesa 3D driver/src/egl/drivers/wgl/egl_wgl.c b/mesa 3D driver/src/egl/drivers/wgl/egl_wgl.c
new file mode 100644
index 0000000000..d0483e1641
--- /dev/null
+++ b/mesa 3D driver/src/egl/drivers/wgl/egl_wgl.c	
@@ -0,0 +1,712 @@
+/*
+ * Copyright © Microsoft Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <egldriver.h>
+#include <egllog.h>
+#include <eglcurrent.h>
+#include <eglcontext.h>
+#include <eglsurface.h>
+
+#include "egl_wgl.h"
+
+#include <stw_device.h>
+#include <stw_pixelformat.h>
+#include <stw_context.h>
+#include <stw_framebuffer.h>
+
+#include <GL/wglext.h>
+
+#include <pipe/p_screen.h>
+
+#include <mapi/glapi/glapi.h>
+
+static EGLBoolean
+wgl_match_config(const _EGLConfig *conf, const _EGLConfig *criteria)
+{
+   if (_eglCompareConfigs(conf, criteria, NULL, EGL_FALSE) != 0)
+      return EGL_FALSE;
+
+   if (!_eglMatchConfig(conf, criteria))
+      return EGL_FALSE;
+
+   return EGL_TRUE;
+}
+
+static struct wgl_egl_config *
+wgl_add_config(_EGLDisplay *disp, const struct stw_pixelformat_info *stw_config, int id, EGLint surface_type)
+{
+   struct wgl_egl_config *conf;
+   struct wgl_egl_display *wgl_dpy = wgl_egl_display(disp);
+   _EGLConfig base;
+   unsigned int double_buffer;
+   int wgl_shifts[4] = { -1, -1, -1, -1 };
+   unsigned int wgl_sizes[4] = { 0, 0, 0, 0 };
+   _EGLConfig *matching_config;
+   EGLint num_configs = 0;
+   EGLint config_id;
+
+   _eglInitConfig(&base, disp, id);
+
+   double_buffer = (stw_config->pfd.dwFlags & PFD_DOUBLEBUFFER) != 0;
+
+   if (stw_config->pfd.iPixelType != PFD_TYPE_RGBA)
+      return NULL;
+
+   wgl_sizes[0] = stw_config->pfd.cRedBits;
+   wgl_sizes[1] = stw_config->pfd.cGreenBits;
+   wgl_sizes[2] = stw_config->pfd.cBlueBits;
+   wgl_sizes[3] = stw_config->pfd.cAlphaBits;
+
+   base.RedSize = stw_config->pfd.cRedBits;
+   base.GreenSize = stw_config->pfd.cGreenBits;
+   base.BlueSize = stw_config->pfd.cBlueBits;
+   base.AlphaSize = stw_config->pfd.cAlphaBits;
+   base.BufferSize = stw_config->pfd.cColorBits;
+
+   wgl_shifts[0] = stw_config->pfd.cRedShift;
+   wgl_shifts[1] = stw_config->pfd.cGreenShift;
+   wgl_shifts[2] = stw_config->pfd.cBlueShift;
+   wgl_shifts[3] = stw_config->pfd.cAlphaShift;
+
+   if (stw_config->pfd.cAccumBits) {
+      /* Don't expose visuals with the accumulation buffer. */
+      return NULL;
+   }
+
+   base.MaxPbufferWidth = _EGL_MAX_PBUFFER_WIDTH;
+   base.MaxPbufferHeight = _EGL_MAX_PBUFFER_HEIGHT;
+
+   base.DepthSize = stw_config->pfd.cDepthBits;
+   base.StencilSize = stw_config->pfd.cStencilBits;
+   base.Samples = stw_config->stvis.samples;
+   base.SampleBuffers = base.Samples > 1;
+
+   base.NativeRenderable = EGL_TRUE;
+
+   if (surface_type & EGL_PBUFFER_BIT) {
+      base.BindToTextureRGB = stw_config->bindToTextureRGB;
+      if (base.AlphaSize > 0)
+         base.BindToTextureRGBA = stw_config->bindToTextureRGBA;
+   }
+
+   if (double_buffer) {
+      surface_type &= ~EGL_PIXMAP_BIT;
+   }
+
+   if (!(stw_config->pfd.dwFlags & PFD_DRAW_TO_WINDOW)) {
+      surface_type &= ~EGL_WINDOW_BIT;
+   }
+
+   if (!surface_type)
+      return NULL;
+
+   base.SurfaceType = surface_type;
+   base.RenderableType = disp->ClientAPIs;
+   base.Conformant = disp->ClientAPIs;
+
+   base.MinSwapInterval = 0;
+   base.MaxSwapInterval = 1;
+
+   if (!_eglValidateConfig(&base, EGL_FALSE)) {
+      _eglLog(_EGL_DEBUG, "wgl: failed to validate config %d", id);
+      return NULL;
+   }
+
+   config_id = base.ConfigID;
+   base.ConfigID = EGL_DONT_CARE;
+   base.SurfaceType = EGL_DONT_CARE;
+   num_configs = _eglFilterArray(disp->Configs, (void **)&matching_config, 1,
+      (_EGLArrayForEach)wgl_match_config, &base);
+
+   if (num_configs == 1) {
+      conf = (struct wgl_egl_config *)matching_config;
+
+      if (!conf->stw_config[double_buffer])
+         conf->stw_config[double_buffer] = stw_config;
+      else
+         /* a similar config type is already added (unlikely) => discard */
+         return NULL;
+   }
+   else if (num_configs == 0) {
+      conf = calloc(1, sizeof * conf);
+      if (conf == NULL)
+         return NULL;
+
+      conf->stw_config[double_buffer] = stw_config;
+
+      memcpy(&conf->base, &base, sizeof base);
+      conf->base.SurfaceType = 0;
+      conf->base.ConfigID = config_id;
+
+      _eglLinkConfig(&conf->base);
+   }
+   else {
+      unreachable("duplicates should not be possible");
+      return NULL;
+   }
+
+   conf->base.SurfaceType |= surface_type;
+
+   return conf;
+}
+
+static EGLBoolean
+wgl_add_configs(_EGLDisplay *disp, HDC hdc)
+{
+   unsigned int config_count = 0;
+   unsigned surface_type = EGL_PBUFFER_BIT | (hdc ? EGL_WINDOW_BIT : 0);
+
+   // This is already a filtered set of what the driver supports,
+   // and there's no further filtering needed per-visual
+   for (unsigned i = 1; stw_pixelformat_get_info(i) != NULL; i++) {
+
+      struct wgl_egl_config *wgl_conf = wgl_add_config(disp, stw_pixelformat_get_info(i),
+         config_count + 1, surface_type);
+
+      if (wgl_conf) {
+         if (wgl_conf->base.ConfigID == config_count + 1)
+            config_count++;
+      }
+   }
+
+   return (config_count != 0);
+}
+
+static void
+wgl_display_destroy(_EGLDisplay *disp)
+{
+   free(disp);
+}
+
+static EGLBoolean
+wgl_initialize_impl(_EGLDisplay *disp, HDC hdc)
+{
+   struct wgl_egl_display *wgl_dpy;
+   const char* err;
+
+   wgl_dpy = calloc(1, sizeof * wgl_dpy);
+   if (!wgl_dpy)
+      return _eglError(EGL_BAD_ALLOC, "eglInitialize");
+
+   disp->DriverData = (void *)wgl_dpy;
+
+   if (!stw_init_screen(hdc)) {
+      err = "wgl: failed to initialize screen";
+      goto cleanup;
+   }
+
+   wgl_dpy->screen = stw_get_device()->screen;
+
+   disp->ClientAPIs = 0;
+   if (_eglIsApiValid(EGL_OPENGL_API))
+      disp->ClientAPIs |= EGL_OPENGL_BIT;
+   if (_eglIsApiValid(EGL_OPENGL_ES_API))
+      disp->ClientAPIs |= EGL_OPENGL_ES_BIT | EGL_OPENGL_ES2_BIT | EGL_OPENGL_ES3_BIT_KHR;
+
+   disp->Extensions.KHR_no_config_context = EGL_TRUE;
+   disp->Extensions.KHR_surfaceless_context = EGL_TRUE;
+   disp->Extensions.MESA_query_driver = EGL_TRUE;
+
+   /* Report back to EGL the bitmask of priorities supported */
+   disp->Extensions.IMG_context_priority =
+      wgl_dpy->screen->get_param(wgl_dpy->screen, PIPE_CAP_CONTEXT_PRIORITY_MASK);
+
+   disp->Extensions.EXT_pixel_format_float = EGL_TRUE;
+
+   if (wgl_dpy->screen->is_format_supported(wgl_dpy->screen,
+         PIPE_FORMAT_B8G8R8A8_SRGB,
+         PIPE_TEXTURE_2D, 0, 0,
+         PIPE_BIND_RENDER_TARGET))
+      disp->Extensions.KHR_gl_colorspace = EGL_TRUE;
+
+   disp->Extensions.KHR_create_context = EGL_TRUE;
+   disp->Extensions.KHR_reusable_sync = EGL_TRUE;
+
+#if 0
+   disp->Extensions.KHR_image_base = EGL_TRUE;
+   disp->Extensions.KHR_gl_renderbuffer_image = EGL_TRUE;
+   if (wgl_dpy->image->base.version >= 5 &&
+      wgl_dpy->image->createImageFromTexture) {
+      disp->Extensions.KHR_gl_texture_2D_image = EGL_TRUE;
+      disp->Extensions.KHR_gl_texture_cubemap_image = EGL_TRUE;
+
+      if (wgl_renderer_query_integer(wgl_dpy,
+         __wgl_RENDERER_HAS_TEXTURE_3D))
+         disp->Extensions.KHR_gl_texture_3D_image = EGL_TRUE;
+   }
+#endif
+
+   if (!wgl_add_configs(disp, hdc)) {
+      err = "wgl: failed to add configs";
+      goto cleanup;
+   }
+
+   return EGL_TRUE;
+
+cleanup:
+   wgl_display_destroy(disp);
+   return _eglError(EGL_NOT_INITIALIZED, err);
+}
+
+static EGLBoolean
+wgl_initialize(_EGLDisplay *disp)
+{
+   EGLBoolean ret = EGL_FALSE;
+   struct wgl_egl_display *wgl_dpy = wgl_egl_display(disp);
+
+   /* In the case where the application calls eglMakeCurrent(context1),
+    * eglTerminate, then eglInitialize again (without a call to eglReleaseThread
+    * or eglMakeCurrent(NULL) before that), wgl_dpy structure is still
+    * initialized, as we need it to be able to free context1 correctly.
+    *
+    * It would probably be safest to forcibly release the display with
+    * wgl_display_release, to make sure the display is reinitialized correctly.
+    * However, the EGL spec states that we need to keep a reference to the
+    * current context (so we cannot call wgl_make_current(NULL)), and therefore
+    * we would leak context1 as we would be missing the old display connection
+    * to free it up correctly.
+    */
+   if (wgl_dpy) {
+      wgl_dpy->ref_count++;
+      return EGL_TRUE;
+   }
+
+   switch (disp->Platform) {
+   case _EGL_PLATFORM_SURFACELESS:
+      ret = wgl_initialize_impl(disp, NULL);
+      break;
+   case _EGL_PLATFORM_WINDOWS:
+      ret = wgl_initialize_impl(disp, disp->PlatformDisplay);
+      break;
+   default:
+      unreachable("Callers ensure we cannot get here.");
+      return EGL_FALSE;
+   }
+
+   if (!ret)
+      return EGL_FALSE;
+
+   wgl_dpy = wgl_egl_display(disp);
+   wgl_dpy->ref_count++;
+
+   return EGL_TRUE;
+}
+
+/**
+ * Decrement display reference count, and free up display if necessary.
+ */
+static void
+wgl_display_release(_EGLDisplay *disp)
+{
+   struct wgl_egl_display *wgl_dpy;
+
+   if (!disp)
+      return;
+
+   wgl_dpy = wgl_egl_display(disp);
+
+   assert(wgl_dpy->ref_count > 0);
+   wgl_dpy->ref_count--;
+
+   if (wgl_dpy->ref_count > 0)
+      return;
+
+   _eglCleanupDisplay(disp);
+   wgl_display_destroy(disp);
+}
+
+/**
+ * Called via eglTerminate(), drv->Terminate().
+ *
+ * This must be guaranteed to be called exactly once, even if eglTerminate is
+ * called many times (without a eglInitialize in between).
+ */
+static EGLBoolean
+wgl_terminate(_EGLDisplay *disp)
+{
+   /* Release all non-current Context/Surfaces. */
+   _eglReleaseDisplayResources(disp);
+
+   wgl_display_release(disp);
+
+   return EGL_TRUE;
+}
+
+/**
+ * Called via eglCreateContext(), drv->CreateContext().
+ */
+static _EGLContext *
+wgl_create_context(_EGLDisplay *disp, _EGLConfig *conf,
+   _EGLContext *share_list, const EGLint *attrib_list)
+{
+   struct wgl_egl_context *wgl_ctx;
+   struct wgl_egl_display *wgl_dpy = wgl_egl_display(disp);
+   struct wgl_egl_context *wgl_ctx_shared = wgl_egl_context(share_list);
+   struct stw_context *shared =
+      wgl_ctx_shared ? wgl_ctx_shared->ctx : NULL;
+   struct wgl_egl_config *wgl_config = wgl_egl_config(conf);
+   const struct stw_pixelformat_info *stw_config;
+
+   wgl_ctx = malloc(sizeof * wgl_ctx);
+   if (!wgl_ctx) {
+      _eglError(EGL_BAD_ALLOC, "eglCreateContext");
+      return NULL;
+   }
+
+   if (!_eglInitContext(&wgl_ctx->base, disp, conf, attrib_list))
+      goto cleanup;
+
+   /* The EGL_EXT_create_context_robustness spec says:
+    *
+    *    "Add to the eglCreateContext context creation errors: [...]
+    *
+    *     * If the reset notification behavior of <share_context> and the
+    *       newly created context are different then an EGL_BAD_MATCH error is
+    *       generated."
+    */
+   if (share_list && share_list->ResetNotificationStrategy !=
+      wgl_ctx->base.ResetNotificationStrategy) {
+      _eglError(EGL_BAD_MATCH, "eglCreateContext");
+      goto cleanup;
+   }
+
+   /* The EGL_KHR_create_context_no_error spec says:
+    *
+    *    "BAD_MATCH is generated if the value of EGL_CONTEXT_OPENGL_NO_ERROR_KHR
+    *    used to create <share_context> does not match the value of
+    *    EGL_CONTEXT_OPENGL_NO_ERROR_KHR for the context being created."
+    */
+   if (share_list && share_list->NoError != wgl_ctx->base.NoError) {
+      _eglError(EGL_BAD_MATCH, "eglCreateContext");
+      goto cleanup;
+   }
+
+   unsigned profile_mask = 0;
+   switch (wgl_ctx->base.ClientAPI) {
+   case EGL_OPENGL_ES_API:
+      profile_mask = WGL_CONTEXT_ES_PROFILE_BIT_EXT;
+      break;
+   case EGL_OPENGL_API:
+      if ((wgl_ctx->base.ClientMajorVersion >= 4
+         || (wgl_ctx->base.ClientMajorVersion == 3
+            && wgl_ctx->base.ClientMinorVersion >= 2))
+         && wgl_ctx->base.Profile == EGL_CONTEXT_OPENGL_CORE_PROFILE_BIT_KHR)
+         profile_mask = WGL_CONTEXT_CORE_PROFILE_BIT_ARB;
+      else if (wgl_ctx->base.ClientMajorVersion == 3 &&
+         wgl_ctx->base.ClientMinorVersion == 1)
+         profile_mask = WGL_CONTEXT_CORE_PROFILE_BIT_ARB;
+      else
+         profile_mask = WGL_CONTEXT_COMPATIBILITY_PROFILE_BIT_ARB;
+      break;
+   default:
+      _eglError(EGL_BAD_PARAMETER, "eglCreateContext");
+      free(wgl_ctx);
+      return NULL;
+   }
+
+   if (conf != NULL) {
+      /* The config chosen here isn't necessarily
+       * used for surfaces later.
+       * A pixmap surface will use the single config.
+       * This opportunity depends on disabling the
+       * doubleBufferMode check in
+       * src/mesa/main/context.c:check_compatible()
+       */
+      if (wgl_config->stw_config[1])
+         stw_config = wgl_config->stw_config[1];
+      else
+         stw_config = wgl_config->stw_config[0];
+   }
+   else
+      stw_config = NULL;
+
+   unsigned flags = 0;
+   if (wgl_ctx->base.Flags & EGL_CONTEXT_OPENGL_FORWARD_COMPATIBLE_BIT_KHR)
+      flags |= WGL_CONTEXT_FORWARD_COMPATIBLE_BIT_ARB;
+   if (wgl_ctx->base.Flags & EGL_CONTEXT_OPENGL_DEBUG_BIT_KHR)
+      flags |= WGL_CONTEXT_DEBUG_BIT_ARB;
+   wgl_ctx->ctx = stw_create_context_attribs(disp->PlatformDisplay, 0, shared,
+      wgl_ctx->base.ClientMajorVersion,
+      wgl_ctx->base.ClientMinorVersion,
+      flags,
+      profile_mask,
+      stw_config->iPixelFormat);
+
+   if (!wgl_ctx->ctx)
+      goto cleanup;
+
+   return &wgl_ctx->base;
+
+cleanup:
+   free(wgl_ctx);
+   return NULL;
+}
+
+/**
+ * Called via eglDestroyContext(), drv->DestroyContext().
+ */
+static EGLBoolean
+wgl_destroy_context(_EGLDisplay *disp, _EGLContext *ctx)
+{
+   struct wgl_egl_context *wgl_ctx = wgl_egl_context(ctx);
+   struct wgl_egl_display *wgl_dpy = wgl_egl_display(disp);
+
+   if (_eglPutContext(ctx)) {
+      stw_destroy_context(wgl_ctx->ctx);
+      free(wgl_ctx);
+   }
+
+   return EGL_TRUE;
+}
+
+static EGLBoolean
+wgl_destroy_surface(_EGLDisplay *disp, _EGLSurface *surf)
+{
+   struct wgl_egl_surface *wgl_surf = wgl_egl_surface(surf);
+
+   if (!_eglPutSurface(surf))
+      return EGL_TRUE;
+
+   struct stw_context *ctx = stw_current_context();
+   stw_framebuffer_lock(wgl_surf->fb);
+   stw_framebuffer_release_locked(wgl_surf->fb, ctx ? ctx->st : NULL);
+   return EGL_TRUE;
+}
+
+static void
+wgl_gl_flush()
+{
+   static void (*glFlush)(void);
+   static mtx_t glFlushMutex = _MTX_INITIALIZER_NP;
+
+   mtx_lock(&glFlushMutex);
+   if (!glFlush)
+      glFlush = _glapi_get_proc_address("glFlush");
+   mtx_unlock(&glFlushMutex);
+
+   /* if glFlush is not available things are horribly broken */
+   if (!glFlush) {
+      _eglLog(_EGL_WARNING, "wgl: failed to find glFlush entry point");
+      return;
+   }
+
+   glFlush();
+}
+
+/**
+ * Called via eglMakeCurrent(), drv->MakeCurrent().
+ */
+static EGLBoolean
+wgl_make_current(_EGLDisplay *disp, _EGLSurface *dsurf,
+   _EGLSurface *rsurf, _EGLContext *ctx)
+{
+   struct wgl_egl_display *wgl_dpy = wgl_egl_display(disp);
+   struct wgl_egl_context *wgl_ctx = wgl_egl_context(ctx);
+   _EGLDisplay *old_disp = NULL;
+   struct wgl_egl_display *old_wgl_dpy = NULL;
+   _EGLContext *old_ctx;
+   _EGLSurface *old_dsurf, *old_rsurf;
+   _EGLSurface *tmp_dsurf, *tmp_rsurf;
+   struct stw_framebuffer *ddraw, *rdraw;
+   struct stw_context *cctx;
+   EGLint egl_error = EGL_SUCCESS;
+
+   if (!wgl_dpy)
+      return _eglError(EGL_NOT_INITIALIZED, "eglMakeCurrent");
+
+   /* make new bindings, set the EGL error otherwise */
+   if (!_eglBindContext(ctx, dsurf, rsurf, &old_ctx, &old_dsurf, &old_rsurf))
+      return EGL_FALSE;
+
+   if (old_ctx) {
+      struct stw_context *old_cctx = wgl_egl_context(old_ctx)->ctx;
+      old_disp = old_ctx->Resource.Display;
+      old_wgl_dpy = wgl_egl_display(old_disp);
+
+      /* flush before context switch */
+      wgl_gl_flush();
+
+#if 0
+      if (old_dsurf)
+         wgl_surf_update_fence_fd(old_ctx, disp, old_dsurf);
+
+      /* Disable shared buffer mode */
+      if (old_dsurf && _eglSurfaceInSharedBufferMode(old_dsurf) &&
+         old_wgl_dpy->vtbl->set_shared_buffer_mode) {
+         old_wgl_dpy->vtbl->set_shared_buffer_mode(old_disp, old_dsurf, false);
+      }
+#endif
+
+      stw_unbind_context(old_cctx);
+   }
+
+   ddraw = (dsurf) ? wgl_egl_surface(dsurf)->fb : NULL;
+   rdraw = (rsurf) ? wgl_egl_surface(rsurf)->fb : NULL;
+   cctx = (wgl_ctx) ? wgl_ctx->ctx : NULL;
+
+   if (cctx || ddraw || rdraw) {
+      if (!stw_make_current(ddraw, rdraw, cctx)) {
+         _EGLContext *tmp_ctx;
+
+         /* stw_make_current failed. We cannot tell for sure why, but
+          * setting the error to EGL_BAD_MATCH is surely better than leaving it
+          * as EGL_SUCCESS.
+          */
+         egl_error = EGL_BAD_MATCH;
+
+         /* undo the previous _eglBindContext */
+         _eglBindContext(old_ctx, old_dsurf, old_rsurf, &ctx, &tmp_dsurf, &tmp_rsurf);
+         assert(&wgl_ctx->base == ctx &&
+            tmp_dsurf == dsurf &&
+            tmp_rsurf == rsurf);
+
+         _eglPutSurface(dsurf);
+         _eglPutSurface(rsurf);
+         _eglPutContext(ctx);
+
+         _eglPutSurface(old_dsurf);
+         _eglPutSurface(old_rsurf);
+         _eglPutContext(old_ctx);
+
+         ddraw = (old_dsurf) ? wgl_egl_surface(old_dsurf)->fb : NULL;
+         rdraw = (old_rsurf) ? wgl_egl_surface(old_rsurf)->fb : NULL;
+         cctx = (old_ctx) ? wgl_egl_context(old_ctx)->ctx : NULL;
+
+         /* undo the previous wgl_dpy->core->unbindContext */
+         if (stw_make_current(ddraw, rdraw, cctx)) {
+#if 0
+            if (old_dsurf && _eglSurfaceInSharedBufferMode(old_dsurf) &&
+               old_wgl_dpy->vtbl->set_shared_buffer_mode) {
+               old_wgl_dpy->vtbl->set_shared_buffer_mode(old_disp, old_dsurf, true);
+            }
+#endif
+
+            return _eglError(egl_error, "eglMakeCurrent");
+         }
+
+         /* We cannot restore the same state as it was before calling
+          * eglMakeCurrent() and the spec isn't clear about what to do. We
+          * can prevent EGL from calling into the DRI driver with no DRI
+          * context bound.
+          */
+         dsurf = rsurf = NULL;
+         ctx = NULL;
+
+         _eglBindContext(ctx, dsurf, rsurf, &tmp_ctx, &tmp_dsurf, &tmp_rsurf);
+         assert(tmp_ctx == old_ctx && tmp_dsurf == old_dsurf &&
+            tmp_rsurf == old_rsurf);
+
+         _eglLog(_EGL_WARNING, "wgl: failed to rebind the previous context");
+      }
+      else {
+         /* wgl_dpy->core->bindContext succeeded, so take a reference on the
+          * wgl_dpy. This prevents wgl_dpy from being reinitialized when a
+          * EGLDisplay is terminated and then initialized again while a
+          * context is still bound. See wgl_intitialize() for a more in depth
+          * explanation. */
+         wgl_dpy->ref_count++;
+      }
+   }
+
+   wgl_destroy_surface(disp, old_dsurf);
+   wgl_destroy_surface(disp, old_rsurf);
+
+   if (old_ctx) {
+      wgl_destroy_context(disp, old_ctx);
+      wgl_display_release(old_disp);
+   }
+
+   if (egl_error != EGL_SUCCESS)
+      return _eglError(egl_error, "eglMakeCurrent");
+
+#if 0
+   if (dsurf && _eglSurfaceHasMutableRenderBuffer(dsurf) &&
+      wgl_dpy->vtbl->set_shared_buffer_mode) {
+      /* Always update the shared buffer mode. This is obviously needed when
+       * the active EGL_RENDER_BUFFER is EGL_SINGLE_BUFFER. When
+       * EGL_RENDER_BUFFER is EGL_BACK_BUFFER, the update protects us in the
+       * case where external non-EGL API may have changed window's shared
+       * buffer mode since we last saw it.
+       */
+      bool mode = (dsurf->ActiveRenderBuffer == EGL_SINGLE_BUFFER);
+      wgl_dpy->vtbl->set_shared_buffer_mode(disp, dsurf, mode);
+   }
+#endif
+
+   return EGL_TRUE;
+}
+
+static _EGLSurface*
+wgl_create_window_surface(_EGLDisplay *disp, _EGLConfig *conf,
+                          void *native_window, const EGLint *attrib_list)
+{
+   struct wgl_egl_config *wgl_conf = wgl_egl_config(conf);
+
+   struct wgl_egl_surface *wgl_surf = calloc(1, sizeof(*wgl_surf));
+   if (!wgl_surf)
+      return NULL;
+
+   if (!_eglInitSurface(&wgl_surf->base, disp, EGL_WINDOW_BIT, conf, attrib_list, native_window)) {
+      free(wgl_surf);
+      return NULL;
+   }
+
+   const struct stw_pixelformat_info *stw_conf = wgl_conf->stw_config[1] ?
+      wgl_conf->stw_config[1] : wgl_conf->stw_config[0];
+   wgl_surf->fb = stw_framebuffer_create(native_window, stw_conf->iPixelFormat, STW_FRAMEBUFFER_EGL_WINDOW);
+   if (!wgl_surf->fb) {
+      free(wgl_surf);
+      return NULL;
+   }
+
+   stw_framebuffer_unlock(wgl_surf->fb);
+
+   return &wgl_surf->base;
+}
+
+static EGLBoolean
+wgl_swap_buffers(_EGLDisplay *disp, _EGLSurface *draw)
+{
+   struct wgl_egl_display *wgl_disp = wgl_egl_display(disp);
+   struct wgl_egl_surface *wgl_surf = wgl_egl_surface(draw);
+
+   stw_framebuffer_lock(wgl_surf->fb);
+   HDC hdc = GetDC(wgl_surf->fb->hWnd);
+   BOOL ret = stw_framebuffer_swap_locked(hdc, wgl_surf->fb);
+   ReleaseDC(wgl_surf->fb->hWnd, hdc);
+
+   return ret;
+}
+
+struct _egl_driver _eglDriver = {
+   .Initialize = wgl_initialize,
+   .Terminate = wgl_terminate,
+   .CreateContext = wgl_create_context,
+   .DestroyContext = wgl_destroy_context,
+   .MakeCurrent = wgl_make_current,
+   .CreateWindowSurface = wgl_create_window_surface,
+   .DestroySurface = wgl_destroy_surface,
+   .GetProcAddress = _glapi_get_proc_address,
+   .SwapBuffers = wgl_swap_buffers,
+};
+
diff --git a/mesa 3D driver/src/egl/drivers/wgl/egl_wgl.h b/mesa 3D driver/src/egl/drivers/wgl/egl_wgl.h
new file mode 100644
index 0000000000..d7bb68daf6
--- /dev/null
+++ b/mesa 3D driver/src/egl/drivers/wgl/egl_wgl.h	
@@ -0,0 +1,57 @@
+/*
+ * Copyright © Microsoft Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <egldriver.h>
+#include <egldisplay.h>
+#include <eglconfig.h>
+
+#include <stw_pixelformat.h>
+#include <windows.h>
+
+struct wgl_egl_display
+{
+   int ref_count;
+   struct pipe_screen *screen;
+};
+
+struct wgl_egl_config
+{
+   _EGLConfig                         base;
+   const struct stw_pixelformat_info *stw_config[2];
+};
+
+struct wgl_egl_context
+{
+   _EGLContext base;
+   struct stw_context *ctx;
+};
+
+struct wgl_egl_surface
+{
+   _EGLSurface base;
+   struct stw_framebuffer *fb;
+};
+
+_EGL_DRIVER_STANDARD_TYPECASTS(wgl_egl)
diff --git a/mesa 3D driver/src/egl/generate/egl.xml b/mesa 3D driver/src/egl/generate/egl.xml
index 77c0546931..443335c362 100644
--- a/mesa 3D driver/src/egl/generate/egl.xml	
+++ b/mesa 3D driver/src/egl/generate/egl.xml	
@@ -575,7 +575,7 @@
         <enum value="0x31DC" name="EGL_PLATFORM_XCB_EXT"/>
         <enum value="0x31DD" name="EGL_PLATFORM_SURFACELESS_MESA"/>
         <enum value="0x31DE" name="EGL_PLATFORM_XCB_SCREEN_EXT"/>
-            <unused start="0x31DF" end="0x31DF"/>
+        <enum value="0x31DF" name="EGL_PRESENT_OPAQUE_EXT"/>
     </enums>
 
     <!-- Due to an oversight in development, these enums alias the above MESA
@@ -1030,7 +1030,8 @@
     </enums>
 
     <enums namespace="EGL" start="0x34C0" end="0x34CF" vendor="EXT" comment="Reserved for Robert Mader (PR 124)">
-        <unused start="0x34C0" end="0x34CF"/>
+        <enum value="0x34C0" name="EGL_CONFIG_SELECT_GROUP_EXT"/>
+            <unused start="0x34C1" end="0x34CF"/>
     </enums>
 
 <!-- Please remember that new enumerant allocations must be obtained by
@@ -1989,6 +1990,14 @@
              <param><ptype>EGLImage</ptype> <name>image</name></param>
              <param><ptype>EGLSync</ptype> <name>sync</name></param>
         </command>
+        <command>
+             <proto><ptype>EGLBoolean</ptype> <name>eglQueryDeviceBinaryEXT</name></proto>
+             <param><ptype>EGLDeviceEXT</ptype> <name>device</name></param>
+             <param><ptype>EGLint</ptype> <name>name</name></param>
+             <param><ptype>EGLint</ptype> <name>max_size</name></param>
+             <param>void *<name>value</name></param>
+             <param>EGLint *<name>size</name></param>
+        </command>
     </commands>
 
     <!-- SECTION: EGL API interface definitions. -->
@@ -2388,6 +2397,11 @@
                 <command name="eglClientSignalSyncEXT"/>
             </require>
         </extension>
+        <extension name="EGL_EXT_config_select_group" supported="egl">
+          <require>
+                <enum name="EGL_CONFIG_SELECT_GROUP_EXT"/>
+          </require>
+        </extension>
         <extension name="EGL_EXT_create_context_robustness" supported="egl">
             <require>
                 <enum name="EGL_CONTEXT_OPENGL_ROBUST_ACCESS_EXT"/>
@@ -2590,6 +2604,11 @@
                 <enum name="EGL_PLATFORM_XCB_SCREEN_EXT"/>
             </require>
         </extension>
+        <extension name="EGL_EXT_present_opaque" supported="egl">
+            <require>
+                <enum name="EGL_PRESENT_OPAQUE_EXT"/>
+            </require>
+        </extension>
         <extension name="EGL_EXT_protected_content" supported="egl">
             <require>
                 <enum name="EGL_PROTECTED_CONTENT_EXT"/>
@@ -3430,6 +3449,7 @@
                 <enum name="EGL_DEVICE_UUID_EXT"/>
                 <enum name="EGL_DRIVER_UUID_EXT"/>
                 <enum name="EGL_DRIVER_NAME_EXT"/>
+                <command name="eglQueryDeviceBinaryEXT"/>
             </require>
         </extension>
         <extension name="EGL_EXT_device_drm_render_node" supported="egl">
diff --git a/mesa 3D driver/src/egl/generate/eglFunctionList.py b/mesa 3D driver/src/egl/generate/eglFunctionList.py
index cd6c259826..6a0bd1696a 100644
--- a/mesa 3D driver/src/egl/generate/eglFunctionList.py	
+++ b/mesa 3D driver/src/egl/generate/eglFunctionList.py	
@@ -1,3 +1,28 @@
+# (C) Copyright 2016, NVIDIA CORPORATION.
+# All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# on the rights to use, copy, modify, merge, publish, distribute, sub
+# license, and/or sell copies of the Software, and to permit persons to whom
+# the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+# IBM AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+#
+# Authors:
+#    Kyle Brenneman <kbrenneman@nvidia.com>
+
 """
 Contains a list of EGL functions to generate dispatch functions for.
 
diff --git a/mesa 3D driver/src/egl/generate/egl_other.xml b/mesa 3D driver/src/egl/generate/egl_other.xml
index 7fe3a9e272..fd1be9394f 100644
--- a/mesa 3D driver/src/egl/generate/egl_other.xml	
+++ b/mesa 3D driver/src/egl/generate/egl_other.xml	
@@ -1,5 +1,32 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <registry>
+    <!--
+     (C) Copyright 2016, NVIDIA CORPORATION.
+     All Rights Reserved.
+
+     Permission is hereby granted, free of charge, to any person obtaining a
+     copy of this software and associated documentation files (the "Software"),
+     to deal in the Software without restriction, including without limitation
+     on the rights to use, copy, modify, merge, publish, distribute, sub
+     license, and/or sell copies of the Software, and to permit persons to whom
+     the Software is furnished to do so, subject to the following conditions:
+
+     The above copyright notice and this permission notice (including the next
+     paragraph) shall be included in all copies or substantial portions of the
+     Software.
+
+     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+     FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+     IBM AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+     IN THE SOFTWARE.
+
+     Authors:
+        Kyle Brenneman <kbrenneman@nvidia.com>
+    -->
+
     <comment>
         This file contains any EGL extension functions that are missing from
         the normal egl.xml list.
diff --git a/mesa 3D driver/src/egl/main/egl.def b/mesa 3D driver/src/egl/main/egl.def
index 0cfe920e0e..54ae76b330 100644
--- a/mesa 3D driver/src/egl/main/egl.def	
+++ b/mesa 3D driver/src/egl/main/egl.def	
@@ -2,14 +2,21 @@ EXPORTS
    eglBindAPI
    eglBindTexImage
    eglChooseConfig
+   eglClientWaitSync
    eglCopyBuffers
    eglCreateContext
+   eglCreateImage
    eglCreatePbufferFromClientBuffer
    eglCreatePbufferSurface
    eglCreatePixmapSurface
+   eglCreatePlatformPixmapSurface
+   eglCreatePlatformWindowSurface
+   eglCreateSync
    eglCreateWindowSurface
    eglDestroyContext
+   eglDestroyImage
    eglDestroySurface
+   eglDestroySync
    eglGetConfigAttrib
    eglGetConfigs
    eglGetCurrentContext
@@ -17,7 +24,9 @@ EXPORTS
    eglGetCurrentSurface
    eglGetDisplay
    eglGetError
+   eglGetPlatformDisplay
    eglGetProcAddress
+   eglGetSyncAttrib
    eglInitialize
    eglMakeCurrent
    eglQueryAPI
@@ -33,3 +42,6 @@ EXPORTS
    eglWaitClient
    eglWaitGL
    eglWaitNative
+   eglWaitSync
+   MesaGLInteropEGLQueryDeviceInfo
+   MesaGLInteropEGLExportObject
diff --git a/mesa 3D driver/src/egl/main/eglapi.c b/mesa 3D driver/src/egl/main/eglapi.c
index 17e36af22e..437865df0f 100644
--- a/mesa 3D driver/src/egl/main/eglapi.c	
+++ b/mesa 3D driver/src/egl/main/eglapi.c	
@@ -502,6 +502,7 @@ _eglCreateExtensionsString(_EGLDisplay *disp)
    _EGL_CHECK_EXTENSION(EXT_image_dma_buf_import);
    _EGL_CHECK_EXTENSION(EXT_image_dma_buf_import_modifiers);
    _EGL_CHECK_EXTENSION(EXT_protected_surface);
+   _EGL_CHECK_EXTENSION(EXT_present_opaque);
    _EGL_CHECK_EXTENSION(EXT_surface_CTA861_3_metadata);
    _EGL_CHECK_EXTENSION(EXT_surface_SMPTE2086_metadata);
    _EGL_CHECK_EXTENSION(EXT_swap_buffers_with_damage);
diff --git a/mesa 3D driver/src/egl/main/eglcurrent.c b/mesa 3D driver/src/egl/main/eglcurrent.c
index 11277d3e4c..3a82a2d0d7 100644
--- a/mesa 3D driver/src/egl/main/eglcurrent.c	
+++ b/mesa 3D driver/src/egl/main/eglcurrent.c	
@@ -33,6 +33,7 @@
 #include "c99_compat.h"
 #include "c11/threads.h"
 #include "util/u_thread.h"
+#include "util/u_string.h"
 
 #include "egllog.h"
 #include "eglcurrent.h"
@@ -130,8 +131,14 @@ _eglCreateThreadInfo(void)
 static void
 _eglDestroyThreadInfo(_EGLThreadInfo *t)
 {
-   if (t != &dummy_thread)
+   if (t != &dummy_thread) {
       free(t);
+#ifdef USE_ELF_TLS
+      /* Reset the TLS also here, otherwise
+       * it will be having a dangling pointer */
+      _egl_TLS = NULL;
+#endif
+   }
 }
 
 
diff --git a/mesa 3D driver/src/egl/main/egldispatchstubs.c b/mesa 3D driver/src/egl/main/egldispatchstubs.c
index 96708aeb0d..ee9c8f5f68 100644
--- a/mesa 3D driver/src/egl/main/egldispatchstubs.c	
+++ b/mesa 3D driver/src/egl/main/egldispatchstubs.c	
@@ -1,3 +1,30 @@
+/*
+ * (C) Copyright 2016, NVIDIA CORPORATION.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * IBM AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Kyle Brenneman <kbrenneman@nvidia.com>
+ */
+
 #include "egldispatchstubs.h"
 #include "g_egldispatchstubs.h"
 
diff --git a/mesa 3D driver/src/egl/main/egldispatchstubs.h b/mesa 3D driver/src/egl/main/egldispatchstubs.h
index 7861ea5e61..fec1e63bb8 100644
--- a/mesa 3D driver/src/egl/main/egldispatchstubs.h	
+++ b/mesa 3D driver/src/egl/main/egldispatchstubs.h	
@@ -1,3 +1,30 @@
+/*
+ * (C) Copyright 2016, NVIDIA CORPORATION.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * IBM AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Kyle Brenneman <kbrenneman@nvidia.com>
+ */
+
 #ifndef EGLDISPATCHSTUBS_H
 #define EGLDISPATCHSTUBS_H
 
diff --git a/mesa 3D driver/src/egl/main/egldisplay.c b/mesa 3D driver/src/egl/main/egldisplay.c
index 765618f0dd..131fc22786 100644
--- a/mesa 3D driver/src/egl/main/egldisplay.c	
+++ b/mesa 3D driver/src/egl/main/egldisplay.c	
@@ -35,7 +35,11 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
+#ifdef _WIN32
+#include <io.h>
+#else
 #include <unistd.h>
+#endif
 #include <fcntl.h>
 #include "c11/threads.h"
 #include "util/macros.h"
@@ -60,6 +64,9 @@
 #ifdef HAVE_DRM_PLATFORM
 #include <gbm.h>
 #endif
+#ifdef HAVE_WINDOWS_PLATFORM
+#include <windows.h>
+#endif
 
 
 /**
@@ -77,6 +84,7 @@ static const struct {
    { _EGL_PLATFORM_HAIKU, "haiku" },
    { _EGL_PLATFORM_SURFACELESS, "surfaceless" },
    { _EGL_PLATFORM_DEVICE, "device" },
+   { _EGL_PLATFORM_WINDOWS, "windows" },
 };
 
 
@@ -123,11 +131,15 @@ _eglNativePlatformDetectNativeDisplay(void *nativeDisplay)
    if (nativeDisplay == EGL_DEFAULT_DISPLAY)
       return _EGL_INVALID_PLATFORM;
 
+#ifdef HAVE_WINDOWS_PLATFORM
+   if (GetObjectType(nativeDisplay) == OBJ_DC)
+      return _EGL_PLATFORM_WINDOWS;
+#endif
+
+#if defined(HAVE_WAYLAND_PLATFORM) || defined(HAVE_DRM_PLATFORM)
    if (_eglPointerIsDereferencable(nativeDisplay)) {
       void *first_pointer = *(void **) nativeDisplay;
 
-      (void) first_pointer; /* silence unused var warning */
-
 #ifdef HAVE_WAYLAND_PLATFORM
       /* wl_display is a wl_proxy, which is a wl_object.
        * wl_object's first element points to the interfacetype. */
@@ -141,6 +153,7 @@ _eglNativePlatformDetectNativeDisplay(void *nativeDisplay)
          return _EGL_PLATFORM_DRM;
 #endif
    }
+#endif
 
    return _EGL_INVALID_PLATFORM;
 }
diff --git a/mesa 3D driver/src/egl/main/egldisplay.h b/mesa 3D driver/src/egl/main/egldisplay.h
index 4d2afbc712..0ee06a487c 100644
--- a/mesa 3D driver/src/egl/main/egldisplay.h	
+++ b/mesa 3D driver/src/egl/main/egldisplay.h	
@@ -52,6 +52,7 @@ enum _egl_platform_type {
    _EGL_PLATFORM_HAIKU,
    _EGL_PLATFORM_SURFACELESS,
    _EGL_PLATFORM_DEVICE,
+   _EGL_PLATFORM_WINDOWS,
 
    _EGL_NUM_PLATFORMS,
    _EGL_INVALID_PLATFORM = -1
@@ -108,6 +109,7 @@ struct _egl_extensions
    EGLBoolean EXT_image_dma_buf_import_modifiers;
    EGLBoolean EXT_pixel_format_float;
    EGLBoolean EXT_protected_surface;
+   EGLBoolean EXT_present_opaque;
    EGLBoolean EXT_surface_CTA861_3_metadata;
    EGLBoolean EXT_surface_SMPTE2086_metadata;
    EGLBoolean EXT_swap_buffers_with_damage;
diff --git a/mesa 3D driver/src/egl/main/eglglobals.c b/mesa 3D driver/src/egl/main/eglglobals.c
index e0e9044a92..8d815967f9 100644
--- a/mesa 3D driver/src/egl/main/eglglobals.c	
+++ b/mesa 3D driver/src/egl/main/eglglobals.c	
@@ -39,6 +39,7 @@
 #include "egldisplay.h"
 
 #include "util/macros.h"
+#include "util/os_misc.h"
 
 #ifdef HAVE_MINCORE
 #include <unistd.h>
@@ -137,7 +138,8 @@ EGLBoolean
 _eglPointerIsDereferencable(void *p)
 {
    uintptr_t addr = (uintptr_t) p;
-   const long page_size = getpagesize();
+   uint64_t page_size = 0;
+   os_get_page_size(&page_size);
 #ifdef HAVE_MINCORE
    unsigned char valid = 0;
 
diff --git a/mesa 3D driver/src/egl/main/eglglvnd.c b/mesa 3D driver/src/egl/main/eglglvnd.c
index 81fdb4508c..1e94678e66 100644
--- a/mesa 3D driver/src/egl/main/eglglvnd.c	
+++ b/mesa 3D driver/src/egl/main/eglglvnd.c	
@@ -1,3 +1,30 @@
+/*
+ * (C) Copyright 2016, NVIDIA CORPORATION.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * IBM AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Kyle Brenneman <kbrenneman@nvidia.com>
+ */
+
 #include <string.h>
 #include <assert.h>
 
diff --git a/mesa 3D driver/src/egl/main/egllog.c b/mesa 3D driver/src/egl/main/egllog.c
index 6a91952577..984dd5b193 100644
--- a/mesa 3D driver/src/egl/main/egllog.c	
+++ b/mesa 3D driver/src/egl/main/egllog.c	
@@ -39,7 +39,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <strings.h>
 #include "c11/threads.h"
 #include "util/macros.h"
 #include "util/u_string.h"
diff --git a/mesa 3D driver/src/egl/main/eglsurface.c b/mesa 3D driver/src/egl/main/eglsurface.c
index aee5217899..9167b9b7ee 100644
--- a/mesa 3D driver/src/egl/main/eglsurface.c	
+++ b/mesa 3D driver/src/egl/main/eglsurface.c	
@@ -216,6 +216,21 @@ _eglParseSurfaceAttribList(_EGLSurface *surf, const EGLint *attrib_list)
             surf->ActiveRenderBuffer = val;
          }
          break;
+      case EGL_PRESENT_OPAQUE_EXT:
+         if (!disp->Extensions.EXT_present_opaque) {
+            err = EGL_BAD_ATTRIBUTE;
+            break;
+         }
+         if (type != EGL_WINDOW_BIT) {
+            err = EGL_BAD_ATTRIBUTE;
+            break;
+         }
+         if (val != EGL_TRUE && val != EGL_FALSE) {
+            err = EGL_BAD_PARAMETER;
+            break;
+         }
+         surf->PresentOpaque = val;
+         break;
       case EGL_POST_SUB_BUFFER_SUPPORTED_NV:
          if (!disp->Extensions.NV_post_sub_buffer ||
              type != EGL_WINDOW_BIT) {
@@ -392,6 +407,7 @@ _eglInitSurface(_EGLSurface *surf, _EGLDisplay *disp, EGLint type,
    surf->VGColorspace = EGL_VG_COLORSPACE_sRGB;
    surf->GLColorspace = EGL_GL_COLORSPACE_LINEAR_KHR;
    surf->ProtectedContent = EGL_FALSE;
+   surf->PresentOpaque = EGL_FALSE;
 
    surf->MipmapLevel = 0;
    surf->MultisampleResolve = EGL_MULTISAMPLE_RESOLVE_DEFAULT;
@@ -595,6 +611,11 @@ _eglQuerySurface(_EGLDisplay *disp, _EGLSurface *surface,
          return _eglError(EGL_BAD_ATTRIBUTE, "eglQuerySurface");
       *value = surface->ProtectedContent;
       break;
+   case EGL_PRESENT_OPAQUE_EXT:
+      if (!disp->Extensions.EXT_present_opaque)
+         return _eglError(EGL_BAD_ATTRIBUTE, "eglQuerySurface");
+      *value = surface->PresentOpaque;
+      break;
    default:
       return _eglError(EGL_BAD_ATTRIBUTE, "eglQuerySurface");
    }
diff --git a/mesa 3D driver/src/egl/main/eglsurface.h b/mesa 3D driver/src/egl/main/eglsurface.h
index 7f419cbf7e..b26768363d 100644
--- a/mesa 3D driver/src/egl/main/eglsurface.h	
+++ b/mesa 3D driver/src/egl/main/eglsurface.h	
@@ -171,6 +171,8 @@ struct _egl_surface
 
    EGLBoolean ProtectedContent;
 
+   EGLBoolean PresentOpaque;
+
    struct _egl_hdr_metadata HdrMetadata;
 
    void *NativeSurface;
diff --git a/mesa 3D driver/src/egl/meson.build b/mesa 3D driver/src/egl/meson.build
index ab8f4e1fdb..65faf60770 100644
--- a/mesa 3D driver/src/egl/meson.build	
+++ b/mesa 3D driver/src/egl/meson.build	
@@ -140,6 +140,13 @@ elif with_platform_haiku
   files_egl += files('drivers/haiku/egl_haiku.cpp')
   link_for_egl += libgl
   deps_for_egl += cpp.find_library('be')
+elif with_platform_windows
+  c_args_for_egl += [
+    '-DEGLAPI=', '-DPUBLIC='
+  ]
+  files_egl += files('drivers/wgl/egl_wgl.c')
+  incs_for_egl += [inc_wgl, inc_gallium, inc_gallium_aux]
+  link_for_egl += libgallium_wgl
 endif
 
 if cc.has_function('mincore')
@@ -149,9 +156,11 @@ endif
 if not with_glvnd
   egl_lib_name = 'EGL' + get_option('egl-lib-suffix')
   egl_lib_version = '1.0.0'
+  egl_lib_soversion = host_machine.system() == 'windows' ? '' : '1'
 else
   egl_lib_name = 'EGL_@0@'.format(glvnd_vendor_name)
   egl_lib_version = '0.0.0'
+  egl_lib_soversion = '0'
   deps_for_egl += dep_glvnd
   files_egl += [g_egldispatchstubs_h, g_egldispatchstubs_c]
   files_egl += files('main/eglglvnd.c', 'main/egldispatchstubs.c')
@@ -181,6 +190,9 @@ libegl = shared_library(
   dependencies : [deps_for_egl, dep_dl, dep_libdrm, dep_clock, dep_thread, idep_mesautil],
   install : true,
   version : egl_lib_version,
+  soversion : egl_lib_soversion,
+  name_prefix : 'lib', # even on windows
+  vs_module_defs : 'main/egl.def'
 )
 
 if not with_glvnd
diff --git a/mesa 3D driver/src/etnaviv/ci/deqp-etnaviv-gc2000-fails.txt b/mesa 3D driver/src/etnaviv/ci/deqp-etnaviv-gc2000-fails.txt
new file mode 100644
index 0000000000..414d40fbc1
--- /dev/null
+++ b/mesa 3D driver/src/etnaviv/ci/deqp-etnaviv-gc2000-fails.txt	
@@ -0,0 +1,151 @@
+dEQP-GLES2.functional.buffer.write.use.index_array.array,Fail
+dEQP-GLES2.functional.buffer.write.use.index_array.element_array,Fail
+dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_center,Fail
+dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_corner,Fail
+dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_pos_y_pos_z_and_neg_x_neg_y_pos_z_and_pos_x_pos_y_neg_z,Fail
+dEQP-GLES2.functional.depth_stencil_clear.depth_stencil_masked,Fail
+dEQP-GLES2.functional.depth_stencil_clear.depth_stencil,Fail
+dEQP-GLES2.functional.draw.draw_elements.triangle_fan.single_attribute,Fail
+dEQP-GLES2.functional.fbo.render.color_clear.rbo_rgba4_depth_component16,Fail
+dEQP-GLES2.functional.fbo.render.color_clear.rbo_rgba4_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.color_clear.rbo_rgba4,Fail
+dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.no_rebind_rbo_rgb5_a1_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.no_rebind_rbo_rgb565_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.no_rebind_rbo_rgba4_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.no_rebind_tex2d_rgb_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.no_rebind_tex2d_rgba_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.rebind_rbo_rgb5_a1_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.rebind_rbo_rgb565_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.rebind_rbo_rgba4_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.rebind_tex2d_rgb_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.recreate_colorbuffer.rebind_tex2d_rgba_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.resize.rbo_rgb5_a1_depth_component16,Fail
+dEQP-GLES2.functional.fbo.render.resize.rbo_rgb5_a1_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.resize.rbo_rgb5_a1,Fail
+dEQP-GLES2.functional.fbo.render.resize.rbo_rgb565_depth_component16,Fail
+dEQP-GLES2.functional.fbo.render.resize.rbo_rgb565_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.resize.rbo_rgb565,Fail
+dEQP-GLES2.functional.fbo.render.resize.rbo_rgba4_depth_component16,Fail
+dEQP-GLES2.functional.fbo.render.resize.rbo_rgba4_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.resize.rbo_rgba4,Fail
+dEQP-GLES2.functional.fbo.render.resize.tex2d_rgb_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.resize.tex2d_rgba_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.shared_colorbuffer.rbo_rgb5_a1_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.shared_colorbuffer.rbo_rgb565_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.shared_colorbuffer.rbo_rgba4_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.shared_colorbuffer.tex2d_rgb_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.shared_colorbuffer.tex2d_rgba_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.stencil_clear.rbo_rgb5_a1_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.stencil_clear.rbo_rgb565_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.stencil_clear.rbo_rgba4_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.stencil_clear.tex2d_rgb_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.stencil_clear.tex2d_rgba_stencil_index8,Fail
+dEQP-GLES2.functional.fbo.render.texsubimage.after_render_tex2d_rgb,Fail
+dEQP-GLES2.functional.fbo.render.texsubimage.after_render_tex2d_rgba,Fail
+dEQP-GLES2.functional.fbo.render.texsubimage.between_render_tex2d_rgb,Fail
+dEQP-GLES2.functional.fbo.render.texsubimage.between_render_tex2d_rgba,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.random.0,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.random.1,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.random.10,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.random.11,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.random.12,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.random.13,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.random.14,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.random.15,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.random.16,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.random.17,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.random.18,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.random.19,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.random.2,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.random.20,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.random.21,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.random.22,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.random.23,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.random.24,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.random.3,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.random.4,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.random.5,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.random.6,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.random.7,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.random.8,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.random.9,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.write_mask.both,Fail
+dEQP-GLES2.functional.fragment_ops.depth_stencil.write_mask.stencil,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.2,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.23,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.27,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.35,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.39,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.4,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.41,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.42,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.47,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.5,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.51,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.56,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.58,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.60,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.70,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.72,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.74,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.75,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.76,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.77,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.78,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.8,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.82,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.87,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.90,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.94,Fail
+dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.99,Fail
+dEQP-GLES2.functional.fragment_ops.random.0,Fail
+dEQP-GLES2.functional.fragment_ops.random.1,Fail
+dEQP-GLES2.functional.fragment_ops.random.11,Fail
+dEQP-GLES2.functional.fragment_ops.random.14,Fail
+dEQP-GLES2.functional.fragment_ops.random.17,Fail
+dEQP-GLES2.functional.fragment_ops.random.20,Fail
+dEQP-GLES2.functional.fragment_ops.random.24,Fail
+dEQP-GLES2.functional.fragment_ops.random.25,Fail
+dEQP-GLES2.functional.fragment_ops.random.27,Fail
+dEQP-GLES2.functional.fragment_ops.random.3,Fail
+dEQP-GLES2.functional.fragment_ops.random.30,Fail
+dEQP-GLES2.functional.fragment_ops.random.31,Fail
+dEQP-GLES2.functional.fragment_ops.random.34,Fail
+dEQP-GLES2.functional.fragment_ops.random.36,Fail
+dEQP-GLES2.functional.fragment_ops.random.37,Fail
+dEQP-GLES2.functional.fragment_ops.random.4,Fail
+dEQP-GLES2.functional.fragment_ops.random.40,Fail
+dEQP-GLES2.functional.fragment_ops.random.45,Fail
+dEQP-GLES2.functional.fragment_ops.random.49,Fail
+dEQP-GLES2.functional.fragment_ops.random.5,Fail
+dEQP-GLES2.functional.fragment_ops.random.50,Fail
+dEQP-GLES2.functional.fragment_ops.random.52,Fail
+dEQP-GLES2.functional.fragment_ops.random.54,Fail
+dEQP-GLES2.functional.fragment_ops.random.56,Fail
+dEQP-GLES2.functional.fragment_ops.random.60,Fail
+dEQP-GLES2.functional.fragment_ops.random.61,Fail
+dEQP-GLES2.functional.fragment_ops.random.69,Fail
+dEQP-GLES2.functional.fragment_ops.random.7,Fail
+dEQP-GLES2.functional.fragment_ops.random.74,Fail
+dEQP-GLES2.functional.fragment_ops.random.77,Fail
+dEQP-GLES2.functional.fragment_ops.random.78,Fail
+dEQP-GLES2.functional.fragment_ops.random.80,Fail
+dEQP-GLES2.functional.fragment_ops.random.85,Fail
+dEQP-GLES2.functional.fragment_ops.random.86,Fail
+dEQP-GLES2.functional.fragment_ops.random.9,Fail
+dEQP-GLES2.functional.fragment_ops.random.90,Fail
+dEQP-GLES2.functional.fragment_ops.random.96,Fail
+dEQP-GLES2.functional.fragment_ops.random.97,Fail
+dEQP-GLES2.functional.fragment_ops.random.98,Fail
+dEQP-GLES2.functional.rasterization.limits.points,Fail
+dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_read_fragment,Fail
+dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_read_vertex,Fail
+dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_read_fragment,Fail
+dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_read_vertex,Fail
+dEQP-GLES2.functional.texture.vertex.cube.filtering.nearest_mipmap_linear_linear_clamp,Fail
+dEQP-GLES2.functional.texture.vertex.cube.filtering.nearest_mipmap_linear_linear_mirror,Fail
+dEQP-GLES2.functional.texture.vertex.cube.filtering.nearest_mipmap_linear_linear_repeat,Fail
+dEQP-GLES2.functional.texture.vertex.cube.filtering.nearest_mipmap_nearest_linear_clamp,Fail
+dEQP-GLES2.functional.texture.vertex.cube.filtering.nearest_mipmap_nearest_linear_mirror,Fail
+dEQP-GLES2.functional.texture.vertex.cube.filtering.nearest_mipmap_nearest_linear_repeat,Fail
+dEQP-GLES2.functional.uniform_api.random.0,Fail
diff --git a/mesa 3D driver/src/etnaviv/ci/deqp-etnaviv-gc2000-flakes.txt b/mesa 3D driver/src/etnaviv/ci/deqp-etnaviv-gc2000-flakes.txt
new file mode 100644
index 0000000000..cf10982296
--- /dev/null
+++ b/mesa 3D driver/src/etnaviv/ci/deqp-etnaviv-gc2000-flakes.txt	
@@ -0,0 +1,39 @@
+dEQP-GLES2.functional.fbo.render.color_clear.tex2d_rgb
+dEQP-GLES2.functional.fbo.render.color_clear.tex2d_rgba_stencil_index8
+dEQP-GLES2.functional.fbo.render.color.blend_npot_tex2d_rgba
+dEQP-GLES2.functional.fbo.render.color.blend_npot_tex2d_rgba_depth_component16
+dEQP-GLES2.functional.fbo.render.color.blend_rbo_rgba4
+dEQP-GLES2.functional.fbo.render.color.blend_rbo_rgba4_depth_component16
+dEQP-GLES2.functional.fbo.render.depth.npot_rbo_rgb565_depth_component16
+dEQP-GLES2.functional.rasterization.culling.both_triangle_fan
+dEQP-GLES2.functional.rasterization.culling.both_triangle_strip
+dEQP-GLES2.functional.rasterization.culling.both_triangle_strip_reverse
+dEQP-GLES2.functional.rasterization.culling.both_triangles
+dEQP-GLES2.functional.rasterization.culling.both_triangles_reverse
+dEQP-GLES2.functional.shaders.random.all_features.fragment.24
+dEQP-GLES2.functional.shaders.random.all_features.fragment.45
+dEQP-GLES2.functional.shaders.random.all_features.fragment.74
+dEQP-GLES2.functional.shaders.random.texture.fragment.26
+dEQP-GLES2.functional.shaders.random.texture.fragment.63
+dEQP-GLES2.functional.shaders.random.texture.fragment.84
+dEQP-GLES2.functional.shaders.random.texture.vertex.39
+dEQP-GLES2.functional.shaders.random.texture.vertex.4
+dEQP-GLES2.functional.texture.filtering.2d.nearest_nearest_clamp_rgba8888_npot
+dEQP-GLES2.functional.texture.format.a8_2d_pot
+dEQP-GLES2.functional.texture.format.etc1_cube_npot
+dEQP-GLES2.functional.texture.format.l8_2d_pot
+dEQP-GLES2.functional.texture.format.l8_cube_npot
+dEQP-GLES2.functional.texture.format.la88_cube_pot
+dEQP-GLES2.functional.texture.format.rgb888_2d_npot
+dEQP-GLES2.functional.texture.format.rgba5551_2d_npot
+dEQP-GLES2.functional.texture.format.rgba8888_cube_npot
+dEQP-GLES2.functional.texture.format.rgba8888_cube_pot
+dEQP-GLES2.functional.uniform_api.random.55
+dEQP-GLES2.functional.uniform_api.value.assigned.by_pointer.render.struct_in_array.sampler2D_samplerCube_fragment
+dEQP-GLES2.functional.uniform_api.value.assigned.by_value.render.array_in_struct.sampler2D_samplerCube_fragment
+dEQP-GLES2.functional.uniform_api.value.assigned.by_value.render.basic_array.sampler2D_both
+dEQP-GLES2.functional.uniform_api.value.assigned.by_value.render.basic_struct.sampler2D_samplerCube_fragment
+dEQP-GLES2.functional.uniform_api.value.assigned.by_value.render.basic.sampler2D_vertex
+dEQP-GLES2.functional.uniform_api.value.assigned.by_value.render.basic.samplerCube_vertex
+dEQP-GLES2.functional.uniform_api.value.assigned.by_value.render.struct_in_array.sampler2D_samplerCube_fragment
+dEQP-GLES2.functional.uniform_api.value.initial.render.basic.samplerCube_vertex
diff --git a/mesa 3D driver/src/etnaviv/ci/gitlab-ci.yml b/mesa 3D driver/src/etnaviv/ci/gitlab-ci.yml
new file mode 100644
index 0000000000..84a30903c2
--- /dev/null
+++ b/mesa 3D driver/src/etnaviv/ci/gitlab-ci.yml	
@@ -0,0 +1,36 @@
+.etnaviv-armhf-test:
+  extends:
+    - .baremetal-test
+    - .use-debian/arm_test
+    - .etnaviv-rules
+  script:
+    - ./install/bare-metal/fastboot.sh
+  variables:
+    BM_CMDLINE: "ip=dhcp console=ttymxc0,115200n8 root=/dev/nfs rw nfsrootdebug init=/init $BM_KERNELARGS"
+    BM_KERNEL: /baremetal-files/zImage
+    BM_ROOTFS: /rootfs-armhf
+    ETNA_MESA_DEBUG: nir
+    FLAKES_CHANNEL: "#etnaviv-ci"
+    MINIO_ARTIFACT_NAME: mesa-armhf
+  needs:
+    - debian/arm_test
+    - job: debian-arm64
+      artifacts: false
+
+.etnaviv-armhf-gc2000:
+  extends:
+    - .etnaviv-armhf-test
+  variables:
+    BM_DTB: /baremetal-files/imx6q-cubox-i.dtb
+    DEQP_EXPECTED_RENDERER: GC2000
+    GPU_VERSION: "etnaviv-gc2000"
+  tags:
+    - etnaviv-gc2000
+
+gc2000_gles2:
+  extends:
+    - .etnaviv-armhf-gc2000
+    - .test-manual-mr
+  variables:
+    HWCI_TEST_SCRIPT: "/install/deqp-runner.sh"
+    DEQP_VER: gles2
diff --git a/mesa 3D driver/src/etnaviv/drm/etnaviv_device.c b/mesa 3D driver/src/etnaviv/drm/etnaviv_device.c
index 87a3e36faf..3e4370b96c 100644
--- a/mesa 3D driver/src/etnaviv/drm/etnaviv_device.c	
+++ b/mesa 3D driver/src/etnaviv/drm/etnaviv_device.c	
@@ -32,12 +32,30 @@
 
 struct etna_device *etna_device_new(int fd)
 {
-	struct etna_device *dev = calloc(sizeof(*dev), 1);
+	struct etna_device *dev;
 	struct drm_etnaviv_param req = {
 		.param = ETNAVIV_PARAM_SOFTPIN_START_ADDR,
 	};
+	drmVersionPtr version;
 	int ret;
 
+	version = drmGetVersion(fd);
+	if (!version) {
+		ERROR_MSG("cannot get version: %s", strerror(errno));
+		return NULL;
+	}
+
+	dev = calloc(sizeof(*dev), 1);
+	if (!dev) {
+		goto out;
+	}
+
+	dev->drm_version = ETNA_DRM_VERSION(version->version_major,
+					    version->version_minor);
+
+out:
+	drmFreeVersion(version);
+
 	if (!dev)
 		return NULL;
 
@@ -125,3 +143,8 @@ bool etnaviv_device_softpin_capable(struct etna_device *dev)
 {
 	return !!dev->use_softpin;
 }
+
+uint32_t etnaviv_device_version(struct etna_device *dev)
+{
+   return dev->drm_version;
+}
diff --git a/mesa 3D driver/src/etnaviv/drm/etnaviv_drmif.h b/mesa 3D driver/src/etnaviv/drm/etnaviv_drmif.h
index 00cf651816..f7cc47f991 100644
--- a/mesa 3D driver/src/etnaviv/drm/etnaviv_drmif.h	
+++ b/mesa 3D driver/src/etnaviv/drm/etnaviv_drmif.h	
@@ -89,12 +89,15 @@ enum etna_param_id {
 /* device functions:
  */
 
+#define ETNA_DRM_VERSION(major, minor) ((major) << 16 | (minor))
+
 struct etna_device *etna_device_new(int fd);
 struct etna_device *etna_device_new_dup(int fd);
 struct etna_device *etna_device_ref(struct etna_device *dev);
 void etna_device_del(struct etna_device *dev);
 int etna_device_fd(struct etna_device *dev);
 bool etnaviv_device_softpin_capable(struct etna_device *dev);
+uint32_t etnaviv_device_version(struct etna_device *dev);
 
 /* gpu functions:
  */
diff --git a/mesa 3D driver/src/etnaviv/drm/etnaviv_priv.h b/mesa 3D driver/src/etnaviv/drm/etnaviv_priv.h
index 65a88c7284..0b2c2c820d 100644
--- a/mesa 3D driver/src/etnaviv/drm/etnaviv_priv.h	
+++ b/mesa 3D driver/src/etnaviv/drm/etnaviv_priv.h	
@@ -65,6 +65,7 @@ struct etna_bo_cache {
 
 struct etna_device {
 	int fd;
+	uint32_t drm_version;
 	int refcnt;
 
 	/* tables to keep track of bo's, to avoid "evil-twin" etna_bo objects:
diff --git a/mesa 3D driver/src/freedreno/.gitlab-ci/reference/afuc_test.asm b/mesa 3D driver/src/freedreno/.gitlab-ci/reference/afuc_test.asm
index f4ad047977..141559df3d 100644
--- a/mesa 3D driver/src/freedreno/.gitlab-ci/reference/afuc_test.asm	
+++ b/mesa 3D driver/src/freedreno/.gitlab-ci/reference/afuc_test.asm	
@@ -1,5 +1,4 @@
 ; a6xx microcode
-; Disassembling microcode: src/freedreno/.gitlab-ci/reference/afuc_test.fw
 ; Version: 01000001
 
         [01000001]  ; nop
diff --git a/mesa 3D driver/src/freedreno/.gitlab-ci/reference/crash.log b/mesa 3D driver/src/freedreno/.gitlab-ci/reference/crash.log
index e4db088ac0..e26db4d25f 100644
--- a/mesa 3D driver/src/freedreno/.gitlab-ci/reference/crash.log	
+++ b/mesa 3D driver/src/freedreno/.gitlab-ci/reference/crash.log	
@@ -1675,7 +1675,7 @@ registers:
 	deadbeef	0xae50: deadbeef
 	deadbeef	0xae51: deadbeef
 	deadbeef	0xae52: deadbeef
-	00000000	TPL1_UNKNOWN_B600: 0
+	00000000	TPL1_DBG_ECO_CNTL: 0
 	00000001	TPL1_ADDR_MODE_CNTL: ADDR_64B
 	00000004	TPL1_NC_MODE_CNTL: { LOWER_BIT = 2 | UPPER_BIT = 0 }
 	00000000	TPL1_UNKNOWN_B605: 0
@@ -3435,7 +3435,7 @@ shader-blocks:
       size: 2048
 	:0:0000:0000[0600e824x_a018c54ax] no match: 0600e824a018c54a
 	:7:0001:0001[edc6145bx_11fa09c3x] no match: edc6145b11fa09c3
-	:2:0002:0002[41440087x_008c504ax] ceil.f hr33.w, (neg)hc18.z	; dontcare bits in ceil.f: 40000008c0000
+	:2:0002:0002[41440087x_008c504ax] ceil.f hr33.w, (neg)hc18.z	; dontcare bits in ceil.f: 00040000008c0000
 	:0:0003:0003[14183488x_d5c04509x] no match: 14183488d5c04509
 	:5:0004:0004[a52373bdx_8ff7c071x] no match: a52373bd8ff7c071
 	:1:0005:0005[39301c43x_1d826d16x] no match: 39301c431d826d16
@@ -4114,11 +4114,11 @@ shader-blocks:
   - type: A6XX_HLSQ_INST_RAM
     - bank: 0
       size: 2048
-	:2:0000:0000[40846422x_d81251c5x] (sat)(ul)sign.f r8.z, (neg)hc113.y	; dontcare bits in sign.f: 40000d8120000
+	:2:0000:0000[40846422x_d81251c5x] (sat)(ul)sign.f r8.z, (neg)hc113.y	; dontcare bits in sign.f: 00040000d8120000
 	:4:0001:0001[938a16e2x_520c369ax] no match: 938a16e2520c369a
 	:1:0002:0002[200a00c1x_094864d2x] no match: 200a00c1094864d2
 	:2:0003:0003[44109084x_4a201507x] no match: 441090844a201507
-	:4:0004:0004[882fadabx_14a391b1x] (jp)(sat)(rpt1)(ul)rsq hr42.w, (abs)(r)hc108.y	; dontcare bits in rsq: f800014a30000
+	:4:0004:0004[882fadabx_14a391b1x] (jp)(sat)(rpt1)(ul)rsq hr42.w, (abs)(r)hc108.y	; dontcare bits in rsq: 000f800014a30000
 	:3:0005:0006[6060f068x_7106601ax] no match: 6060f0687106601a
 	-----------------------------------------------
 	8192 (0x2000) bytes
@@ -4638,12 +4638,12 @@ shader-blocks:
       size: 2048
 	:0:0000:0000[00000000x_00003002x] nop
 	:0:0001:0001[00000000x_00000000x] nop
-	:6:0002:0002[deadbeefx_deadbeefx] (sy)(jp)atomic.xor.typed.4d.u8.4.l r59.w, l[r45.z], -34	; dontcare bits in atomic.xor: ee, WARNING: unexpected bits[41:48] in #instruction-cat6-a3xx-atomic: 0xdf vs 0x0, WARNING: unexpected bits[53:53] in #instruction-cat6-a3xx-atomic: 0x1 vs 0x0
-	:6:0003:0003[deadbeefx_deadbeefx] (sy)(jp)atomic.xor.typed.4d.u8.4.l r59.w, l[r45.z], -34	; dontcare bits in atomic.xor: ee, WARNING: unexpected bits[41:48] in #instruction-cat6-a3xx-atomic: 0xdf vs 0x0, WARNING: unexpected bits[53:53] in #instruction-cat6-a3xx-atomic: 0x1 vs 0x0
-	:6:0004:0004[deadbeefx_deadbeefx] (sy)(jp)atomic.xor.typed.4d.u8.4.l r59.w, l[r45.z], -34	; dontcare bits in atomic.xor: ee, WARNING: unexpected bits[41:48] in #instruction-cat6-a3xx-atomic: 0xdf vs 0x0, WARNING: unexpected bits[53:53] in #instruction-cat6-a3xx-atomic: 0x1 vs 0x0
-	:6:0005:0005[deadbeefx_deadbeefx] (sy)(jp)atomic.xor.typed.4d.u8.4.l r59.w, l[r45.z], -34	; dontcare bits in atomic.xor: ee, WARNING: unexpected bits[41:48] in #instruction-cat6-a3xx-atomic: 0xdf vs 0x0, WARNING: unexpected bits[53:53] in #instruction-cat6-a3xx-atomic: 0x1 vs 0x0
-	:6:0006:0006[deadbeefx_deadbeefx] (sy)(jp)atomic.xor.typed.4d.u8.4.l r59.w, l[r45.z], -34	; dontcare bits in atomic.xor: ee, WARNING: unexpected bits[41:48] in #instruction-cat6-a3xx-atomic: 0xdf vs 0x0, WARNING: unexpected bits[53:53] in #instruction-cat6-a3xx-atomic: 0x1 vs 0x0
-	:6:0007:0007[deadbeefx_deadbeefx] (sy)(jp)atomic.xor.typed.4d.u8.4.l r59.w, l[r45.z], -34	; dontcare bits in atomic.xor: ee, WARNING: unexpected bits[41:48] in #instruction-cat6-a3xx-atomic: 0xdf vs 0x0, WARNING: unexpected bits[53:53] in #instruction-cat6-a3xx-atomic: 0x1 vs 0x0
+	:6:0002:0002[deadbeefx_deadbeefx] (sy)(jp)atomic.xor.typed.4d.u8.4.l r59.w, l[r45.z], 222	; dontcare bits in atomic.xor: 00000000000000ee, WARNING: unexpected bits[41:48] in #instruction-cat6-a3xx-atomic: 00000000000000df vs 0000000000000000, WARNING: unexpected bits[53:53] in #instruction-cat6-a3xx-atomic: 0000000000000001 vs 0000000000000000
+	:6:0003:0003[deadbeefx_deadbeefx] (sy)(jp)atomic.xor.typed.4d.u8.4.l r59.w, l[r45.z], 222	; dontcare bits in atomic.xor: 00000000000000ee, WARNING: unexpected bits[41:48] in #instruction-cat6-a3xx-atomic: 00000000000000df vs 0000000000000000, WARNING: unexpected bits[53:53] in #instruction-cat6-a3xx-atomic: 0000000000000001 vs 0000000000000000
+	:6:0004:0004[deadbeefx_deadbeefx] (sy)(jp)atomic.xor.typed.4d.u8.4.l r59.w, l[r45.z], 222	; dontcare bits in atomic.xor: 00000000000000ee, WARNING: unexpected bits[41:48] in #instruction-cat6-a3xx-atomic: 00000000000000df vs 0000000000000000, WARNING: unexpected bits[53:53] in #instruction-cat6-a3xx-atomic: 0000000000000001 vs 0000000000000000
+	:6:0005:0005[deadbeefx_deadbeefx] (sy)(jp)atomic.xor.typed.4d.u8.4.l r59.w, l[r45.z], 222	; dontcare bits in atomic.xor: 00000000000000ee, WARNING: unexpected bits[41:48] in #instruction-cat6-a3xx-atomic: 00000000000000df vs 0000000000000000, WARNING: unexpected bits[53:53] in #instruction-cat6-a3xx-atomic: 0000000000000001 vs 0000000000000000
+	:6:0006:0006[deadbeefx_deadbeefx] (sy)(jp)atomic.xor.typed.4d.u8.4.l r59.w, l[r45.z], 222	; dontcare bits in atomic.xor: 00000000000000ee, WARNING: unexpected bits[41:48] in #instruction-cat6-a3xx-atomic: 00000000000000df vs 0000000000000000, WARNING: unexpected bits[53:53] in #instruction-cat6-a3xx-atomic: 0000000000000001 vs 0000000000000000
+	:6:0007:0007[deadbeefx_deadbeefx] (sy)(jp)atomic.xor.typed.4d.u8.4.l r59.w, l[r45.z], 222	; dontcare bits in atomic.xor: 00000000000000ee, WARNING: unexpected bits[41:48] in #instruction-cat6-a3xx-atomic: 00000000000000df vs 0000000000000000, WARNING: unexpected bits[53:53] in #instruction-cat6-a3xx-atomic: 0000000000000001 vs 0000000000000000
 	-----------------------------------------------
 	8192 (0x2000) bytes
 	000000: 00003002 00000000 00000000 00000000	|.0..............|
@@ -5382,7 +5382,7 @@ clusters:
 	00000000	GRAS_CL_Z_CLAMP[0xe].MAX: 0.000000
 	00000000	GRAS_CL_Z_CLAMP[0xf].MIN: 0.000000
 	00000000	GRAS_CL_Z_CLAMP[0xf].MAX: 0.000000
-	00000010	GRAS_SU_CNTL: { LINEHALFWIDTH = 0.500000 }
+	00000010	GRAS_SU_CNTL: { LINEHALFWIDTH = 0.500000 | LINE_MODE = BRESENHAM }
 	00000000	GRAS_SU_POINT_MINMAX: { MIN = 0.000000 | MAX = 0.000000 }
 	00000000	GRAS_SU_POINT_SIZE: 0.000000
 	00000000	GRAS_SU_DEPTH_PLANE_CNTL: { Z_MODE = A6XX_EARLY_Z }
@@ -5471,7 +5471,7 @@ clusters:
 	00000000	GRAS_SC_WINDOW_SCISSOR_BR: { X = 0 | Y = 0 }
 	00000000	GRAS_LRZ_CNTL: { 0 }
 	00000000	GRAS_LRZ_PS_INPUT_CNTL: { FRAGCOORDSAMPLEMODE = FRAGCOORD_CENTER }
-	00000000	GRAS_2D_BLIT_INFO: { COLOR_FORMAT = 0 }
+	00000000	GRAS_LRZ_MRT_BUF_INFO_0: { COLOR_FORMAT = 0 }
 	00000000	GRAS_LRZ_BUFFER_BASE: 0
 	00000000	GRAS_LRZ_BUFFER_BASE_HI: 0
 	00000000	GRAS_LRZ_BUFFER_PITCH: { PITCH = 0 | ARRAY_PITCH = 0 }
@@ -5627,7 +5627,7 @@ clusters:
 	00000000	GRAS_CL_Z_CLAMP[0xe].MAX: 0.000000
 	00000000	GRAS_CL_Z_CLAMP[0xf].MIN: 0.000000
 	00000000	GRAS_CL_Z_CLAMP[0xf].MAX: 0.000000
-	00000010	GRAS_SU_CNTL: { LINEHALFWIDTH = 0.500000 }
+	00000010	GRAS_SU_CNTL: { LINEHALFWIDTH = 0.500000 | LINE_MODE = BRESENHAM }
 	00000000	GRAS_SU_POINT_MINMAX: { MIN = 0.000000 | MAX = 0.000000 }
 	00000000	GRAS_SU_POINT_SIZE: 0.000000
 	00000000	GRAS_SU_DEPTH_PLANE_CNTL: { Z_MODE = A6XX_EARLY_Z }
@@ -5716,7 +5716,7 @@ clusters:
 	00000000	GRAS_SC_WINDOW_SCISSOR_BR: { X = 0 | Y = 0 }
 	00000000	GRAS_LRZ_CNTL: { 0 }
 	00000000	GRAS_LRZ_PS_INPUT_CNTL: { FRAGCOORDSAMPLEMODE = FRAGCOORD_CENTER }
-	00000000	GRAS_2D_BLIT_INFO: { COLOR_FORMAT = 0 }
+	00000000	GRAS_LRZ_MRT_BUF_INFO_0: { COLOR_FORMAT = 0 }
 	00000000	GRAS_LRZ_BUFFER_BASE: 0
 	00000000	GRAS_LRZ_BUFFER_BASE_HI: 0
 	00000000	GRAS_LRZ_BUFFER_PITCH: { PITCH = 0 | ARRAY_PITCH = 0 }
diff --git a/mesa 3D driver/src/freedreno/.gitlab-ci/reference/dEQP-GLES2.functional.texture.specification.basic_teximage2d.rgba16f_2d.log b/mesa 3D driver/src/freedreno/.gitlab-ci/reference/dEQP-GLES2.functional.texture.specification.basic_teximage2d.rgba16f_2d.log
index e8a201d2e7..e1163bc36d 100644
--- a/mesa 3D driver/src/freedreno/.gitlab-ci/reference/dEQP-GLES2.functional.texture.specification.basic_teximage2d.rgba16f_2d.log	
+++ b/mesa 3D driver/src/freedreno/.gitlab-ci/reference/dEQP-GLES2.functional.texture.specification.basic_teximage2d.rgba16f_2d.log	
@@ -1,4 +1,3 @@
-Reading src/freedreno/.gitlab-ci/traces/dEQP-GLES2.functional.texture.specification.basic_teximage2d.rgba16f_2d.rd.gz...
 gpu_id: 201
 cmd: deqp-gles2/185: fence=1250
 ############################################################
diff --git a/mesa 3D driver/src/freedreno/.gitlab-ci/reference/dEQP-VK.draw.indirect_draw.indexed.indirect_draw_count.triangle_list.log b/mesa 3D driver/src/freedreno/.gitlab-ci/reference/dEQP-VK.draw.indirect_draw.indexed.indirect_draw_count.triangle_list.log
index cd110261a5..528661eeed 100644
--- a/mesa 3D driver/src/freedreno/.gitlab-ci/reference/dEQP-VK.draw.indirect_draw.indexed.indirect_draw_count.triangle_list.log	
+++ b/mesa 3D driver/src/freedreno/.gitlab-ci/reference/dEQP-VK.draw.indirect_draw.indexed.indirect_draw_count.triangle_list.log	
@@ -1,4 +1,3 @@
-Reading src/freedreno/.gitlab-ci/traces/dEQP-VK.draw.indirect_draw.indexed.indirect_draw_count.triangle_list.rd.gz...
 gpu_id: 640
 cmd: deqp-vk/74711: fence=247337
 ############################################################
@@ -30,8 +29,8 @@ t4		write SP_PERFCTR_ENABLE (ae0f)
 t4		write TPL1_UNKNOWN_B605 (b605)
 			TPL1_UNKNOWN_B605: 68
 000000000105803c:		0000: 40b60501 00000044
-t4		write TPL1_UNKNOWN_B600 (b600)
-			TPL1_UNKNOWN_B600: 0x100000
+t4		write TPL1_DBG_ECO_CNTL (b600)
+			TPL1_DBG_ECO_CNTL: 0x100000
 0000000001058044:		0000: 40b60001 00100000
 t4		write HLSQ_UNKNOWN_BE00 (be00)
 			HLSQ_UNKNOWN_BE00: 0x80
@@ -359,7 +358,7 @@ t7		opcode: CP_BLIT (2c) (2 dwords)
 !+	01011000		SP_TP_BORDER_COLOR_BASE_ADDR: 0x1011000
  +	00000000		SP_TP_BORDER_COLOR_BASE_ADDR_HI: 0
 !+	000000a2		SP_TP_MODE_CNTL: { ISAMMODE = ISAMMODE_GL | UNK3 = 0x28 }
-!+	00100000		TPL1_UNKNOWN_B600: 0x100000
+!+	00100000		TPL1_DBG_ECO_CNTL: 0x100000
 !+	00000044		TPL1_UNKNOWN_B605: 68
 !+	000000fc		HLSQ_CONTROL_5_REG: { LINELENGTHREGID = r63.x | FOVEATIONQUALITYREGID = r0.x }
 !+	000fffff		HLSQ_INVALIDATE_CMD: { VS_STATE | HS_STATE | DS_STATE | GS_STATE | FS_STATE | CS_STATE | CS_IBO | GFX_IBO | CS_SHARED_CONST | GFX_SHARED_CONST | CS_BINDLESS = 0x1f | GFX_BINDLESS = 0x1f }
@@ -1215,7 +1214,7 @@ t4					write GRAS_SC_SCREEN_SCISSOR[0].TL (80b0)
 			enable_mask: 0x7
 0000000001054730:				0000: 40809001 00000814
 t4					write GRAS_SU_CNTL (8090)
-						GRAS_SU_CNTL: { FRONT_CW | LINEHALFWIDTH = 0.500000 | POLY_OFFSET }
+						GRAS_SU_CNTL: { FRONT_CW | LINEHALFWIDTH = 0.500000 | POLY_OFFSET | LINE_MODE = BRESENHAM }
 0000000001054730:					0000: 40809001 00000814
 			group_id: 22
 			count: 4
@@ -1324,7 +1323,7 @@ t7			opcode: CP_DRAW_INDIRECT_MULTI (2a) (12 dwords)
 !+	3f800000			GRAS_CL_VPORT[0].ZSCALE: 1.000000
  +	00000000			GRAS_CL_Z_CLAMP[0].MIN: 0.000000
 !+	3f800000			GRAS_CL_Z_CLAMP[0].MAX: 1.000000
-!+	00000814			GRAS_SU_CNTL: { FRONT_CW | LINEHALFWIDTH = 0.500000 | POLY_OFFSET }
+!+	00000814			GRAS_SU_CNTL: { FRONT_CW | LINEHALFWIDTH = 0.500000 | POLY_OFFSET | LINE_MODE = BRESENHAM }
 !+	ffc00001			GRAS_SU_POINT_MINMAX: { MIN = 0.062500 | MAX = 4092.000000 }
 !+	00000010			GRAS_SU_POINT_SIZE: 1.000000
  +	00000000			GRAS_SU_DEPTH_PLANE_CNTL: { Z_MODE = A6XX_EARLY_Z }
diff --git a/mesa 3D driver/src/freedreno/.gitlab-ci/reference/es2gears-a320.log b/mesa 3D driver/src/freedreno/.gitlab-ci/reference/es2gears-a320.log
index f40871704f..a8f46dc8b8 100644
--- a/mesa 3D driver/src/freedreno/.gitlab-ci/reference/es2gears-a320.log	
+++ b/mesa 3D driver/src/freedreno/.gitlab-ci/reference/es2gears-a320.log	
@@ -1,4 +1,3 @@
-Reading src/freedreno/.gitlab-ci/traces/es2gears-a320.rd.gz...
 gpu_id: 330
 cmd: es2gears/628: fence=276
 ############################################################
diff --git a/mesa 3D driver/src/freedreno/.gitlab-ci/reference/fd-clouds.log b/mesa 3D driver/src/freedreno/.gitlab-ci/reference/fd-clouds.log
index 128166461c..a5d53c3f24 100644
--- a/mesa 3D driver/src/freedreno/.gitlab-ci/reference/fd-clouds.log	
+++ b/mesa 3D driver/src/freedreno/.gitlab-ci/reference/fd-clouds.log	
@@ -1,4 +1,3 @@
-Reading src/freedreno/.gitlab-ci/traces/fd-clouds.rd.gz...
 gpu_id: 630
 cmd: null_platform_t/2995: fence=1855
 ############################################################
@@ -27,8 +26,8 @@ t4		write SP_PERFCTR_ENABLE (ae0f)
 t4		write TPL1_UNKNOWN_B605 (b605)
 			TPL1_UNKNOWN_B605: 68
 0000000001d91034:		0000: 40b60501 00000044
-t4		write TPL1_UNKNOWN_B600 (b600)
-			TPL1_UNKNOWN_B600: 0x100000
+t4		write TPL1_DBG_ECO_CNTL (b600)
+			TPL1_DBG_ECO_CNTL: 0x100000
 0000000001d9103c:		0000: 40b60001 00100000
 t4		write HLSQ_UNKNOWN_BE00 (be00)
 			HLSQ_UNKNOWN_BE00: 0x80
@@ -820,7 +819,7 @@ t4					write GRAS_CL_CNTL (8000)
 						GRAS_VS_CL_CNTL: { CLIP_MASK = 0 | CULL_MASK = 0 }
 0000000001123000:					0000: 40800002 00000080 00000000
 t4					write GRAS_SU_CNTL (8090)
-						GRAS_SU_CNTL: { FRONT_CW | LINEHALFWIDTH = 0.500000 }
+						GRAS_SU_CNTL: { FRONT_CW | LINEHALFWIDTH = 0.500000 | LINE_MODE = BRESENHAM }
 000000000112300c:					0000: 40809001 00000014
 t4					write GRAS_SU_POINT_MINMAX (8091)
 						GRAS_SU_POINT_MINMAX: { MIN = 1.000000 | MAX = 1.000000 }
@@ -931,7 +930,7 @@ t7			opcode: CP_DRAW_INDX_OFFSET (38) (4 dwords)
 !+	44340000			GRAS_CL_VPORT[0].YSCALE: 720.000000
 !+	3f000000			GRAS_CL_VPORT[0].ZOFFSET: 0.500000
 !+	3f000000			GRAS_CL_VPORT[0].ZSCALE: 0.500000
-!+	00000014			GRAS_SU_CNTL: { FRONT_CW | LINEHALFWIDTH = 0.500000 }
+!+	00000014			GRAS_SU_CNTL: { FRONT_CW | LINEHALFWIDTH = 0.500000 | LINE_MODE = BRESENHAM }
 !+	00100010			GRAS_SU_POINT_MINMAX: { MIN = 1.000000 | MAX = 1.000000 }
 !+	00000010			GRAS_SU_POINT_SIZE: 1.000000
  +	00000000			GRAS_SU_DEPTH_PLANE_CNTL: { Z_MODE = A6XX_EARLY_Z }
@@ -1133,7 +1132,7 @@ t7			opcode: CP_DRAW_INDX_OFFSET (38) (4 dwords)
  +	00000000			SP_TP_SAMPLE_CONFIG: { 0 }
  +	00000000			SP_TP_WINDOW_OFFSET: { X = 0 | Y = 0 }
 !+	000000a2			SP_TP_MODE_CNTL: { ISAMMODE = ISAMMODE_GL | UNK3 = 0x28 }
-!+	00100000			TPL1_UNKNOWN_B600: 0x100000
+!+	00100000			TPL1_DBG_ECO_CNTL: 0x100000
 !+	00000044			TPL1_UNKNOWN_B605: 68
 !+	00000100			HLSQ_VS_CNTL: { CONSTLEN = 0 | ENABLED }
  +	00000000			HLSQ_HS_CNTL: { CONSTLEN = 0 }
@@ -1996,10 +1995,10 @@ t4					write VPC_VS_LAYER_CNTL (9104)
 						VPC_VS_LAYER_CNTL: { LAYERLOC = 255 | VIEWLOC = 255 }
 00000000011200d0:					0000: 48910401 0000ffff
 t4					write GRAS_CNTL (8005)
-						GRAS_CNTL: { SIZE | COORD_MASK = 0xf }
+						GRAS_CNTL: { IJ_LINEAR_PIXEL | COORD_MASK = 0xf }
 00000000011200d8:					0000: 40800501 000003c8
 t4					write RB_RENDER_CONTROL0 (8809)
-						RB_RENDER_CONTROL0: { SIZE | COORD_MASK = 0xf }
+						RB_RENDER_CONTROL0: { IJ_LINEAR_PIXEL | COORD_MASK = 0xf }
 						RB_RENDER_CONTROL1: { FRAGCOORDSAMPLEMODE = FRAGCOORD_CENTER }
 00000000011200e0:					0000: 48880902 000003c8 00000000
 t4					write RB_SAMPLE_CNTL (8810)
@@ -5061,7 +5060,7 @@ t4					write GRAS_CL_CNTL (8000)
 						GRAS_VS_CL_CNTL: { CLIP_MASK = 0 | CULL_MASK = 0 }
 0000000001123000:					0000: 40800002 00000080 00000000
 t4					write GRAS_SU_CNTL (8090)
-						GRAS_SU_CNTL: { FRONT_CW | LINEHALFWIDTH = 0.500000 }
+						GRAS_SU_CNTL: { FRONT_CW | LINEHALFWIDTH = 0.500000 | LINE_MODE = BRESENHAM }
 000000000112300c:					0000: 40809001 00000014
 t4					write GRAS_SU_POINT_MINMAX (8091)
 						GRAS_SU_POINT_MINMAX: { MIN = 1.000000 | MAX = 1.000000 }
@@ -5174,7 +5173,7 @@ t7			opcode: CP_DRAW_INDX_OFFSET (38) (4 dwords)
 			:0,1,17,2
  +	00000080			GRAS_CL_CNTL: { VP_CLIP_CODE_IGNORE }
  +	00000000			GRAS_VS_CL_CNTL: { CLIP_MASK = 0 | CULL_MASK = 0 }
-!+	000003c8			GRAS_CNTL: { SIZE | COORD_MASK = 0xf }
+!+	000003c8			GRAS_CNTL: { IJ_LINEAR_PIXEL | COORD_MASK = 0xf }
  +	00057537			GRAS_CL_GUARDBAND_CLIP_ADJ: { HORZ = 311 | VERT = 349 }
  +	44870000			GRAS_CL_VPORT[0].XOFFSET: 1080.000000
  +	44870000			GRAS_CL_VPORT[0].XSCALE: 1080.000000
@@ -5182,7 +5181,7 @@ t7			opcode: CP_DRAW_INDX_OFFSET (38) (4 dwords)
  +	44340000			GRAS_CL_VPORT[0].YSCALE: 720.000000
  +	3f000000			GRAS_CL_VPORT[0].ZOFFSET: 0.500000
  +	3f000000			GRAS_CL_VPORT[0].ZSCALE: 0.500000
- +	00000014			GRAS_SU_CNTL: { FRONT_CW | LINEHALFWIDTH = 0.500000 }
+ +	00000014			GRAS_SU_CNTL: { FRONT_CW | LINEHALFWIDTH = 0.500000 | LINE_MODE = BRESENHAM }
  +	00100010			GRAS_SU_POINT_MINMAX: { MIN = 1.000000 | MAX = 1.000000 }
  +	00000010			GRAS_SU_POINT_SIZE: 1.000000
  +	00000000			GRAS_SU_DEPTH_PLANE_CNTL: { Z_MODE = A6XX_EARLY_Z }
@@ -5195,7 +5194,7 @@ t7			opcode: CP_DRAW_INDX_OFFSET (38) (4 dwords)
  +	059f086f			GRAS_SC_VIEWPORT_SCISSOR[0].BR: { X = 2159 | Y = 1439 }
  +	00000000			GRAS_LRZ_PS_INPUT_CNTL: { FRAGCOORDSAMPLEMODE = FRAGCOORD_CENTER }
  +	00000000			GRAS_SAMPLE_CNTL: { 0 }
-!+	000003c8			RB_RENDER_CONTROL0: { SIZE | COORD_MASK = 0xf }
+!+	000003c8			RB_RENDER_CONTROL0: { IJ_LINEAR_PIXEL | COORD_MASK = 0xf }
  +	00000000			RB_RENDER_CONTROL1: { FRAGCOORDSAMPLEMODE = FRAGCOORD_CENTER }
  +	00000000			RB_FS_OUTPUT_CNTL0: { 0 }
 !+	00000001			RB_FS_OUTPUT_CNTL1: { MRT = 1 }
diff --git a/mesa 3D driver/src/freedreno/.gitlab-ci/reference/glxgears-a420.log b/mesa 3D driver/src/freedreno/.gitlab-ci/reference/glxgears-a420.log
index 91664ba757..10c32649b6 100644
--- a/mesa 3D driver/src/freedreno/.gitlab-ci/reference/glxgears-a420.log	
+++ b/mesa 3D driver/src/freedreno/.gitlab-ci/reference/glxgears-a420.log	
@@ -1,4 +1,3 @@
-Reading src/freedreno/.gitlab-ci/traces/glxgears-a420.rd.gz...
 gpu_id: 420
 cmd: X/23360: fence=1029603
 cmd: glxgears/23375: fence=1029604
diff --git a/mesa 3D driver/src/freedreno/.gitlab-ci/reference/shadow.log b/mesa 3D driver/src/freedreno/.gitlab-ci/reference/shadow.log
index 47c7f2e2e5..2b619e5bfd 100644
--- a/mesa 3D driver/src/freedreno/.gitlab-ci/reference/shadow.log	
+++ b/mesa 3D driver/src/freedreno/.gitlab-ci/reference/shadow.log	
@@ -1,6 +1,4 @@
 Analyzing Data...
-Reading src/freedreno/.gitlab-ci/traces/shadow.rd.gz...
-Parsing src/freedreno/.gitlab-ci/traces/shadow.rd.gz
 
 Blit:
 -----
diff --git a/mesa 3D driver/src/freedreno/afuc/disasm.c b/mesa 3D driver/src/freedreno/afuc/disasm.c
index d7b322a102..5be0670d90 100644
--- a/mesa 3D driver/src/freedreno/afuc/disasm.c	
+++ b/mesa 3D driver/src/freedreno/afuc/disasm.c	
@@ -402,7 +402,7 @@ disasm_instr(uint32_t *instrs, unsigned pc)
          printf(" << %u", instr->movi.shift);
 
       if ((instr->movi.dst == REG_ADDR) && (instr->movi.shift >= 16)) {
-         uint32_t val = instr->movi.uimm << instr->movi.shift;
+         uint32_t val = (uint32_t)instr->movi.uimm << (uint32_t)instr->movi.shift;
          val &= ~0x40000;  /* b18 seems to be a flag */
 
          if ((val & 0x00ffffff) == 0) {
@@ -439,7 +439,7 @@ disasm_instr(uint32_t *instrs, unsigned pc)
          }
       }
 
-      print_gpu_reg(instr->movi.uimm << instr->movi.shift);
+      print_gpu_reg((uint32_t)instr->movi.uimm << (uint32_t)instr->movi.shift);
 
       break;
    }
@@ -893,9 +893,10 @@ main(int argc, char **argv)
    uint32_t gpu_id = 0;
    size_t sz;
    int c, ret;
+   bool unit_test = false;
 
    /* Argument parsing: */
-   while ((c = getopt(argc, argv, "g:vce")) != -1) {
+   while ((c = getopt(argc, argv, "g:vceu")) != -1) {
       switch (c) {
       case 'g':
          gpu_id = atoi(optarg);
@@ -910,6 +911,9 @@ main(int argc, char **argv)
          emulator = true;
          verbose  = true;
          break;
+      case 'u':
+         unit_test = true;
+         break;
       default:
          usage();
       }
@@ -956,7 +960,8 @@ main(int argc, char **argv)
 
    buf = (uint32_t *)os_read_file(file, &sz);
 
-   printf("; Disassembling microcode: %s\n", file);
+   if (!unit_test)
+      printf("; Disassembling microcode: %s\n", file);
    printf("; Version: %08x\n\n", buf[1]);
 
    if (gpuver < 6) {
diff --git a/mesa 3D driver/src/freedreno/afuc/meson.build b/mesa 3D driver/src/freedreno/afuc/meson.build
index 878d4e74d8..b779b2dd39 100644
--- a/mesa 3D driver/src/freedreno/afuc/meson.build	
+++ b/mesa 3D driver/src/freedreno/afuc/meson.build	
@@ -18,6 +18,10 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+if with_tests
+  diff = find_program('diff')
+endif
+
 afuc_parser = custom_target(
   'parser.[ch]',
   input: 'parser.y',
@@ -55,31 +59,60 @@ asm = executable(
   build_by_default : with_tools.contains('freedreno'),
   install: install_fd_decode_tools,
 )
+if with_tests
+  asm_fw = custom_target('afuc_test.fw',
+    output: 'afuc_test.fw',
+    command: [asm, '-g', '6', files('../.gitlab-ci/traces/afuc_test.asm'), '@OUTPUT@'],
+  )
+  test('afuc-asm',
+    diff,
+    args: ['-u', files('../.gitlab-ci/reference/afuc_test.fw'), asm_fw],
+    suite: 'freedreno',
+    workdir: meson.source_root()
+  )
+endif
 
-disasm = executable(
-  'afuc-disasm',
-  [
-    'disasm.c',
-    'emu.c',
-    'emu.h',
-    'emu-ds.c',
-    'emu-regs.c',
-    'emu-ui.c',
-    'util.c',
-    'util.h',
-  ],
-  include_directories: [
-    inc_freedreno,
-    inc_freedreno_rnn,
-    inc_include,
-    inc_src,
-    inc_util,
-  ],
-  link_with: [
-    libfreedreno_rnn,
-  ],
-  dependencies: [
-  ],
-  build_by_default : with_tools.contains('freedreno'),
-  install: install_fd_decode_tools,
-)
+# Disasm requires mmaping >4GB
+if cc.sizeof('size_t') > 4
+  disasm = executable(
+    'afuc-disasm',
+    [
+      'disasm.c',
+      'emu.c',
+      'emu.h',
+      'emu-ds.c',
+      'emu-regs.c',
+      'emu-ui.c',
+      'util.c',
+      'util.h',
+    ],
+    include_directories: [
+      inc_freedreno,
+      inc_freedreno_rnn,
+      inc_include,
+      inc_src,
+      inc_util,
+    ],
+    link_with: [
+      libfreedreno_rnn,
+    ],
+    dependencies: [
+    ],
+    build_by_default : with_tools.contains('freedreno'),
+    install: install_fd_decode_tools,
+  )
+
+  if with_tests
+    disasm_fw = custom_target('afuc_test.asm',
+      output: 'afuc_test.asm',
+      command: [disasm, '-u', files('../.gitlab-ci/reference/afuc_test.fw'), '-g', '630'],
+      capture: true
+    )
+    test('afuc-disasm',
+      diff,
+      args: ['-u', files('../.gitlab-ci/reference/afuc_test.asm'), disasm_fw],
+      suite: 'freedreno',
+      workdir: meson.source_root()
+    )
+  endif
+endif
diff --git a/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a307.toml b/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a307.toml
index 6494583917..00d266e831 100644
--- a/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a307.toml	
+++ b/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a307.toml	
@@ -7,6 +7,8 @@ deqp_args = [
     "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
     "--deqp-gl-config-name=rgba8888d24s8ms0",
 ]
+version_check = "GL ES 3.0.*git"
+renderer_check = "FD307"
 
 [[deqp]]
 deqp = "/deqp/modules/gles3/deqp-gles3"
diff --git a/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a530-fails.txt b/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a530-fails.txt
index fa46db2ce2..8821c94ae8 100644
--- a/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a530-fails.txt	
+++ b/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a530-fails.txt	
@@ -129,30 +129,14 @@ KHR-GLES2.core.internalformat.copy_tex_image.alpha,Fail
 KHR-GLES3.core.internalformat.copy_tex_image.alpha,Fail
 KHR-GLES31.core.internalformat.copy_tex_image.alpha,Fail
 
-# "../src/freedreno/ir3/ir3_ra.c:132: interval_insert: Assertion `(interval->reg->flags & IR3_REG_HALF) == (right->reg->flags & IR3_REG_HALF)' failed."
-KHR-GLES31.core.arrays_of_arrays.InteractionArgumentAliasing1,Crash
-KHR-GLES31.core.arrays_of_arrays.InteractionArgumentAliasing2,Crash
-KHR-GLES31.core.arrays_of_arrays.InteractionArgumentAliasing3,Crash
-KHR-GLES31.core.arrays_of_arrays.InteractionArgumentAliasing4,Crash
-KHR-GLES31.core.arrays_of_arrays.InteractionArgumentAliasing5,Crash
-KHR-GLES31.core.arrays_of_arrays.InteractionArgumentAliasing6,Crash
-
 # "Invalid array size was returned. at es31cArrayOfArraysTests.cpp:4779"
 # msm 900000.mdss: [drm:a5xx_irq] *ERROR* gpu fault ring 0 fence 2c54ef status E40801C1 rb 0162/0162 ib1 000000000104B000/0000 ib2 000000000104C000/0000
 KHR-GLES31.core.arrays_of_arrays.InteractionFunctionCalls1,Fail
 KHR-GLES31.core.arrays_of_arrays.InteractionFunctionCalls2,Fail
 
-# "gl_NumWorkGroups: Invalid data at index 0"
-KHR-GLES31.core.compute_shader.built-in-variables,Fail
-
-# "Got red: 1, expected 0.00392157, at (1, 0)"
-KHR-GLES31.core.compute_shader.resource-image,Fail
-
 # "../src/gallium/drivers/freedreno/a5xx/fd5_emit.c:82: fd5_emit_const_bo: Assertion `dst_off % 4 == 0' failed."
-KHR-GLES31.core.draw_indirect.advanced-twoPass-transformFeedback-arrays,Crash
-KHR-GLES31.core.draw_indirect.advanced-twoPass-transformFeedback-elements,Crash
-KHR-GLES31.core.draw_indirect.basic-drawArrays-vertexIds,Crash
-KHR-GLES31.core.draw_indirect.basic-drawElements-vertexIds,Crash
+KHR-GLES31.core.draw_indirect.advanced-twoPass-transformFeedback-arrays,Fail
+KHR-GLES31.core.draw_indirect.advanced-twoPass-transformFeedback-elements,Fail
 
 # "drawTestCompute failed expected: RGBA(4, 3, 2, 1) actual: RGBA(0, 255, 0, 255)"
 KHR-GLES31.core.layout_binding.sampler2DArray_layout_binding_texture_ComputeShader,Fail
diff --git a/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a530-flakes.txt b/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a530-flakes.txt
index ac194d19c5..976a556ce9 100644
--- a/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a530-flakes.txt	
+++ b/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a530-flakes.txt	
@@ -13,6 +13,9 @@ dEQP-GLES3.functional.transform_feedback.*
 
 dEQP-GLES3.functional.fragment_ops.interaction.basic_shader.70
 
+# First appeared 2021-06-11 on an unrelated MR.
+dEQP-GLES31.functional.image_load_store.3d.load_store.*_single_layer
+
 # These are in the xfails list (they usually do), but the random
 # behavior occasionally results in UnexpectedPass results.
 dEQP-GLES31.functional.separate_shader.random.99
@@ -27,6 +30,23 @@ dEQP-GLES31.functional.separate_shader.interface.same_name_vertex_centroid_fragm
 dEQP-GLES31.functional.texture.border_clamp.*
 KHR-GLES31.core.texture_border_clamp.*
 
+# Occasionally passes
+KHR-GLES31.core.arrays_of_arrays.InteractionFunctionCalls1
+KHR-GLES31.core.arrays_of_arrays.InteractionFunctionCalls2
+
+# Occasionally passes
+KHR-GLES31.core.layout_binding.buffer_layout_binding_atomicAdd_ComputeShader
+KHR-GLES31.core.layout_binding.buffer_layout_binding_atomicAdd_FragmentShader
+
+# " Counter value is 1024 should be 896."
+# 1. Create atomic counter buffers and init them with start values.
+# 2. Increment (decrement) buffer values in the shader.
+# 3. Map buffers with MapBufferRange command. Increment (decrement) buffer values manually.
+# 4. Unmap buffers with UnmapBuffer command.
+# 5. Again increment (decrement) buffer values in the shader.
+# Verify that this scenario works as expected and final values in the buffer objects are correct.
+KHR-GLES31.core.shader_atomic_counters.advanced-usage-draw-update-draw
+
 KHR-GLES31.core.shader_image_load_store.basic-allTargets-atomicCS
 KHR-GLES31.core.shader_image_load_store.basic-glsl-misc-cs
 KHR-GLES31.core.shader_storage_buffer_object.advanced-switchBuffers-cs
diff --git a/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a530-skips.txt b/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a530-skips.txt
index ee4cddf7df..5fb16813de 100644
--- a/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a530-skips.txt	
+++ b/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a530-skips.txt	
@@ -17,3 +17,7 @@ dEQP-GLES31.functional.shaders.builtin_functions.precision.tanh.lowp_compute.vec
 
 # Takes more than a few minutes, time is spent in batch_draw_tracking().
 KHR-GLES31.core.shader_image_load_store.basic-allFormats-store-fs
+
+# Sometimes times out in CI.  I think it's spending time in the compiler, and so
+# then the flake-detect run usually passes once shaders are cached.
+KHR-GLES31.core.arrays_of_arrays.SizedDeclarationsPrimitive
diff --git a/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a530.toml b/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a530.toml
index d55953bce3..5b40bde35a 100644
--- a/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a530.toml	
+++ b/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a530.toml	
@@ -7,6 +7,8 @@ deqp_args = [
     "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
     "--deqp-gl-config-name=rgba8888d24s8ms0",
 ]
+version_check = "GL ES 3.1.*git"
+renderer_check = "FD530"
 
 [[deqp]]
 deqp = "/deqp/modules/gles3/deqp-gles3"
diff --git a/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a630-fails.txt b/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a630-fails.txt
index f835921141..b40a18abb0 100644
--- a/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a630-fails.txt	
+++ b/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a630-fails.txt	
@@ -14,35 +14,17 @@ KHR-GL33.transform_feedback.query_vertex_separate_test,Fail
 # "*** Color comparison failed"
 KHR-GLES3.packed_depth_stencil.verify_read_pixels.depth24_stencil8,Fail
 
-# "MESA: error: ir3_ra() failed!"
-KHR-GLES31.core.arrays_of_arrays.InteractionFunctionCalls2,Fail
-KHR-GLES31.core.arrays_of_arrays.InteractionArgumentAliasing5,Fail
-KHR-GLES31.core.arrays_of_arrays.InteractionArgumentAliasing6,Fail
-
 # "The values of resultStd[i] & 0xFFFFFFFE and resultFma[i] & 0xFFFFFFFE and resultCPU[i] & 0xFFFFFFFE are not bitwise equal for i = 0..99 "
 KHR-GLES31.core.gpu_shader5.fma_precision_float,Fail
 KHR-GLES31.core.gpu_shader5.fma_precision_vec2,Fail
 KHR-GLES31.core.gpu_shader5.fma_precision_vec3,Fail
 KHR-GLES31.core.gpu_shader5.fma_precision_vec4,Fail
 
-# "gl_NumWorkGroups: Invalid data at index 2"
-KHR-GLES31.core.compute_shader.built-in-variables,Fail
-
-# "Got red: 1, expected 0.00392157, at (1, 0)"
-KHR-GLES31.core.compute_shader.resource-image,Fail
-
-# "(x,y)= (0,0). Color RGBA(0,0,0,1) is different than expected RGBA(0.1,0.2,0.3,1)"
-KHR-GLES31.core.draw_indirect.advanced-twoPass-transformFeedback-arrays,Fail
-KHR-GLES31.core.draw_indirect.advanced-twoPass-transformFeedback-elements,Fail
-
 # Lots of errors like "[279] Check failed. Received: [3,0,0,2] instead of: [5,0,0,2]"
 KHR-GLES31.core.geometry_shader.layered_framebuffer.depth_support,Fail
 
 KHR-GLES31.core.geometry_shader.layered_framebuffer.stencil_support,Fail
 
-# " [31] Check failed. Received: [3,0,0,2] instead of: [5,0,0,2]"
-KHR-GLES31.core.shader_image_load_store.basic-glsl-misc-fs,Fail
-
 # " Pixel data comparison failed; expected: (0.1, 0.2, 0.3, 0.4) rendered: (0, 0, 0, 0) epsilon: 0.00392157
 #   Pixel data comparison failed at esextcTessellationShaderPoints.cpp:597"
 KHR-GLES31.core.tessellation_shader.tessellation_shader_point_mode.point_rendering,Fail
@@ -53,8 +35,6 @@ KHR-GLES31.core.tessellation_shader.tessellation_shader_tc_barriers.barrier_guar
 
 # no debug info in the qpa
 KHR-GLES31.core.texture_cube_map_array.color_depth_attachments,Fail
-# failures in GS,TCS,TES texturing
-KHR-GLES31.core.texture_cube_map_array.sampling,Fail
 
 # rendering errors in ~4x4 blocks around the bottom side of the diagonal for the quad
 bypass-dEQP-GLES31.functional.blend_equation_advanced.msaa.colorburn,Fail
@@ -73,279 +53,10 @@ bypass-dEQP-GLES31.functional.blend_equation_advanced.msaa.overlay,Fail
 bypass-dEQP-GLES31.functional.blend_equation_advanced.msaa.screen,Fail
 bypass-dEQP-GLES31.functional.blend_equation_advanced.msaa.softlight,Fail
 
-# "Fail (createInstance returned VK_ERROR_INITIALIZATION_FAILED)"
-# happens inside the loader on anholt's debian system, and there are various
-# likely-looking fixes in later versions of the loader.
-dEQP-VK.api.device_init.create_instance_device_intentional_alloc_fail,Fail
-
-# optimalTilingFeatures  missing: VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT|VK_FORMAT_FEATURE_TRANSFER_SRC_BIT|VK_FORMAT_FEATURE_TRANSFER_DST_BIT|VK_FORMAT_FEATURE_COSITED_CHROMA_SAMPLES_BIT
-dEQP-VK.api.info.format_properties.g8_b8_r8_3plane_420_unorm,Fail
-dEQP-VK.api.info.format_properties.g8_b8r8_2plane_420_unorm,Fail
-
-dEQP-VK.api.info.image_format_properties.2d.optimal.g8_b8_r8_3plane_420_unorm,Fail
-
-# ERROR: VK_FORMAT_G8_B8R8_2PLANE_420_UNORM must support VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT|VK_FORMAT_FEATURE_TRANSFER_SRC_BIT|VK_FORMAT_FEATURE_TRANSFER_DST_BIT|VK_FORMAT_FEATURE_COSITED_CHROMA_SAMPLES_BIT
-dEQP-VK.api.info.image_format_properties.2d.optimal.g8_b8r8_2plane_420_unorm,Fail
-
-# "Mismatch between VkPhysicalDeviceProtectedMemoryProperties at vktApiFeatureInfo.cpp:4208"
-dEQP-VK.api.info.get_physical_device_properties2.properties,Fail
-
-# LEAK 1: REALLOCATION: original=0x0000000000000000, size=400, alignment=8, scope=3, returnedPtr=0x0000aaaaf6b61310
-# ERROR: Found 1 memory leaks!
-dEQP-VK.api.object_management.alloc_callback_fail.device,Fail
-dEQP-VK.api.object_management.alloc_callback_fail.device_group,Fail
-
-# "MESA: error: ir3_ra() failed!"
-# https://gitlab.freedesktop.org/mesa/mesa/-/issues/33
-dEQP-VK.graphicsfuzz.spv-stable-maze-flatten-copy-composite,Fail
-dEQP-VK.graphicsfuzz.spv-stable-pillars-volatile-nontemporal-store,Fail
-
-# https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3019
-# should be fixed by https://gerrit.khronos.org/c/vk-gl-cts/+/7745
-dEQP-VK.renderpass.dedicated_allocation.attachment_allocation.input_output.7,Fail
-dEQP-VK.renderpass.suballocation.attachment_allocation.input_output.7,Fail
-dEQP-VK.renderpass2.dedicated_allocation.attachment_allocation.input_output.7,Fail
-dEQP-VK.renderpass2.suballocation.attachment_allocation.input_output.7,Fail
-
-# "MESA: error: ir3_ra() failed!
-# https://gitlab.freedesktop.org/mesa/mesa/-/issues/33
-dEQP-VK.spirv_assembly.instruction.compute.opcopymemory.array,Fail
-
-# "deqp-vk: ../src/freedreno/vulkan/tu_cs.h:186: tu_cs_reserve: Assertion `tu_cs_get_space(cs) >= reserved_size' failed."
-# https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8841
-dEQP-VK.spirv_assembly.instruction.compute.opphi.wide,Crash
-
-# " deqp-vk: ../src/freedreno/ir3/ir3_ra.c:526: ra_file_insert: Assertion `interval->physreg_end <= file->size' failed."
-# also https://gitlab.freedesktop.org/mesa/mesa/-/issues/5163
-dEQP-VK.spirv_assembly.instruction.compute.spirv_ids_abuse.lots_ids,Crash
-dEQP-VK.spirv_assembly.instruction.graphics.spirv_ids_abuse.lots_ids_frag,Crash
-dEQP-VK.spirv_assembly.instruction.graphics.spirv_ids_abuse.lots_ids_geom,Crash
-dEQP-VK.spirv_assembly.instruction.graphics.spirv_ids_abuse.lots_ids_tessc,Crash
-dEQP-VK.spirv_assembly.instruction.graphics.spirv_ids_abuse.lots_ids_tesse,Crash
-dEQP-VK.spirv_assembly.instruction.graphics.spirv_ids_abuse.lots_ids_vert,Crash
-
 # Fails when TU_DEBUG=forcebin is set
-dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_single_buffer_geom,Fail
 dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_geom,Fail
 dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_vert,Fail
 
-# "MESA: error: ir3_ra() failed!"
-# https://gitlab.freedesktop.org/mesa/mesa/-/issues/33
-# Needs spilling, or maybe some scheduling (though throwing a bit of nir_move/sink
-# at it didn't help).
-dEQP-VK.spirv_assembly.instruction.spirv1p4.opcopylogical.nested_arrays_different_inner_stride,Fail
-dEQP-VK.spirv_assembly.instruction.spirv1p4.opcopylogical.nested_arrays_different_outer_stride,Fail
-dEQP-VK.spirv_assembly.instruction.spirv1p4.opcopylogical.nested_arrays_different_strides,Fail
-
-dEQP-VK.texture.filtering.2d.formats.d24_unorm_s8_uint_stencil.nearest,Fail
-dEQP-VK.texture.filtering.2d_array.formats.d24_unorm_s8_uint_stencil.d24_unorm_s8_uint_stencil_nearest,Fail
-dEQP-VK.texture.filtering.cube.formats.d24_unorm_s8_uint_stencil.nearest,Fail
-dEQP-VK.texture.filtering.unnormal.formats.d24_unorm_s8_uint_stencil.nearest,Fail
-
-# Broken on all drivers: https://gitlab.freedesktop.org/mesa/mesa/-/issues/4582
-dEQP-VK.wsi.display_control.register_device_event,Fail
-
-# "MESA: error: ir3_ra() failed!"
-# https://gitlab.freedesktop.org/mesa/mesa/-/issues/33
-dEQP-VK.ssbo.layout.2_level_array.scalar.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.2_level_array.scalar.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.2_level_array.scalar.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.2_level_array.scalar.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.2_level_array.std140.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.2_level_array.std140.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.2_level_array.std140.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.2_level_array.std140.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.2_level_array.std430.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.2_level_array.std430.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.2_level_array.std430.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.2_level_array.std430.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat2x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat2x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat2x4,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat2x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x2_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x2_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x2,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x2_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.3_level_array.scalar.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat2x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat2x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat2x4,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat2x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x2_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x2_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x2,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x2_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.3_level_array.std140.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat2x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat2x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat2x4,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat2x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x2_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x2_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x2,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x2_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.3_level_array.std430.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat2x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat2x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat2x4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat2x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x2_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x2_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x2,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x2_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat2x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat2x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat2x4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat2x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x2_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x2_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x2,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x2_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std140.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat2x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat2x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat2x4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat2x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x2_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x2_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x2,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x2_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.3_level_unsized_array.std430.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.scalar.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std140.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat3x4_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat3x4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat3x4,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat3x4_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4x3_comp_access,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4x3_comp_access_store_cols,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4x3,Fail
-dEQP-VK.ssbo.layout.basic_unsized_array.std430.row_major_mat4x3_store_cols,Fail
-dEQP-VK.ssbo.layout.random.all_shared_buffer.5,Fail
-dEQP-VK.ssbo.layout.random.nested_structs_arrays.0,Fail
-dEQP-VK.ssbo.layout.random.nested_structs_arrays.17,Fail
-dEQP-VK.ssbo.layout.random.scalar.19,Fail
-
-bypass-dEQP-VK.renderpass.dedicated_allocation.attachment_allocation.input_output.7,Fail
-bypass-dEQP-VK.renderpass.suballocation.attachment_allocation.input_output.7,Fail
-bypass-dEQP-VK.renderpass2.dedicated_allocation.attachment_allocation.input_output.7,Fail
-bypass-dEQP-VK.renderpass2.suballocation.attachment_allocation.input_output.7,Fail
+# https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3052
+# fixed by https://gerrit.khronos.org/c/vk-gl-cts/+/7837
 bypass-dEQP-VK.renderpass.suballocation.subpass_dependencies.separate_channels.r8g8b8a8_unorm,Fail
diff --git a/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a630-flakes.txt b/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a630-flakes.txt
index aeb26d9ddc..c46da455bb 100644
--- a/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a630-flakes.txt	
+++ b/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a630-flakes.txt	
@@ -91,11 +91,6 @@ dEQP-GLES31.functional.layout_binding.ssbo.fragment_binding_array
 dEQP-GLES3.functional.fbo.blit.conversion.rg8i_to_r16i
 dEQP-GLES3.functional.fbo.blit.conversion.rg8_to_r16f
 
-# Started appearing after disabling cpufreq, devfreq and disabling runtime PM
-dEQP-GLES3.functional.fbo.blit.depth_stencil.depth32f_stencil8_basic
-dEQP-GLES3.functional.fbo.blit.depth_stencil.depth32f_stencil8_scale
-dEQP-GLES3.functional.fbo.blit.depth_stencil.depth32f_stencil8_stencil_only
-
 # Could trip hangcheck timeout
 dEQP-VK.api.command_buffers.record_many_draws_primary_2
 dEQP-VK.api.command_buffers.record_many_draws_secondary_2
@@ -114,6 +109,13 @@ KHR-GLES31.core.tessellation_shader.tessellation_shader_tc_barriers.barrier_guar
 # looks like a cache flushing issue, and it does sometimes pass.
 bypass-dEQP-GLES31.functional.blend_equation_advanced.msaa.*
 
+# Testcase was mostly fixed in 23f7e06cd8d40569f8bfabde9c01d1597573abef, but has
+# flaked in CI since then:
+# " [775] Check failed. Received: [3,0,0,2] instead of: [5,0,0,2]
+#   [806] Check failed. Received: [3,0,0,2] instead of: [5,0,0,2]
+#   ..."
+KHR-GLES31.core.shader_image_load_store.basic-glsl-misc-fs
+
 # Flakes, all seen since merge of:
 # https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12258
 # Failures seen so far in different flakes:
@@ -126,3 +128,4 @@ bypass-dEQP-GLES31.functional.blend_equation_advanced.msaa.*
 # test does a single point draw in a FS taking the length of the first 7 SSBOs and writing them to
 # ints in SSBO 7, then glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT), then mapping it.
 KHR-GLES31.core.shader_storage_buffer_object.advanced-unsizedArrayLength-fs-std430-matC-pad
+KHR-GLES31.core.shader_storage_buffer_object.advanced-unsizedArrayLength-fs-std430-vec
diff --git a/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a630-skips.txt b/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a630-skips.txt
index b3531a2c6a..542314d041 100644
--- a/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a630-skips.txt	
+++ b/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a630-skips.txt	
@@ -5,9 +5,6 @@
 # Note normal pre-merge CI also includes -premerge-skips.txt, and that's where
 # "it's slow but would pass/fail/crash within a couple of minutes" skips should go.
 
-# Crashes likely caused by https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/2701
-dEQP-VK.synchronization.cross_instance.*binary_semaphore_fence_fd
-
 # Timeouts in CI even after 5 minutes
 dEQP-VK.tessellation.invariance.outer_edge_division.quads_equal_spacing
 dEQP-VK.tessellation.invariance.outer_edge_division.quads_fractional_even_spacing
@@ -25,3 +22,8 @@ dEQP-VK.ubo.random.all_shared_buffer.48
 
 # Still running after 3 hours, time is spent in batch_draw_tracking().
 KHR-GLES31.core.shader_image_load_store.basic-allFormats-store-fs
+
+# causes a hangcheck timeout on a630:
+# msm ae00000.mdss: [drm:hangcheck_handler] *ERROR* A630: hangcheck detected gpu lockup rb 0!
+dEQP-VK.graphicsfuzz.spv-stable-maze-flatten-copy-composite
+spill-dEQP-VK.graphicsfuzz.spv-stable-pillars-volatile-nontemporal-store
diff --git a/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a630-vk-full.toml b/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a630-vk-full.toml
index 35a2280b15..7106cb1494 100644
--- a/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a630-vk-full.toml	
+++ b/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a630-vk-full.toml	
@@ -3,6 +3,7 @@
 deqp = "/deqp/external/vulkancts/modules/vulkan/deqp-vk"
 caselists = ["/deqp/mustpass/vk-master.txt"]
 timeout = 300
+renderer_check = "Turnip Adreno .* 630"
 [deqp.env]
 # Force binning in the main run, which makes sure we render at
 # least 2 bins.  This is the path that impacts the most different
diff --git a/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a630-vk.toml b/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a630-vk.toml
index 72becc3dca..a952787cc0 100644
--- a/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a630-vk.toml	
+++ b/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a630-vk.toml	
@@ -4,6 +4,7 @@ deqp = "/deqp/external/vulkancts/modules/vulkan/deqp-vk"
 caselists = ["/deqp/mustpass/vk-master.txt"]
 skips = ["install/deqp-freedreno-a630-premerge-skips.txt"]
 fraction = 3
+renderer_check = "Turnip Adreno .* 630"
 [deqp.env]
 # Force binning in the main run, which makes sure we render at
 # least 2 bins.  This is the path that impacts the most different
@@ -20,3 +21,12 @@ prefix = "bypass-"
 fraction = 15
 [deqp.env]
 TU_DEBUG = "sysmem"
+
+[[deqp]]
+deqp = "/deqp/external/vulkancts/modules/vulkan/deqp-vk"
+caselists = ["/deqp/mustpass/vk-master.txt"]
+skips = ["install/deqp-freedreno-a630-premerge-skips.txt"]
+include = ["dEQP-VK.graphicsfuzz.*"]
+prefix = "spill-"
+[deqp.env]
+IR3_SHADER_DEBUG = "spillall"
diff --git a/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a630.toml b/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a630.toml
index b6f782f0bd..4229229cd4 100644
--- a/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a630.toml	
+++ b/mesa 3D driver/src/freedreno/ci/deqp-freedreno-a630.toml	
@@ -8,6 +8,8 @@ deqp_args = [
     "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
     "--deqp-gl-config-name=rgba8888d24s8ms0",
 ]
+version_check = "GL ES 3.2.*git"
+renderer_check = "FD630"
 
 [[deqp]]
 deqp = "/deqp/modules/gles3/deqp-gles3"
@@ -143,3 +145,18 @@ deqp_args = [
     "--deqp-gl-config-name=rgba8888d24s8ms4",
 ]
 prefix = "multisample-"
+
+# spilling testing
+[[deqp]]
+deqp = "/deqp/modules/gles2/deqp-gles2"
+caselists = ["/deqp/mustpass/gles2-master.txt"]
+skips = ["install/deqp-freedreno-a630-premerge-skips.txt"]
+include = ["functional.shaders"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+prefix = "spill-"
+[deqp.env]
+IR3_SHADER_DEBUG = "spillall"
diff --git a/mesa 3D driver/src/freedreno/ci/gitlab-ci.yml b/mesa 3D driver/src/freedreno/ci/gitlab-ci.yml
index c5a6870f8a..25f52b5818 100644
--- a/mesa 3D driver/src/freedreno/ci/gitlab-ci.yml	
+++ b/mesa 3D driver/src/freedreno/ci/gitlab-ci.yml	
@@ -44,14 +44,12 @@
     BM_KERNEL: /baremetal-files/Image.gz
     BM_DTB: /baremetal-files/apq8016-sbc.dtb
     GPU_VERSION: freedreno-a307
-    DEQP_EXPECTED_RENDERER: FD307
 
 a306_gl:
   extends:
     - .baremetal-deqp-test
     - .a306-test
   variables:
-    DEQP_VER: gles2
     DEQP_SUITE: freedreno-a307
   parallel: 5
 
@@ -66,11 +64,11 @@ a306-traces:
 .a530-test:
   extends:
     - .freedreno-test
+    - .test-manual-mr
   variables:
     BM_KERNEL: /baremetal-files/Image.gz
     BM_DTB: /baremetal-files/apq8096-db820c.dtb
     GPU_VERSION: freedreno-a530
-    DEQP_EXPECTED_RENDERER: FD530
   tags:
     - google-freedreno-db820c
 
@@ -79,7 +77,6 @@ a530_gl:
     - .baremetal-deqp-test
     - .a530-test
   variables:
-    DEQP_VER: gles2
     DEQP_SUITE: freedreno-a530
   parallel: 5
 
@@ -119,11 +116,10 @@ a530-traces:
   extends:
     - .freedreno-test
   variables:
-    DEQP_PARALLEL: 10
+    FDO_CI_CONCURRENT: 10
     BM_KERNEL: /baremetal-files/cheza-kernel
     BM_CMDLINE: "ip=dhcp console=ttyMSM0,115200n8 root=/dev/nfs rw nfsrootdebug nfsroot=,tcp,nfsvers=4.2 init=/init"
     GPU_VERSION: freedreno-a630
-    DEQP_EXPECTED_RENDERER: FD630
   tags:
     - google-freedreno-cheza
   script:
@@ -135,7 +131,6 @@ a630_gl:
     - .a630-test
   parallel: 4
   variables:
-    DEQP_VER: gles2 # for renderer check
     DEQP_SUITE: freedreno-a630
 
 # Robustness tests may be disruptive to other tests, so we run EGL's robustness
@@ -158,15 +153,15 @@ a630_gles_asan:
   variables:
     DEQP_VER: gles31
     DEQP_FRACTION: 10
+    DEQP_EXPECTED_RENDERER: FD630
     GPU_VERSION: freedreno-a630-asan
 
 a630_vk:
   extends:
-    - .baremetal-deqp-test-freedreno-vk
     - .a630-test
+    - .baremetal-deqp-test-freedreno-vk
   parallel: 3
   variables:
-    DEQP_VER: vk
     DEQP_SUITE: freedreno-a630-vk
 
 a630_vk_full:
@@ -182,13 +177,14 @@ a630_vk_full:
 # Clicking play can show you some useful areas for fixing turnip, though.
 a630_vk_asan:
   extends:
-    - .baremetal-deqp-test-freedreno-vk
     - .a630-test
+    - .baremetal-deqp-test-freedreno-vk
     - .baremetal-arm64-asan-test
     - .test-manual
   variables:
+    DEQP_EXPECTED_RENDERER: "Turnip Adreno (TM) 630"
     DEQP_FRACTION: 100
-    DEQP_PARALLEL: 4 # We get OOMkills if we go too wide with asan enabled
+    FDO_CI_CONCURRENT: 4 # We get OOMkills if we go too wide with asan enabled
     TU_DEBUG: forcebin
     # Disable the leak checks, since the library gets dlclose()d and thus get
     # totally useless leak reports.  We can still catch buffer overflows.
@@ -238,5 +234,5 @@ a630-traces-performance:
     # So we aren't capped by VSync by the X server
     EGL_PLATFORM: surfaceless
     GIT_STRATEGY: none
-    HWCI_FREQ_MAX: 1
+    HWCI_FREQ_MAX: "true"
   allow_failure: true
diff --git a/mesa 3D driver/src/freedreno/ci/restricted-traces-freedreno.yml b/mesa 3D driver/src/freedreno/ci/restricted-traces-freedreno.yml
index 4808f98003..44ddb8aacf 100644
--- a/mesa 3D driver/src/freedreno/ci/restricted-traces-freedreno.yml	
+++ b/mesa 3D driver/src/freedreno/ci/restricted-traces-freedreno.yml	
@@ -14,7 +14,7 @@ traces:
   - path: golf-with-your-friends/GolfWithYourFriends-trim--f1070-v20201203.trace
     expectations:
       - device: freedreno-a630
-        checksum: 7b08782ff6a54a3d088729363080c5b6
+        checksum: 1531665cf86c7e7502dcd9701def5b17
   - path: hollow-knight/HollowKnight-trim--f2020-v20201203.trace
     expectations:
       - device: freedreno-a630
@@ -27,7 +27,7 @@ traces:
   - path: overcooked2/Overcooked2-trim--f3301-v20201203.trace
     expectations:
       - device: freedreno-a630
-        checksum: 358c723a5f1807fafea2d68bcd32040e
+        checksum: f5cf383154f328e626baf7c4515e170b
 # Crashes
 #  - path: plaugue-inc-evolved/PlagueIncEvolved-trim--f1200-v20201203.trace
 #    expectations:
diff --git a/mesa 3D driver/src/freedreno/ci/traces-freedreno.yml b/mesa 3D driver/src/freedreno/ci/traces-freedreno.yml
index 30114f5d6b..eb1a71273e 100644
--- a/mesa 3D driver/src/freedreno/ci/traces-freedreno.yml	
+++ b/mesa 3D driver/src/freedreno/ci/traces-freedreno.yml	
@@ -45,9 +45,9 @@ traces:
       #   checksum: 4b707f385256b380c936186db8c251cb
       # 1 minute
       - device: freedreno-a530
-        checksum: a71d62bb2c0fabeca41468628777b441
+        checksum: 130dbeac42683b46fed4b268c5aad984
       - device: freedreno-a630
-        checksum: 339dce29ae08569652438116829510c7
+        checksum: 139861e52f9425b4adb7c0b90b885f91
   - path: xonotic/xonotic-keybench-high.trace
     expectations:
       # Skipped since it's long on a530.
@@ -90,14 +90,18 @@ traces:
       #   checksum: 14e78caf29b6a3341081c8f2e678355f
       - device: freedreno-a630
         checksum: c8608d54cc6298476a2b60686d152dbc
-  - path: minetest/minetest.trace
-    expectations:
-      - device: freedreno-a306
-        checksum: bd6e158327d68e69ecf5edfacc368a7b
-      - device: freedreno-a530
-        checksum: 1b72313340c37a96acd82e3332513a3e
-      - device: freedreno-a630
-        checksum: eea608db257a1caa21517f0b13807952
+  # Disabled on all devices due to:
+  # https://gitlab.freedesktop.org/mesa/mesa/-/issues/4595
+  # - path: minetest/minetest.trace
+  #   expectations:
+  #     # Started flaking and sometimes not drawing the left side of the crosshair around 2021-06-25.
+  #     # Note that the crosshair is drawn some time in the middle of the frame.
+  #     # - device: freedreno-a306
+  #     #   checksum: bd6e158327d68e69ecf5edfacc368a7b
+  #     - device: freedreno-a530
+  #       checksum: 599b00e7a443a90b0edcd06fccd1a400
+  #     - device: freedreno-a630
+  #       checksum: a71da1e8c855209d79fa8a0b83a46775
   - path: neverball/neverball.trace
     expectations:
       # Skipped since it's long on a530.
@@ -323,9 +327,9 @@ traces:
       #- device: freedreno-a306
       #  checksum: 0c57ccc3989b75a940b28ea1cc09cb0d
       - device: freedreno-a530
-        checksum: bc19f0f58935fdb348f401396e6845e1
+        checksum: 4715d72a7958f2fd5a387c16b3a01579
       - device: freedreno-a630
-        checksum: f546f840e916ab0f11f8df0e4eee584d
+        checksum: 1e397c5c34c9c50350a8db1a060a6bbb
   - path: glmark2/shading:shading=blinn-phong-inf.trace
     expectations:
       - device: freedreno-a306
@@ -411,14 +415,14 @@ traces:
       # - device: freedreno-a306
       #   checksum: 751e0e784ba2f003cfc456fe8699f1fa
       - device: freedreno-a530
-        checksum: a8580a2a85f37600c15fb897cd874432
+        checksum: ba53d1ffbe911171546a93259fb2e57c
       - device: freedreno-a630
-        checksum: 66d7cfb1aedfe40048fe2cdf8032071c
+        checksum: 313ef615f0f5a11eeaf95a2a87769a32
 # Note: Requires GL3.3
   - path: gputest/gimark.trace
     expectations:
       - device: freedreno-a630
-        checksum: 286cab0d6d6562d5dcc969d778cfa666
+        checksum: dd8fb768033d09f6edc98b4cfff02c6f
   - path: gputest/pixmark-julia-fp32.trace
     expectations:
       - device: freedreno-a630
@@ -448,16 +452,16 @@ traces:
   - path: gputest/plot3d.trace
     expectations:
       - device: freedreno-a306
-        checksum: f6ecd9b8afc692b0cdb459b9b30db8d4
+        checksum: 302943895dbdd7730958fb0175f23b7f
       - device: freedreno-a530
-        checksum: 4faafe5fab0d8ec6d7b549c94f663c92
+        checksum: 755aa5b521237ddf9fea3181d2ba2b75
       - device: freedreno-a630
-        checksum: 0a6a16c394a413f02ec2ebcc3251e366
+        checksum: 302aec1ced68e22182460b617b0f2aef
 # Note: Requires GL4 for tess.
   - path: gputest/tessmark.trace
     expectations:
       - device: freedreno-a630
-        checksum: 93d7cb8c775a7b60c499695045edc07f
+        checksum: af356a98c4d55fb10613a11fbe687adb
   - path: gputest/triangle.trace
     expectations:
       - device: freedreno-a306
@@ -469,9 +473,9 @@ traces:
   - path: humus/AmbientAperture.trace
     expectations:
       - device: freedreno-a306
-        checksum: 8d4c52f0af9c09710d358f24c73fae3c
+        checksum: 3d9243cbd0659cb58b16cade2be3f2c2
       - device: freedreno-a530
-        checksum: aab5c853e383e1cda56663d65f6925ad
+        checksum: c55c1ba5683306980956b5f89563f343
       - device: freedreno-a630
         checksum: 83fd7bce0fc1e1f30bd143b7d30ca890
   - path: humus/CelShading.trace
@@ -532,7 +536,7 @@ traces:
     expectations:
       # a306/a630 would need higher GL version to run
       - device: freedreno-a630
-        checksum: 0e32ca8fc815a7250f38a07faeafb21b
+        checksum: e93cf9682c9ca5ed6a6effe5b7fdd386
   - path: pathfinder/canvas_text_v2.trace
     expectations:
       # a306/a630 would need higher GL version to run
diff --git a/mesa 3D driver/src/freedreno/common/freedreno_dev_info.h b/mesa 3D driver/src/freedreno/common/freedreno_dev_info.h
index d005bcd483..844a28d511 100644
--- a/mesa 3D driver/src/freedreno/common/freedreno_dev_info.h	
+++ b/mesa 3D driver/src/freedreno/common/freedreno_dev_info.h	
@@ -69,6 +69,9 @@ struct fd_dev_info {
 
          bool tess_use_shared;
 
+         /* Does the hw support GL_QCOM_shading_rate? */
+         bool has_shading_rate;
+
          /* newer a6xx allows using 16-bit descriptor for both 16-bit
           * and 32-bit access
           */
@@ -102,9 +105,27 @@ struct fd_dev_info {
 
          bool has_8bpp_ubwc;
 
+         /* a650 seems to be affected by a bug where flushing CCU color into
+          * depth or vice-versa requires a WFI. In particular, clearing a
+          * depth attachment (which writes to it as a color attachment) then
+          * using it as a normal depth attachment requires a WFI in addition
+          * to the expected CCU_FLUSH_COLOR + CCU_INVALIDATE_DEPTH, even
+          * though all those operations happen in the same stage. As this is
+          * usually the only scenario where a CCU flush doesn't require a WFI
+          * we just insert a WFI after every CCU flush.
+          *
+          * Tests affected include
+          * dEQP-VK.renderpass.suballocation.formats.d16_unorm.* in sysmem
+          * mode (a few tests flake when the entire series is run).
+          */
+         bool has_ccu_flush_bug;
+
+         bool has_lpac;
+
          struct {
             uint32_t RB_UNKNOWN_8E04_blit;
             uint32_t PC_POWER_CNTL;
+            uint32_t TPL1_DBG_ECO_CNTL;
          } magic;
       } a6xx;
    };
diff --git a/mesa 3D driver/src/freedreno/common/freedreno_devices.py b/mesa 3D driver/src/freedreno/common/freedreno_devices.py
index e1eca08e59..e13b5c6c2c 100644
--- a/mesa 3D driver/src/freedreno/common/freedreno_devices.py	
+++ b/mesa 3D driver/src/freedreno/common/freedreno_devices.py	
@@ -126,6 +126,9 @@ class A6xxGPUInfo(GPUInfo):
         self.a6xx = Struct()
         self.a6xx.magic = Struct()
 
+        for name, val in template["magic"].items():
+            setattr(self.a6xx.magic, name, val)
+
         # Various "magic" register values:
         self.a6xx.magic.RB_UNKNOWN_8E04_blit = RB_UNKNOWN_8E04_blit
         self.a6xx.magic.PC_POWER_CNTL = PC_POWER_CNTL
@@ -136,6 +139,8 @@ class A6xxGPUInfo(GPUInfo):
         self.a6xx.has_8bpp_ubwc = True
 
         for name, val in template.items():
+            if name == "magic": # handled above
+                continue
             setattr(self.a6xx, name, val)
 
 # a2xx is really two sub-generations, a20x and a22x, but we don't currently
@@ -179,7 +184,10 @@ add_gpus([
     ))
 
 add_gpus([
+        GPUId(508),
+        GPUId(509),
         GPUId(510),
+        GPUId(512),
         GPUId(530),
         GPUId(540),
     ], GPUInfo(
@@ -201,6 +209,9 @@ a6xx_gen1 = dict(
         ccu_cntl_gmem_unk2 = True,
         indirect_draw_wfm_quirk = True,
         depth_bounds_require_depth_test_quirk = True,
+        magic = dict(
+            TPL1_DBG_ECO_CNTL = 0x100000,
+        )
     )
 
 # a640, a680:
@@ -211,6 +222,9 @@ a6xx_gen2 = dict(
         has_z24uint_s8uint = True,
         indirect_draw_wfm_quirk = True,
         depth_bounds_require_depth_test_quirk = True, # TODO: check if true
+        magic = dict(
+            TPL1_DBG_ECO_CNTL = 0,
+        ),
     )
 
 # a650:
@@ -223,9 +237,15 @@ a6xx_gen3 = dict(
         storage_16bit = True,
         has_tex_filter_cubic = True,
         has_sample_locations = True,
+        has_ccu_flush_bug = True,
+        has_8bpp_ubwc = False,
+        magic = dict(
+            # this seems to be a chicken bit that fixes cubic filtering:
+            TPL1_DBG_ECO_CNTL = 0x1000000,
+        ),
     )
 
-# a635, a650:
+# a635, a660:
 a6xx_gen4 = dict(
         fibers_per_sp = 128 * 2 * 16,
         reg_size_vec4 = 64,
@@ -237,6 +257,11 @@ a6xx_gen4 = dict(
         has_sample_locations = True,
         has_cp_reg_write = False,
         has_8bpp_ubwc = False,
+        has_lpac = True,
+        has_shading_rate = True,
+        magic = dict(
+            TPL1_DBG_ECO_CNTL = 0x5008000,
+        ),
     )
 
 add_gpus([
diff --git a/mesa 3D driver/src/freedreno/computerator/a4xx.c b/mesa 3D driver/src/freedreno/computerator/a4xx.c
new file mode 100644
index 0000000000..0dbb307127
--- /dev/null
+++ b/mesa 3D driver/src/freedreno/computerator/a4xx.c	
@@ -0,0 +1,348 @@
+/*
+ * Copyright © 2021 Ilia Mirkin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ir3/ir3_compiler.h"
+
+#include "util/u_math.h"
+#include "util/u_queue.h"
+#include "util/half_float.h"
+
+#include "adreno_pm4.xml.h"
+#include "adreno_common.xml.h"
+#include "a4xx.xml.h"
+
+#include "ir3_asm.h"
+#include "main.h"
+
+struct a4xx_backend {
+   struct backend base;
+
+   struct ir3_compiler *compiler;
+   struct fd_device *dev;
+};
+define_cast(backend, a4xx_backend);
+
+/*
+ * Backend implementation:
+ */
+
+static struct kernel *
+a4xx_assemble(struct backend *b, FILE *in)
+{
+   struct a4xx_backend *a4xx_backend = to_a4xx_backend(b);
+   struct ir3_kernel *ir3_kernel = ir3_asm_assemble(a4xx_backend->compiler, in);
+   ir3_kernel->backend = b;
+   return &ir3_kernel->base;
+}
+
+static void
+a4xx_disassemble(struct kernel *kernel, FILE *out)
+{
+   ir3_asm_disassemble(to_ir3_kernel(kernel), out);
+}
+
+static void
+cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
+{
+   struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel);
+   struct ir3_shader_variant *v = ir3_kernel->v;
+   const struct ir3_info *i = &v->info;
+   enum a3xx_threadsize thrsz = i->double_threadsize ? FOUR_QUADS : TWO_QUADS;
+
+   OUT_PKT0(ring, REG_A4XX_UCHE_INVALIDATE0, 2);
+   OUT_RING(ring, 0x00000000);
+   OUT_RING(ring, 0x00000012);
+
+   OUT_WFI(ring);
+
+   OUT_PKT0(ring, REG_A4XX_SP_MODE_CONTROL, 1);
+   OUT_RING(ring, 0x0000001e);
+
+   OUT_PKT0(ring, REG_A4XX_TPL1_TP_MODE_CONTROL, 1);
+   OUT_RING(ring, 0x00000038);
+
+   OUT_PKT0(ring, REG_A4XX_TPL1_TP_FS_TEX_COUNT, 1);
+   OUT_RING(ring, 0x00000000);
+
+   OUT_WFI(ring);
+
+   OUT_PKT0(ring, REG_A4XX_HLSQ_MODE_CONTROL, 1);
+   OUT_RING(ring, 0x00000003);
+
+   OUT_PKT0(ring, REG_A4XX_HLSQ_CONTROL_0_REG, 1);
+   OUT_RING(ring, 0x080005f0);
+
+   OUT_PKT0(ring, REG_A4XX_HLSQ_UPDATE_CONTROL, 1);
+   OUT_RING(ring, 0x00000038);
+
+   OUT_PKT0(ring, REG_A4XX_SP_SP_CTRL_REG, 1);
+   OUT_RING(ring, 0x00860010);
+   // OUT_RING(ring, 0x00920000);
+
+   OUT_PKT0(ring, REG_A4XX_SP_INSTR_CACHE_CTRL, 1);
+   OUT_RING(ring, 0x000004ff);
+   // OUT_RING(ring, 0x00000260);
+
+   OUT_PKT0(ring, REG_A4XX_SP_FS_CTRL_REG1, 1);
+   OUT_RING(ring, 0x80000000);
+
+   OUT_PKT0(ring, REG_A4XX_SP_CS_CTRL_REG0, 1);
+   OUT_RING(ring,
+            A4XX_SP_CS_CTRL_REG0_THREADSIZE(thrsz) |
+            A4XX_SP_CS_CTRL_REG0_SUPERTHREADMODE |
+            A4XX_SP_CS_CTRL_REG0_HALFREGFOOTPRINT(i->max_half_reg + 1) |
+            A4XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT(i->max_reg + 1));
+
+   OUT_PKT0(ring, REG_A4XX_HLSQ_CS_CONTROL_REG, 1);
+   OUT_RING(ring, A4XX_HLSQ_CS_CONTROL_REG_CONSTOBJECTOFFSET(0) |
+                     A4XX_HLSQ_CS_CONTROL_REG_SHADEROBJOFFSET(0) |
+                     A4XX_HLSQ_CS_CONTROL_REG_ENABLED |
+                     A4XX_HLSQ_CS_CONTROL_REG_INSTRLENGTH(1) |
+                     COND(v->has_ssbo, A4XX_HLSQ_CS_CONTROL_REG_SSBO_ENABLE) |
+                     A4XX_HLSQ_CS_CONTROL_REG_CONSTLENGTH(v->constlen / 4));
+
+   OUT_PKT0(ring, REG_A4XX_SP_CS_OBJ_START, 1);
+   OUT_RELOC(ring, v->bo, 0, 0, 0); /* SP_CS_OBJ_START */
+
+   OUT_PKT0(ring, REG_A4XX_SP_CS_LENGTH_REG, 1);
+   OUT_RING(ring, v->instrlen);
+
+   uint32_t local_invocation_id, work_group_id, num_wg_id;
+   local_invocation_id =
+      ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
+   work_group_id = ir3_kernel->info.wgid;
+   num_wg_id = ir3_kernel->info.numwg;
+
+   OUT_PKT0(ring, REG_A4XX_HLSQ_CL_CONTROL_0, 2);
+   OUT_RING(ring, A4XX_HLSQ_CL_CONTROL_0_WGIDCONSTID(work_group_id) |
+                     A4XX_HLSQ_CL_CONTROL_0_UNK12CONSTID(regid(63, 0)) |
+                     A4XX_HLSQ_CL_CONTROL_0_LOCALIDREGID(local_invocation_id));
+   OUT_RING(ring, A4XX_HLSQ_CL_CONTROL_1_UNK0CONSTID(regid(63, 0)) |
+                     A4XX_HLSQ_CL_CONTROL_1_UNK12CONSTID(regid(63, 0)));
+
+   OUT_PKT0(ring, REG_A4XX_HLSQ_CL_KERNEL_CONST, 1);
+   OUT_RING(ring, A4XX_HLSQ_CL_KERNEL_CONST_UNK0CONSTID(regid(63, 0)) |
+                     A4XX_HLSQ_CL_KERNEL_CONST_NUMWGCONSTID(num_wg_id));
+
+   OUT_PKT0(ring, REG_A4XX_HLSQ_CL_WG_OFFSET, 1);
+   OUT_RING(ring, A4XX_HLSQ_CL_WG_OFFSET_UNK0CONSTID(regid(63, 0)));
+
+   OUT_PKT3(ring, CP_LOAD_STATE4, 2);
+   OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) |
+                     CP_LOAD_STATE4_0_STATE_SRC(SS4_INDIRECT) |
+                     CP_LOAD_STATE4_0_STATE_BLOCK(SB4_CS_SHADER) |
+                     CP_LOAD_STATE4_0_NUM_UNIT(v->instrlen));
+   OUT_RELOC(ring, v->bo, 0, CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER), 0);
+}
+
+static void
+emit_const(struct fd_ringbuffer *ring, struct kernel *kernel, uint32_t constid, uint32_t sizedwords,
+           const uint32_t *dwords)
+{
+   uint32_t align_sz;
+
+   debug_assert((constid % 4) == 0);
+
+   /* Overwrite appropriate entries with buffer addresses */
+   struct fd_bo **replacements = calloc(sizedwords, sizeof(struct fd_bo *));
+   for (int i = 0; i < MAX_BUFS; i++) {
+      if (kernel->buf_addr_regs[i] != INVALID_REG) {
+         int idx = kernel->buf_addr_regs[i];
+         assert(idx < sizedwords);
+
+         replacements[idx] = kernel->bufs[i];
+      }
+   }
+
+   align_sz = align(sizedwords, 4);
+
+   OUT_PKT3(ring, CP_LOAD_STATE4, 2 + align_sz);
+   OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(constid / 4) |
+                     CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) |
+                     CP_LOAD_STATE4_0_STATE_BLOCK(SB4_CS_SHADER) |
+                     CP_LOAD_STATE4_0_NUM_UNIT(DIV_ROUND_UP(sizedwords, 4)));
+   OUT_RING(ring, CP_LOAD_STATE4_1_EXT_SRC_ADDR(0) |
+                     CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS));
+   for (unsigned i = 0; i < sizedwords; i++) {
+      if (replacements[i])
+         OUT_RELOC(ring, replacements[i], 0, 0, 0);
+      else
+         OUT_RING(ring, dwords[i]);
+   }
+
+   /* Zero-pad to multiple of 4 dwords */
+   for (uint32_t i = sizedwords; i < align_sz; i++) {
+      OUT_RING(ring, 0);
+   }
+
+   free(replacements);
+}
+
+static void
+cs_const_emit(struct fd_ringbuffer *ring, struct kernel *kernel,
+              uint32_t grid[3])
+{
+   struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel);
+   struct ir3_shader_variant *v = ir3_kernel->v;
+
+   const struct ir3_const_state *const_state = ir3_const_state(v);
+   uint32_t base = const_state->offsets.immediate;
+   int size = DIV_ROUND_UP(const_state->immediates_count, 4);
+
+   /* truncate size to avoid writing constants that shader
+    * does not use:
+    */
+   size = MIN2(size + base, v->constlen) - base;
+
+   /* convert out of vec4: */
+   base *= 4;
+   size *= 4;
+
+   if (size > 0) {
+      emit_const(ring, kernel, base, size, const_state->immediates);
+   }
+}
+
+static void
+cs_ibo_emit(struct fd_ringbuffer *ring, struct fd_submit *submit,
+            struct kernel *kernel)
+{
+   OUT_PKT3(ring, CP_LOAD_STATE4, 2 + (4 * kernel->num_bufs));
+   OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) |
+         CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) |
+         CP_LOAD_STATE4_0_STATE_BLOCK(SB4_CS_SSBO) |
+         CP_LOAD_STATE4_0_NUM_UNIT(kernel->num_bufs));
+   OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER) |
+         CP_LOAD_STATE4_1_EXT_SRC_ADDR(0));
+   for (unsigned i = 0; i < kernel->num_bufs; i++) {
+      OUT_RELOC(ring, kernel->bufs[i], 0, 0, 0);
+#if 1
+      OUT_RING(ring, 0);
+      OUT_RING(ring, 0);
+      OUT_RING(ring, 0);
+#else
+      OUT_RING(ring, kernel->buf_sizes[i]);
+      OUT_RING(ring, kernel->buf_sizes[i]);
+      OUT_RING(ring, 0x00000004);
+#endif
+   }
+
+   OUT_PKT3(ring, CP_LOAD_STATE4, 2 + (2 * kernel->num_bufs));
+   OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) |
+         CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) |
+         CP_LOAD_STATE4_0_STATE_BLOCK(SB4_CS_SSBO) |
+         CP_LOAD_STATE4_0_NUM_UNIT(kernel->num_bufs));
+   OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS) |
+         CP_LOAD_STATE4_1_EXT_SRC_ADDR(0));
+   for (unsigned i = 0; i < kernel->num_bufs; i++) {
+      unsigned sz = kernel->buf_sizes[i];
+
+      /* width is in dwords, overflows into height: */
+      sz /= 4;
+
+#if 1
+      OUT_RING(ring, A4XX_SSBO_1_0_WIDTH(sz));
+      OUT_RING(ring, A4XX_SSBO_1_1_HEIGHT(sz >> 16));
+#else
+      OUT_RING(ring, A4XX_SSBO_1_0_WIDTH(sz) |
+            A4XX_SSBO_1_0_FMT(RB4_R32_UINT) |
+            A4XX_SSBO_1_0_CPP(4));
+      OUT_RING(ring, A4XX_SSBO_1_1_HEIGHT(DIV_ROUND_UP(sz, 1 << 16)) |
+            A4XX_SSBO_1_1_DEPTH(1));
+#endif
+   }
+}
+
+static void
+a4xx_emit_grid(struct kernel *kernel, uint32_t grid[3],
+               struct fd_submit *submit)
+{
+   struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
+      submit, 0, FD_RINGBUFFER_PRIMARY | FD_RINGBUFFER_GROWABLE);
+
+   cs_program_emit(ring, kernel);
+   cs_const_emit(ring, kernel, grid);
+   cs_ibo_emit(ring, submit, kernel);
+
+   const unsigned *local_size = kernel->local_size;
+   const unsigned *num_groups = grid;
+
+   unsigned work_dim = 0;
+   for (int i = 0; i < 3; i++) {
+      if (!grid[i])
+         break;
+      work_dim++;
+   }
+
+   OUT_PKT0(ring, REG_A4XX_HLSQ_CL_NDRANGE_0, 7);
+   OUT_RING(ring, A4XX_HLSQ_CL_NDRANGE_0_KERNELDIM(work_dim) |
+                     A4XX_HLSQ_CL_NDRANGE_0_LOCALSIZEX(local_size[0] - 1) |
+                     A4XX_HLSQ_CL_NDRANGE_0_LOCALSIZEY(local_size[1] - 1) |
+                     A4XX_HLSQ_CL_NDRANGE_0_LOCALSIZEZ(local_size[2] - 1));
+   OUT_RING(ring,
+            A4XX_HLSQ_CL_NDRANGE_1_SIZE_X(local_size[0] * num_groups[0]));
+   OUT_RING(ring, 0); /* HLSQ_CL_NDRANGE_2_GLOBALOFF_X */
+   OUT_RING(ring,
+            A4XX_HLSQ_CL_NDRANGE_3_SIZE_Y(local_size[1] * num_groups[1]));
+   OUT_RING(ring, 0); /* HLSQ_CL_NDRANGE_4_GLOBALOFF_Y */
+   OUT_RING(ring,
+            A4XX_HLSQ_CL_NDRANGE_5_SIZE_Z(local_size[2] * num_groups[2]));
+   OUT_RING(ring, 0); /* HLSQ_CL_NDRANGE_6_GLOBALOFF_Z */
+
+#if 1
+   OUT_PKT3(ring, CP_EXEC_CS, 4);
+   OUT_RING(ring, 0x00000000);
+   OUT_RING(ring, CP_EXEC_CS_1_NGROUPS_X(grid[0]));
+   OUT_RING(ring, CP_EXEC_CS_2_NGROUPS_Y(grid[1]));
+   OUT_RING(ring, CP_EXEC_CS_3_NGROUPS_Z(grid[2]));
+#else
+   OUT_PKT0(ring, REG_A4XX_HLSQ_CL_KERNEL_GROUP_X, 3);
+   OUT_RING(ring, grid[0]); /* HLSQ_CL_KERNEL_GROUP_X */
+   OUT_RING(ring, grid[1]); /* HLSQ_CL_KERNEL_GROUP_Y */
+   OUT_RING(ring, grid[2]); /* HLSQ_CL_KERNEL_GROUP_Z */
+
+   OUT_PKT3(ring, CP_RUN_OPENCL, 1);
+   OUT_RING(ring, 0);
+#endif
+
+   OUT_WFI(ring);
+
+   /* TODO: cache_flush */
+}
+
+struct backend *
+a4xx_init(struct fd_device *dev, const struct fd_dev_id *dev_id)
+{
+   struct a4xx_backend *a4xx_backend = calloc(1, sizeof(*a4xx_backend));
+
+   a4xx_backend->base = (struct backend){
+      .assemble = a4xx_assemble,
+      .disassemble = a4xx_disassemble,
+      .emit_grid = a4xx_emit_grid,
+   };
+
+   a4xx_backend->compiler = ir3_compiler_create(dev, dev_id, false);
+   a4xx_backend->dev = dev;
+
+   return &a4xx_backend->base;
+}
diff --git a/mesa 3D driver/src/freedreno/computerator/a6xx.c b/mesa 3D driver/src/freedreno/computerator/a6xx.c
index 0dc103816f..67104a6db7 100644
--- a/mesa 3D driver/src/freedreno/computerator/a6xx.c	
+++ b/mesa 3D driver/src/freedreno/computerator/a6xx.c	
@@ -29,6 +29,8 @@
 #include "adreno_common.xml.h"
 #include "a6xx.xml.h"
 
+#include "common/freedreno_dev_info.h"
+
 #include "ir3_asm.h"
 #include "main.h"
 
@@ -38,6 +40,8 @@ struct a6xx_backend {
    struct ir3_compiler *compiler;
    struct fd_device *dev;
 
+   const struct fd_dev_info *info;
+
    unsigned seqno;
    struct fd_bo *control_mem;
 
@@ -109,6 +113,7 @@ static void
 cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
 {
    struct ir3_kernel *ir3_kernel = to_ir3_kernel(kernel);
+   struct a6xx_backend *a6xx_backend = to_a6xx_backend(ir3_kernel->backend);
    struct ir3_shader_variant *v = ir3_kernel->v;
    const struct ir3_info *i = &v->info;
    enum a6xx_threadsize thrsz = i->double_threadsize ? THREAD128 : THREAD64;
@@ -182,6 +187,24 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
                      CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) |
                      CP_LOAD_STATE6_0_NUM_UNIT(v->instrlen));
    OUT_RELOC(ring, v->bo, 0, 0, 0);
+
+   if (v->pvtmem_size > 0) {
+      uint32_t per_fiber_size = ALIGN(v->pvtmem_size, 512);
+      uint32_t per_sp_size =
+         ALIGN(per_fiber_size * a6xx_backend->info->a6xx.fibers_per_sp, 1 << 12);
+      uint32_t total_size = per_sp_size * a6xx_backend->info->num_sp_cores;
+
+      struct fd_bo *pvtmem = fd_bo_new(a6xx_backend->dev, total_size, 0, "pvtmem");
+      OUT_PKT4(ring, REG_A6XX_SP_CS_PVT_MEM_PARAM, 4);
+      OUT_RING(ring, A6XX_SP_CS_PVT_MEM_PARAM_MEMSIZEPERITEM(per_fiber_size));
+      OUT_RELOC(ring, pvtmem, 0, 0, 0);
+      OUT_RING(ring, A6XX_SP_CS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(per_sp_size) |
+                     COND(v->pvtmem_per_wave,
+                          A6XX_SP_CS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT));
+
+      OUT_PKT4(ring, REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET, 1);
+      OUT_RING(ring, A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET_OFFSET(per_sp_size));
+   }
 }
 
 static void
@@ -499,6 +522,8 @@ a6xx_init(struct fd_device *dev, const struct fd_dev_id *dev_id)
    a6xx_backend->compiler = ir3_compiler_create(dev, dev_id, false);
    a6xx_backend->dev = dev;
 
+   a6xx_backend->info = fd_dev_info(dev_id);
+
    a6xx_backend->control_mem =
       fd_bo_new(dev, 0x1000, 0, "control");
 
diff --git a/mesa 3D driver/src/freedreno/computerator/examples/pvtmem.asm b/mesa 3D driver/src/freedreno/computerator/examples/pvtmem.asm
new file mode 100644
index 0000000000..15b493d8be
--- /dev/null
+++ b/mesa 3D driver/src/freedreno/computerator/examples/pvtmem.asm	
@@ -0,0 +1,14 @@
+@localsize 1, 1, 1
+@buf 4  ; g[0]
+@pvtmem 4
+mov.u32u32 r1.x, 0x12345678
+mov.u32u32 r0.x, 0
+(rpt5)nop
+stp.u32 p[r0.x + 0], r1.x, 1
+ldp.u32 r0.x, p[r0.x + 0], 1
+mov.u32u32 r0.y, 0x00000000
+(sy)(rpt5)nop
+stib.b.untyped.1d.u32.1.imm r0.x, r0.y, 0
+end
+nop
+
diff --git a/mesa 3D driver/src/freedreno/computerator/ir3_asm.c b/mesa 3D driver/src/freedreno/computerator/ir3_asm.c
index b9c295adff..1b4fdc1016 100644
--- a/mesa 3D driver/src/freedreno/computerator/ir3_asm.c	
+++ b/mesa 3D driver/src/freedreno/computerator/ir3_asm.c	
@@ -35,8 +35,6 @@ ir3_asm_assemble(struct ir3_compiler *c, FILE *in)
       errx(-1, "assembler failed");
    struct ir3_shader_variant *v = shader->variants;
 
-   v->mergedregs = true;
-
    kernel->v = v;
    kernel->bin = v->bin;
 
diff --git a/mesa 3D driver/src/freedreno/computerator/main.c b/mesa 3D driver/src/freedreno/computerator/main.c
index 7d613fdde0..0468380beb 100644
--- a/mesa 3D driver/src/freedreno/computerator/main.c	
+++ b/mesa 3D driver/src/freedreno/computerator/main.c	
@@ -249,6 +249,9 @@ main(int argc, char **argv)
 
    struct backend *backend;
    switch (fd_dev_gen(dev_id)) {
+   case 4:
+      backend = a4xx_init(dev, dev_id);
+      break;
    case 6:
       backend = a6xx_init(dev, dev_id);
       break;
diff --git a/mesa 3D driver/src/freedreno/computerator/main.h b/mesa 3D driver/src/freedreno/computerator/main.h
index 62bc5a3343..795aad8424 100644
--- a/mesa 3D driver/src/freedreno/computerator/main.h	
+++ b/mesa 3D driver/src/freedreno/computerator/main.h	
@@ -80,6 +80,7 @@ struct backend {
       return (struct _to *)f;                                                  \
    }
 
+struct backend *a4xx_init(struct fd_device *dev, const struct fd_dev_id *dev_id);
 struct backend *a6xx_init(struct fd_device *dev, const struct fd_dev_id *dev_id);
 
 /* for conditionally setting boolean flag(s): */
diff --git a/mesa 3D driver/src/freedreno/computerator/meson.build b/mesa 3D driver/src/freedreno/computerator/meson.build
index b320426e3d..9302201444 100644
--- a/mesa 3D driver/src/freedreno/computerator/meson.build	
+++ b/mesa 3D driver/src/freedreno/computerator/meson.build	
@@ -19,6 +19,7 @@
 # SOFTWARE.
 
 computerator_files = [
+  'a4xx.c',
   'a6xx.c',
   'ir3_asm.c',
   'main.c',
@@ -42,6 +43,7 @@ computerator = executable(
     libfreedreno_drm,
     libfreedreno_ir3,
     libfreedreno_perfcntrs,
+    libfreedreno_common,
   ],
   dependencies : [
     dep_libdrm,
diff --git a/mesa 3D driver/src/freedreno/decode/buffers.c b/mesa 3D driver/src/freedreno/decode/buffers.c
index 6a73ef276d..068a4f1f4f 100644
--- a/mesa 3D driver/src/freedreno/decode/buffers.c	
+++ b/mesa 3D driver/src/freedreno/decode/buffers.c	
@@ -55,7 +55,12 @@ buffer_insert_cmp(const struct rb_node *n1, const struct rb_node *n2)
 {
    const struct buffer *buf1 = (const struct buffer *)n1;
    const struct buffer *buf2 = (const struct buffer *)n2;
-   return buf1->gpuaddr - buf2->gpuaddr;
+   /* Note that gpuaddr comparisions can overflow an int: */
+   if (buf1->gpuaddr > buf2->gpuaddr)
+      return 1;
+   else if (buf1->gpuaddr < buf2->gpuaddr)
+      return -1;
+   return 0;
 }
 
 static int
@@ -88,8 +93,7 @@ buffer_contains_hostptr(struct buffer *buf, void *hostptr)
 uint64_t
 gpuaddr(void *hostptr)
 {
-   rb_tree_foreach(struct buffer, buf, &buffers, node)
-   {
+   rb_tree_foreach (struct buffer, buf, &buffers, node) {
       if (buffer_contains_hostptr(buf, hostptr))
          return buf->gpuaddr + (hostptr - buf->hostptr);
    }
@@ -165,8 +169,7 @@ has_dumped(uint64_t gpuaddr, unsigned enable_mask)
 void
 reset_buffers(void)
 {
-   rb_tree_foreach_safe(struct buffer, buf, &buffers, node)
-   {
+   rb_tree_foreach_safe (struct buffer, buf, &buffers, node) {
       rb_tree_remove(&buffers, &buf->node);
       free(buf->hostptr);
       free(buf);
diff --git a/mesa 3D driver/src/freedreno/decode/cffdec.c b/mesa 3D driver/src/freedreno/decode/cffdec.c
index b112be7a67..0c4dc1e905 100644
--- a/mesa 3D driver/src/freedreno/decode/cffdec.c	
+++ b/mesa 3D driver/src/freedreno/decode/cffdec.c	
@@ -2616,7 +2616,10 @@ cp_set_ctxswitch_ib(uint32_t *dwords, uint32_t sizedwords, int level)
 
    addr = dwords[0] | ((uint64_t)dwords[1] << 32);
 
-   printf("addr=%" PRIx64 "\n", addr);
+   if (!quiet(3)) {
+      printf("%saddr=%" PRIx64 "\n", levels[level], addr);
+   }
+
    ptr = hostptr(addr);
    if (ptr) {
       dump_commands(ptr, size, level + 1);
diff --git a/mesa 3D driver/src/freedreno/decode/cffdec.h b/mesa 3D driver/src/freedreno/decode/cffdec.h
index 1d0a05d8bd..a21cf9f069 100644
--- a/mesa 3D driver/src/freedreno/decode/cffdec.h	
+++ b/mesa 3D driver/src/freedreno/decode/cffdec.h	
@@ -64,6 +64,11 @@ struct cffdec_options {
     */
    int once;
 
+   /* In unit_test mode, suppress pathnames in output so that we can have references
+    * independent of the build dir.
+    */
+   int unit_test;
+
    /* for crashdec, where we know CP_IBx_REM_SIZE, we can use this
     * to highlight the cmdstream not parsed yet, to make it easier
     * to see how far along the CP is.
diff --git a/mesa 3D driver/src/freedreno/decode/cffdump.c b/mesa 3D driver/src/freedreno/decode/cffdump.c
index 098ba4b790..0ed1989798 100644
--- a/mesa 3D driver/src/freedreno/decode/cffdump.c	
+++ b/mesa 3D driver/src/freedreno/decode/cffdump.c	
@@ -106,6 +106,7 @@ print_usage(const char *name)
            "\t                   which can be useful when looking at state that does\n"
            "\t                   not change per tile\n"
            "\t--not-once       - decode cmdstream for each IB (default)\n"
+           "\t--unit-test      - make reproducible output for unit testing\n"
            "\t-h, --help       - show this message\n"
            , name);
    /* clang-format on */
@@ -128,6 +129,7 @@ static const struct option opts[] = {
       { "query-compare",   no_argument, &options.query_compare, 1 },
       { "once",            no_argument, &options.once,          1 },
       { "not-once",        no_argument, &options.once,          0 },
+      { "unit-test",       no_argument, &options.unit_test,     1 },
 
       /* Long opts with short alias: */
       { "verbose",   no_argument,       0, 'v' },
@@ -263,7 +265,8 @@ handle_file(const char *filename, int start, int end, int draw)
 
    cffdec_init(&options);
 
-   printf("Reading %s...\n", filename);
+   if (!options.unit_test)
+      printf("Reading %s...\n", filename);
 
    script_start_cmdstream(filename);
 
diff --git a/mesa 3D driver/src/freedreno/decode/crashdec.c b/mesa 3D driver/src/freedreno/decode/crashdec.c
index 14313ca270..5e54f7fa06 100644
--- a/mesa 3D driver/src/freedreno/decode/crashdec.c	
+++ b/mesa 3D driver/src/freedreno/decode/crashdec.c	
@@ -345,6 +345,11 @@ dump_cmdstream(void)
        */
       unsigned ringszdw = ringbuffers[id].size >> 2; /* in dwords */
 
+      if (verbose) {
+         dump_commands(ringbuffers[id].buf, ringszdw, 0);
+         return;
+      }
+
 /* helper macro to deal with modulo size math: */
 #define mod_add(b, v) ((ringszdw + (int)(b) + (int)(v)) % ringszdw)
 
diff --git a/mesa 3D driver/src/freedreno/decode/meson.build b/mesa 3D driver/src/freedreno/decode/meson.build
index b1feea2ada..7f7a0801c4 100644
--- a/mesa 3D driver/src/freedreno/decode/meson.build	
+++ b/mesa 3D driver/src/freedreno/decode/meson.build	
@@ -18,6 +18,10 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+if with_tests
+  diff = find_program('diff')
+endif
+
 # Shared cmdstream decoding:
 libfreedreno_cffdec = static_library(
   'freedreno_cffdec',
@@ -93,6 +97,37 @@ if dep_lua.found() and dep_libarchive.found()
     build_by_default: with_tools.contains('freedreno'),
     install: install_fd_decode_tools,
   )
+
+  if with_tests
+    # dump only a single frame, and single tile pass, to keep the
+    # reference output size managable
+    cffdump_tests = [
+      ['fd-clouds', ['--frame', '0', '--once']],
+      ['es2gears-a320', ['--frame', '0', '--once']],
+      ['glxgears-a420', ['--frame', '1', '--once']],
+      ['dEQP-GLES2.functional.texture.specification.basic_teximage2d.rgba16f_2d', ['--once']],
+      ['dEQP-VK.draw.indirect_draw.indexed.indirect_draw_count.triangle_list', ['--frame', '0', '--once']],
+      # Test a lua script to ensure we don't break scripting API
+      ['shadow', ['--script', files(join_paths(meson.current_source_dir(), 'scripts', 'parse-submits.lua'))]],
+    ]
+    foreach cffdump_test: cffdump_tests
+      name = cffdump_test[0]
+      args = cffdump_test[1]
+
+      log = custom_target(name + '.log',
+        output: name + '.log',
+        command: [cffdump, '--unit-test', args, files('../.gitlab-ci/traces/' + name + '.rd.gz')],
+        capture: true,
+      )
+      test('cffdump-' + name,
+        diff,
+        args: ['-u', files('../.gitlab-ci/reference/' + name + '.log'), log],
+        suite: 'freedreno',
+        workdir: meson.source_root()
+      )
+
+    endforeach
+  endif
 endif
 
 crashdec = executable(
@@ -113,6 +148,20 @@ crashdec = executable(
   install: install_fd_decode_tools,
 )
 
+if with_tests
+  crashdec_output = custom_target('crashdec.txt',
+    output: 'crashdec.txt',
+    command: [crashdec, '-sf', files('../.gitlab-ci/traces/crash.devcore')],
+    capture: true
+  )
+  test('crashdec',
+    diff,
+    args: ['-u', files('../.gitlab-ci/reference/crash.log'), crashdec_output],
+    suite: 'freedreno',
+    workdir: meson.source_root()
+  )
+endif
+
 if dep_libarchive.found()
   pgmdump = executable(
     'pgmdump',
diff --git a/mesa 3D driver/src/freedreno/decode/pgmdump2.c b/mesa 3D driver/src/freedreno/decode/pgmdump2.c
index 227fd22c13..94d6c3401e 100644
--- a/mesa 3D driver/src/freedreno/decode/pgmdump2.c	
+++ b/mesa 3D driver/src/freedreno/decode/pgmdump2.c	
@@ -101,7 +101,7 @@ struct state {
 #define F(s, field)                                                            \
    do {                                                                        \
       OFF(s->field);                                                           \
-      printf("%s%12s:\t%f (0x%0x)\n", tab(state->lvl), #field, d2f(s->field),  \
+      printf("%s%12s:\t%f (0x%0x)\n", tab(state->lvl), #field, uif(s->field),  \
              s->field);                                                        \
    } while (0)
 
@@ -161,7 +161,7 @@ dump_unknown(struct state *state, void *buf, unsigned start, unsigned n)
          uint8_t c = *(ascii++);
          printf("%c", (isascii(c) && !iscntrl(c)) ? c : '.');
       }
-      printf("|\t%f", d2f(d));
+      printf("|\t%f", uif(d));
 
       /* TODO maybe scan for first non-null and non-ascii char starting from
        * end of shader binary to (roughly) establish the start of the string
diff --git a/mesa 3D driver/src/freedreno/decode/redump.h b/mesa 3D driver/src/freedreno/decode/redump.h
index c7121e8650..5c4b8f0a41 100644
--- a/mesa 3D driver/src/freedreno/decode/redump.h	
+++ b/mesa 3D driver/src/freedreno/decode/redump.h	
@@ -24,6 +24,8 @@
 #ifndef REDUMP_H_
 #define REDUMP_H_
 
+#include "util/u_math.h"
+
 enum rd_sect_type {
    RD_NONE,
    RD_TEST,           /* ascii text */
@@ -79,12 +81,6 @@ void rd_write_section(enum rd_sect_type type, const void *buf, int sz)
          rd_write_section(t, b, s);                                            \
    } while (0)
 
-#ifndef ARRAY_SIZE
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
-#endif
-#undef ALIGN
-#define ALIGN(v, a) (((v) + (a)-1) & ~((a)-1))
-
 #define min(a, b) (((a) < (b)) ? (a) : (b))
 #define max(a, b) (((a) > (b)) ? (a) : (b))
 
diff --git a/mesa 3D driver/src/freedreno/decode/scripts/parse-submits.lua b/mesa 3D driver/src/freedreno/decode/scripts/parse-submits.lua
index 96fa66957d..cd20b680d3 100644
--- a/mesa 3D driver/src/freedreno/decode/scripts/parse-submits.lua	
+++ b/mesa 3D driver/src/freedreno/decode/scripts/parse-submits.lua	
@@ -64,10 +64,6 @@ local depthwrite
 local stenciltest
 local stencilwrite
 
-function start_cmdstream(name)
-	printf("Parsing %s\n", name)
-end
-
 function reset()
 	dbg("reset\n")
 	mrts = {}
diff --git a/mesa 3D driver/src/freedreno/decode/util.h b/mesa 3D driver/src/freedreno/decode/util.h
index 497f561500..fbc5a78c36 100644
--- a/mesa 3D driver/src/freedreno/decode/util.h	
+++ b/mesa 3D driver/src/freedreno/decode/util.h	
@@ -28,6 +28,8 @@
 #include <stdint.h>
 #include <stdio.h>
 
+#include "util/u_math.h"
+
 /* old-style program binary XOR'd ascii w/ 0xff */
 #ifndef ASCII_XOR
 #define ASCII_XOR 0
@@ -40,19 +42,6 @@ tab(int lvl)
    return &TAB[strlen(TAB) - lvl];
 }
 
-/* convert float to dword */
-static inline float
-d2f(uint32_t d)
-{
-   union {
-      float f;
-      uint32_t d;
-   } u = {
-      .d = d,
-   };
-   return u.f;
-}
-
 static inline void
 dump_hex(const void *buf, int sz)
 {
@@ -68,7 +57,7 @@ dump_hex(const void *buf, int sz)
       d |= *(ptr++) << 0;
       d |= *(ptr++) << 8;
       d |= *(ptr++) << 16;
-      d |= *(ptr++) << 24;
+      d |= (uint32_t)*(ptr++) << 24;
 
       printf("%08x", d);
 
@@ -99,9 +88,9 @@ dump_float(const void *buf, int sz)
       d |= *(ptr++) << 0;
       d |= *(ptr++) << 8;
       d |= *(ptr++) << 16;
-      d |= *(ptr++) << 24;
+      d |= (uint32_t)*(ptr++) << 24;
 
-      printf("%8f", d2f(d));
+      printf("%8f", uif(d));
 
       if ((i % 8) == 7) {
          printf("\n");
@@ -171,7 +160,7 @@ dump_hex_ascii(const void *buf, int sz, int level)
       d |= *(ptr++) << 0;
       d |= *(ptr++) << 8;
       d |= *(ptr++) << 16;
-      d |= *(ptr++) << 24;
+      d |= (uint32_t)*(ptr++) << 24;
 
       printf("%08x", d);
 
diff --git a/mesa 3D driver/src/freedreno/drm/freedreno_bo.c b/mesa 3D driver/src/freedreno/drm/freedreno_bo.c
index 7be0c7839a..15491b965d 100644
--- a/mesa 3D driver/src/freedreno/drm/freedreno_bo.c	
+++ b/mesa 3D driver/src/freedreno/drm/freedreno_bo.c	
@@ -77,7 +77,7 @@ bo_from_handle(struct fd_device *dev, uint32_t size, uint32_t handle)
    bo->size = size;
    bo->handle = handle;
    bo->iova = bo->funcs->iova(bo);
-   bo->flags = FD_RELOC_FLAGS_INIT;
+   bo->reloc_flags = FD_RELOC_FLAGS_INIT;
 
    p_atomic_set(&bo->refcnt, 1);
    list_inithead(&bo->list);
@@ -94,6 +94,10 @@ bo_new(struct fd_device *dev, uint32_t size, uint32_t flags,
    uint32_t handle;
    int ret;
 
+   /* demote cached-coherent to WC if not supported: */
+   if ((flags & FD_BO_CACHED_COHERENT) && !dev->has_cached_coherent)
+      flags &= ~FD_BO_CACHED_COHERENT;
+
    bo = fd_bo_cache_alloc(cache, &size, flags);
    if (bo)
       return bo;
@@ -106,6 +110,7 @@ bo_new(struct fd_device *dev, uint32_t size, uint32_t flags,
    bo = bo_from_handle(dev, size, handle);
    simple_mtx_unlock(&table_lock);
 
+   bo->alloc_flags = flags;
    bo->max_fences = 1;
    bo->fences = &bo->_inline_fence;
 
@@ -137,11 +142,11 @@ _fd_bo_set_name(struct fd_bo *bo, const char *fmt, va_list ap)
 struct fd_bo *
 fd_bo_new_ring(struct fd_device *dev, uint32_t size)
 {
-   uint32_t flags = FD_BO_GPUREADONLY;
+   uint32_t flags = FD_BO_GPUREADONLY | FD_BO_CACHED_COHERENT;
    struct fd_bo *bo = bo_new(dev, size, flags, &dev->ring_cache);
    if (bo) {
       bo->bo_reuse = RING_CACHE;
-      bo->flags |= FD_RELOC_DUMP;
+      bo->reloc_flags |= FD_RELOC_DUMP;
       fd_bo_set_name(bo, "cmdstream");
    }
    return bo;
@@ -239,7 +244,7 @@ fd_bo_from_name(struct fd_device *dev, uint32_t name)
 void
 fd_bo_mark_for_dump(struct fd_bo *bo)
 {
-   bo->flags |= FD_RELOC_DUMP;
+   bo->reloc_flags |= FD_RELOC_DUMP;
 }
 
 uint64_t
@@ -315,7 +320,8 @@ cleanup_fences(struct fd_bo *bo, bool expired)
       if (expired && fd_fence_before(f->pipe->control->fence, f->fence))
          continue;
 
-      fd_pipe_del_locked(f->pipe);
+      struct fd_pipe *pipe = f->pipe;
+
       bo->nr_fences--;
 
       if (bo->nr_fences > 0) {
@@ -323,6 +329,8 @@ cleanup_fences(struct fd_bo *bo, bool expired)
          bo->fences[i] = bo->fences[bo->nr_fences];
          i--;
       }
+
+      fd_pipe_del_locked(pipe);
    }
 }
 
@@ -427,6 +435,12 @@ fd_bo_size(struct fd_bo *bo)
    return bo->size;
 }
 
+bool
+fd_bo_is_cached(struct fd_bo *bo)
+{
+   return !!(bo->alloc_flags & FD_BO_CACHED_COHERENT);
+}
+
 void *
 fd_bo_map(struct fd_bo *bo)
 {
diff --git a/mesa 3D driver/src/freedreno/drm/freedreno_bo_cache.c b/mesa 3D driver/src/freedreno/drm/freedreno_bo_cache.c
index 6a028d9ae4..0a70cb531f 100644
--- a/mesa 3D driver/src/freedreno/drm/freedreno_bo_cache.c	
+++ b/mesa 3D driver/src/freedreno/drm/freedreno_bo_cache.c	
@@ -81,6 +81,8 @@ fd_bo_cache_cleanup(struct fd_bo_cache *cache, time_t time)
 {
    int i;
 
+   simple_mtx_assert_locked(&table_lock);
+
    if (cache->time == time)
       return;
 
@@ -135,13 +137,13 @@ find_in_bucket(struct fd_bo_bucket *bucket, uint32_t flags)
     * (MRU, since likely to be in GPU cache), rather than head (LRU)..
     */
    simple_mtx_lock(&table_lock);
-   if (!list_is_empty(&bucket->list)) {
-      bo = LIST_ENTRY(struct fd_bo, bucket->list.next, list);
-      /* TODO check for compatible flags? */
-      if (fd_bo_state(bo) == FD_BO_STATE_IDLE) {
+   list_for_each_entry (struct fd_bo, entry, &bucket->list, list) {
+      if (fd_bo_state(entry) != FD_BO_STATE_IDLE)
+         break;
+      if (entry->alloc_flags == flags) {
+         bo = entry;
          list_del(&bo->list);
-      } else {
-         bo = NULL;
+         break;
       }
    }
    simple_mtx_unlock(&table_lock);
@@ -174,7 +176,7 @@ fd_bo_cache_alloc(struct fd_bo_cache *cache, uint32_t *size, uint32_t flags)
             goto retry;
          }
          p_atomic_set(&bo->refcnt, 1);
-         bo->flags = FD_RELOC_FLAGS_INIT;
+         bo->reloc_flags = FD_RELOC_FLAGS_INIT;
          return bo;
       }
    }
@@ -185,6 +187,11 @@ fd_bo_cache_alloc(struct fd_bo_cache *cache, uint32_t *size, uint32_t flags)
 int
 fd_bo_cache_free(struct fd_bo_cache *cache, struct fd_bo *bo)
 {
+   simple_mtx_assert_locked(&table_lock);
+
+   if (bo->nosync || bo->shared)
+      return -1;
+
    struct fd_bo_bucket *bucket = get_bucket(cache, bo->size);
 
    /* see if we can be green and recycle: */
diff --git a/mesa 3D driver/src/freedreno/drm/freedreno_device.c b/mesa 3D driver/src/freedreno/drm/freedreno_device.c
index dc01f65abd..31c306fb04 100644
--- a/mesa 3D driver/src/freedreno/drm/freedreno_device.c	
+++ b/mesa 3D driver/src/freedreno/drm/freedreno_device.c	
@@ -86,6 +86,7 @@ fd_device_new(int fd)
 
    list_inithead(&dev->deferred_submits);
    simple_mtx_init(&dev->submit_lock, mtx_plain);
+   simple_mtx_init(&dev->suballoc_lock, mtx_plain);
 
    return dev;
 }
@@ -130,6 +131,9 @@ fd_device_del_impl(struct fd_device *dev)
 
    assert(list_is_empty(&dev->deferred_submits));
 
+   if (dev->suballoc_bo)
+      fd_bo_del_locked(dev->suballoc_bo);
+
    fd_bo_cache_cleanup(&dev->bo_cache, 0);
    fd_bo_cache_cleanup(&dev->ring_cache, 0);
    _mesa_hash_table_destroy(dev->handle_table, NULL);
diff --git a/mesa 3D driver/src/freedreno/drm/freedreno_drmif.h b/mesa 3D driver/src/freedreno/drm/freedreno_drmif.h
index 67b784acbe..5c2b1d9ad1 100644
--- a/mesa 3D driver/src/freedreno/drm/freedreno_drmif.h	
+++ b/mesa 3D driver/src/freedreno/drm/freedreno_drmif.h	
@@ -97,9 +97,10 @@ struct fd_fence {
 };
 
 /* bo flags: */
-#define FD_BO_GPUREADONLY  BITSET_BIT(1)
-#define FD_BO_SCANOUT      BITSET_BIT(2)
-/* Default caching is WRITECOMBINE, we can add new bo flags later for cached/etc */
+#define FD_BO_GPUREADONLY         BITSET_BIT(1)
+#define FD_BO_SCANOUT             BITSET_BIT(2)
+#define FD_BO_CACHED_COHERENT     BITSET_BIT(3)
+/* Default caching is WRITECOMBINE */
 
 /* bo access flags: (keep aligned to MSM_PREP_x) */
 #define FD_BO_PREP_READ   BITSET_BIT(0)
@@ -128,6 +129,8 @@ enum fd_version {
    FD_VERSION_SOFTPIN = 4,             /* adds softpin, bo name, and dump flag */
    FD_VERSION_ROBUSTNESS = 5,          /* adds FD_NR_FAULTS and FD_PP_PGTABLE */
    FD_VERSION_MEMORY_FD = 2,           /* supports shared memory objects */
+   FD_VERSION_SUSPENDS = 7,            /* Adds MSM_PARAM_SUSPENDS to detect device suspend */
+   FD_VERSION_CACHED_COHERENT = 8,     /* Adds cached-coherent support (a6xx+) */
 };
 enum fd_version fd_device_version(struct fd_device *dev);
 
@@ -206,6 +209,7 @@ uint32_t fd_bo_size(struct fd_bo *bo);
 void *fd_bo_map(struct fd_bo *bo);
 int fd_bo_cpu_prep(struct fd_bo *bo, struct fd_pipe *pipe, uint32_t op);
 void fd_bo_cpu_fini(struct fd_bo *bo);
+bool fd_bo_is_cached(struct fd_bo *bo);
 
 #ifdef __cplusplus
 } /* end of extern "C" */
diff --git a/mesa 3D driver/src/freedreno/drm/freedreno_pipe.c b/mesa 3D driver/src/freedreno/drm/freedreno_pipe.c
index 53fd808821..83052b98d1 100644
--- a/mesa 3D driver/src/freedreno/drm/freedreno_pipe.c	
+++ b/mesa 3D driver/src/freedreno/drm/freedreno_pipe.c	
@@ -64,7 +64,8 @@ fd_pipe_new2(struct fd_device *dev, enum fd_pipe_id id, uint32_t prio)
    pipe->dev_id.chip_id = val;
 
    pipe->control_mem = fd_bo_new(dev, sizeof(*pipe->control),
-                                 0, "pipe-control");
+                                 FD_BO_CACHED_COHERENT,
+                                 "pipe-control");
    pipe->control = fd_bo_map(pipe->control_mem);
 
    /* We could be getting a bo from the bo-cache, make sure the fence value
diff --git a/mesa 3D driver/src/freedreno/drm/freedreno_priv.h b/mesa 3D driver/src/freedreno/drm/freedreno_priv.h
index 6e3c610ee0..bb165b8728 100644
--- a/mesa 3D driver/src/freedreno/drm/freedreno_priv.h	
+++ b/mesa 3D driver/src/freedreno/drm/freedreno_priv.h	
@@ -130,7 +130,9 @@ struct fd_device {
    struct fd_bo_cache bo_cache;
    struct fd_bo_cache ring_cache;
 
-   int closefd; /* call close(fd) upon destruction */
+   bool has_cached_coherent;
+
+   bool closefd; /* call close(fd) upon destruction */
 
    /* just for valgrind: */
    int bo_size;
@@ -146,6 +148,23 @@ struct fd_device {
    struct list_head deferred_submits;
    unsigned deferred_cmds;
    simple_mtx_t submit_lock;
+
+   /**
+    * BO for suballocating long-lived state objects.
+    *
+    * Note: one would be tempted to put this in fd_pipe to avoid locking.
+    * But that is a bad idea for a couple of reasons:
+    *
+    *  1) With TC, stateobj allocation can happen in either frontend thread
+    *     (ie. most CSOs), and also driver thread (a6xx cached tex state)
+    *  2) It is best for fd_pipe to not hold a reference to a BO that can
+    *     be free'd to bo cache, as that can cause unexpected re-entrancy
+    *     (fd_bo_cache_alloc() -> find_in_bucket() -> fd_bo_state() ->
+    *     cleanup_fences() -> drop pipe ref which free's bo's).
+    */
+   struct fd_bo *suballoc_bo;
+   uint32_t suballoc_offset;
+   simple_mtx_t suballoc_lock;
 };
 
 #define foreach_submit(name, list) \
@@ -284,7 +303,8 @@ struct fd_bo {
    uint32_t handle;
    uint32_t name;
    int32_t refcnt;
-   uint32_t flags; /* flags like FD_RELOC_DUMP to use for relocs to this BO */
+   uint32_t reloc_flags; /* flags like FD_RELOC_DUMP to use for relocs to this BO */
+   uint32_t alloc_flags; /* flags that control allocation/mapping, ie. FD_BO_x */
    uint64_t iova;
    void *map;
    const struct fd_bo_funcs *funcs;
diff --git a/mesa 3D driver/src/freedreno/drm/freedreno_ringbuffer.h b/mesa 3D driver/src/freedreno/drm/freedreno_ringbuffer.h
index 6fc49f72b8..ae985801ac 100644
--- a/mesa 3D driver/src/freedreno/drm/freedreno_ringbuffer.h	
+++ b/mesa 3D driver/src/freedreno/drm/freedreno_ringbuffer.h	
@@ -256,6 +256,13 @@ fd_ringbuffer_size(struct fd_ringbuffer *ring)
    return offset_bytes(ring->cur, ring->start);
 }
 
+static inline bool
+fd_ringbuffer_empty(struct fd_ringbuffer *ring)
+{
+   return (fd_ringbuffer_cmd_count(ring) == 1) &&
+          (offset_bytes(ring->cur, ring->start) == 0);
+}
+
 #define LOG_DWORDS 0
 
 static inline void
diff --git a/mesa 3D driver/src/freedreno/drm/msm_bo.c b/mesa 3D driver/src/freedreno/drm/msm_bo.c
index 3f43793648..5963cac86b 100644
--- a/mesa 3D driver/src/freedreno/drm/msm_bo.c	
+++ b/mesa 3D driver/src/freedreno/drm/msm_bo.c	
@@ -170,7 +170,6 @@ msm_bo_new_handle(struct fd_device *dev, uint32_t size, uint32_t flags,
 {
    struct drm_msm_gem_new req = {
       .size = size,
-      .flags = MSM_BO_WC, // TODO figure out proper flags..
    };
    int ret;
 
@@ -180,6 +179,11 @@ msm_bo_new_handle(struct fd_device *dev, uint32_t size, uint32_t flags,
    if (flags & FD_BO_GPUREADONLY)
       req.flags |= MSM_BO_GPU_READONLY;
 
+   if (flags & FD_BO_CACHED_COHERENT)
+      req.flags |= MSM_BO_CACHED_COHERENT;
+   else
+      req.flags |= MSM_BO_WC;
+
    ret = drmCommandWriteRead(dev->fd, DRM_MSM_GEM_NEW, &req, sizeof(req));
    if (ret)
       return ret;
diff --git a/mesa 3D driver/src/freedreno/drm/msm_device.c b/mesa 3D driver/src/freedreno/drm/msm_device.c
index 93abf4844b..a221fdc4ca 100644
--- a/mesa 3D driver/src/freedreno/drm/msm_device.c	
+++ b/mesa 3D driver/src/freedreno/drm/msm_device.c	
@@ -73,6 +73,27 @@ msm_device_new(int fd, drmVersionPtr version)
       util_queue_init(&msm_dev->submit_queue, "sq", 8, 1, 0, NULL);
    }
 
+   if (version->version_minor >= FD_VERSION_CACHED_COHERENT) {
+      struct drm_msm_gem_new new_req = {
+         .size = 0x1000,
+         .flags = MSM_BO_CACHED_COHERENT,
+      };
+
+      /* The kernel is new enough to support MSM_BO_CACHED_COHERENT,
+       * but that is not a guarantee that the device we are running
+       * on supports it.  So do a test allocation to find out.
+       */
+      if (!drmCommandWriteRead(fd, DRM_MSM_GEM_NEW,
+                               &new_req, sizeof(new_req))) {
+         struct drm_gem_close close_req = {
+            .handle = new_req.handle,
+         };
+         drmIoctl(fd, DRM_IOCTL_GEM_CLOSE, &close_req);
+
+         dev->has_cached_coherent = true;
+      }
+   }
+
    dev->bo_size = sizeof(struct msm_bo);
 
    return dev;
diff --git a/mesa 3D driver/src/freedreno/drm/msm_pipe.c b/mesa 3D driver/src/freedreno/drm/msm_pipe.c
index 0c35063c35..4a71a9f940 100644
--- a/mesa 3D driver/src/freedreno/drm/msm_pipe.c	
+++ b/mesa 3D driver/src/freedreno/drm/msm_pipe.c	
@@ -172,9 +172,6 @@ msm_pipe_destroy(struct fd_pipe *pipe)
 {
    struct msm_pipe *msm_pipe = to_msm_pipe(pipe);
 
-   if (msm_pipe->suballoc_bo)
-      fd_bo_del_locked(msm_pipe->suballoc_bo);
-
    close_submitqueue(pipe, msm_pipe->queue_id);
    msm_pipe_sp_ringpool_init(msm_pipe);
    free(msm_pipe);
diff --git a/mesa 3D driver/src/freedreno/drm/msm_priv.h b/mesa 3D driver/src/freedreno/drm/msm_priv.h
index 53e61e5896..9e0211d922 100644
--- a/mesa 3D driver/src/freedreno/drm/msm_priv.h	
+++ b/mesa 3D driver/src/freedreno/drm/msm_priv.h	
@@ -30,6 +30,9 @@
 #include "freedreno_priv.h"
 
 #include "util/slab.h"
+#include "util/timespec.h"
+
+#include "pipe/p_defines.h"
 
 #ifndef __user
 #define __user
@@ -39,7 +42,6 @@
 
 struct msm_device {
    struct fd_device base;
-   struct fd_bo_cache ring_cache;
    struct util_queue submit_queue;
 };
 FD_DEFINE_CAST(fd_device, msm_device);
@@ -56,10 +58,6 @@ struct msm_pipe {
    uint32_t queue_id;
    struct slab_parent_pool ring_pool;
 
-   /* BO for suballocating long-lived objects on the pipe. */
-   struct fd_bo *suballoc_bo;
-   uint32_t suballoc_offset;
-
    /**
     * The last fence seqno that was flushed to kernel (doesn't mean that it
     * is complete, just that the kernel knows about it)
@@ -132,9 +130,17 @@ static inline void
 get_abs_timeout(struct drm_msm_timespec *tv, uint64_t ns)
 {
    struct timespec t;
+
+   if (ns == PIPE_TIMEOUT_INFINITE)
+      ns = 3600ULL * NSEC_PER_SEC; /* 1 hour timeout is almost infinite */
+
    clock_gettime(CLOCK_MONOTONIC, &t);
-   tv->tv_sec = t.tv_sec + ns / 1000000000;
-   tv->tv_nsec = t.tv_nsec + ns % 1000000000;
+   tv->tv_sec = t.tv_sec + ns / NSEC_PER_SEC;
+   tv->tv_nsec = t.tv_nsec + ns % NSEC_PER_SEC;
+   if (tv->tv_nsec >= NSEC_PER_SEC) { /* handle nsec overflow */
+      tv->tv_nsec -= NSEC_PER_SEC;
+      tv->tv_sec++;
+   }
 }
 
 #endif /* MSM_PRIV_H_ */
diff --git a/mesa 3D driver/src/freedreno/drm/msm_ringbuffer.c b/mesa 3D driver/src/freedreno/drm/msm_ringbuffer.c
index 3520b886e3..43bd83937e 100644
--- a/mesa 3D driver/src/freedreno/drm/msm_ringbuffer.c	
+++ b/mesa 3D driver/src/freedreno/drm/msm_ringbuffer.c	
@@ -149,7 +149,7 @@ append_bo(struct msm_submit *submit, struct fd_bo *bo)
          idx = APPEND(
             submit, submit_bos,
             (struct drm_msm_gem_submit_bo){
-               .flags = bo->flags & (MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE),
+               .flags = bo->reloc_flags & (MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE),
                .handle = bo->handle,
                .presumed = 0,
             });
diff --git a/mesa 3D driver/src/freedreno/drm/msm_ringbuffer_sp.c b/mesa 3D driver/src/freedreno/drm/msm_ringbuffer_sp.c
index 3604e3d647..e84e37d894 100644
--- a/mesa 3D driver/src/freedreno/drm/msm_ringbuffer_sp.c	
+++ b/mesa 3D driver/src/freedreno/drm/msm_ringbuffer_sp.c	
@@ -366,7 +366,7 @@ flush_submit_list(struct list_head *submit_list)
    }
 
    for (unsigned i = 0; i < msm_submit->nr_bos; i++) {
-      submit_bos[i].flags = msm_submit->bos[i]->flags;
+      submit_bos[i].flags = msm_submit->bos[i]->reloc_flags;
       submit_bos[i].handle = msm_submit->bos[i]->handle;
       submit_bos[i].presumed = 0;
    }
@@ -825,34 +825,33 @@ msm_ringbuffer_sp_init(struct msm_ringbuffer_sp *msm_ring, uint32_t size,
 struct fd_ringbuffer *
 msm_ringbuffer_sp_new_object(struct fd_pipe *pipe, uint32_t size)
 {
-   struct msm_pipe *msm_pipe = to_msm_pipe(pipe);
+   struct fd_device *dev = pipe->dev;
    struct msm_ringbuffer_sp *msm_ring = malloc(sizeof(*msm_ring));
 
    /* Lock access to the msm_pipe->suballoc_* since ringbuffer object allocation
     * can happen both on the frontend (most CSOs) and the driver thread (a6xx
     * cached tex state, for example)
     */
-   static simple_mtx_t suballoc_lock = _SIMPLE_MTX_INITIALIZER_NP;
-   simple_mtx_lock(&suballoc_lock);
+   simple_mtx_lock(&dev->suballoc_lock);
 
    /* Maximum known alignment requirement is a6xx's TEX_CONST at 16 dwords */
-   msm_ring->offset = align(msm_pipe->suballoc_offset, 64);
-   if (!msm_pipe->suballoc_bo ||
-       msm_ring->offset + size > fd_bo_size(msm_pipe->suballoc_bo)) {
-      if (msm_pipe->suballoc_bo)
-         fd_bo_del(msm_pipe->suballoc_bo);
-      msm_pipe->suballoc_bo =
-         fd_bo_new_ring(pipe->dev, MAX2(SUBALLOC_SIZE, align(size, 4096)));
+   msm_ring->offset = align(dev->suballoc_offset, 64);
+   if (!dev->suballoc_bo ||
+       msm_ring->offset + size > fd_bo_size(dev->suballoc_bo)) {
+      if (dev->suballoc_bo)
+         fd_bo_del(dev->suballoc_bo);
+      dev->suballoc_bo =
+         fd_bo_new_ring(dev, MAX2(SUBALLOC_SIZE, align(size, 4096)));
       msm_ring->offset = 0;
    }
 
    msm_ring->u.pipe = pipe;
-   msm_ring->ring_bo = fd_bo_ref(msm_pipe->suballoc_bo);
+   msm_ring->ring_bo = fd_bo_ref(dev->suballoc_bo);
    msm_ring->base.refcnt = 1;
 
-   msm_pipe->suballoc_offset = msm_ring->offset + size;
+   dev->suballoc_offset = msm_ring->offset + size;
 
-   simple_mtx_unlock(&suballoc_lock);
+   simple_mtx_unlock(&dev->suballoc_lock);
 
    return msm_ringbuffer_sp_init(msm_ring, size, _FD_RINGBUFFER_OBJECT);
 }
diff --git a/mesa 3D driver/src/freedreno/ds/meson.build b/mesa 3D driver/src/freedreno/ds/meson.build
index 89d2b7e6eb..50f099a9e0 100644
--- a/mesa 3D driver/src/freedreno/ds/meson.build	
+++ b/mesa 3D driver/src/freedreno/ds/meson.build	
@@ -19,6 +19,7 @@ pps_freedreno_lib = static_library(
   dependencies: [
     dep_libdrm,
     dep_perfetto,
+    dep_valgrind,
   ],
   cpp_args: '-std=c++17'
 )
diff --git a/mesa 3D driver/src/freedreno/fdl/fd6_format_table.c b/mesa 3D driver/src/freedreno/fdl/fd6_format_table.c
new file mode 100644
index 0000000000..d99d0299c6
--- /dev/null
+++ b/mesa 3D driver/src/freedreno/fdl/fd6_format_table.c	
@@ -0,0 +1,467 @@
+/*
+ * Copyright (C) 2016 Rob Clark <robclark@freedesktop.org>
+ * Copyright © 2018 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "pipe/p_defines.h"
+#include "util/format/u_format.h"
+
+#include "fd6_format_table.h"
+
+/* Specifies the table of all the formats and their features. Also supplies
+ * the helpers that look up various data in those tables.
+ */
+
+struct fd6_format {
+   enum a6xx_format vtx;
+   enum a6xx_format tex;
+   enum a6xx_format rb;
+   enum a3xx_color_swap swap;
+   boolean present;
+};
+
+#define FMT(pipe, vtxfmt, texfmt, rbfmt, swapfmt)                              \
+   [PIPE_FORMAT_##pipe] = {.present = 1,                                       \
+                           .vtx = FMT6_##vtxfmt,                               \
+                           .tex = FMT6_##texfmt,                               \
+                           .rb = FMT6_##rbfmt,                                 \
+                           .swap = swapfmt}
+
+/* vertex + texture + color */
+#define VTC(pipe, fmt, swapfmt) FMT(pipe, fmt, fmt, fmt, swapfmt)
+
+#define _TC(pipe, fmt, swapfmt) FMT(pipe, NONE, fmt, fmt, swapfmt)
+#define _T_(pipe, fmt, swapfmt) FMT(pipe, NONE, fmt, NONE, swapfmt)
+#define VT_(pipe, fmt, swapfmt) FMT(pipe, fmt, fmt, NONE, swapfmt)
+#define V__(pipe, fmt, swapfmt) FMT(pipe, fmt, NONE, NONE, swapfmt)
+
+/* clang-format off */
+static const struct fd6_format formats[PIPE_FORMAT_COUNT] = {
+   /* 8-bit */
+   VTC(R8_UNORM,   8_UNORM,                     WZYX),
+   VTC(R8_SNORM,   8_SNORM,                     WZYX),
+   VTC(R8_UINT,    8_UINT,                      WZYX),
+   VTC(R8_SINT,    8_SINT,                      WZYX),
+   V__(R8_USCALED, 8_UINT,                      WZYX),
+   V__(R8_SSCALED, 8_SINT,                      WZYX),
+   _TC(R8_SRGB,    8_UNORM,                     WZYX),
+   _TC(Y8_UNORM,   NV12_Y,                      WZYX),
+
+   FMT(A8_UNORM,   NONE, 8_UNORM, A8_UNORM,     WZYX),
+   _TC(L8_UNORM,   8_UNORM,                     WZYX),
+   _T_(I8_UNORM,   8_UNORM,                     WZYX),
+
+   _T_(A8_UINT,    8_UINT,                      WZYX),
+   _T_(A8_SINT,    8_SINT,                      WZYX),
+   _T_(L8_UINT,    8_UINT,                      WZYX),
+   _T_(L8_SINT,    8_SINT,                      WZYX),
+   _T_(I8_UINT,    8_UINT,                      WZYX),
+   _T_(I8_SINT,    8_SINT,                      WZYX),
+
+   _TC(S8_UINT,    8_UINT,                      WZYX),
+
+   /* 16-bit */
+   VTC(R16_UNORM,   16_UNORM,                   WZYX),
+   VTC(R16_SNORM,   16_SNORM,                   WZYX),
+   VTC(R16_UINT,    16_UINT,                    WZYX),
+   VTC(R16_SINT,    16_SINT,                    WZYX),
+   V__(R16_USCALED, 16_UINT,                    WZYX),
+   V__(R16_SSCALED, 16_SINT,                    WZYX),
+   VTC(R16_FLOAT,   16_FLOAT,                   WZYX),
+   _TC(Z16_UNORM,   16_UNORM,                   WZYX),
+
+   _T_(A16_UNORM,   16_UNORM,                   WZYX),
+   _T_(A16_SNORM,   16_SNORM,                   WZYX),
+   _T_(A16_UINT,    16_UINT,                    WZYX),
+   _T_(A16_SINT,    16_SINT,                    WZYX),
+   _T_(L16_UNORM,   16_UNORM,                   WZYX),
+   _T_(L16_SNORM,   16_SNORM,                   WZYX),
+   _T_(L16_UINT,    16_UINT,                    WZYX),
+   _T_(L16_SINT,    16_SINT,                    WZYX),
+   _T_(I16_UNORM,   16_UNORM,                   WZYX),
+   _T_(I16_SNORM,   16_SNORM,                   WZYX),
+   _T_(I16_UINT,    16_UINT,                    WZYX),
+   _T_(I16_SINT,    16_SINT,                    WZYX),
+
+   VTC(R8G8_UNORM,   8_8_UNORM,                 WZYX),
+   VTC(R8G8_SNORM,   8_8_SNORM,                 WZYX),
+   VTC(R8G8_UINT,    8_8_UINT,                  WZYX),
+   VTC(R8G8_SINT,    8_8_SINT,                  WZYX),
+   V__(R8G8_USCALED, 8_8_UINT,                  WZYX),
+   V__(R8G8_SSCALED, 8_8_SINT,                  WZYX),
+   _TC(R8G8_SRGB,    8_8_UNORM,                 WZYX),
+
+   _T_(L8A8_UINT,    8_8_UINT,                  WZYX),
+   _T_(L8A8_SINT,    8_8_SINT,                  WZYX),
+
+   _TC(R5G6B5_UNORM,   5_6_5_UNORM,             WZYX),
+   _TC(B5G6R5_UNORM,   5_6_5_UNORM,             WXYZ),
+
+   _TC(B5G5R5A1_UNORM, 5_5_5_1_UNORM,           WXYZ),
+   _TC(B5G5R5X1_UNORM, 5_5_5_1_UNORM,           WXYZ),
+   _TC(A1R5G5B5_UNORM, 5_5_5_1_UNORM,           ZYXW),
+   _TC(A1B5G5R5_UNORM, 5_5_5_1_UNORM,           XYZW),
+
+   _TC(R4G4B4A4_UNORM, 4_4_4_4_UNORM,           WZYX),
+   _TC(B4G4R4A4_UNORM, 4_4_4_4_UNORM,           WXYZ),
+   _TC(A4R4G4B4_UNORM, 4_4_4_4_UNORM,           ZYXW),
+   _TC(A4B4G4R4_UNORM, 4_4_4_4_UNORM,           XYZW),
+
+   /* 24-bit */
+   V__(R8G8B8_UNORM,   8_8_8_UNORM,             WZYX),
+   V__(R8G8B8_SNORM,   8_8_8_SNORM,             WZYX),
+   V__(R8G8B8_UINT,    8_8_8_UINT,              WZYX),
+   V__(R8G8B8_SINT,    8_8_8_SINT,              WZYX),
+   V__(R8G8B8_USCALED, 8_8_8_UINT,              WZYX),
+   V__(R8G8B8_SSCALED, 8_8_8_SINT,              WZYX),
+
+   /* 32-bit */
+   V__(R32_UNORM,   32_UNORM,                   WZYX),
+   V__(R32_SNORM,   32_SNORM,                   WZYX),
+   VTC(R32_UINT,    32_UINT,                    WZYX),
+   VTC(R32_SINT,    32_SINT,                    WZYX),
+   V__(R32_USCALED, 32_UINT,                    WZYX),
+   V__(R32_SSCALED, 32_SINT,                    WZYX),
+   VTC(R32_FLOAT,   32_FLOAT,                   WZYX),
+   V__(R32_FIXED,   32_FIXED,                   WZYX),
+
+   _T_(A32_UINT,    32_UINT,                    WZYX),
+   _T_(A32_SINT,    32_SINT,                    WZYX),
+   _T_(L32_UINT,    32_UINT,                    WZYX),
+   _T_(L32_SINT,    32_SINT,                    WZYX),
+   _T_(I32_UINT,    32_UINT,                    WZYX),
+   _T_(I32_SINT,    32_SINT,                    WZYX),
+
+   VTC(R16G16_UNORM,   16_16_UNORM,             WZYX),
+   VTC(R16G16_SNORM,   16_16_SNORM,             WZYX),
+   VTC(R16G16_UINT,    16_16_UINT,              WZYX),
+   VTC(R16G16_SINT,    16_16_SINT,              WZYX),
+   V__(R16G16_USCALED, 16_16_UINT,              WZYX),
+   V__(R16G16_SSCALED, 16_16_SINT,              WZYX),
+   VTC(R16G16_FLOAT,   16_16_FLOAT,             WZYX),
+
+   _T_(L16A16_UNORM,   16_16_UNORM,             WZYX),
+   _T_(L16A16_SNORM,   16_16_SNORM,             WZYX),
+   _T_(L16A16_UINT,    16_16_UINT,              WZYX),
+   _T_(L16A16_SINT,    16_16_SINT,              WZYX),
+
+   VTC(R8G8B8A8_UNORM,   8_8_8_8_UNORM,         WZYX),
+   _TC(R8G8B8X8_UNORM,   8_8_8_8_UNORM,         WZYX),
+   _TC(R8G8B8A8_SRGB,    8_8_8_8_UNORM,         WZYX),
+   _TC(R8G8B8X8_SRGB,    8_8_8_8_UNORM,         WZYX),
+   VTC(R8G8B8A8_SNORM,   8_8_8_8_SNORM,         WZYX),
+   VTC(R8G8B8A8_UINT,    8_8_8_8_UINT,          WZYX),
+   VTC(R8G8B8A8_SINT,    8_8_8_8_SINT,          WZYX),
+   V__(R8G8B8A8_USCALED, 8_8_8_8_UINT,          WZYX),
+   V__(R8G8B8A8_SSCALED, 8_8_8_8_SINT,          WZYX),
+
+   VTC(B8G8R8A8_UNORM,   8_8_8_8_UNORM,         WXYZ),
+   _TC(B8G8R8X8_UNORM,   8_8_8_8_UNORM,         WXYZ),
+   _TC(B8G8R8A8_SRGB,    8_8_8_8_UNORM,         WXYZ),
+   _TC(B8G8R8X8_SRGB,    8_8_8_8_UNORM,         WXYZ),
+   VTC(B8G8R8A8_SNORM,   8_8_8_8_SNORM,         WXYZ),
+   VTC(B8G8R8A8_UINT,    8_8_8_8_UINT,          WXYZ),
+   VTC(B8G8R8A8_SINT,    8_8_8_8_SINT,          WXYZ),
+   V__(B8G8R8A8_USCALED, 8_8_8_8_UINT,          WXYZ),
+   V__(B8G8R8A8_SSCALED, 8_8_8_8_SINT,          WXYZ),
+
+   VTC(A8B8G8R8_UNORM,   8_8_8_8_UNORM,         XYZW),
+   _TC(X8B8G8R8_UNORM,   8_8_8_8_UNORM,         XYZW),
+   _TC(A8B8G8R8_SRGB,    8_8_8_8_UNORM,         XYZW),
+   _TC(X8B8G8R8_SRGB,    8_8_8_8_UNORM,         XYZW),
+
+   VTC(A8R8G8B8_UNORM,   8_8_8_8_UNORM,         ZYXW),
+   _TC(X8R8G8B8_UNORM,   8_8_8_8_UNORM,         ZYXW),
+   _TC(A8R8G8B8_SRGB,    8_8_8_8_UNORM,         ZYXW),
+   _TC(X8R8G8B8_SRGB,    8_8_8_8_UNORM,         ZYXW),
+
+   FMT(R10G10B10A2_UNORM, 10_10_10_2_UNORM, 10_10_10_2_UNORM, 10_10_10_2_UNORM_DEST, WZYX),
+   FMT(B10G10R10A2_UNORM, 10_10_10_2_UNORM, 10_10_10_2_UNORM, 10_10_10_2_UNORM_DEST, WXYZ),
+   FMT(B10G10R10X2_UNORM, NONE,             10_10_10_2_UNORM, 10_10_10_2_UNORM_DEST, WXYZ),
+   V__(R10G10B10A2_SNORM,   10_10_10_2_SNORM,   WZYX),
+   V__(B10G10R10A2_SNORM,   10_10_10_2_SNORM,   WXYZ),
+   VTC(R10G10B10A2_UINT,    10_10_10_2_UINT,    WZYX),
+   V__(R10G10B10A2_SINT,    10_10_10_2_SINT,    WZYX),
+   VTC(B10G10R10A2_UINT,    10_10_10_2_UINT,    WXYZ),
+   V__(B10G10R10A2_SINT,    10_10_10_2_SINT,    WXYZ),
+   V__(R10G10B10A2_USCALED, 10_10_10_2_UINT,    WZYX),
+   V__(B10G10R10A2_USCALED, 10_10_10_2_UINT,    WXYZ),
+   V__(R10G10B10A2_SSCALED, 10_10_10_2_SINT,    WZYX),
+   V__(B10G10R10A2_SSCALED, 10_10_10_2_SINT,    WXYZ),
+
+   VTC(R11G11B10_FLOAT, 11_11_10_FLOAT,         WZYX),
+   _T_(R9G9B9E5_FLOAT,  9_9_9_E5_FLOAT,         WZYX),
+
+   _TC(Z24X8_UNORM,          Z24_UNORM_S8_UINT, WZYX),
+   _TC(X24S8_UINT,           8_8_8_8_UINT,      WZYX),
+   _TC(Z24_UNORM_S8_UINT,    Z24_UNORM_S8_UINT, WZYX),
+   _TC(Z32_FLOAT,            32_FLOAT,          WZYX),
+   _TC(Z32_FLOAT_S8X24_UINT, 32_FLOAT,          WZYX),
+   _TC(X32_S8X24_UINT,       8_UINT,            WZYX),
+
+   /* special format for blits: */
+   _TC(Z24_UNORM_S8_UINT_AS_R8G8B8A8, Z24_UNORM_S8_UINT_AS_R8G8B8A8, WZYX),
+
+   /* 48-bit */
+   V__(R16G16B16_UNORM,   16_16_16_UNORM,       WZYX),
+   V__(R16G16B16_SNORM,   16_16_16_SNORM,       WZYX),
+   V__(R16G16B16_UINT,    16_16_16_UINT,        WZYX),
+   V__(R16G16B16_SINT,    16_16_16_SINT,        WZYX),
+   V__(R16G16B16_USCALED, 16_16_16_UINT,        WZYX),
+   V__(R16G16B16_SSCALED, 16_16_16_SINT,        WZYX),
+   V__(R16G16B16_FLOAT,   16_16_16_FLOAT,       WZYX),
+
+   /* 64-bit */
+   VTC(R16G16B16A16_UNORM,   16_16_16_16_UNORM, WZYX),
+   VTC(R16G16B16X16_UNORM,   16_16_16_16_UNORM, WZYX),
+   VTC(R16G16B16A16_SNORM,   16_16_16_16_SNORM, WZYX),
+   VTC(R16G16B16X16_SNORM,   16_16_16_16_SNORM, WZYX),
+   VTC(R16G16B16A16_UINT,    16_16_16_16_UINT,  WZYX),
+   VTC(R16G16B16X16_UINT,    16_16_16_16_UINT,  WZYX),
+   VTC(R16G16B16A16_SINT,    16_16_16_16_SINT,  WZYX),
+   VTC(R16G16B16X16_SINT,    16_16_16_16_SINT,  WZYX),
+   V__(R16G16B16A16_USCALED, 16_16_16_16_UINT,  WZYX),
+   V__(R16G16B16A16_SSCALED, 16_16_16_16_SINT,  WZYX),
+   VTC(R16G16B16A16_FLOAT,   16_16_16_16_FLOAT, WZYX),
+   VTC(R16G16B16X16_FLOAT,   16_16_16_16_FLOAT, WZYX),
+
+   V__(R32G32_UNORM,   32_32_UNORM,             WZYX),
+   V__(R32G32_SNORM,   32_32_SNORM,             WZYX),
+   VTC(R32G32_UINT,    32_32_UINT,              WZYX),
+   VTC(R32G32_SINT,    32_32_SINT,              WZYX),
+   V__(R32G32_USCALED, 32_32_UINT,              WZYX),
+   V__(R32G32_SSCALED, 32_32_SINT,              WZYX),
+   VTC(R32G32_FLOAT,   32_32_FLOAT,             WZYX),
+   V__(R32G32_FIXED,   32_32_FIXED,             WZYX),
+
+   _T_(L32A32_UINT,    32_32_UINT,              WZYX),
+   _T_(L32A32_SINT,    32_32_SINT,              WZYX),
+
+   /* 96-bit */
+   V__(R32G32B32_UNORM,   32_32_32_UNORM,       WZYX),
+   V__(R32G32B32_SNORM,   32_32_32_SNORM,       WZYX),
+   VT_(R32G32B32_UINT,    32_32_32_UINT,        WZYX),
+   VT_(R32G32B32_SINT,    32_32_32_SINT,        WZYX),
+   V__(R32G32B32_USCALED, 32_32_32_UINT,        WZYX),
+   V__(R32G32B32_SSCALED, 32_32_32_SINT,        WZYX),
+   VT_(R32G32B32_FLOAT,   32_32_32_FLOAT,       WZYX),
+   V__(R32G32B32_FIXED,   32_32_32_FIXED,       WZYX),
+
+   /* 128-bit */
+   V__(R32G32B32A32_UNORM,   32_32_32_32_UNORM, WZYX),
+   V__(R32G32B32A32_SNORM,   32_32_32_32_SNORM, WZYX),
+   VTC(R32G32B32A32_UINT,    32_32_32_32_UINT,  WZYX),
+   _TC(R32G32B32X32_UINT,    32_32_32_32_UINT,  WZYX),
+   VTC(R32G32B32A32_SINT,    32_32_32_32_SINT,  WZYX),
+   _TC(R32G32B32X32_SINT,    32_32_32_32_SINT,  WZYX),
+   V__(R32G32B32A32_USCALED, 32_32_32_32_UINT,  WZYX),
+   V__(R32G32B32A32_SSCALED, 32_32_32_32_SINT,  WZYX),
+   VTC(R32G32B32A32_FLOAT,   32_32_32_32_FLOAT, WZYX),
+   _TC(R32G32B32X32_FLOAT,   32_32_32_32_FLOAT, WZYX),
+   V__(R32G32B32A32_FIXED,   32_32_32_32_FIXED, WZYX),
+
+   /* compressed */
+   _T_(ETC1_RGB8, ETC1,                         WZYX),
+   _T_(ETC2_RGB8, ETC2_RGB8,                    WZYX),
+   _T_(ETC2_SRGB8, ETC2_RGB8,                   WZYX),
+   _T_(ETC2_RGB8A1, ETC2_RGB8A1,                WZYX),
+   _T_(ETC2_SRGB8A1, ETC2_RGB8A1,               WZYX),
+   _T_(ETC2_RGBA8, ETC2_RGBA8,                  WZYX),
+   _T_(ETC2_SRGBA8, ETC2_RGBA8,                 WZYX),
+   _T_(ETC2_R11_UNORM, ETC2_R11_UNORM,          WZYX),
+   _T_(ETC2_R11_SNORM, ETC2_R11_SNORM,          WZYX),
+   _T_(ETC2_RG11_UNORM, ETC2_RG11_UNORM,        WZYX),
+   _T_(ETC2_RG11_SNORM, ETC2_RG11_SNORM,        WZYX),
+
+   _T_(DXT1_RGB,   DXT1,                        WZYX),
+   _T_(DXT1_SRGB,  DXT1,                        WZYX),
+   _T_(DXT1_RGBA,  DXT1,                        WZYX),
+   _T_(DXT1_SRGBA, DXT1,                        WZYX),
+   _T_(DXT3_RGBA,  DXT3,                        WZYX),
+   _T_(DXT3_SRGBA, DXT3,                        WZYX),
+   _T_(DXT5_RGBA,  DXT5,                        WZYX),
+   _T_(DXT5_SRGBA, DXT5,                        WZYX),
+
+   _T_(BPTC_RGBA_UNORM, BPTC,                   WZYX),
+   _T_(BPTC_SRGBA,      BPTC,                   WZYX),
+   _T_(BPTC_RGB_FLOAT,  BPTC_FLOAT,             WZYX),
+   _T_(BPTC_RGB_UFLOAT, BPTC_UFLOAT,            WZYX),
+
+   _T_(RGTC1_UNORM, RGTC1_UNORM,                WZYX),
+   _T_(RGTC1_SNORM, RGTC1_SNORM,                WZYX),
+   _T_(RGTC2_UNORM, RGTC2_UNORM,                WZYX),
+   _T_(RGTC2_SNORM, RGTC2_SNORM,                WZYX),
+   _T_(LATC1_UNORM, RGTC1_UNORM,                WZYX),
+   _T_(LATC1_SNORM, RGTC1_SNORM,                WZYX),
+   _T_(LATC2_UNORM, RGTC2_UNORM,                WZYX),
+   _T_(LATC2_SNORM, RGTC2_SNORM,                WZYX),
+
+   _T_(ASTC_4x4,   ASTC_4x4,                    WZYX),
+   _T_(ASTC_5x4,   ASTC_5x4,                    WZYX),
+   _T_(ASTC_5x5,   ASTC_5x5,                    WZYX),
+   _T_(ASTC_6x5,   ASTC_6x5,                    WZYX),
+   _T_(ASTC_6x6,   ASTC_6x6,                    WZYX),
+   _T_(ASTC_8x5,   ASTC_8x5,                    WZYX),
+   _T_(ASTC_8x6,   ASTC_8x6,                    WZYX),
+   _T_(ASTC_8x8,   ASTC_8x8,                    WZYX),
+   _T_(ASTC_10x5,  ASTC_10x5,                   WZYX),
+   _T_(ASTC_10x6,  ASTC_10x6,                   WZYX),
+   _T_(ASTC_10x8,  ASTC_10x8,                   WZYX),
+   _T_(ASTC_10x10, ASTC_10x10,                  WZYX),
+   _T_(ASTC_12x10, ASTC_12x10,                  WZYX),
+   _T_(ASTC_12x12, ASTC_12x12,                  WZYX),
+
+   _T_(ASTC_4x4_SRGB,   ASTC_4x4,               WZYX),
+   _T_(ASTC_5x4_SRGB,   ASTC_5x4,               WZYX),
+   _T_(ASTC_5x5_SRGB,   ASTC_5x5,               WZYX),
+   _T_(ASTC_6x5_SRGB,   ASTC_6x5,               WZYX),
+   _T_(ASTC_6x6_SRGB,   ASTC_6x6,               WZYX),
+   _T_(ASTC_8x5_SRGB,   ASTC_8x5,               WZYX),
+   _T_(ASTC_8x6_SRGB,   ASTC_8x6,               WZYX),
+   _T_(ASTC_8x8_SRGB,   ASTC_8x8,               WZYX),
+   _T_(ASTC_10x5_SRGB,  ASTC_10x5,              WZYX),
+   _T_(ASTC_10x6_SRGB,  ASTC_10x6,              WZYX),
+   _T_(ASTC_10x8_SRGB,  ASTC_10x8,              WZYX),
+   _T_(ASTC_10x10_SRGB, ASTC_10x10,             WZYX),
+   _T_(ASTC_12x10_SRGB, ASTC_12x10,             WZYX),
+   _T_(ASTC_12x12_SRGB, ASTC_12x12,             WZYX),
+
+   _T_(R8G8_R8B8_UNORM, R8G8R8B8_422_UNORM, WZYX), /* YUYV */
+   _T_(G8R8_B8R8_UNORM, G8R8B8R8_422_UNORM, WZYX), /* UYVY */
+
+   _T_(R8_G8B8_420_UNORM, R8_G8B8_2PLANE_420_UNORM, WZYX),
+   _T_(R8_G8_B8_420_UNORM, R8_G8_B8_3PLANE_420_UNORM, WZYX),
+};
+/* clang-format on */
+
+static enum a3xx_color_swap
+fd6_pipe2swap(enum pipe_format format, enum a6xx_tile_mode tile_mode)
+{
+   if (!formats[format].present)
+      return WZYX;
+
+   if (tile_mode)
+      return WZYX;
+
+   return formats[format].swap;
+}
+
+/* convert pipe format to vertex buffer format: */
+enum a6xx_format
+fd6_vertex_format(enum pipe_format format)
+{
+   if (!formats[format].present)
+      return FMT6_NONE;
+   return formats[format].vtx;
+}
+
+enum a3xx_color_swap
+fd6_vertex_swap(enum pipe_format format)
+{
+   return fd6_pipe2swap(format, TILE6_LINEAR);
+}
+
+/* convert pipe format to texture sampler format: */
+enum a6xx_format
+fd6_texture_format(enum pipe_format format, enum a6xx_tile_mode tile_mode)
+{
+   if (!formats[format].present)
+      return FMT6_NONE;
+
+   if (!tile_mode) {
+      switch (format) {
+      /* Linear ARGB/ABGR1555 has a special format for sampling (tiled
+       * 1555/5551 formats always have the same swizzle and layout).
+       */
+      case PIPE_FORMAT_A1R5G5B5_UNORM:
+      case PIPE_FORMAT_A1B5G5R5_UNORM:
+         return FMT6_1_5_5_5_UNORM;
+      /* note: this may be more about UBWC than tiling, but we don't support
+       * tiled non-UBWC NV12
+       */
+      case PIPE_FORMAT_Y8_UNORM:
+         return FMT6_8_UNORM;
+      default:
+         break;
+      }
+   }
+
+   return formats[format].tex;
+}
+
+enum a3xx_color_swap
+fd6_texture_swap(enum pipe_format format, enum a6xx_tile_mode tile_mode)
+{
+   if (!tile_mode) {
+      switch (format) {
+      case PIPE_FORMAT_A1R5G5B5_UNORM:
+         return WZYX;
+      case PIPE_FORMAT_A1B5G5R5_UNORM:
+         return WXYZ;
+      default:
+         break;
+      }
+   }
+
+   return fd6_pipe2swap(format, tile_mode);
+}
+
+/* convert pipe format to MRT / copydest format used for render-target: */
+enum a6xx_format
+fd6_color_format(enum pipe_format format, enum a6xx_tile_mode tile_mode)
+{
+   if (!formats[format].present)
+      return FMT6_NONE;
+
+   if (!tile_mode && format == PIPE_FORMAT_Y8_UNORM)
+      return FMT6_8_UNORM;
+
+   return formats[format].rb;
+}
+
+enum a3xx_color_swap
+fd6_color_swap(enum pipe_format format, enum a6xx_tile_mode tile_mode)
+{
+   return fd6_pipe2swap(format, tile_mode);
+}
+
+enum a6xx_depth_format
+fd6_pipe2depth(enum pipe_format format)
+{
+   switch (format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      return DEPTH6_16;
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+   case PIPE_FORMAT_X8Z24_UNORM:
+   case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+      return DEPTH6_24_8;
+   case PIPE_FORMAT_Z32_FLOAT:
+   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+      return DEPTH6_32;
+   default:
+      return ~0;
+   }
+}
diff --git a/mesa 3D driver/src/freedreno/fdl/fd6_format_table.h b/mesa 3D driver/src/freedreno/fdl/fd6_format_table.h
new file mode 100644
index 0000000000..dc88f18649
--- /dev/null
+++ b/mesa 3D driver/src/freedreno/fdl/fd6_format_table.h	
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2016 Rob Clark <robclark@freedesktop.org>
+ * Copyright © 2018 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef FD6_FORMAT_TABLE_H
+#define FD6_FORMAT_TABLE_H
+
+#include "util/format/u_format.h"
+#include "util/u_math.h"
+
+#include "adreno_pm4.xml.h"
+#include "adreno_common.xml.h"
+#include "a6xx.xml.h"
+
+enum a6xx_depth_format fd6_pipe2depth(enum pipe_format format);
+
+enum a6xx_format fd6_vertex_format(enum pipe_format format) ATTRIBUTE_CONST;
+enum a3xx_color_swap fd6_vertex_swap(enum pipe_format format) ATTRIBUTE_CONST;
+enum a6xx_format fd6_texture_format(enum pipe_format format,
+                                    enum a6xx_tile_mode tile_mode) ATTRIBUTE_CONST;
+enum a3xx_color_swap fd6_texture_swap(enum pipe_format format,
+                                      enum a6xx_tile_mode tile_mode) ATTRIBUTE_CONST;
+enum a6xx_format fd6_color_format(enum pipe_format format,
+                                  enum a6xx_tile_mode tile_mode) ATTRIBUTE_CONST;
+enum a3xx_color_swap fd6_color_swap(enum pipe_format format,
+                                    enum a6xx_tile_mode tile_mode) ATTRIBUTE_CONST;
+
+#endif /* FD6_FORMAT_TABLE_H */
diff --git a/mesa 3D driver/src/freedreno/fdl/fd6_layout.c b/mesa 3D driver/src/freedreno/fdl/fd6_layout.c
index 0067adb52f..44d062e832 100644
--- a/mesa 3D driver/src/freedreno/fdl/fd6_layout.c	
+++ b/mesa 3D driver/src/freedreno/fdl/fd6_layout.c	
@@ -30,15 +30,15 @@
 #include "freedreno_layout.h"
 
 static bool
-is_r8g8(struct fdl_layout *layout)
+is_r8g8(const struct fdl_layout *layout)
 {
    return layout->cpp == 2 &&
           util_format_get_nr_components(layout->format) == 2;
 }
 
 void
-fdl6_get_ubwc_blockwidth(struct fdl_layout *layout, uint32_t *blockwidth,
-                         uint32_t *blockheight)
+fdl6_get_ubwc_blockwidth(const struct fdl_layout *layout,
+                         uint32_t *blockwidth, uint32_t *blockheight)
 {
    static const struct {
       uint8_t width;
@@ -58,6 +58,10 @@ fdl6_get_ubwc_blockwidth(struct fdl_layout *layout, uint32_t *blockwidth,
       *blockwidth = 16;
       *blockheight = 8;
       return;
+   } else if (layout->format == PIPE_FORMAT_Y8_UNORM) {
+      *blockwidth = 32;
+      *blockheight = 8;
+      return;
    }
 
    uint32_t cpp = fdl_cpp_shift(layout);
@@ -107,6 +111,7 @@ fdl6_layout(struct fdl_layout *layout, enum pipe_format format,
    layout->width0 = width0;
    layout->height0 = height0;
    layout->depth0 = depth0;
+   layout->mip_levels = mip_levels;
 
    layout->cpp = util_format_get_blocksize(format);
    layout->cpp *= nr_samples;
diff --git a/mesa 3D driver/src/freedreno/fdl/fd6_view.c b/mesa 3D driver/src/freedreno/fdl/fd6_view.c
new file mode 100644
index 0000000000..5806b1682d
--- /dev/null
+++ b/mesa 3D driver/src/freedreno/fdl/fd6_view.c	
@@ -0,0 +1,346 @@
+/*
+ * Copyright © 2016 Red Hat.
+ * Copyright © 2016 Bas Nieuwenhuizen
+ * Copyright © 2021 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "freedreno_layout.h"
+#include "fd6_format_table.h"
+
+static enum a6xx_tex_swiz
+fdl6_swiz(unsigned char swiz)
+{
+   STATIC_ASSERT((unsigned) A6XX_TEX_X == (unsigned) PIPE_SWIZZLE_X);
+   STATIC_ASSERT((unsigned) A6XX_TEX_Y == (unsigned) PIPE_SWIZZLE_Y);
+   STATIC_ASSERT((unsigned) A6XX_TEX_Z == (unsigned) PIPE_SWIZZLE_Z);
+   STATIC_ASSERT((unsigned) A6XX_TEX_W == (unsigned) PIPE_SWIZZLE_W);
+   STATIC_ASSERT((unsigned) A6XX_TEX_ZERO == (unsigned) PIPE_SWIZZLE_0);
+   STATIC_ASSERT((unsigned) A6XX_TEX_ONE == (unsigned) PIPE_SWIZZLE_1);
+   return (enum a6xx_tex_swiz) swiz;
+}
+
+static enum a6xx_tex_type
+fdl6_tex_type(enum fdl_view_type type, bool storage)
+{
+   STATIC_ASSERT((unsigned) FDL_VIEW_TYPE_1D == (unsigned) A6XX_TEX_1D);
+   STATIC_ASSERT((unsigned) FDL_VIEW_TYPE_2D == (unsigned) A6XX_TEX_2D);
+   STATIC_ASSERT((unsigned) FDL_VIEW_TYPE_CUBE == (unsigned) A6XX_TEX_CUBE);
+   STATIC_ASSERT((unsigned) FDL_VIEW_TYPE_3D == (unsigned) A6XX_TEX_3D);
+
+   return (storage && type == FDL_VIEW_TYPE_CUBE) ?
+      A6XX_TEX_2D : (enum a6xx_tex_type) type;
+}
+
+static uint32_t
+fdl6_texswiz(const struct fdl_view_args *args, bool has_z24uint_s8uint)
+{
+   unsigned char format_swiz[4] =
+      { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W };
+   switch (args->format) {
+   case PIPE_FORMAT_R8G8_R8B8_UNORM:
+   case PIPE_FORMAT_G8R8_B8R8_UNORM:
+   case PIPE_FORMAT_R8_G8B8_420_UNORM:
+   case PIPE_FORMAT_R8_G8_B8_420_UNORM:
+      format_swiz[0] = PIPE_SWIZZLE_Z;
+      format_swiz[1] = PIPE_SWIZZLE_X;
+      format_swiz[2] = PIPE_SWIZZLE_Y;
+      break;
+   case PIPE_FORMAT_DXT1_RGB:
+   case PIPE_FORMAT_DXT1_SRGB:
+      /* same hardware format is used for BC1_RGB / BC1_RGBA */
+      format_swiz[3] = PIPE_SWIZZLE_1;
+      break;
+   case PIPE_FORMAT_X24S8_UINT:
+      if (!has_z24uint_s8uint) {
+         /* using FMT6_8_8_8_8_UINT, so need to pick out the W channel and
+          * swizzle (0,0,1) in the rest (see "Conversion to RGBA").
+          */
+         format_swiz[0] = PIPE_SWIZZLE_W;
+         format_swiz[1] = PIPE_SWIZZLE_0;
+         format_swiz[2] = PIPE_SWIZZLE_0;
+         format_swiz[3] = PIPE_SWIZZLE_1;
+      } else {
+         /* using FMT6_Z24_UINT_S8_UINT, which is (d, s, 0, 1), so need to
+          * swizzle away the d.
+          */
+         format_swiz[0] = PIPE_SWIZZLE_Y;
+         format_swiz[1] = PIPE_SWIZZLE_0;
+      }
+      break;
+   default:
+      break;
+   }
+
+   unsigned char swiz[4];
+   util_format_compose_swizzles(format_swiz, args->swiz, swiz);
+
+   return A6XX_TEX_CONST_0_SWIZ_X(fdl6_swiz(swiz[0])) |
+          A6XX_TEX_CONST_0_SWIZ_Y(fdl6_swiz(swiz[1])) |
+          A6XX_TEX_CONST_0_SWIZ_Z(fdl6_swiz(swiz[2])) |
+          A6XX_TEX_CONST_0_SWIZ_W(fdl6_swiz(swiz[3]));
+}
+
+#define COND(bool, val) ((bool) ? (val) : 0)
+
+void
+fdl6_view_init(struct fdl6_view *view, const struct fdl_layout **layouts,
+               const struct fdl_view_args *args, bool has_z24uint_s8uint)
+{
+   const struct fdl_layout *layout = layouts[0];
+   uint32_t width = u_minify(layout->width0, args->base_miplevel);
+   uint32_t height = u_minify(layout->height0, args->base_miplevel);
+   uint32_t storage_depth = args->layer_count;
+   if (args->type == FDL_VIEW_TYPE_3D) {
+      storage_depth = u_minify(layout->depth0, args->base_miplevel);
+   }
+
+   uint32_t depth = storage_depth;
+   if (args->type == FDL_VIEW_TYPE_CUBE) {
+      /* Cubes are treated as 2D arrays for storage images, so only divide the
+       * depth by 6 for the texture descriptor.
+       */
+      depth /= 6;
+   }
+
+   uint64_t base_addr = args->iova +
+      fdl_surface_offset(layout, args->base_miplevel, args->base_array_layer);
+   uint64_t ubwc_addr = args->iova +
+      fdl_ubwc_offset(layout, args->base_miplevel, args->base_array_layer);
+
+   uint32_t pitch = fdl_pitch(layout, args->base_miplevel);
+   uint32_t ubwc_pitch = fdl_ubwc_pitch(layout, args->base_miplevel);
+   uint32_t layer_size = fdl_layer_stride(layout, args->base_miplevel);
+
+   enum a6xx_format texture_format =
+      fd6_texture_format(args->format, layout->tile_mode);
+   enum a3xx_color_swap swap =
+      fd6_texture_swap(args->format, layout->tile_mode);
+   enum a6xx_tile_mode tile_mode = fdl_tile_mode(layout, args->base_miplevel);
+
+   bool ubwc_enabled = fdl_ubwc_enabled(layout, args->base_miplevel);
+
+   bool is_d24s8 = (args->format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
+                    args->format == PIPE_FORMAT_Z24X8_UNORM ||
+                    args->format == PIPE_FORMAT_X24S8_UINT);
+
+   if (args->format == PIPE_FORMAT_X24S8_UINT && has_z24uint_s8uint)
+      texture_format = FMT6_Z24_UINT_S8_UINT;
+
+   if (texture_format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !ubwc_enabled)
+      texture_format = FMT6_8_8_8_8_UNORM;
+
+   enum a6xx_format storage_format = texture_format;
+   if (is_d24s8) {
+      if (ubwc_enabled)
+         storage_format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
+      else
+         storage_format = FMT6_8_8_8_8_UNORM;
+   }
+
+   memset(view->descriptor, 0, sizeof(view->descriptor));
+
+   view->descriptor[0] =
+      A6XX_TEX_CONST_0_TILE_MODE(tile_mode) |
+      COND(util_format_is_srgb(args->format), A6XX_TEX_CONST_0_SRGB) |
+      A6XX_TEX_CONST_0_FMT(texture_format) |
+      A6XX_TEX_CONST_0_SAMPLES(util_logbase2(layout->nr_samples)) |
+      A6XX_TEX_CONST_0_SWAP(swap) |
+      fdl6_texswiz(args, has_z24uint_s8uint) |
+      A6XX_TEX_CONST_0_MIPLVLS(args->level_count - 1);
+   view->descriptor[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
+   view->descriptor[2] =
+      A6XX_TEX_CONST_2_PITCHALIGN(layout->pitchalign - 6) |
+      A6XX_TEX_CONST_2_PITCH(pitch) |
+      A6XX_TEX_CONST_2_TYPE(fdl6_tex_type(args->type, false));
+   view->descriptor[3] = A6XX_TEX_CONST_3_ARRAY_PITCH(layer_size);
+   view->descriptor[4] = base_addr;
+   view->descriptor[5] = (base_addr >> 32) | A6XX_TEX_CONST_5_DEPTH(depth);
+
+   if (layout->tile_all)
+      view->descriptor[3] |= A6XX_TEX_CONST_3_TILE_ALL;
+
+   if (args->format == PIPE_FORMAT_R8_G8B8_420_UNORM ||
+       args->format == PIPE_FORMAT_R8_G8_B8_420_UNORM) {
+      /* chroma offset re-uses MIPLVLS bits */
+      assert(args->level_count == 1);
+      if (args->chroma_offsets[0] == FDL_CHROMA_LOCATION_MIDPOINT)
+         view->descriptor[0] |= A6XX_TEX_CONST_0_CHROMA_MIDPOINT_X;
+      if (args->chroma_offsets[1] == FDL_CHROMA_LOCATION_MIDPOINT)
+         view->descriptor[0] |= A6XX_TEX_CONST_0_CHROMA_MIDPOINT_Y;
+
+      uint64_t base_addr[3];
+
+      view->descriptor[3] |= A6XX_TEX_CONST_3_TILE_ALL;
+      if (ubwc_enabled) {
+         view->descriptor[3] |= A6XX_TEX_CONST_3_FLAG;
+         /* no separate ubwc base, image must have the expected layout */
+         for (uint32_t i = 0; i < 3; i++) {
+            base_addr[i] = args->iova +
+               fdl_ubwc_offset(layouts[i], args->base_miplevel, args->base_array_layer);
+         }
+      } else {
+         for (uint32_t i = 0; i < 3; i++) {
+            base_addr[i] = args->iova +
+               fdl_surface_offset(layouts[i], args->base_miplevel, args->base_array_layer);
+         }
+      }
+
+      view->descriptor[4] = base_addr[0];
+      view->descriptor[5] |= base_addr[0] >> 32;
+      view->descriptor[6] =
+         A6XX_TEX_CONST_6_PLANE_PITCH(fdl_pitch(layouts[1], args->base_miplevel));
+      view->descriptor[7] = base_addr[1];
+      view->descriptor[8] = base_addr[1] >> 32;
+      view->descriptor[9] = base_addr[2];
+      view->descriptor[10] = base_addr[2] >> 32;
+
+      assert(args->type != FDL_VIEW_TYPE_3D);
+      return;
+   }
+
+   if (ubwc_enabled) {
+      uint32_t block_width, block_height;
+      fdl6_get_ubwc_blockwidth(layout, &block_width, &block_height);
+
+      view->descriptor[3] |= A6XX_TEX_CONST_3_FLAG;
+      view->descriptor[7] = ubwc_addr;
+      view->descriptor[8] = ubwc_addr >> 32;
+      view->descriptor[9] |= A6XX_TEX_CONST_9_FLAG_BUFFER_ARRAY_PITCH(layout->ubwc_layer_size >> 2);
+      view->descriptor[10] |=
+         A6XX_TEX_CONST_10_FLAG_BUFFER_PITCH(ubwc_pitch) |
+         A6XX_TEX_CONST_10_FLAG_BUFFER_LOGW(util_logbase2_ceil(DIV_ROUND_UP(width, block_width))) |
+         A6XX_TEX_CONST_10_FLAG_BUFFER_LOGH(util_logbase2_ceil(DIV_ROUND_UP(height, block_height)));
+   }
+
+   if (args->type == FDL_VIEW_TYPE_3D) {
+      view->descriptor[3] |=
+         A6XX_TEX_CONST_3_MIN_LAYERSZ(layout->slices[layout->mip_levels - 1].size0);
+   }
+
+   bool samples_average =
+      layout->nr_samples > 1 &&
+      !util_format_is_pure_integer(args->format) &&
+      !util_format_is_depth_or_stencil(args->format);
+
+   view->SP_PS_2D_SRC_INFO =
+      A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(storage_format) |
+      A6XX_SP_PS_2D_SRC_INFO_TILE_MODE(tile_mode) |
+      A6XX_SP_PS_2D_SRC_INFO_COLOR_SWAP(swap) |
+      COND(ubwc_enabled, A6XX_SP_PS_2D_SRC_INFO_FLAGS) |
+      COND(util_format_is_srgb(args->format), A6XX_SP_PS_2D_SRC_INFO_SRGB) |
+      A6XX_SP_PS_2D_SRC_INFO_SAMPLES(util_logbase2(layout->nr_samples)) |
+      COND(samples_average, A6XX_SP_PS_2D_SRC_INFO_SAMPLES_AVERAGE) |
+      A6XX_SP_PS_2D_SRC_INFO_UNK20 |
+      A6XX_SP_PS_2D_SRC_INFO_UNK22;
+
+   view->SP_PS_2D_SRC_SIZE =
+      A6XX_SP_PS_2D_SRC_SIZE_WIDTH(width) |
+      A6XX_SP_PS_2D_SRC_SIZE_HEIGHT(height);
+
+   /* note: these have same encoding for MRT and 2D (except 2D PITCH src) */
+   view->PITCH = A6XX_RB_DEPTH_BUFFER_PITCH(pitch);
+   view->FLAG_BUFFER_PITCH =
+      A6XX_RB_DEPTH_FLAG_BUFFER_PITCH_PITCH(ubwc_pitch) |
+      A6XX_RB_DEPTH_FLAG_BUFFER_PITCH_ARRAY_PITCH(layout->ubwc_layer_size >> 2);
+
+   view->base_addr = base_addr;
+   view->ubwc_addr = ubwc_addr;
+   view->layer_size = layer_size;
+   view->ubwc_layer_size = layout->ubwc_layer_size;
+
+   enum a6xx_format color_format =
+      fd6_color_format(args->format, layout->tile_mode);
+
+   /* Don't set fields that are only used for attachments/blit dest if COLOR
+    * is unsupported.
+    */
+   if (color_format == FMT6_NONE)
+      return;
+
+   enum a3xx_color_swap color_swap =
+      fd6_color_swap(args->format, layout->tile_mode);
+
+   if (is_d24s8)
+      color_format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
+
+   if (color_format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !ubwc_enabled)
+      color_format = FMT6_8_8_8_8_UNORM;
+
+   memset(view->storage_descriptor, 0, sizeof(view->storage_descriptor));
+
+   view->storage_descriptor[0] =
+      A6XX_IBO_0_FMT(storage_format) |
+      A6XX_IBO_0_TILE_MODE(tile_mode);
+   view->storage_descriptor[1] =
+      A6XX_IBO_1_WIDTH(width) |
+      A6XX_IBO_1_HEIGHT(height);
+   view->storage_descriptor[2] =
+      A6XX_IBO_2_PITCH(pitch) |
+      A6XX_IBO_2_TYPE(fdl6_tex_type(args->type, true));
+   view->storage_descriptor[3] = A6XX_IBO_3_ARRAY_PITCH(layer_size);
+
+   view->storage_descriptor[4] = base_addr;
+   view->storage_descriptor[5] = (base_addr >> 32) | A6XX_IBO_5_DEPTH(storage_depth);
+
+   if (ubwc_enabled) {
+      view->storage_descriptor[3] |= A6XX_IBO_3_FLAG | A6XX_IBO_3_UNK27;
+      view->storage_descriptor[7] |= ubwc_addr;
+      view->storage_descriptor[8] |= ubwc_addr >> 32;
+      view->storage_descriptor[9] = A6XX_IBO_9_FLAG_BUFFER_ARRAY_PITCH(layout->ubwc_layer_size >> 2);
+      view->storage_descriptor[10] =
+         A6XX_IBO_10_FLAG_BUFFER_PITCH(ubwc_pitch);
+   }
+
+   view->width = width;
+   view->height = height;
+   view->need_y2_align =
+      tile_mode == TILE6_LINEAR && args->base_miplevel != layout->mip_levels - 1;
+
+   view->ubwc_enabled = ubwc_enabled;
+
+   view->RB_MRT_BUF_INFO =
+      A6XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(tile_mode) |
+      A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT(color_format) |
+      A6XX_RB_MRT_BUF_INFO_COLOR_SWAP(color_swap);
+
+   view->SP_FS_MRT_REG =
+      A6XX_SP_FS_MRT_REG_COLOR_FORMAT(color_format) |
+      COND(util_format_is_pure_sint(args->format), A6XX_SP_FS_MRT_REG_COLOR_SINT) |
+      COND(util_format_is_pure_uint(args->format), A6XX_SP_FS_MRT_REG_COLOR_UINT);
+
+   view->RB_2D_DST_INFO =
+      A6XX_RB_2D_DST_INFO_COLOR_FORMAT(color_format) |
+      A6XX_RB_2D_DST_INFO_TILE_MODE(tile_mode) |
+      A6XX_RB_2D_DST_INFO_COLOR_SWAP(color_swap) |
+      COND(ubwc_enabled, A6XX_RB_2D_DST_INFO_FLAGS) |
+      COND(util_format_is_srgb(args->format), A6XX_RB_2D_DST_INFO_SRGB);
+
+   view->RB_BLIT_DST_INFO =
+      A6XX_RB_BLIT_DST_INFO_TILE_MODE(tile_mode) |
+      A6XX_RB_BLIT_DST_INFO_SAMPLES(util_logbase2(layout->nr_samples)) |
+      A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(color_format) |
+      A6XX_RB_BLIT_DST_INFO_COLOR_SWAP(color_swap) |
+      COND(ubwc_enabled, A6XX_RB_BLIT_DST_INFO_FLAGS);
+}
diff --git a/mesa 3D driver/src/freedreno/fdl/freedreno_layout.h b/mesa 3D driver/src/freedreno/fdl/freedreno_layout.h
index a2c81227f7..a53b9b9797 100644
--- a/mesa 3D driver/src/freedreno/fdl/freedreno_layout.h	
+++ b/mesa 3D driver/src/freedreno/fdl/freedreno_layout.h	
@@ -124,6 +124,7 @@ struct fdl_layout {
    uint8_t cpp_shift;
 
    uint32_t width0, height0, depth0;
+   uint32_t mip_levels;
    uint32_t nr_samples;
    enum pipe_format format;
 
@@ -252,7 +253,68 @@ fdl_set_pitchalign(struct fdl_layout *layout, unsigned pitchalign)
 
 void fdl_dump_layout(struct fdl_layout *layout);
 
-void fdl6_get_ubwc_blockwidth(struct fdl_layout *layout, uint32_t *blockwidth,
-                              uint32_t *blockheight);
+void fdl6_get_ubwc_blockwidth(const struct fdl_layout *layout,
+                              uint32_t *blockwidth, uint32_t *blockheight);
+
+enum fdl_view_type {
+   FDL_VIEW_TYPE_1D = 0,
+   FDL_VIEW_TYPE_2D = 1,
+   FDL_VIEW_TYPE_CUBE = 2,
+   FDL_VIEW_TYPE_3D = 3,
+};
+
+enum fdl_chroma_location {
+   FDL_CHROMA_LOCATION_COSITED_EVEN = 0,
+   FDL_CHROMA_LOCATION_MIDPOINT = 1,
+};
+
+struct fdl_view_args {
+   uint64_t iova;
+   uint32_t base_array_layer, base_miplevel;
+   uint32_t layer_count, level_count;
+   unsigned char swiz[4];
+   enum pipe_format format;
+   enum fdl_view_type type;
+   enum fdl_chroma_location chroma_offsets[2];
+};
+
+#define FDL6_TEX_CONST_DWORDS 16
+
+struct fdl6_view {
+   uint64_t base_addr;
+   uint64_t ubwc_addr;
+   uint32_t layer_size;
+   uint32_t ubwc_layer_size;
+
+   uint32_t width, height;
+   bool need_y2_align;
+
+   bool ubwc_enabled;
+
+   uint32_t descriptor[FDL6_TEX_CONST_DWORDS];
+
+   /* Descriptor for use as a storage image as opposed to a sampled image.
+    * This has a few differences for cube maps (e.g. type).
+    */
+   uint32_t storage_descriptor[FDL6_TEX_CONST_DWORDS];
+
+   /* pre-filled register values */
+   uint32_t PITCH;
+   uint32_t FLAG_BUFFER_PITCH;
+
+   uint32_t RB_MRT_BUF_INFO;
+   uint32_t SP_FS_MRT_REG;
+
+   uint32_t SP_PS_2D_SRC_INFO;
+   uint32_t SP_PS_2D_SRC_SIZE;
+
+   uint32_t RB_2D_DST_INFO;
+
+   uint32_t RB_BLIT_DST_INFO;
+};
+
+void
+fdl6_view_init(struct fdl6_view *view, const struct fdl_layout **layouts,
+               const struct fdl_view_args *args, bool has_z24uint_s8uint);
 
 #endif /* FREEDRENO_LAYOUT_H_ */
diff --git a/mesa 3D driver/src/freedreno/fdl/meson.build b/mesa 3D driver/src/freedreno/fdl/meson.build
index 83becbbb33..479d3dcab1 100644
--- a/mesa 3D driver/src/freedreno/fdl/meson.build	
+++ b/mesa 3D driver/src/freedreno/fdl/meson.build	
@@ -22,8 +22,11 @@ libfreedreno_layout = static_library(
   'freedreno_layout',
   [
     'fd5_layout.c',
+    'fd6_format_table.c',
     'fd6_layout.c',
+    'fd6_view.c',
     'freedreno_layout.c',
+    freedreno_xml_header_files,
   ],
   include_directories : [inc_freedreno, inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux],
   c_args : [no_override_init_args],
diff --git a/mesa 3D driver/src/freedreno/ir3/disasm-a3xx.c b/mesa 3D driver/src/freedreno/ir3/disasm-a3xx.c
index 4a0fb40bbd..89424b2b65 100644
--- a/mesa 3D driver/src/freedreno/ir3/disasm-a3xx.c	
+++ b/mesa 3D driver/src/freedreno/ir3/disasm-a3xx.c	
@@ -348,6 +348,9 @@ static const struct opc_info {
    OPC(6, OPC_GETSPID,      getspid),
    OPC(6, OPC_GETWID,       getwid),
 
+   OPC(6, OPC_SPILL_MACRO,  spill.macro),
+   OPC(6, OPC_RELOAD_MACRO, reload.macro),
+
    OPC(7, OPC_BAR,          bar),
    OPC(7, OPC_FENCE,        fence),
 /* clang-format on */
@@ -469,11 +472,15 @@ disasm_handle_last(struct disasm_ctx *ctx)
 }
 
 static void
-disasm_instr_cb(void *d, unsigned n, uint64_t instr)
+disasm_instr_cb(void *d, unsigned n, void *instr)
 {
    struct disasm_ctx *ctx = d;
-   uint32_t *dwords = (uint32_t *)&instr;
-   unsigned opc_cat = instr >> 61;
+   uint32_t *dwords = (uint32_t *)instr;
+   uint64_t val = dwords[1];
+   val = val << 32;
+   val |= dwords[0];
+
+   unsigned opc_cat = val >> 61;
 
    /* There are some cases where we can get instr_cb called multiple
     * times per instruction (like when we need an extra line for branch
@@ -492,9 +499,9 @@ disasm_instr_cb(void *d, unsigned n, uint64_t instr)
        * some hand-coded parsing:
        */
       if (opc_cat == 1) {
-         unsigned opc = (instr >> 57) & 0x3;
-         unsigned src_type = (instr >> 50) & 0x7;
-         unsigned dst_type = (instr >> 46) & 0x7;
+         unsigned opc = (val >> 57) & 0x3;
+         unsigned src_type = (val >> 50) & 0x7;
+         unsigned dst_type = (val >> 46) & 0x7;
 
          if (opc == 0) {
             if (src_type == dst_type) {
diff --git a/mesa 3D driver/src/freedreno/ir3/instr-a3xx.h b/mesa 3D driver/src/freedreno/ir3/instr-a3xx.h
index 93162c103b..fca65c41df 100644
--- a/mesa 3D driver/src/freedreno/ir3/instr-a3xx.h	
+++ b/mesa 3D driver/src/freedreno/ir3/instr-a3xx.h	
@@ -308,6 +308,9 @@ typedef enum {
    OPC_LDG_A           = _OPC(6, 55),
    OPC_STG_A           = _OPC(6, 56),
 
+   OPC_SPILL_MACRO     = _OPC(6, 57),
+   OPC_RELOAD_MACRO    = _OPC(6, 58),
+
    /* category 7: */
    OPC_BAR             = _OPC(7, 0),
    OPC_FENCE           = _OPC(7, 1),
@@ -379,6 +382,32 @@ type_size(type_t type)
    }
 }
 
+static inline type_t
+type_uint_size(unsigned bit_size)
+{
+   switch (bit_size) {
+   case 8:  return TYPE_U8;
+   case 1:  /* 1b bools are treated as normal half-regs */
+   case 16: return TYPE_U16;
+   case 32: return TYPE_U32;
+   default:
+      ir3_assert(0); /* invalid size */
+      return 0;
+   }
+}
+
+static inline type_t
+type_float_size(unsigned bit_size)
+{
+   switch (bit_size) {
+   case 16: return TYPE_F16;
+   case 32: return TYPE_F32;
+   default:
+      ir3_assert(0); /* invalid size */
+      return 0;
+   }
+}
+
 static inline int
 type_float(type_t type)
 {
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3.c b/mesa 3D driver/src/freedreno/ir3/ir3.c
index 1491429c14..634732a6ac 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3.c	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3.c	
@@ -125,6 +125,7 @@ ir3_should_double_threadsize(struct ir3_shader_variant *v, unsigned regs_count)
    }
 
    switch (v->type) {
+   case MESA_SHADER_KERNEL:
    case MESA_SHADER_COMPUTE: {
       unsigned threads_per_wg =
          v->local_size[0] * v->local_size[1] * v->local_size[2];
@@ -177,7 +178,8 @@ ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
    unsigned max_waves = compiler->max_waves;
 
    /* If this is a compute shader, compute the limit based on shared size */
-   if (v->type == MESA_SHADER_COMPUTE) {
+   if ((v->type == MESA_SHADER_COMPUTE) ||
+       (v->type == MESA_SHADER_KERNEL)) {
       /* Shared is allocated in chunks of 1k */
       unsigned shared_per_wg = ALIGN_POT(v->shared_size, 1024);
       if (shared_per_wg > 0 && !v->local_size_variable) {
@@ -262,11 +264,15 @@ ir3_collect_info(struct ir3_shader_variant *v)
          }
 
          if ((instr->opc == OPC_STP || instr->opc == OPC_LDP)) {
-            struct ir3_register *base =
-               (instr->opc == OPC_STP) ? instr->srcs[2] : instr->srcs[1];
-            if (base->iim_val * type_size(instr->cat6.type) > 32) {
+            unsigned components = instr->srcs[2]->uim_val;
+            if (components * type_size(instr->cat6.type) > 32) {
                info->multi_dword_ldp_stp = true;
             }
+
+            if (instr->opc == OPC_STP)
+               info->stp_count += components;
+            else
+               info->ldp_count += components;
          }
 
          if ((instr->opc == OPC_BARY_F) && (instr->dsts[0]->flags & IR3_REG_EI))
@@ -393,6 +399,22 @@ ir3_block_remove_predecessor(struct ir3_block *block, struct ir3_block *pred)
    }
 }
 
+void
+ir3_block_remove_physical_predecessor(struct ir3_block *block, struct ir3_block *pred)
+{
+   for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
+      if (block->physical_predecessors[i] == pred) {
+         if (i < block->predecessors_count - 1) {
+            block->physical_predecessors[i] =
+               block->physical_predecessors[block->predecessors_count - 1];
+         }
+
+         block->physical_predecessors_count--;
+         return;
+      }
+   }
+}
+
 unsigned
 ir3_block_get_pred_index(struct ir3_block *block, struct ir3_block *pred)
 {
@@ -684,6 +706,9 @@ ir3_set_dst_type(struct ir3_instruction *instr, bool half)
 void
 ir3_fixup_src_type(struct ir3_instruction *instr)
 {
+   if (instr->srcs_count == 0)
+      return;
+
    switch (opc_cat(instr->opc)) {
    case 1: /* move instructions */
       if (instr->srcs[0]->flags & IR3_REG_HALF) {
@@ -933,13 +958,18 @@ ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags)
          if (instr->opc == OPC_STG_A && (n == 4))
             return false;
 
+         if (instr->opc == OPC_LDG && (n == 0))
+            return false;
+
+         if (instr->opc == OPC_LDG_A && (n < 2))
+            return false;
+
          /* as with atomics, these cat6 instrs can only have an immediate
           * for SSBO/IBO slot argument
           */
          switch (instr->opc) {
          case OPC_LDIB:
          case OPC_STIB:
-         case OPC_LDC:
          case OPC_RESINFO:
             if (n != 0)
                return false;
@@ -954,3 +984,39 @@ ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags)
 
    return true;
 }
+
+bool
+ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed)
+{
+   if (instr->opc == OPC_MOV || is_meta(instr))
+      return true;
+
+   if (is_mem(instr)) {
+      switch (instr->opc) {
+      /* Some load/store instructions have a 13-bit offset and size which must
+       * always be an immediate and the rest of the sources cannot be
+       * immediates, so the frontend is responsible for checking the size:
+       */
+      case OPC_LDL:
+      case OPC_STL:
+      case OPC_LDP:
+      case OPC_STP:
+      case OPC_LDG:
+      case OPC_STG:
+      case OPC_SPILL_MACRO:
+      case OPC_RELOAD_MACRO:
+      case OPC_LDG_A:
+      case OPC_STG_A:
+      case OPC_LDLW:
+      case OPC_STLW:
+      case OPC_LDLV:
+         return true;
+      default:
+         /* most cat6 src immediates can only encode 8 bits: */
+         return !(immed & ~0xff);
+      }
+   }
+
+   /* Other than cat1 (mov) we can only encode up to 10 bits, sign-extended: */
+   return !(immed & ~0x1ff) || !(-immed & ~0x1ff);
+}
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3.h b/mesa 3D driver/src/freedreno/ir3/ir3.h
index 723eb6567e..07cc651b68 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3.h	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3.h	
@@ -57,6 +57,8 @@ struct ir3_info {
    uint16_t nops_count;   /* # of nop instructions, including nopN */
    uint16_t mov_count;
    uint16_t cov_count;
+   uint16_t stp_count;
+   uint16_t ldp_count;
    /* NOTE: max_reg, etc, does not include registers not touched
     * by the shader (ie. vertex fetched via VFD_DECODE but not
     * touched by shader)
@@ -89,6 +91,7 @@ struct ir3_merge_set {
    uint16_t alignment;
 
    unsigned interval_start;
+   unsigned spill_slot;
 
    unsigned regs_count;
    struct ir3_register **regs;
@@ -121,6 +124,8 @@ struct ir3_register {
       IR3_REG_BNOT = 0x400,
       /* (ei) flag, end-input?  Set on last bary, presumably to signal
        * that the shader needs no more input:
+       *
+       * Note: Has different meaning on other instructions like add.s/u
        */
       IR3_REG_EI = 0x2000,
       /* meta-flags, for intermediate stages of IR, ie.
@@ -147,6 +152,8 @@ struct ir3_register {
       IR3_REG_UNUSED = 0x40000,
    } flags;
 
+   unsigned name;
+
    /* used for cat5 instructions, but also for internal/IR level
     * tracking of what registers are read/written by an instruction.
     * wrmask may be a bad name since it is used to represent both
@@ -168,7 +175,6 @@ struct ir3_register {
     * rN.x becomes: (N << 2) | x
     */
    uint16_t num;
-   uint16_t name;
    union {
       /* immediate: */
       int32_t iim_val;
@@ -202,6 +208,8 @@ struct ir3_register {
     */
    struct ir3_register *tied;
 
+   unsigned spill_slot, next_use;
+
    unsigned merge_set_offset;
    struct ir3_merge_set *merge_set;
    unsigned interval_start, interval_end;
@@ -332,7 +340,7 @@ struct ir3_instruction {
           * handled.
           */
          int dst_offset;
-         int iim_val   : 3; /* for ldgb/stgb, # of components */
+         int iim_val;       /* for ldgb/stgb, # of components */
          unsigned d    : 3; /* for ldc, component offset */
          bool typed    : 1;
          unsigned base : 3;
@@ -378,9 +386,8 @@ struct ir3_instruction {
       } input;
    };
 
-   /* When we get to the RA stage, we need instruction's position/name: */
-   uint16_t ip;
-   uint16_t name;
+   /* For assigning jump offsets, we need instruction's position: */
+   uint32_t ip;
 
    /* used for per-pass extra instruction data.
     *
@@ -588,6 +595,7 @@ struct ir3_block {
    uint32_t dom_post_index;
 
    uint32_t loop_id;
+   uint32_t loop_depth;
 
 #ifdef DEBUG
    uint32_t serialno;
@@ -615,6 +623,8 @@ void ir3_block_add_physical_predecessor(struct ir3_block *block,
                                         struct ir3_block *pred);
 void ir3_block_remove_predecessor(struct ir3_block *block,
                                   struct ir3_block *pred);
+void ir3_block_remove_physical_predecessor(struct ir3_block *block,
+                                           struct ir3_block *pred);
 unsigned ir3_block_get_pred_index(struct ir3_block *block,
                                   struct ir3_block *pred);
 
@@ -708,6 +718,17 @@ ir3_instr_move_after(struct ir3_instruction *instr,
    list_add(&instr->node, &before->node);
 }
 
+/**
+ * Move 'instr' to the beginning of the block:
+ */
+static inline void
+ir3_instr_move_before_block(struct ir3_instruction *instr,
+                            struct ir3_block *block)
+{
+   list_delinit(&instr->node);
+   list_add(&instr->node, &block->instr_list);
+}
+
 void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps);
 
 void ir3_set_dst_type(struct ir3_instruction *instr, bool half);
@@ -717,6 +738,8 @@ int ir3_flut(struct ir3_register *src_reg);
 
 bool ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags);
 
+bool ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed);
+
 #include "util/set.h"
 #define foreach_ssa_use(__use, __instr)                                        \
    for (struct ir3_instruction *__use = (void *)~0; __use && (__instr)->uses;  \
@@ -799,6 +822,8 @@ is_same_type_mov(struct ir3_instruction *instr)
       if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
          return false;
       break;
+   case OPC_META_PHI:
+      return instr->srcs_count == 1;
    default:
       return false;
    }
@@ -1126,6 +1151,15 @@ is_reg_special(const struct ir3_register *reg)
           (reg_num(reg) == REG_P0);
 }
 
+/* Same as above but in cases where we don't have a register. r48.x and above
+ * are shared/special.
+ */
+static inline bool
+is_reg_num_special(unsigned num)
+{
+   return num >= 48 * 4;
+}
+
 /* returns defining instruction for reg */
 /* TODO better name */
 static inline struct ir3_instruction *
@@ -1166,6 +1200,9 @@ half_type(type_t type)
    case TYPE_U16:
    case TYPE_S16:
       return type;
+   case TYPE_U8:
+   case TYPE_S8:
+      return type;
    default:
       assert(0);
       return ~0;
@@ -1178,8 +1215,10 @@ full_type(type_t type)
    switch (type) {
    case TYPE_F16:
       return TYPE_F32;
+   case TYPE_U8:
    case TYPE_U16:
       return TYPE_U32;
+   case TYPE_S8:
    case TYPE_S16:
       return TYPE_S32;
    case TYPE_F32:
@@ -1422,7 +1461,7 @@ ir3_output_conv_src_type(struct ir3_instruction *instr, type_t base_type)
       return TYPE_F32;
 
    default:
-      return (instr->dsts[1]->flags & IR3_REG_HALF) ? half_type(base_type)
+      return (instr->srcs[0]->flags & IR3_REG_HALF) ? half_type(base_type)
                                                     : full_type(base_type);
    }
 }
@@ -1581,6 +1620,9 @@ void ir3_validate(struct ir3 *ir);
 void ir3_print(struct ir3 *ir);
 void ir3_print_instr(struct ir3_instruction *instr);
 
+struct log_stream;
+void ir3_print_instr_stream(struct log_stream *stream, struct ir3_instruction *instr);
+
 /* delay calculation: */
 int ir3_delayslots(struct ir3_instruction *assigner,
                    struct ir3_instruction *consumer, unsigned n, bool soft);
@@ -1593,6 +1635,9 @@ unsigned ir3_delay_calc_exact(struct ir3_block *block,
                               struct ir3_instruction *instr, bool mergedregs);
 void ir3_remove_nops(struct ir3 *ir);
 
+/* unreachable block elimination: */
+bool ir3_remove_unreachable(struct ir3 *ir);
+
 /* dead code elimination: */
 struct ir3_shader_variant;
 bool ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so);
@@ -2147,6 +2192,9 @@ INSTR3F(G, ATOMIC_OR)
 INSTR3F(G, ATOMIC_XOR)
 #elif GPU >= 400
 INSTR3(LDGB)
+#if GPU >= 500
+INSTR3(LDIB)
+#endif
 INSTR4NODST(STGB)
 INSTR4NODST(STIB)
 INSTR4F(G, ATOMIC_ADD)
@@ -2167,7 +2215,112 @@ INSTR0(BAR)
 INSTR0(FENCE)
 
 /* ************************************************************************* */
-#include "regmask.h"
+#include "bitset.h"
+
+#define MAX_REG 256
+
+typedef BITSET_DECLARE(regmaskstate_t, 2 * MAX_REG);
+
+typedef struct {
+   bool mergedregs;
+   regmaskstate_t mask;
+} regmask_t;
+
+static inline bool
+__regmask_get(regmask_t *regmask, bool half, unsigned n)
+{
+   if (regmask->mergedregs) {
+      /* a6xx+ case, with merged register file, we track things in terms
+       * of half-precision registers, with a full precisions register
+       * using two half-precision slots.
+       *
+       * Pretend that special regs (a0.x, a1.x, etc.) are full registers to
+       * avoid having them alias normal full regs.
+       */
+      if (half && !is_reg_num_special(n)) {
+         return BITSET_TEST(regmask->mask, n);
+      } else {
+         n *= 2;
+         return BITSET_TEST(regmask->mask, n) ||
+                BITSET_TEST(regmask->mask, n + 1);
+      }
+   } else {
+      /* pre a6xx case, with separate register file for half and full
+       * precision:
+       */
+      if (half)
+         n += MAX_REG;
+      return BITSET_TEST(regmask->mask, n);
+   }
+}
+
+static inline void
+__regmask_set(regmask_t *regmask, bool half, unsigned n)
+{
+   if (regmask->mergedregs) {
+      /* a6xx+ case, with merged register file, we track things in terms
+       * of half-precision registers, with a full precisions register
+       * using two half-precision slots:
+       */
+      if (half && !is_reg_num_special(n)) {
+         BITSET_SET(regmask->mask, n);
+      } else {
+         n *= 2;
+         BITSET_SET(regmask->mask, n);
+         BITSET_SET(regmask->mask, n + 1);
+      }
+   } else {
+      /* pre a6xx case, with separate register file for half and full
+       * precision:
+       */
+      if (half)
+         n += MAX_REG;
+      BITSET_SET(regmask->mask, n);
+   }
+}
+
+static inline void
+__regmask_clear(regmask_t *regmask, bool half, unsigned n)
+{
+   if (regmask->mergedregs) {
+      /* a6xx+ case, with merged register file, we track things in terms
+       * of half-precision registers, with a full precisions register
+       * using two half-precision slots:
+       */
+      if (half && !is_reg_num_special(n)) {
+         BITSET_CLEAR(regmask->mask, n);
+      } else {
+         n *= 2;
+         BITSET_CLEAR(regmask->mask, n);
+         BITSET_CLEAR(regmask->mask, n + 1);
+      }
+   } else {
+      /* pre a6xx case, with separate register file for half and full
+       * precision:
+       */
+      if (half)
+         n += MAX_REG;
+      BITSET_CLEAR(regmask->mask, n);
+   }
+}
+
+static inline void
+regmask_init(regmask_t *regmask, bool mergedregs)
+{
+   memset(&regmask->mask, 0, sizeof(regmask->mask));
+   regmask->mergedregs = mergedregs;
+}
+
+static inline void
+regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
+{
+   assert(dst->mergedregs == a->mergedregs);
+   assert(dst->mergedregs == b->mergedregs);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(dst->mask); i++)
+      dst->mask[i] = a->mask[i] | b->mask[i];
+}
+
 
 static inline void
 regmask_set(regmask_t *regmask, struct ir3_register *reg)
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_a4xx.c b/mesa 3D driver/src/freedreno/ir3/ir3_a4xx.c
index ea93285bc7..2339fa3d67 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_a4xx.c	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_a4xx.c	
@@ -24,7 +24,8 @@
  *    Rob Clark <robclark@freedesktop.org>
  */
 
-#define GPU 400
+/* 500 gets us LDIB but doesn't change any other a4xx instructions */
+#define GPU 500
 
 #include "ir3_context.h"
 #include "ir3_image.h"
@@ -47,7 +48,7 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
    offset = ir3_get_src(ctx, &intr->src[2])[0];
 
    /* src0 is uvec2(offset*4, 0), src1 is offset.. nir already *= 4: */
-   src0 = ir3_collect(ctx, byte_offset, create_immed(b, 0));
+   src0 = ir3_collect(b, byte_offset, create_immed(b, 0));
    src1 = offset;
 
    ldgb = ir3_LDGB(b, ssbo, 0, src0, 0, src1, 0);
@@ -80,9 +81,9 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
    /* src0 is value, src1 is offset, src2 is uvec2(offset*4, 0)..
     * nir already *= 4:
     */
-   src0 = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
+   src0 = ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), ncomp);
    src1 = offset;
-   src2 = ir3_collect(ctx, byte_offset, create_immed(b, 0));
+   src2 = ir3_collect(b, byte_offset, create_immed(b, 0));
 
    stgb = ir3_STGB(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
    stgb->cat6.iim_val = ncomp;
@@ -105,70 +106,69 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
  * sources represent:
  *
  * 0: The SSBO buffer index.
- * 1: The offset into the SSBO buffer of the variable that the atomic
+ * 1: The byte offset into the SSBO buffer of the variable that the atomic
  *    operation will operate on.
  * 2: The data parameter to the atomic function (i.e. the value to add
  *    in ssbo_atomic_add, etc).
- * 3: For CompSwap only: the second data parameter.
+ * 3: CompSwap: the second data parameter.
+ *    Non-CompSwap: The dword offset into the SSBO buffer variable.
+ * 4: CompSwap: The dword offset into the SSBO buffer variable.
+ *
+ * We use custom ssbo_*_ir3 intrinsics generated by ir3_nir_lower_io_offsets()
+ * so we can have the dword offset generated in NIR.
  */
 static struct ir3_instruction *
 emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 {
    struct ir3_block *b = ctx->block;
-   struct ir3_instruction *atomic, *ssbo, *src0, *src1, *src2, *byte_offset,
-      *offset;
+   struct ir3_instruction *atomic;
    type_t type = TYPE_U32;
 
-   ssbo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
+   struct ir3_instruction *ssbo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
 
-   byte_offset = ir3_get_src(ctx, &intr->src[1])[0];
-   offset = ir3_get_src(ctx, &intr->src[3])[0];
-
-   /* src0 is data (or uvec2(data, compare))
-    * src1 is offset
-    * src2 is uvec2(offset*4, 0) (appears to be 64b byte offset)
-    *
-    * Note that nir already multiplies the offset by four
-    */
-   src0 = ir3_get_src(ctx, &intr->src[2])[0];
-   src1 = offset;
-   src2 = ir3_collect(ctx, byte_offset, create_immed(b, 0));
+   struct ir3_instruction *data = ir3_get_src(ctx, &intr->src[2])[0];
+   /* 64b byte offset */
+   struct ir3_instruction *byte_offset =
+      ir3_collect(b, ir3_get_src(ctx, &intr->src[1])[0], create_immed(b, 0));
+   /* dword offset for everything but comp_swap */
+   struct ir3_instruction *src3 = ir3_get_src(ctx, &intr->src[3])[0];
 
    switch (intr->intrinsic) {
    case nir_intrinsic_ssbo_atomic_add_ir3:
-      atomic = ir3_ATOMIC_ADD_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+      atomic = ir3_ATOMIC_ADD_G(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0);
       break;
    case nir_intrinsic_ssbo_atomic_imin_ir3:
-      atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+      atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0);
       type = TYPE_S32;
       break;
    case nir_intrinsic_ssbo_atomic_umin_ir3:
-      atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+      atomic = ir3_ATOMIC_MIN_G(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0);
       break;
    case nir_intrinsic_ssbo_atomic_imax_ir3:
-      atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+      atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0);
       type = TYPE_S32;
       break;
    case nir_intrinsic_ssbo_atomic_umax_ir3:
-      atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+      atomic = ir3_ATOMIC_MAX_G(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0);
       break;
    case nir_intrinsic_ssbo_atomic_and_ir3:
-      atomic = ir3_ATOMIC_AND_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+      atomic = ir3_ATOMIC_AND_G(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0);
       break;
    case nir_intrinsic_ssbo_atomic_or_ir3:
-      atomic = ir3_ATOMIC_OR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+      atomic = ir3_ATOMIC_OR_G(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0);
       break;
    case nir_intrinsic_ssbo_atomic_xor_ir3:
-      atomic = ir3_ATOMIC_XOR_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+      atomic = ir3_ATOMIC_XOR_G(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0);
       break;
    case nir_intrinsic_ssbo_atomic_exchange_ir3:
-      atomic = ir3_ATOMIC_XCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+      atomic = ir3_ATOMIC_XCHG_G(b, ssbo, 0, data, 0, src3, 0, byte_offset, 0);
       break;
    case nir_intrinsic_ssbo_atomic_comp_swap_ir3:
       /* for cmpxchg, src0 is [ui]vec2(data, compare): */
-      src0 = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[3])[0], src0);
-      src1 = ir3_get_src(ctx, &intr->src[4])[0];
-      atomic = ir3_ATOMIC_CMPXCHG_G(b, ssbo, 0, src0, 0, src1, 0, src2, 0);
+      data = ir3_collect(b, src3, data);
+      struct ir3_instruction *dword_offset = ir3_get_src(ctx, &intr->src[4])[0];
+      atomic = ir3_ATOMIC_CMPXCHG_G(b, ssbo, 0, data, 0, dword_offset, 0,
+                                    byte_offset, 0);
       break;
    default:
       unreachable("boo");
@@ -225,7 +225,33 @@ get_image_offset(struct ir3_context *ctx, const nir_intrinsic_instr *instr,
       offset = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
    }
 
-   return ir3_collect(ctx, offset, create_immed(b, 0));
+   return ir3_collect(b, offset, create_immed(b, 0));
+}
+
+/* src[] = { deref, coord, sample_index }. const_index[] = {} */
+static void
+emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+                          struct ir3_instruction **dst)
+{
+   struct ir3_block *b = ctx->block;
+   struct ir3_instruction *const *coords = ir3_get_src(ctx, &intr->src[1]);
+   struct ir3_instruction *ibo = ir3_image_to_ibo(ctx, intr->src[0]);
+   struct ir3_instruction *offset = get_image_offset(ctx, intr, coords, true);
+   unsigned ncoords = ir3_get_image_coords(intr, NULL);
+   unsigned ncomp =
+      ir3_get_num_components_for_image_format(nir_intrinsic_format(intr));
+
+   struct ir3_instruction *ldib = ir3_LDIB(
+      b, ibo, 0, offset, 0, ir3_create_collect(b, coords, ncoords), 0);
+   ldib->dsts[0]->wrmask = MASK(intr->num_components);
+   ldib->cat6.iim_val = ncomp;
+   ldib->cat6.d = ncoords;
+   ldib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
+   ldib->cat6.typed = true;
+   ldib->barrier_class = IR3_BARRIER_IMAGE_R;
+   ldib->barrier_conflict = IR3_BARRIER_IMAGE_W;
+
+   ir3_split_dest(b, dst, ldib, 0, intr->num_components);
 }
 
 /* src[] = { index, coord, sample_index, value }. const_index[] = {} */
@@ -253,8 +279,8 @@ emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
     * one over the other in various cases.
     */
 
-   stib = ir3_STIB(b, ibo, 0, ir3_create_collect(ctx, value, ncomp), 0,
-                   ir3_create_collect(ctx, coords, ncoords), 0, offset, 0);
+   stib = ir3_STIB(b, ibo, 0, ir3_create_collect(b, value, ncomp), 0,
+                   ir3_create_collect(b, coords, ncoords), 0, offset, 0);
    stib->cat6.iim_val = ncomp;
    stib->cat6.d = ncoords;
    stib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
@@ -280,7 +306,7 @@ emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
     * src2 is 64b byte offset
     */
    src0 = ir3_get_src(ctx, &intr->src[3])[0];
-   src1 = ir3_create_collect(ctx, coords, ncoords);
+   src1 = ir3_create_collect(b, coords, ncoords);
    src2 = get_image_offset(ctx, intr, coords, false);
 
    switch (intr->intrinsic) {
@@ -309,7 +335,7 @@ emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
       break;
    case nir_intrinsic_image_atomic_comp_swap:
       /* for cmpxchg, src0 is [ui]vec2(data, compare): */
-      src0 = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[4])[0], src0);
+      src0 = ir3_collect(b, ir3_get_src(ctx, &intr->src[4])[0], src0);
       atomic = ir3_ATOMIC_CMPXCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
       break;
    default:
@@ -333,6 +359,7 @@ const struct ir3_context_funcs ir3_a4xx_funcs = {
    .emit_intrinsic_load_ssbo = emit_intrinsic_load_ssbo,
    .emit_intrinsic_store_ssbo = emit_intrinsic_store_ssbo,
    .emit_intrinsic_atomic_ssbo = emit_intrinsic_atomic_ssbo,
+   .emit_intrinsic_load_image = emit_intrinsic_load_image,
    .emit_intrinsic_store_image = emit_intrinsic_store_image,
    .emit_intrinsic_atomic_image = emit_intrinsic_atomic_image,
    .emit_intrinsic_image_size = emit_intrinsic_image_size_tex,
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_a6xx.c b/mesa 3D driver/src/freedreno/ir3/ir3_a6xx.c
index aca14fb14a..04efc0adfe 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_a6xx.c	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_a6xx.c	
@@ -74,7 +74,7 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 
    /* src0 is offset, src1 is value:
     */
-   val = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
+   val = ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), ncomp);
    offset = ir3_get_src(ctx, &intr->src[3])[0];
 
    stib = ir3_STIB(b, ir3_ssbo_to_ibo(ctx, intr->src[1]), 0, offset, 0, val, 0);
@@ -136,10 +136,10 @@ emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
    if (intr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap_ir3) {
       src0 = ir3_get_src(ctx, &intr->src[4])[0];
       struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[3])[0];
-      src1 = ir3_collect(ctx, dummy, compare, data);
+      src1 = ir3_collect(b, dummy, compare, data);
    } else {
       src0 = ir3_get_src(ctx, &intr->src[3])[0];
-      src1 = ir3_collect(ctx, dummy, data);
+      src1 = ir3_collect(b, dummy, data);
    }
 
    switch (intr->intrinsic) {
@@ -207,7 +207,7 @@ emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
    unsigned ncoords = ir3_get_image_coords(intr, NULL);
 
    ldib = ir3_LDIB(b, ir3_image_to_ibo(ctx, intr->src[0]), 0,
-                   ir3_create_collect(ctx, coords, ncoords), 0);
+                   ir3_create_collect(b, coords, ncoords), 0);
    ldib->dsts[0]->wrmask = MASK(intr->num_components);
    ldib->cat6.iim_val = intr->num_components;
    ldib->cat6.d = ncoords;
@@ -236,8 +236,8 @@ emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
    /* src0 is offset, src1 is value:
     */
    stib = ir3_STIB(b, ir3_image_to_ibo(ctx, intr->src[0]), 0,
-                   ir3_create_collect(ctx, coords, ncoords), 0,
-                   ir3_create_collect(ctx, value, ncomp), 0);
+                   ir3_create_collect(b, coords, ncoords), 0,
+                   ir3_create_collect(b, value, ncomp), 0);
    stib->cat6.iim_val = ncomp;
    stib->cat6.d = ncoords;
    stib->cat6.type = ir3_get_type_for_image_intrinsic(intr);
@@ -275,14 +275,14 @@ emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
     * register) and then immediately extract the first component.
     */
    dummy = create_immed(b, 0);
-   src0 = ir3_create_collect(ctx, coords, ncoords);
+   src0 = ir3_create_collect(b, coords, ncoords);
 
    if (intr->intrinsic == nir_intrinsic_image_atomic_comp_swap ||
        intr->intrinsic == nir_intrinsic_bindless_image_atomic_comp_swap) {
       struct ir3_instruction *compare = ir3_get_src(ctx, &intr->src[4])[0];
-      src1 = ir3_collect(ctx, dummy, compare, value);
+      src1 = ir3_collect(b, dummy, compare, value);
    } else {
-      src1 = ir3_collect(ctx, dummy, value);
+      src1 = ir3_collect(b, dummy, value);
    }
 
    switch (intr->intrinsic) {
@@ -359,6 +359,7 @@ emit_intrinsic_image_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
    compile_assert(ctx, intr->num_components <= 3);
    resinfo->dsts[0]->wrmask = MASK(3);
    ir3_handle_bindless_cat6(resinfo, intr->src[0]);
+   ir3_handle_nonuniform(resinfo, intr);
 
    ir3_split_dest(b, dst, resinfo, 0, intr->num_components);
 }
@@ -372,15 +373,26 @@ emit_intrinsic_load_global_ir3(struct ir3_context *ctx,
    unsigned dest_components = nir_intrinsic_dest_components(intr);
    struct ir3_instruction *addr, *offset;
 
-   addr = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[0])[0],
+   addr = ir3_collect(b, ir3_get_src(ctx, &intr->src[0])[0],
                       ir3_get_src(ctx, &intr->src[0])[1]);
 
-   offset = ir3_get_src(ctx, &intr->src[1])[0];
+   struct ir3_instruction *load;
 
-   struct ir3_instruction *load =
-      ir3_LDG_A(b, addr, 0, offset, 0, create_immed(b, 0), 0,
-                create_immed(b, 0), 0, create_immed(b, dest_components), 0);
-   load->cat6.type = TYPE_U32;
+   bool const_offset_in_bounds = nir_src_is_const(intr->src[1]) &&
+                                 nir_src_as_int(intr->src[1]) < (1 << 13) &&
+                                 nir_src_as_int(intr->src[1]) > -(1 << 13);
+
+   if (const_offset_in_bounds) {
+      load = ir3_LDG(b, addr, 0, create_immed(b, nir_src_as_int(intr->src[1])),
+                     0, create_immed(b, dest_components), 0);
+   } else {
+      offset = ir3_get_src(ctx, &intr->src[1])[0];
+      load =
+         ir3_LDG_A(b, addr, 0, offset, 0, create_immed(b, 0), 0,
+                   create_immed(b, 0), 0, create_immed(b, dest_components), 0);
+   }
+
+   load->cat6.type = type_uint_size(intr->dest.ssa.bit_size);
    load->dsts[0]->wrmask = MASK(dest_components);
 
    load->barrier_class = IR3_BARRIER_BUFFER_R;
@@ -397,17 +409,30 @@ emit_intrinsic_store_global_ir3(struct ir3_context *ctx,
    struct ir3_instruction *value, *addr, *offset;
    unsigned ncomp = nir_intrinsic_src_components(intr, 0);
 
-   addr = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[1])[0],
+   addr = ir3_collect(b, ir3_get_src(ctx, &intr->src[1])[0],
                       ir3_get_src(ctx, &intr->src[1])[1]);
 
-   offset = ir3_get_src(ctx, &intr->src[2])[0];
+   value = ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), ncomp);
 
-   value = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), ncomp);
+   struct ir3_instruction *stg;
 
-   struct ir3_instruction *stg =
-      ir3_STG_A(b, addr, 0, offset, 0, create_immed(b, 0), 0,
-                create_immed(b, 0), 0, value, 0, create_immed(b, ncomp), 0);
-   stg->cat6.type = TYPE_U32;
+   bool const_offset_in_bounds = nir_src_is_const(intr->src[2]) &&
+                                 nir_src_as_int(intr->src[2]) < (1 << 13) &&
+                                 nir_src_as_int(intr->src[2]) > -(1 << 13);
+
+   if (const_offset_in_bounds) {
+      stg = ir3_STG(b, addr, 0,
+                    create_immed(b, nir_src_as_int(intr->src[2])), 0,
+                    value, 0,
+                    create_immed(b, ncomp), 0);
+   } else {
+      offset = ir3_get_src(ctx, &intr->src[2])[0];
+      stg =
+         ir3_STG_A(b, addr, 0, offset, 0, create_immed(b, 0), 0,
+                   create_immed(b, 0), 0, value, 0, create_immed(b, ncomp), 0);
+   }
+
+   stg->cat6.type = type_uint_size(intr->src[0].ssa->bit_size);
    stg->cat6.iim_val = 1;
 
    array_insert(b, b->keeps, stg);
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_assembler.c b/mesa 3D driver/src/freedreno/ir3/ir3_assembler.c
index dd46f882be..f4777c3257 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_assembler.c	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_assembler.c	
@@ -42,6 +42,9 @@ ir3_parse_asm(struct ir3_compiler *c, struct ir3_kernel_info *info, FILE *in)
    v->shader = shader;
    v->const_state = rzalloc_size(v, sizeof(*v->const_state));
 
+   if (c->gen >= 6)
+      v->mergedregs = true;
+
    shader->variants = v;
    shader->variant_count = 1;
 
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_assembler.h b/mesa 3D driver/src/freedreno/ir3/ir3_assembler.h
index 3547e74f4e..5ff28242c8 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_assembler.h	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_assembler.h	
@@ -34,8 +34,9 @@ struct ir3_kernel_info {
    uint32_t buf_sizes[MAX_BUFS]; /* size in dwords */
    uint32_t buf_addr_regs[MAX_BUFS];
 
-   /* driver-param uniforms: */
+   /* driver-param / replaced uniforms: */
    unsigned numwg;
+   unsigned wgid;
 };
 
 struct ir3_shader;
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_cf.c b/mesa 3D driver/src/freedreno/ir3/ir3_cf.c
index dc05738985..4f8d85f429 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_cf.c	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_cf.c	
@@ -38,6 +38,13 @@ is_safe_conv(struct ir3_instruction *instr, type_t src_type, opc_t *src_opc)
        full_type(instr->cat1.src_type) != full_type(instr->cat1.dst_type))
       return false;
 
+   /* mul.s24/u24 always return 32b result regardless of its sources size,
+    * hence we cannot guarantee the high 16b of dst being zero or sign extended.
+    */
+   if ((*src_opc == OPC_MUL_S24 || *src_opc == OPC_MUL_U24) &&
+       type_size(instr->cat1.src_type) == 16)
+      return false;
+
    struct ir3_register *dst = instr->dsts[0];
    struct ir3_register *src = instr->srcs[0];
 
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_compiler.c b/mesa 3D driver/src/freedreno/ir3/ir3_compiler.c
index 29a59052f9..bc51e383e6 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_compiler.c	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_compiler.c	
@@ -44,6 +44,7 @@ static const struct debug_named_value shader_debug_options[] = {
    {"nouboopt",   IR3_DBG_NOUBOOPT,   "Disable lowering UBO to uniform"},
    {"nofp16",     IR3_DBG_NOFP16,     "Don't lower mediump to fp16"},
    {"nocache",    IR3_DBG_NOCACHE,    "Disable shader cache"},
+   {"spillall",   IR3_DBG_SPILLALL,   "Spill as much as possible to test the spiller"},
 #ifdef DEBUG
    /* DEBUG-only options: */
    {"schedmsgs",  IR3_DBG_SCHEDMSGS,  "Enable scheduler debug messages"},
@@ -125,6 +126,9 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
 
       compiler->tess_use_shared =
             fd_dev_info(compiler->dev_id)->a6xx.tess_use_shared;
+
+      compiler->storage_16bit =
+            fd_dev_info(compiler->dev_id)->a6xx.storage_16bit;
    } else {
       compiler->max_const_pipeline = 512;
       compiler->max_const_geom = 512;
@@ -181,6 +185,8 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
       compiler->const_upload_unit = 8;
    }
 
+   compiler->bool_type = (compiler->gen >= 5) ? TYPE_U16 : TYPE_U32;
+
    ir3_disk_cache_init(compiler);
 
    return compiler;
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_compiler.h b/mesa 3D driver/src/freedreno/ir3/ir3_compiler.h
index cf2fe2ad22..0071539649 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_compiler.h	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_compiler.h	
@@ -156,6 +156,12 @@ struct ir3_compiler {
 
    /* Whether private memory is supported */
    bool has_pvtmem;
+
+   /* True if 16-bit descriptors are used for both 16-bit and 32-bit access. */
+   bool storage_16bit;
+
+   /* Type to use for 1b nir bools: */
+   type_t bool_type;
 };
 
 void ir3_compiler_destroy(struct ir3_compiler *compiler);
@@ -194,6 +200,7 @@ enum ir3_shader_debug {
    IR3_DBG_NOUBOOPT = BITFIELD_BIT(9),
    IR3_DBG_NOFP16 = BITFIELD_BIT(10),
    IR3_DBG_NOCACHE = BITFIELD_BIT(11),
+   IR3_DBG_SPILLALL = BITFIELD_BIT(12),
 
    /* DEBUG-only options: */
    IR3_DBG_SCHEDMSGS = BITFIELD_BIT(20),
@@ -224,6 +231,7 @@ shader_debug_enabled(gl_shader_stage type)
    case MESA_SHADER_FRAGMENT:
       return !!(ir3_shader_debug & IR3_DBG_SHADER_FS);
    case MESA_SHADER_COMPUTE:
+   case MESA_SHADER_KERNEL:
       return !!(ir3_shader_debug & IR3_DBG_SHADER_CS);
    default:
       debug_assert(0);
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_compiler_nir.c b/mesa 3D driver/src/freedreno/ir3/ir3_compiler_nir.c
index b634eb50e9..56108ef90d 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_compiler_nir.c	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_compiler_nir.c	
@@ -194,7 +194,7 @@ create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
    case nir_op_b2i8:
    case nir_op_b2i16:
    case nir_op_b2i32:
-      src_type = TYPE_U32;
+      src_type = ctx->compiler->bool_type;
       break;
 
    default:
@@ -293,8 +293,7 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
    unsigned bs[info->num_inputs]; /* bit size */
    struct ir3_block *b = ctx->block;
    unsigned dst_sz, wrmask;
-   type_t dst_type =
-      nir_dest_bit_size(alu->dest.dest) == 16 ? TYPE_U16 : TYPE_U32;
+   type_t dst_type = type_uint_size(nir_dest_bit_size(alu->dest.dest));
 
    if (alu->dest.dest.is_ssa) {
       dst_sz = alu->dest.dest.ssa.num_components;
@@ -311,7 +310,8 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
     * order into each writemask channel.
     */
    if ((alu->op == nir_op_vec2) || (alu->op == nir_op_vec3) ||
-       (alu->op == nir_op_vec4)) {
+       (alu->op == nir_op_vec4) || (alu->op == nir_op_vec8) ||
+       (alu->op == nir_op_vec16)) {
 
       for (int i = 0; i < info->num_inputs; i++) {
          nir_alu_src *asrc = &alu->src[i];
@@ -398,7 +398,7 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
    case nir_op_f2b1:
       dst[0] = ir3_CMPS_F(
          b, src[0], 0,
-         create_immed_typed(b, 0, bs[0] == 16 ? TYPE_F16 : TYPE_F32), 0);
+         create_immed_typed(b, 0, type_float_size(bs[0])), 0);
       dst[0]->cat2.condition = IR3_COND_NE;
       break;
 
@@ -408,7 +408,7 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
        */
       dst[0] = ir3_CMPS_S(
          b, src[0], 0,
-         create_immed_typed(b, 0, bs[0] == 16 ? TYPE_U16 : TYPE_U32), 0);
+         create_immed_typed(b, 0, type_uint_size(bs[0])), 0);
       dst[0]->cat2.condition = IR3_COND_NE;
       break;
 
@@ -553,6 +553,14 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
    case nir_op_iadd:
       dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
       break;
+   case nir_op_ihadd:
+      dst[0] = ir3_ADD_S(b, src[0], 0, src[1], 0);
+      dst[0]->dsts[0]->flags |= IR3_REG_EI;
+      break;
+   case nir_op_uhadd:
+      dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
+      dst[0]->dsts[0]->flags |= IR3_REG_EI;
+      break;
    case nir_op_iand:
       dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0);
       break;
@@ -589,7 +597,9 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
       break;
    case nir_op_inot:
       if (bs[0] == 1) {
-         dst[0] = ir3_SUB_U(b, create_immed(ctx->block, 1), 0, src[0], 0);
+         struct ir3_instruction *one =
+               create_immed_typed(ctx->block, 1, ctx->compiler->bool_type);
+         dst[0] = ir3_SUB_U(b, one, 0, src[0], 0);
       } else {
          dst[0] = ir3_NOT_B(b, src[0], 0);
       }
@@ -654,24 +664,34 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
       }
 
       compile_assert(ctx, bs[1] == bs[2]);
+
       /* The condition's size has to match the other two arguments' size, so
        * convert down if necessary.
+       *
+       * Single hashtable is fine, because the conversion will either be
+       * 16->32 or 32->16, but never both
        */
-      if (bs[1] == 16) {
+      if (is_half(src[1]) != is_half(cond)) {
          struct hash_entry *prev_entry =
             _mesa_hash_table_search(ctx->sel_cond_conversions, src[0]);
          if (prev_entry) {
             cond = prev_entry->data;
          } else {
-            cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16);
+            if (is_half(cond)) {
+               cond = ir3_COV(b, cond, TYPE_U16, TYPE_U32);
+            } else {
+               cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16);
+            }
             _mesa_hash_table_insert(ctx->sel_cond_conversions, src[0], cond);
          }
       }
 
-      if (bs[1] != 16)
-         dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0);
-      else
+      if (is_half(src[1])) {
          dst[0] = ir3_SEL_B16(b, src[1], 0, cond, 0, src[2], 0);
+      } else {
+         dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0);
+      }
+
       break;
    }
    case nir_op_bit_count: {
@@ -838,7 +858,7 @@ emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
       carry->cat2.condition = IR3_COND_LT;
       base_hi = ir3_ADD_S(b, base_hi, 0, carry, 0);
 
-      addr = ir3_collect(ctx, addr, base_hi);
+      addr = ir3_collect(b, addr, base_hi);
    }
 
    for (int i = 0; i < intr->num_components; i++) {
@@ -850,6 +870,41 @@ emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
    }
 }
 
+/* Load a kernel param: src[] = { address }. */
+static void
+emit_intrinsic_load_kernel_input(struct ir3_context *ctx,
+                                 nir_intrinsic_instr *intr,
+                                 struct ir3_instruction **dst)
+{
+   const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
+   struct ir3_block *b = ctx->block;
+   unsigned offset = nir_intrinsic_base(intr);
+   unsigned p = regid(const_state->offsets.kernel_params, 0);
+
+   struct ir3_instruction *src0 = ir3_get_src(ctx, &intr->src[0])[0];
+
+   if (is_same_type_mov(src0) && (src0->srcs[0]->flags & IR3_REG_IMMED)) {
+      offset += src0->srcs[0]->iim_val;
+
+      /* kernel param position is in bytes, but constant space is 32b registers: */
+      compile_assert(ctx, !(offset & 0x3));
+
+      dst[0] = create_uniform(b, p + (offset / 4));
+   } else {
+      /* kernel param position is in bytes, but constant space is 32b registers: */
+      compile_assert(ctx, !(offset & 0x3));
+
+      /* TODO we should probably be lowering this in nir, and also handling
+       * non-32b inputs.. Also we probably don't want to be using
+       * SP_MODE_CONTROL.CONSTANT_DEMOTION_ENABLE for KERNEL shaders..
+       */
+      src0 = ir3_SHR_B(b, src0, 0, create_immed(b, 2), 0);
+
+      dst[0] = create_uniform_indirect(b, offset / 4, TYPE_U32,
+                                       ir3_get_addr0(ctx, src0, 1));
+   }
+}
+
 /* src[] = { block_index } */
 static void
 emit_intrinsic_ssbo_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
@@ -865,6 +920,7 @@ emit_intrinsic_ssbo_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
    /* resinfo has no writemask and always writes out 3 components */
    resinfo->dsts[0]->wrmask = MASK(3);
    ir3_handle_bindless_cat6(resinfo, intr->src[0]);
+   ir3_handle_nonuniform(resinfo, intr);
 
    if (ctx->compiler->gen >= 6) {
       ir3_split_dest(b, dst, resinfo, 0, 1);
@@ -918,7 +974,7 @@ emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 
    assert(wrmask == BITFIELD_MASK(intr->num_components));
 
-   stl = ir3_STL(b, offset, 0, ir3_create_collect(ctx, value, ncomp), 0,
+   stl = ir3_STL(b, offset, 0, ir3_create_collect(b, value, ncomp), 0,
                  create_immed(b, ncomp), 0);
    stl->cat6.dst_offset = base;
    stl->cat6.type = utype_src(intr->src[0]);
@@ -970,7 +1026,7 @@ emit_intrinsic_store_shared_ir3(struct ir3_context *ctx,
    offset = ir3_get_src(ctx, &intr->src[1])[0];
 
    store = ir3_STLW(b, offset, 0,
-                    ir3_create_collect(ctx, value, intr->num_components), 0,
+                    ir3_create_collect(b, value, intr->num_components), 0,
                     create_immed(b, intr->num_components), 0);
 
    /* for a650, use STL for vertex outputs used by tess ctrl shader: */
@@ -1044,7 +1100,7 @@ emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
       break;
    case nir_intrinsic_shared_atomic_comp_swap:
       /* for cmpxchg, src1 is [ui]vec2(data, compare): */
-      src1 = ir3_collect(ctx, ir3_get_src(ctx, &intr->src[2])[0], src1);
+      src1 = ir3_collect(b, ir3_get_src(ctx, &intr->src[2])[0], src1);
       atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0);
       break;
    default:
@@ -1063,6 +1119,28 @@ emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
    return atomic;
 }
 
+static void
+stp_ldp_offset(struct ir3_context *ctx, nir_src *src,
+               struct ir3_instruction **offset, int32_t *base)
+{
+   struct ir3_block *b = ctx->block;
+
+   if (nir_src_is_const(*src)) {
+      unsigned src_offset = nir_src_as_uint(*src);
+      /* The base offset field is only 13 bits, and it's signed. Try to make the
+       * offset constant whenever the original offsets are similar, to avoid
+       * creating too many constants in the final shader.
+       */
+      *base = ((int32_t) src_offset << (32 - 13)) >> (32 - 13);
+      uint32_t offset_val = src_offset - *base;
+      *offset = create_immed(b, offset_val);
+   } else {
+      /* TODO: match on nir_iadd with a constant that fits */
+      *base = 0;
+      *offset = ir3_get_src(ctx, src)[0];
+   }
+}
+
 /* src[] = { offset }. */
 static void
 emit_intrinsic_load_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr,
@@ -1070,10 +1148,11 @@ emit_intrinsic_load_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr,
 {
    struct ir3_block *b = ctx->block;
    struct ir3_instruction *ldp, *offset;
+   int32_t base;
 
-   offset = ir3_get_src(ctx, &intr->src[0])[0];
+   stp_ldp_offset(ctx, &intr->src[0], &offset, &base);
 
-   ldp = ir3_LDP(b, offset, 0, create_immed(b, 0), 0,
+   ldp = ir3_LDP(b, offset, 0, create_immed(b, base), 0,
                  create_immed(b, intr->num_components), 0);
 
    ldp->cat6.type = utype_dst(intr->dest);
@@ -1093,18 +1172,20 @@ emit_intrinsic_store_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr)
    struct ir3_instruction *stp, *offset;
    struct ir3_instruction *const *value;
    unsigned wrmask, ncomp;
+   int32_t base;
 
    value = ir3_get_src(ctx, &intr->src[0]);
-   offset = ir3_get_src(ctx, &intr->src[1])[0];
+
+   stp_ldp_offset(ctx, &intr->src[1], &offset, &base);
 
    wrmask = nir_intrinsic_write_mask(intr);
    ncomp = ffs(~wrmask) - 1;
 
    assert(wrmask == BITFIELD_MASK(intr->num_components));
 
-   stp = ir3_STP(b, offset, 0, ir3_create_collect(ctx, value, ncomp), 0,
+   stp = ir3_STP(b, offset, 0, ir3_create_collect(b, value, ncomp), 0,
                  create_immed(b, ncomp), 0);
-   stp->cat6.dst_offset = 0;
+   stp->cat6.dst_offset = base;
    stp->cat6.type = utype_src(intr->src[0]);
    stp->barrier_class = IR3_BARRIER_PRIVATE_W;
    stp->barrier_conflict = IR3_BARRIER_PRIVATE_R | IR3_BARRIER_PRIVATE_W;
@@ -1116,7 +1197,7 @@ struct tex_src_info {
    /* For prefetch */
    unsigned tex_base, samp_base, tex_idx, samp_idx;
    /* For normal tex instructions */
-   unsigned base, combined_idx, a1_val, flags;
+   unsigned base, a1_val, flags;
    struct ir3_instruction *samp_tex;
 };
 
@@ -1149,11 +1230,9 @@ get_image_samp_tex_src(struct ir3_context *ctx, nir_intrinsic_instr *intr)
          if (info.tex_idx < 16) {
             /* Everything fits within the instruction */
             info.base = info.tex_base;
-            info.combined_idx = info.samp_idx | (info.tex_idx << 4);
          } else {
             info.base = info.tex_base;
             info.a1_val = info.tex_idx << 3;
-            info.combined_idx = 0;
             info.flags |= IR3_INSTR_A1EN;
          }
          info.samp_tex = NULL;
@@ -1166,7 +1245,7 @@ get_image_samp_tex_src(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 
          texture = ir3_get_src(ctx, &intr->src[0])[0];
          sampler = create_immed(b, 0);
-         info.samp_tex = ir3_collect(ctx, texture, sampler);
+         info.samp_tex = ir3_collect(b, texture, sampler);
       }
    } else {
       info.flags |= IR3_INSTR_S2EN;
@@ -1177,7 +1256,7 @@ get_image_samp_tex_src(struct ir3_context *ctx, nir_intrinsic_instr *intr)
       texture = create_immed_typed(ctx->block, tex_idx, TYPE_U16);
       sampler = create_immed_typed(ctx->block, tex_idx, TYPE_U16);
 
-      info.samp_tex = ir3_collect(ctx, sampler, texture);
+      info.samp_tex = ir3_collect(b, sampler, texture);
    }
 
    return info;
@@ -1199,7 +1278,8 @@ emit_sam(struct ir3_context *ctx, opc_t opc, struct tex_src_info info,
    }
    if (info.flags & IR3_INSTR_B) {
       sam->cat5.tex_base = info.base;
-      sam->cat5.samp = info.combined_idx;
+      sam->cat5.samp = info.samp_idx;
+      sam->cat5.tex  = info.tex_idx;
    }
    return sam;
 }
@@ -1209,6 +1289,14 @@ static void
 emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
                           struct ir3_instruction **dst)
 {
+   /* Coherent accesses have to go directly to memory, rather than through
+    * ISAM's texture cache (which isn't coherent with image stores).
+    */
+   if (nir_intrinsic_access(intr) & ACCESS_COHERENT && ctx->compiler->gen >= 5) {
+      ctx->funcs->emit_intrinsic_load_image(ctx, intr, dst);
+      return;
+   }
+
    struct ir3_block *b = ctx->block;
    struct tex_src_info info = get_image_samp_tex_src(ctx, intr);
    struct ir3_instruction *sam;
@@ -1233,7 +1321,7 @@ emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
       coords[ncoords++] = create_immed(b, 0);
 
    sam = emit_sam(ctx, OPC_ISAM, info, type, 0b1111,
-                  ir3_create_collect(ctx, coords, ncoords), NULL);
+                  ir3_create_collect(b, coords, ncoords), NULL);
 
    ir3_handle_nonuniform(sam, intr);
 
@@ -1374,8 +1462,7 @@ emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr)
                IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
          }
 
-         /* TODO: check for image mode when it has a separate one */
-         if (modes & nir_var_mem_ssbo) {
+         if (modes & nir_var_image) {
             barrier->barrier_class |= IR3_BARRIER_IMAGE_W;
             barrier->barrier_conflict |=
                IR3_BARRIER_IMAGE_W | IR3_BARRIER_IMAGE_R;
@@ -1496,9 +1583,9 @@ get_barycentric(struct ir3_context *ctx, enum ir3_bary bary)
       struct ir3_instruction *ij;
 
       ij = create_sysval_input(ctx, sysval_base + bary, 0x3);
-      ir3_split_dest(ctx->block, xy, ij, 0, 2);
+      ir3_split_dest(ctx->in_block, xy, ij, 0, 2);
 
-      ctx->ij[bary] = ir3_create_collect(ctx, xy, 2);
+      ctx->ij[bary] = ir3_create_collect(ctx->in_block, xy, 2);
    }
 
    return ctx->ij[bary];
@@ -1599,7 +1686,7 @@ get_frag_coord(struct ir3_context *ctx, nir_intrinsic_instr *intr)
             ir3_MUL_F(b, xyzw[i], 0, create_immed(b, fui(1.0 / 16.0)), 0);
       }
 
-      ctx->frag_coord = ir3_create_collect(ctx, xyzw, 4);
+      ctx->frag_coord = ir3_create_collect(b, xyzw, 4);
    }
 
    ctx->so->fragcoord_compmask |= nir_ssa_def_components_read(&intr->dest.ssa);
@@ -1768,6 +1855,9 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
    case nir_intrinsic_load_input:
       setup_input(ctx, intr);
       break;
+   case nir_intrinsic_load_kernel_input:
+      emit_intrinsic_load_kernel_input(ctx, intr, dst);
+      break;
    /* All SSBO intrinsics should have been lowered by 'lower_io_offsets'
     * pass and replaced by an ir3-specifc version that adds the
     * dword-offset in the last source.
@@ -2003,6 +2093,9 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
    case nir_intrinsic_load_subgroup_id_shift_ir3:
       dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_ID_SHIFT);
       break;
+   case nir_intrinsic_load_work_dim:
+      dst[0] = create_driver_param(ctx, IR3_DP_WORK_DIM);
+      break;
    case nir_intrinsic_discard_if:
    case nir_intrinsic_discard:
    case nir_intrinsic_demote:
@@ -2019,11 +2112,13 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
          cond = src[0];
       } else {
          /* unconditional discard: */
-         cond = create_immed(b, 1);
+         cond = create_immed_typed(b, 1, ctx->compiler->bool_type);
       }
 
       /* NOTE: only cmps.*.* can write p0.x: */
-      cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
+      struct ir3_instruction *zero =
+            create_immed_typed(b, 0, is_half(cond) ? TYPE_U16 : TYPE_U32);
+      cond = ir3_CMPS_S(b, cond, 0, zero, 0);
       cond->cat2.condition = IR3_COND_NE;
 
       /* condition always goes in predicate register: */
@@ -2056,7 +2151,9 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
       cond = src[0];
 
       /* NOTE: only cmps.*.* can write p0.x: */
-      cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
+      struct ir3_instruction *zero =
+            create_immed_typed(b, 0, is_half(cond) ? TYPE_U16 : TYPE_U32);
+      cond = ir3_CMPS_S(b, cond, 0, zero, 0);
       cond->cat2.condition = IR3_COND_NE;
 
       /* condition always goes in predicate register: */
@@ -2154,8 +2251,12 @@ emit_load_const(struct ir3_context *ctx, nir_load_const_instr *instr)
 {
    struct ir3_instruction **dst =
       ir3_get_dst_ssa(ctx, &instr->def, instr->def.num_components);
+   unsigned bit_size = ir3_bitsize(ctx, instr->def.bit_size);
 
-   if (instr->def.bit_size == 16) {
+   if (bit_size <= 8) {
+      for (int i = 0; i < instr->def.num_components; i++)
+         dst[i] = create_immed_typed(ctx->block, instr->value[i].u8, TYPE_U8);
+   } else if (bit_size <= 16) {
       for (int i = 0; i < instr->def.num_components; i++)
          dst[i] = create_immed_typed(ctx->block, instr->value[i].u16, TYPE_U16);
    } else {
@@ -2169,7 +2270,7 @@ emit_undef(struct ir3_context *ctx, nir_ssa_undef_instr *undef)
 {
    struct ir3_instruction **dst =
       ir3_get_dst_ssa(ctx, &undef->def, undef->def.num_components);
-   type_t type = (undef->def.bit_size == 16) ? TYPE_U16 : TYPE_U32;
+   type_t type = utype_for_size(ir3_bitsize(ctx, undef->def.bit_size));
 
    /* backend doesn't want undefined instructions, so just plug
     * in 0.0..
@@ -2297,11 +2398,9 @@ get_tex_samp_tex_src(struct ir3_context *ctx, nir_tex_instr *tex)
               info.tex_base == info.samp_base)) {
             /* Everything fits within the instruction */
             info.base = info.tex_base;
-            info.combined_idx = info.samp_idx | (info.tex_idx << 4);
          } else {
             info.base = info.tex_base;
             info.a1_val = info.tex_idx << 3 | info.samp_base;
-            info.combined_idx = info.samp_idx;
             info.flags |= IR3_INSTR_A1EN;
          }
          info.samp_tex = NULL;
@@ -2335,7 +2434,7 @@ get_tex_samp_tex_src(struct ir3_context *ctx, nir_tex_instr *tex)
          } else {
             sampler = create_immed(b, 0);
          }
-         info.samp_tex = ir3_collect(ctx, texture, sampler);
+         info.samp_tex = ir3_collect(b, texture, sampler);
       }
    } else {
       info.flags |= IR3_INSTR_S2EN;
@@ -2364,7 +2463,7 @@ get_tex_samp_tex_src(struct ir3_context *ctx, nir_tex_instr *tex)
          info.samp_idx = tex->texture_index;
       }
 
-      info.samp_tex = ir3_collect(ctx, sampler, texture);
+      info.samp_tex = ir3_collect(b, sampler, texture);
    }
 
    return info;
@@ -2629,7 +2728,7 @@ emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
 
       ctx->so->fb_read = true;
       info.samp_tex = ir3_collect(
-         ctx, create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16),
+         b, create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16),
          create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16));
       info.flags = IR3_INSTR_S2EN;
 
@@ -2638,8 +2737,8 @@ emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
       info = get_tex_samp_tex_src(ctx, tex);
    }
 
-   struct ir3_instruction *col0 = ir3_create_collect(ctx, src0, nsrc0);
-   struct ir3_instruction *col1 = ir3_create_collect(ctx, src1, nsrc1);
+   struct ir3_instruction *col0 = ir3_create_collect(b, src0, nsrc0);
+   struct ir3_instruction *col1 = ir3_create_collect(b, src1, nsrc1);
 
    if (opc == OPC_META_TEX_PREFETCH) {
       int idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
@@ -2984,6 +3083,7 @@ emit_block(struct ir3_context *ctx, nir_block *nblock)
    list_addtail(&ctx->block->node, &ctx->ir->block_list);
 
    ctx->block->loop_id = ctx->loop_id;
+   ctx->block->loop_depth = ctx->loop_depth;
 
    /* re-emit addr register in each block if needed: */
    for (int i = 0; i < ARRAY_SIZE(ctx->addr0_ht); i++) {
@@ -3048,7 +3148,10 @@ emit_if(struct ir3_context *ctx, nir_if *nif)
    struct ir3_block *last_else = get_block(ctx, nir_if_last_else_block(nif));
    struct ir3_block *after_if =
       get_block(ctx, nir_cf_node_as_block(nir_cf_node_next(&nif->cf_node)));
-   last_else->physical_successors[0] = after_if;
+   assert(last_else->physical_successors[0] &&
+          !last_else->physical_successors[1]);
+   if (after_if != last_else->physical_successors[0])
+      last_else->physical_successors[1] = after_if;
 }
 
 static void
@@ -3056,6 +3159,7 @@ emit_loop(struct ir3_context *ctx, nir_loop *nloop)
 {
    unsigned old_loop_id = ctx->loop_id;
    ctx->loop_id = ctx->so->loops + 1;
+   ctx->loop_depth++;
 
    struct nir_block *nstart = nir_loop_first_block(nloop);
    struct ir3_block *continue_blk = NULL;
@@ -3075,10 +3179,13 @@ emit_loop(struct ir3_context *ctx, nir_loop *nloop)
       struct ir3_block *start = get_block(ctx, nstart);
       continue_blk->successors[0] = start;
       continue_blk->physical_successors[0] = start;
+      continue_blk->loop_id = ctx->loop_id;
+      continue_blk->loop_depth = ctx->loop_depth;
       list_addtail(&continue_blk->node, &ctx->ir->block_list);
    }
 
    ctx->so->loops++;
+   ctx->loop_depth--;
    ctx->loop_id = old_loop_id;
 }
 
@@ -3175,8 +3282,13 @@ emit_stream_out(struct ir3_context *ctx)
    orig_end_block->successors[0] = stream_out_block;
    orig_end_block->successors[1] = new_end_block;
 
+   orig_end_block->physical_successors[0] = stream_out_block;
+   orig_end_block->physical_successors[1] = new_end_block;
+
    stream_out_block->successors[0] = new_end_block;
 
+   stream_out_block->physical_successors[0] = new_end_block;
+
    /* setup 'if (vtxcnt < maxvtxcnt)' condition: */
    cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0);
    cond->dsts[0]->num = regid(REG_P0, 0);
@@ -3297,7 +3409,7 @@ setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr)
    struct ir3_instruction *coord = NULL;
 
    if (intr->intrinsic == nir_intrinsic_load_interpolated_input)
-      coord = ir3_create_collect(ctx, ir3_get_src(ctx, &intr->src[0]), 2);
+      coord = ir3_create_collect(ctx->block, ir3_get_src(ctx, &intr->src[0]), 2);
 
    compile_assert(ctx, nir_src_is_const(intr->src[coord ? 1 : 0]));
 
@@ -3637,6 +3749,7 @@ uses_store_output(struct ir3_shader_variant *so)
       return true;
    case MESA_SHADER_TESS_CTRL:
    case MESA_SHADER_COMPUTE:
+   case MESA_SHADER_KERNEL:
       return false;
    default:
       unreachable("unknown stage");
@@ -3784,6 +3897,15 @@ emit_instructions(struct ir3_context *ctx)
    nir_foreach_register (reg, &fxn->registers) {
       ir3_declare_array(ctx, reg);
    }
+
+   if (ctx->so->type == MESA_SHADER_TESS_CTRL &&
+       ctx->compiler->tess_use_shared) {
+      struct ir3_instruction *barrier = ir3_BAR(ctx->block);
+      barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
+      barrier->barrier_class = IR3_BARRIER_EVERYTHING;
+      array_insert(ctx->block, ctx->block->keeps, barrier);
+   }
+
    /* And emit the body: */
    ctx->impl = fxn;
    emit_function(ctx, fxn);
@@ -3962,6 +4084,13 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 
    ir = so->ir = ctx->ir;
 
+   if (so->type == MESA_SHADER_COMPUTE) {
+      so->local_size[0] = ctx->s->info.workgroup_size[0];
+      so->local_size[1] = ctx->s->info.workgroup_size[1];
+      so->local_size[2] = ctx->s->info.workgroup_size[2];
+      so->local_size_variable = ctx->s->info.workgroup_size_variable;
+   }
+
    /* Vertex shaders in a tessellation or geometry pipeline treat END as a
     * NOP and has an epilogue that writes the VS outputs to local storage, to
     * be read by the HS.  Then it resets execution mask (chmask) and chains
@@ -3982,7 +4111,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
          unsigned n = so->outputs_count++;
          so->outputs[n].slot = VARYING_SLOT_PRIMITIVE_ID;
 
-         struct ir3_instruction *out = ir3_collect(ctx, ctx->primitive_id);
+         struct ir3_instruction *out = ir3_collect(ctx->block, ctx->primitive_id);
          outputs[outputs_count] = out;
          outidxs[outputs_count] = n;
          if (so->type == MESA_SHADER_VERTEX && ctx->rel_patch_id)
@@ -3995,7 +4124,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
       if (so->type == MESA_SHADER_VERTEX && ctx->rel_patch_id) {
          unsigned n = so->outputs_count++;
          so->outputs[n].slot = VARYING_SLOT_REL_PATCH_ID_IR3;
-         struct ir3_instruction *out = ir3_collect(ctx, ctx->rel_patch_id);
+         struct ir3_instruction *out = ir3_collect(ctx->block, ctx->rel_patch_id);
          outputs[outputs_count] = out;
          outidxs[outputs_count] = n;
          regids[outputs_count] = regid(0, 1);
@@ -4005,7 +4134,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
       if (ctx->gs_header) {
          unsigned n = so->outputs_count++;
          so->outputs[n].slot = VARYING_SLOT_GS_HEADER_IR3;
-         struct ir3_instruction *out = ir3_collect(ctx, ctx->gs_header);
+         struct ir3_instruction *out = ir3_collect(ctx->block, ctx->gs_header);
          outputs[outputs_count] = out;
          outidxs[outputs_count] = n;
          regids[outputs_count] = regid(0, 0);
@@ -4015,7 +4144,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
       if (ctx->tcs_header) {
          unsigned n = so->outputs_count++;
          so->outputs[n].slot = VARYING_SLOT_TCS_HEADER_IR3;
-         struct ir3_instruction *out = ir3_collect(ctx, ctx->tcs_header);
+         struct ir3_instruction *out = ir3_collect(ctx->block, ctx->tcs_header);
          outputs[outputs_count] = out;
          outidxs[outputs_count] = n;
          regids[outputs_count] = regid(0, 0);
@@ -4044,13 +4173,13 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
       struct ir3_instruction *outputs[ctx->noutputs / 4];
       unsigned outputs_count = 0;
 
-      struct ir3_block *old_block = ctx->block;
+      struct ir3_block *b = ctx->block;
       /* Insert these collect's in the block before the end-block if
        * possible, so that any moves they generate can be shuffled around to
        * reduce nop's:
        */
       if (ctx->block->predecessors_count == 1)
-         ctx->block = ctx->block->predecessors[0];
+         b = ctx->block->predecessors[0];
 
       /* Setup IR level outputs, which are "collects" that gather
        * the scalar components of outputs.
@@ -4077,7 +4206,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
             continue;
 
          struct ir3_instruction *out =
-            ir3_create_collect(ctx, &ctx->outputs[i], ncomp);
+            ir3_create_collect(b, &ctx->outputs[i], ncomp);
 
          int outidx = i / 4;
          assert(outidx < so->outputs_count);
@@ -4113,8 +4242,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
          }
       }
 
-      ctx->block = old_block;
-
       struct ir3_instruction *end =
          ir3_instr_create(ctx->block, OPC_END, 0, outputs_count);
 
@@ -4135,6 +4262,8 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
    ir3_debug_print(ir, "AFTER: nir->ir3");
    ir3_validate(ir);
 
+   IR3_PASS(ir, ir3_remove_unreachable);
+
    IR3_PASS(ir, ir3_array_to_ssa);
 
    do {
@@ -4316,13 +4445,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
        ctx->s->info.fs.needs_quad_helper_invocations)
       so->need_pixlod = true;
 
-   if (so->type == MESA_SHADER_COMPUTE) {
-      so->local_size[0] = ctx->s->info.workgroup_size[0];
-      so->local_size[1] = ctx->s->info.workgroup_size[1];
-      so->local_size[2] = ctx->s->info.workgroup_size[2];
-      so->local_size_variable = ctx->s->info.workgroup_size_variable;
-   }
-
 out:
    if (ret) {
       if (so->ir)
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_context.c b/mesa 3D driver/src/freedreno/ir3/ir3_context.c
index c215baaa05..301514bbee 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_context.c	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_context.c	
@@ -238,7 +238,7 @@ ir3_get_src(struct ir3_context *ctx, nir_src *src)
 void
 ir3_put_dst(struct ir3_context *ctx, nir_dest *dst)
 {
-   unsigned bit_size = nir_dest_bit_size(*dst);
+   unsigned bit_size = ir3_bitsize(ctx, nir_dest_bit_size(*dst));
 
    /* add extra mov if dst value is shared reg.. in some cases not all
     * instructions can read from shared regs, in cases where they can
@@ -252,8 +252,7 @@ ir3_put_dst(struct ir3_context *ctx, nir_dest *dst)
       }
    }
 
-   /* Note: 1-bit bools are stored in 32-bit regs */
-   if (bit_size == 16) {
+   if (bit_size <= 16) {
       for (unsigned i = 0; i < ctx->last_dst_n; i++) {
          struct ir3_instruction *dst = ctx->last_dst[i];
          ir3_set_dst_type(dst, true);
@@ -298,10 +297,9 @@ dest_flags(struct ir3_instruction *instr)
 }
 
 struct ir3_instruction *
-ir3_create_collect(struct ir3_context *ctx, struct ir3_instruction *const *arr,
+ir3_create_collect(struct ir3_block *block, struct ir3_instruction *const *arr,
                    unsigned arrsz)
 {
-   struct ir3_block *block = ctx->block;
    struct ir3_instruction *collect;
 
    if (arrsz == 0)
@@ -343,7 +341,7 @@ ir3_create_collect(struct ir3_context *ctx, struct ir3_instruction *const *arr,
          elem = ir3_MOV(block, elem, type);
       }
 
-      compile_assert(ctx, dest_flags(elem) == flags);
+      debug_assert(dest_flags(elem) == flags);
       __ssa_src(collect, elem, flags);
    }
 
@@ -514,7 +512,9 @@ ir3_get_predicate(struct ir3_context *ctx, struct ir3_instruction *src)
    struct ir3_instruction *cond;
 
    /* NOTE: only cmps.*.* can write p0.x: */
-   cond = ir3_CMPS_S(b, src, 0, create_immed(b, 0), 0);
+   struct ir3_instruction *zero =
+         create_immed_typed(b, 0, is_half(src) ? TYPE_U16 : TYPE_U32);
+   cond = ir3_CMPS_S(b, src, 0, zero, 0);
    cond->cat2.condition = IR3_COND_NE;
 
    /* condition always goes in predicate register: */
@@ -543,10 +543,7 @@ ir3_declare_array(struct ir3_context *ctx, nir_register *reg)
    arr->length = reg->num_components * MAX2(1, reg->num_array_elems);
    compile_assert(ctx, arr->length > 0);
    arr->r = reg;
-   arr->half = reg->bit_size <= 16;
-   // HACK one-bit bools still end up as 32b:
-   if (reg->bit_size == 1)
-      arr->half = false;
+   arr->half = ir3_bitsize(ctx, reg->bit_size) <= 16;
    list_addtail(&arr->node, &ctx->ir->array_list);
 }
 
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_context.h b/mesa 3D driver/src/freedreno/ir3/ir3_context.h
index 18dfcabb2c..0e78e9153f 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_context.h	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_context.h	
@@ -114,6 +114,7 @@ struct ir3_context {
    unsigned stack, max_stack;
 
    unsigned loop_id;
+   unsigned loop_depth;
 
    /* a common pattern for indirect addressing is to request the
     * same address register multiple times.  To avoid generating
@@ -203,7 +204,7 @@ struct ir3_instruction **ir3_get_dst(struct ir3_context *ctx, nir_dest *dst,
 struct ir3_instruction *const *ir3_get_src(struct ir3_context *ctx,
                                            nir_src *src);
 void ir3_put_dst(struct ir3_context *ctx, nir_dest *dst);
-struct ir3_instruction *ir3_create_collect(struct ir3_context *ctx,
+struct ir3_instruction *ir3_create_collect(struct ir3_block *block,
                                            struct ir3_instruction *const *arr,
                                            unsigned arrsz);
 void ir3_split_dest(struct ir3_block *block, struct ir3_instruction **dst,
@@ -215,10 +216,10 @@ void emit_intrinsic_image_size_tex(struct ir3_context *ctx,
                                    nir_intrinsic_instr *intr,
                                    struct ir3_instruction **dst);
 
-#define ir3_collect(ctx, ...)                                                  \
+#define ir3_collect(block, ...)                                                \
    ({                                                                          \
       struct ir3_instruction *__arr[] = {__VA_ARGS__};                         \
-      ir3_create_collect(ctx, __arr, ARRAY_SIZE(__arr));                       \
+      ir3_create_collect(block, __arr, ARRAY_SIZE(__arr));                     \
    })
 
 NORETURN void ir3_context_error(struct ir3_context *ctx, const char *format,
@@ -274,4 +275,16 @@ utype_dst(nir_dest dst)
    return utype_for_size(nir_dest_bit_size(dst));
 }
 
+/**
+ * Convert nir bitsize to ir3 bitsize, handling the special case of 1b bools
+ * which can be 16b or 32b depending on gen.
+ */
+static inline unsigned
+ir3_bitsize(struct ir3_context *ctx, unsigned nir_bitsize)
+{
+   if (nir_bitsize == 1)
+      return type_size(ctx->compiler->bool_type);
+   return nir_bitsize;
+}
+
 #endif /* IR3_CONTEXT_H_ */
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_cp.c b/mesa 3D driver/src/freedreno/ir3/ir3_cp.c
index 50c43f3303..551e50d44e 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_cp.c	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_cp.c	
@@ -420,14 +420,14 @@ reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr,
             if (!is_cat2_float(instr->opc) && !is_cat3_float(instr->opc))
                return false;
          } else if (src->cat1.dst_type == TYPE_U16) {
-            if (is_meta(instr))
-               return true;
             /* Since we set CONSTANT_DEMOTION_ENABLE, a float reference of
              * what was a U16 value read from the constbuf would incorrectly
              * do 32f->16f conversion, when we want to read a 16f value.
              */
             if (is_cat2_float(instr->opc) || is_cat3_float(instr->opc))
                return false;
+            if (instr->opc == OPC_MOV && type_float(instr->cat1.src_type))
+               return false;
          }
 
          src_reg = ir3_reg_clone(instr->block->shader, src_reg);
@@ -467,10 +467,8 @@ reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr,
          if (new_flags & IR3_REG_BNOT)
             iim_val = ~iim_val;
 
-         /* other than category 1 (mov) we can only encode up to 10 bits: */
          if (ir3_valid_flags(instr, n, new_flags) &&
-             ((instr->opc == OPC_MOV) || is_meta(instr) ||
-              !((iim_val & ~0x3ff) && (-iim_val & ~0x3ff)))) {
+             ir3_valid_immediate(instr, iim_val)) {
             new_flags &= ~(IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT);
             src_reg = ir3_reg_clone(instr->block->shader, src_reg);
             src_reg->flags = new_flags;
@@ -536,11 +534,12 @@ instr_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr)
          /* TODO non-indirect access we could figure out which register
           * we actually want and allow cp..
           */
-         if (reg->flags & IR3_REG_ARRAY)
+         if ((reg->flags & IR3_REG_ARRAY) && src->opc != OPC_META_PHI)
             continue;
 
          /* Don't CP absneg into meta instructions, that won't end well: */
-         if (is_meta(instr) && (src->opc != OPC_MOV))
+         if (is_meta(instr) &&
+             (src->opc == OPC_ABSNEG_F || src->opc == OPC_ABSNEG_S))
             continue;
 
          /* Don't CP mova and mova1 into their users */
@@ -622,7 +621,8 @@ instr_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr)
       struct ir3_register *samp = samp_tex->srcs[0];
       struct ir3_register *tex = samp_tex->srcs[1];
 
-      if ((samp->flags & IR3_REG_IMMED) && (tex->flags & IR3_REG_IMMED)) {
+      if ((samp->flags & IR3_REG_IMMED) && (tex->flags & IR3_REG_IMMED) &&
+          (samp->iim_val < 16) && (tex->iim_val < 16)) {
          instr->flags &= ~IR3_INSTR_S2EN;
          instr->cat5.samp = samp->iim_val;
          instr->cat5.tex = tex->iim_val;
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_cse.c b/mesa 3D driver/src/freedreno/ir3/ir3_cse.c
index 712730f309..36a5cda0e6 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_cse.c	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_cse.c	
@@ -43,12 +43,24 @@ hash_instr(const void *data)
    hash = HASH(hash, instr->opc);
    hash = HASH(hash, instr->dsts[0]->flags);
    foreach_src (src, (struct ir3_instruction *)instr) {
-      if (src->flags & IR3_REG_CONST)
-         hash = HASH(hash, src->num);
-      else if (src->flags & IR3_REG_IMMED)
+      if (src->flags & IR3_REG_CONST) {
+         if (src->flags & IR3_REG_RELATIV)
+            hash = HASH(hash, src->array.offset);
+         else
+            hash = HASH(hash, src->num);
+      } else if (src->flags & IR3_REG_IMMED) {
          hash = HASH(hash, src->uim_val);
-      else
+      } else {
+         if (src->flags & IR3_REG_ARRAY)
+            hash = HASH(hash, src->array.offset);
          hash = HASH(hash, src->def);
+      }
+   }
+
+   if (opc_cat(instr->opc) == 1) {
+      hash = HASH(hash, instr->cat1.dst_type);
+      hash = HASH(hash, instr->cat1.src_type);
+      hash = HASH(hash, instr->cat1.round);
    }
 
    return hash;
@@ -76,24 +88,43 @@ instrs_equal(const struct ir3_instruction *i1, const struct ir3_instruction *i2)
          return false;
 
       if (i1_reg->flags & IR3_REG_CONST) {
-         if (i1_reg->num != i2_reg->num)
-            return false;
+         if (i1_reg->flags & IR3_REG_RELATIV) {
+            if (i1_reg->array.offset != i2_reg->array.offset)
+               return false;
+         } else {
+            if (i1_reg->num != i2_reg->num)
+               return false;
+         }
       } else if (i1_reg->flags & IR3_REG_IMMED) {
          if (i1_reg->uim_val != i2_reg->uim_val)
             return false;
       } else {
+         if (i1_reg->flags & IR3_REG_ARRAY) {
+            if (i1_reg->array.offset != i2_reg->array.offset)
+               return false;
+         }
          if (i1_reg->def != i2_reg->def)
             return false;
       }
    }
 
+   if (opc_cat(i1->opc) == 1) {
+      if (i1->cat1.dst_type != i2->cat1.dst_type ||
+          i1->cat1.src_type != i2->cat1.src_type ||
+          i1->cat1.round != i2->cat1.round)
+         return false;
+   }
+
    return true;
 }
 
 static bool
 instr_can_cse(const struct ir3_instruction *instr)
 {
-   if (instr->opc != OPC_META_COLLECT)
+   if (instr->opc != OPC_META_COLLECT && instr->opc != OPC_MOV)
+      return false;
+
+   if (!is_dest_gpr(instr->dsts[0]) || (instr->dsts[0]->flags & IR3_REG_ARRAY))
       return false;
 
    return true;
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_legalize.c b/mesa 3D driver/src/freedreno/ir3/ir3_legalize.c
index b2d8ab2a3a..42145b2a11 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_legalize.c	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_legalize.c	
@@ -234,8 +234,9 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
       if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5))
          ir3_NOP(block);
 
-      if (ctx->compiler->samgq_workaround && ctx->type == MESA_SHADER_VERTEX &&
-          n->opc == OPC_SAMGQ) {
+      if (ctx->compiler->samgq_workaround &&
+          ctx->type != MESA_SHADER_FRAGMENT &&
+          ctx->type != MESA_SHADER_COMPUTE && n->opc == OPC_SAMGQ) {
          struct ir3_instruction *samgp;
 
          list_delinit(&n->node);
@@ -292,8 +293,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
        */
       if (is_tex(n) || is_sfu(n) || is_mem(n)) {
          foreach_src (reg, n) {
-            if (reg_gpr(reg))
-               regmask_set(&state->needs_ss_war, reg);
+            regmask_set(&state->needs_ss_war, reg);
          }
       }
 
@@ -510,6 +510,16 @@ retarget_jump(struct ir3_instruction *instr, struct ir3_block *new_target)
       cur_block->successors[1] = new_target;
    }
 
+   /* also update physical_successors.. we don't really need them at
+    * this stage, but it keeps ir3_validate happy:
+    */
+   if (cur_block->physical_successors[0] == old_target) {
+      cur_block->physical_successors[0] = new_target;
+   } else {
+      debug_assert(cur_block->physical_successors[1] == old_target);
+      cur_block->physical_successors[1] = new_target;
+   }
+
    /* update new target's predecessors: */
    ir3_block_add_predecessor(new_target, cur_block);
 
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_lexer.l b/mesa 3D driver/src/freedreno/ir3/ir3_lexer.l
index cdee9ab9c2..2d5582e5bd 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_lexer.l	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_lexer.l	
@@ -116,6 +116,7 @@ static int parse_w(const char *str)
 "@in"                             return TOKEN(T_A_IN);
 "@out"                            return TOKEN(T_A_OUT);
 "@tex"                            return TOKEN(T_A_TEX);
+"@pvtmem"                         return TOKEN(T_A_PVTMEM);
 "(sy)"                            return TOKEN(T_SY);
 "(ss)"                            return TOKEN(T_SS);
 "(absneg)"                        return TOKEN(T_ABSNEG);
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_liveness.c b/mesa 3D driver/src/freedreno/ir3/ir3_liveness.c
index 4cdf5fb3c9..c343728800 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_liveness.c	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_liveness.c	
@@ -115,9 +115,9 @@ compute_block_liveness(struct ir3_liveness *live, struct ir3_block *block,
 }
 
 struct ir3_liveness *
-ir3_calc_liveness(struct ir3_shader_variant *v)
+ir3_calc_liveness(void *mem_ctx, struct ir3 *ir)
 {
-   struct ir3_liveness *live = rzalloc(NULL, struct ir3_liveness);
+   struct ir3_liveness *live = rzalloc(mem_ctx, struct ir3_liveness);
 
    /* Reserve name 0 to mean "doesn't have a name yet" to make the debug
     * output nicer.
@@ -126,7 +126,7 @@ ir3_calc_liveness(struct ir3_shader_variant *v)
 
    /* Build definition <-> name mapping */
    unsigned block_count = 0;
-   foreach_block (block, &v->ir->block_list) {
+   foreach_block (block, &ir->block_list) {
       block->index = block_count++;
       foreach_instr (instr, &block->instr_list) {
          ra_foreach_dst (dst, instr) {
@@ -143,7 +143,7 @@ ir3_calc_liveness(struct ir3_shader_variant *v)
    live->live_in = ralloc_array(live, BITSET_WORD *, block_count);
    live->live_out = ralloc_array(live, BITSET_WORD *, block_count);
    unsigned i = 0;
-   foreach_block (block, &v->ir->block_list) {
+   foreach_block (block, &ir->block_list) {
       block->index = i++;
       live->live_in[block->index] =
          rzalloc_array(live, BITSET_WORD, bitset_words);
@@ -154,7 +154,7 @@ ir3_calc_liveness(struct ir3_shader_variant *v)
    bool progress = true;
    while (progress) {
       progress = false;
-      foreach_block_rev (block, &v->ir->block_list) {
+      foreach_block_rev (block, &ir->block_list) {
          progress |=
             compute_block_liveness(live, block, tmp_live, bitset_words);
       }
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_lower_parallelcopy.c b/mesa 3D driver/src/freedreno/ir3/ir3_lower_parallelcopy.c
index 32b2d300c5..0350a97e3e 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_lower_parallelcopy.c	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_lower_parallelcopy.c	
@@ -282,7 +282,7 @@ static void
 split_32bit_copy(struct copy_ctx *ctx, struct copy_entry *entry)
 {
    assert(!entry->done);
-   assert(!(entry->flags & (IR3_REG_IMMED | IR3_REG_CONST)));
+   assert(!(entry->src.flags & (IR3_REG_IMMED | IR3_REG_CONST)));
    assert(copy_entry_size(entry) == 2);
    struct copy_entry *new_entry = &ctx->entries[ctx->entry_count++];
 
@@ -362,7 +362,7 @@ _handle_copies(struct ir3_compiler *compiler, struct ir3_instruction *instr,
 
          if (((ctx->physreg_use_count[entry->dst] == 0 ||
                ctx->physreg_use_count[entry->dst + 1] == 0)) &&
-             !(entry->flags & (IR3_REG_IMMED | IR3_REG_CONST))) {
+             !(entry->src.flags & (IR3_REG_IMMED | IR3_REG_CONST))) {
             split_32bit_copy(ctx, entry);
             progress = true;
          }
@@ -451,6 +451,8 @@ _handle_copies(struct ir3_compiler *compiler, struct ir3_instruction *instr,
                entry->src.reg + (blocking->src.reg - entry->dst);
          }
       }
+
+      entry->done = true;
    }
 }
 
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_lower_spill.c b/mesa 3D driver/src/freedreno/ir3/ir3_lower_spill.c
new file mode 100644
index 0000000000..265207105e
--- /dev/null
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_lower_spill.c	
@@ -0,0 +1,163 @@
+/*
+ * Copyright (C) 2021 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ir3_ra.h"
+
+/* The spilling pass leaves out a few details required to successfully operate
+ * ldp/stp:
+ *
+ * 1. ldp/stp can only load/store 4 components at a time, but spilling ignores
+ *    that and just spills/restores entire values, including arrays and values
+ *    created for texture setup which can be more than 4 components.
+ * 2. The spiller doesn't add barrier dependencies needed for post-RA
+ *    scheduling.
+ *
+ * The first one, in particular, is much easier to handle after RA because
+ * arrays and normal values can be treated the same way. Therefore this pass
+ * runs after RA, and handles both issues. This keeps the complexity out of the
+ * spiller.
+ */
+
+static void
+split_spill(struct ir3_instruction *spill)
+{
+   unsigned orig_components = spill->srcs[2]->uim_val;
+
+   /* We don't handle splitting dependencies. */
+   assert(spill->deps_count == 0);
+
+   if (orig_components <= 4) {
+      if (spill->srcs[1]->flags & IR3_REG_ARRAY) {
+         spill->srcs[1]->wrmask = MASK(orig_components);
+         spill->srcs[1]->num = spill->srcs[1]->array.base;
+         spill->srcs[1]->flags &= ~IR3_REG_ARRAY;
+      }
+      return;
+   }
+
+   for (unsigned comp = 0; comp < orig_components; comp += 4) {
+      unsigned components = MIN2(orig_components - comp, 4);
+      struct ir3_instruction *clone = ir3_instr_clone(spill);
+      ir3_instr_move_before(clone, spill);
+
+      clone->srcs[1]->wrmask = MASK(components);
+      if (clone->srcs[1]->flags & IR3_REG_ARRAY) {
+         clone->srcs[1]->num = clone->srcs[1]->array.base + comp;
+         clone->srcs[1]->flags &= ~IR3_REG_ARRAY;
+      }
+
+      clone->srcs[2]->uim_val = components;
+      clone->cat6.dst_offset +=
+         comp * ((spill->srcs[1]->flags & IR3_REG_HALF) ? 2 : 4);
+   }
+
+   list_delinit(&spill->node);
+}
+
+static void
+split_reload(struct ir3_instruction *reload)
+{
+   unsigned orig_components = reload->srcs[2]->uim_val;
+
+   assert(reload->deps_count == 0);
+
+   if (orig_components <= 4) {
+      if (reload->dsts[0]->flags & IR3_REG_ARRAY) {
+         reload->dsts[0]->wrmask = MASK(orig_components);
+         reload->dsts[0]->num = reload->dsts[0]->array.base;
+         reload->dsts[0]->flags &= ~IR3_REG_ARRAY;
+      }
+      return;
+   }
+
+   for (unsigned comp = 0; comp < orig_components; comp += 4) {
+      unsigned components = MIN2(orig_components - comp, 4);
+      struct ir3_instruction *clone = ir3_instr_clone(reload);
+      ir3_instr_move_before(clone, reload);
+
+      clone->dsts[0]->wrmask = MASK(components);
+      if (clone->dsts[0]->flags & IR3_REG_ARRAY) {
+         clone->dsts[0]->num = clone->dsts[0]->array.base + comp;
+         clone->dsts[0]->flags &= ~IR3_REG_ARRAY;
+      }
+
+      clone->srcs[2]->uim_val = components;
+      clone->srcs[1]->uim_val +=
+         comp * ((reload->dsts[0]->flags & IR3_REG_HALF) ? 2 : 4);
+   }
+
+   list_delinit(&reload->node);
+}
+
+static void
+add_spill_reload_deps(struct ir3_block *block)
+{
+   struct ir3_instruction *last_spill = NULL;
+
+   foreach_instr (instr, &block->instr_list) {
+      if ((instr->opc == OPC_SPILL_MACRO || instr->opc == OPC_RELOAD_MACRO) &&
+          last_spill) {
+         ir3_instr_add_dep(instr, last_spill);
+      }
+
+      if (instr->opc == OPC_SPILL_MACRO)
+         last_spill = instr;
+   }
+
+
+   last_spill = NULL;
+
+   foreach_instr_rev (instr, &block->instr_list) {
+      if ((instr->opc == OPC_SPILL_MACRO || instr->opc == OPC_RELOAD_MACRO) &&
+          last_spill) {
+         ir3_instr_add_dep(last_spill, instr);
+      }
+
+      if (instr->opc == OPC_SPILL_MACRO)
+         last_spill = instr;
+   }
+}
+
+bool
+ir3_lower_spill(struct ir3 *ir)
+{
+   foreach_block (block, &ir->block_list) {
+      foreach_instr_safe (instr, &block->instr_list) {
+         if (instr->opc == OPC_SPILL_MACRO)
+            split_spill(instr);
+         else if (instr->opc == OPC_RELOAD_MACRO)
+            split_reload(instr);
+      }
+
+      add_spill_reload_deps(block);
+
+      foreach_instr (instr, &block->instr_list) {
+         if (instr->opc == OPC_SPILL_MACRO)
+            instr->opc = OPC_STP;
+         else if (instr->opc == OPC_RELOAD_MACRO)
+            instr->opc = OPC_LDP;
+      }
+   }
+
+   return true;
+}
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_lower_subgroups.c b/mesa 3D driver/src/freedreno/ir3/ir3_lower_subgroups.c
index 84235cea91..b529535705 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_lower_subgroups.c	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_lower_subgroups.c	
@@ -218,7 +218,9 @@ lower_block(struct ir3 *ir, struct ir3_block **block)
          ir3_dst_create(mov, instr->dsts[0]->num, instr->dsts[0]->flags);
          struct ir3_register *new_src = ir3_src_create(mov, 0, 0);
          *new_src = *instr->srcs[src];
-         mov->cat1.dst_type = mov->cat1.src_type = TYPE_U32;
+         mov->cat1.dst_type = TYPE_U32;
+         mov->cat1.src_type =
+            (new_src->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
          break;
       }
 
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_merge_regs.c b/mesa 3D driver/src/freedreno/ir3/ir3_merge_regs.c
index e5ba1bdb59..674bc648e0 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_merge_regs.c	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_merge_regs.c	
@@ -198,6 +198,7 @@ get_merge_set(struct ir3_register *def)
    struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set);
    set->preferred_reg = ~0;
    set->interval_start = ~0;
+   set->spill_slot = ~0;
    set->size = reg_size(def);
    set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2;
    set->regs_count = 1;
@@ -339,6 +340,19 @@ try_merge_defs(struct ir3_liveness *live, struct ir3_register *a,
       merge_merge_sets(a_set, b_set, b_set_offset);
 }
 
+void
+ir3_force_merge(struct ir3_register *a, struct ir3_register *b, int b_offset)
+{
+   struct ir3_merge_set *a_set = get_merge_set(a);
+   struct ir3_merge_set *b_set = get_merge_set(b);
+
+   if (a_set == b_set)
+      return;
+
+   int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
+   merge_merge_sets(a_set, b_set, b_set_offset);
+}
+
 static void
 coalesce_phi(struct ir3_liveness *live, struct ir3_instruction *phi)
 {
@@ -429,7 +443,8 @@ create_parallel_copy(struct ir3_block *block)
       for (j = 0; j < phi_count; j++) {
          struct ir3_register *reg = __ssa_dst(pcopy);
          reg->flags |= src[j]->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
-         reg->size = reg_elems(src[j]);
+         reg->size = src[j]->size;
+         reg->wrmask = src[j]->wrmask;
       }
 
       for (j = 0; j < phi_count; j++) {
@@ -461,7 +476,7 @@ ir3_create_parallel_copies(struct ir3 *ir)
 }
 
 static void
-index_merge_sets(struct ir3 *ir)
+index_merge_sets(struct ir3_liveness *live, struct ir3 *ir)
 {
    unsigned offset = 0;
    foreach_block (block, &ir->block_list) {
@@ -488,6 +503,8 @@ index_merge_sets(struct ir3 *ir)
          }
       }
    }
+
+   live->interval_offset = offset;
 }
 
 #define RESET      "\x1b[0m"
@@ -497,7 +514,7 @@ index_merge_sets(struct ir3 *ir)
 static void
 dump_merge_sets(struct ir3 *ir)
 {
-   printf("merge sets:\n");
+   d("merge sets:");
    struct set *merge_sets = _mesa_pointer_set_create(NULL);
    foreach_block (block, &ir->block_list) {
       foreach_instr (instr, &block->instr_list) {
@@ -508,12 +525,12 @@ dump_merge_sets(struct ir3 *ir)
             if (!merge_set || _mesa_set_search(merge_sets, merge_set))
                continue;
 
-            printf("merge set, size %u, align %u:\n", merge_set->size,
-                   merge_set->alignment);
+            d("merge set, size %u, align %u:", merge_set->size,
+              merge_set->alignment);
             for (unsigned j = 0; j < merge_set->regs_count; j++) {
                struct ir3_register *reg = merge_set->regs[j];
-               printf("\t" SYN_SSA("ssa_%u") ":%u, offset %u\n",
-                      reg->instr->serialno, reg->name, reg->merge_set_offset);
+               d("\t" SYN_SSA("ssa_%u") ":%u, offset %u",
+                 reg->instr->serialno, reg->name, reg->merge_set_offset);
             }
 
             _mesa_set_add(merge_sets, merge_set);
@@ -558,7 +575,7 @@ ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir)
       }
    }
 
-   index_merge_sets(ir);
+   index_merge_sets(live, ir);
 
    if (ir3_shader_debug & IR3_DBG_RAMSGS)
       dump_merge_sets(ir);
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_nir.c b/mesa 3D driver/src/freedreno/ir3/ir3_nir.c
index 34fd0ec443..19e41ac047 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_nir.c	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_nir.c	
@@ -232,7 +232,8 @@ ir3_optimize_loop(struct ir3_compiler *compiler, nir_shader *s)
        * for other stages.
        */
       if ((s->info.stage == MESA_SHADER_FRAGMENT) ||
-          (s->info.stage == MESA_SHADER_COMPUTE)) {
+          (s->info.stage == MESA_SHADER_COMPUTE) ||
+          (s->info.stage == MESA_SHADER_KERNEL)) {
          progress |= OPT(s, nir_opt_phi_precision);
       }
       progress |= OPT(s, nir_opt_algebraic);
@@ -271,6 +272,7 @@ ir3_optimize_loop(struct ir3_compiler *compiler, nir_shader *s)
       }
       progress |= OPT(s, nir_opt_if, false);
       progress |= OPT(s, nir_opt_loop_unroll);
+      progress |= OPT(s, nir_lower_64bit_phis);
       progress |= OPT(s, nir_opt_remove_phis);
       progress |= OPT(s, nir_opt_undef);
    } while (progress);
@@ -303,20 +305,23 @@ ir3_nir_lower_ssbo_size_filter(const nir_instr *instr, const void *data)
 static nir_ssa_def *
 ir3_nir_lower_ssbo_size_instr(nir_builder *b, nir_instr *instr, void *data)
 {
+   uint8_t ssbo_size_to_bytes_shift = *(uint8_t *) data;
    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-   return nir_ishl(b, &intr->dest.ssa, nir_imm_int(b, 2));
+   return nir_ishl(b, &intr->dest.ssa, nir_imm_int(b, ssbo_size_to_bytes_shift));
 }
 
 /**
- * The resinfo opcode we have for getting the SSBO size on a6xx returns a number
- * of dwords, while the NIR intrinsic coming in is a number of bytes.  Switch
- * things so the NIR intrinsic in our backend means dwords.
+ * The resinfo opcode we have for getting the SSBO size on a6xx returns a byte
+ * length divided by IBO_0_FMT, while the NIR intrinsic coming in is a number of
+ * bytes. Switch things so the NIR intrinsic in our backend means dwords.
  */
 static bool
-ir3_nir_lower_ssbo_size(nir_shader *s)
+ir3_nir_lower_ssbo_size(nir_shader *s, bool storage_16bit)
 {
+   uint8_t ssbo_size_to_bytes_shift = storage_16bit ? 1 : 2;
    return nir_shader_lower_instructions(s, ir3_nir_lower_ssbo_size_filter,
-                                        ir3_nir_lower_ssbo_size_instr, NULL);
+                                        ir3_nir_lower_ssbo_size_instr,
+                                        &ssbo_size_to_bytes_shift);
 }
 
 void
@@ -463,7 +468,6 @@ lower_subgroup_id_filter(const nir_instr *instr, const void *unused)
 static nir_ssa_def *
 lower_subgroup_id(nir_builder *b, nir_instr *instr, void *unused)
 {
-   (void)instr;
    (void)unused;
 
    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
@@ -499,11 +503,6 @@ ir3_nir_lower_subgroup_id_cs(nir_shader *shader)
                                         lower_subgroup_id, NULL);
 }
 
-static const nir_lower_idiv_options idiv_options = {
-   .imprecise_32bit_lowering = true,
-   .allow_fp16 = true,
-};
-
 /**
  * Late passes that need to be done after pscreen->finalize_nir()
  */
@@ -528,7 +527,8 @@ ir3_nir_post_finalize(struct ir3_compiler *compiler, nir_shader *s)
       NIR_PASS_V(s, nir_lower_mediump_io, nir_var_shader_out, 0, false);
    }
 
-   if (s->info.stage == MESA_SHADER_COMPUTE) {
+   if ((s->info.stage == MESA_SHADER_COMPUTE) ||
+       (s->info.stage == MESA_SHADER_KERNEL)) {
       bool progress = false;
       NIR_PASS(progress, s, nir_lower_subgroups,
                &(nir_lower_subgroups_options){
@@ -556,14 +556,19 @@ ir3_nir_post_finalize(struct ir3_compiler *compiler, nir_shader *s)
     */
    OPT_V(s, ir3_nir_apply_trig_workarounds);
 
-   nir_lower_image_options lower_image_opts = {
+   const nir_lower_image_options lower_image_opts = {
       .lower_cube_size = true,
    };
    NIR_PASS_V(s, nir_lower_image, &lower_image_opts);
-   NIR_PASS_V(s, nir_lower_idiv, &idiv_options); /* idiv generated by cube lowering */
+
+   const nir_lower_idiv_options lower_idiv_options = {
+      .imprecise_32bit_lowering = true,
+      .allow_fp16 = true,
+   };
+   NIR_PASS_V(s, nir_lower_idiv, &lower_idiv_options); /* idiv generated by cube lowering */
 
    if (compiler->gen >= 6)
-      OPT_V(s, ir3_nir_lower_ssbo_size);
+      OPT_V(s, ir3_nir_lower_ssbo_size, compiler->storage_16bit);
 
    ir3_optimize_loop(compiler, s);
 }
@@ -689,11 +694,6 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s)
          32 /* bytes */);
    OPT_V(s, ir3_nir_lower_load_constant, so);
 
-   if (!so->binning_pass)
-      OPT_V(s, ir3_nir_analyze_ubo_ranges, so);
-
-   progress |= OPT(s, ir3_nir_lower_ubo_loads, so);
-
    /* Lower large temporaries to scratch, which in Qualcomm terms is private
     * memory, to avoid excess register pressure. This should happen after
     * nir_opt_large_constants, because loading from a UBO is much, much less
@@ -707,6 +707,17 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s)
    /* Lower scratch writemasks */
    progress |= OPT(s, nir_lower_wrmasks, should_split_wrmask, s);
 
+   progress |= OPT(s, ir3_nir_lower_wide_load_store);
+   progress |= OPT(s, ir3_nir_lower_64b_global);
+   progress |= OPT(s, ir3_nir_lower_64b_intrinsics);
+   progress |= OPT(s, ir3_nir_lower_64b_undef);
+   progress |= OPT(s, nir_lower_int64);
+
+   if (!so->binning_pass)
+      OPT_V(s, ir3_nir_analyze_ubo_ranges, so);
+
+   progress |= OPT(s, ir3_nir_lower_ubo_loads, so);
+
    OPT_V(s, nir_lower_amul, ir3_glsl_type_size);
 
    /* UBO offset lowering has to come after we've decided what will
@@ -785,9 +796,12 @@ ir3_nir_scan_driver_consts(struct ir3_compiler *compiler, nir_shader *shader, st
             case nir_intrinsic_image_atomic_xor:
             case nir_intrinsic_image_atomic_exchange:
             case nir_intrinsic_image_atomic_comp_swap:
+            case nir_intrinsic_image_load:
             case nir_intrinsic_image_store:
             case nir_intrinsic_image_size:
-               if (compiler->gen < 6) {
+               if (compiler->gen < 6 &&
+                   !(intr->intrinsic == nir_intrinsic_image_load &&
+                     !(nir_intrinsic_access(intr) & ACCESS_COHERENT))) {
                   idx = nir_src_as_uint(intr->src[0]);
                   if (layout->image_dims.mask & (1 << idx))
                      break;
@@ -860,9 +874,6 @@ ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v,
 
    const_state->num_ubos = nir->info.num_ubos;
 
-   /* num_driver_params is scalar, align to vec4: */
-   const_state->num_driver_params = align(const_state->num_driver_params, 4);
-
    debug_assert((const_state->ubo_state.size % 16) == 0);
    unsigned constoff = const_state->ubo_state.size / 16;
    unsigned ptrsz = ir3_pointer_size(compiler);
@@ -878,13 +889,32 @@ ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v,
       constoff += align(cnt, 4) / 4;
    }
 
+   if (v->type == MESA_SHADER_KERNEL) {
+      const_state->offsets.kernel_params = constoff;
+      constoff += align(v->shader->cs.req_input_mem, 4) / 4;
+   }
+
    if (const_state->num_driver_params > 0) {
+      /* num_driver_params in dwords.  we only need to align to vec4s for the
+       * common case of immediate constant uploads, but for indirect dispatch
+       * the constants may also be indirect and so we have to align the area in
+       * const space to that requirement.
+       */
+      const_state->num_driver_params = align(const_state->num_driver_params, 4);
+      unsigned upload_unit = 1;
+      if (v->type == MESA_SHADER_COMPUTE ||
+          (const_state->num_driver_params >= IR3_DP_VTXID_BASE)) {
+         upload_unit = compiler->const_upload_unit;
+      }
+
       /* offset cannot be 0 for vs params loaded by CP_DRAW_INDIRECT_MULTI */
       if (v->type == MESA_SHADER_VERTEX && compiler->gen >= 6)
          constoff = MAX2(constoff, 1);
+      constoff = align(constoff, upload_unit);
       const_state->offsets.driver_param = constoff;
+
+      constoff += align(const_state->num_driver_params / 4, upload_unit);
    }
-   constoff += const_state->num_driver_params / 4;
 
    if ((v->type == MESA_SHADER_VERTEX) && (compiler->gen < 5) &&
        v->shader->stream_output.num_outputs > 0) {
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_nir.h b/mesa 3D driver/src/freedreno/ir3/ir3_nir.h
index d1049364f6..dccd8bca87 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_nir.h	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_nir.h	
@@ -42,6 +42,7 @@ bool ir3_nir_lower_load_barycentric_at_offset(nir_shader *shader);
 bool ir3_nir_move_varying_inputs(nir_shader *shader);
 int ir3_nir_coord_offset(nir_ssa_def *ssa);
 bool ir3_nir_lower_tex_prefetch(nir_shader *shader);
+bool ir3_nir_lower_wide_load_store(nir_shader *shader);
 
 void ir3_nir_lower_to_explicit_output(nir_shader *shader,
                                       struct ir3_shader_variant *v,
@@ -54,6 +55,13 @@ void ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v,
                              unsigned topology);
 void ir3_nir_lower_gs(nir_shader *shader);
 
+/*
+ * 64b related lowering:
+ */
+bool ir3_nir_lower_64b_intrinsics(nir_shader *shader);
+bool ir3_nir_lower_64b_undef(nir_shader *shader);
+bool ir3_nir_lower_64b_global(nir_shader *shader);
+
 const nir_shader_compiler_options *
 ir3_get_compiler_options(struct ir3_compiler *compiler);
 void ir3_optimize_loop(struct ir3_compiler *compiler, nir_shader *s);
@@ -89,4 +97,38 @@ ir3_bindless_resource(nir_src src)
    return intrin;
 }
 
+static inline bool
+is_intrinsic_store(nir_intrinsic_op op)
+{
+   switch (op) {
+   case nir_intrinsic_store_output:
+   case nir_intrinsic_store_scratch:
+   case nir_intrinsic_store_ssbo:
+   case nir_intrinsic_store_shared:
+   case nir_intrinsic_store_global:
+   case nir_intrinsic_store_global_ir3:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static inline bool
+is_intrinsic_load(nir_intrinsic_op op)
+{
+   switch (op) {
+   case nir_intrinsic_load_input:
+   case nir_intrinsic_load_scratch:
+   case nir_intrinsic_load_uniform:
+   case nir_intrinsic_load_ssbo:
+   case nir_intrinsic_load_ubo:
+   case nir_intrinsic_load_shared:
+   case nir_intrinsic_load_global:
+   case nir_intrinsic_load_global_ir3:
+      return true;
+   default:
+      return false;
+   }
+}
+
 #endif /* IR3_NIR_H_ */
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_nir_lower_64b.c b/mesa 3D driver/src/freedreno/ir3/ir3_nir_lower_64b.c
new file mode 100644
index 0000000000..e02a6dfc56
--- /dev/null
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_nir_lower_64b.c	
@@ -0,0 +1,284 @@
+/*
+ * Copyright © 2021 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ir3_nir.h"
+
+/*
+ * Lowering for 64b intrinsics generated with OpenCL or with
+ * VK_KHR_buffer_device_address. All our intrinsics from a hw
+ * standpoint are 32b, so we just need to combine in zero for
+ * the upper 32bits and let the other nir passes clean up the mess.
+ */
+
+static bool
+lower_64b_intrinsics_filter(const nir_instr *instr, const void *unused)
+{
+   (void)unused;
+
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+   if (intr->intrinsic == nir_intrinsic_load_deref ||
+       intr->intrinsic == nir_intrinsic_store_deref)
+      return false;
+
+   if (is_intrinsic_store(intr->intrinsic))
+      return nir_src_bit_size(intr->src[0]) == 64;
+
+   if (nir_intrinsic_dest_components(intr) == 0)
+      return false;
+
+   return nir_dest_bit_size(intr->dest) == 64;
+}
+
+static nir_ssa_def *
+lower_64b_intrinsics(nir_builder *b, nir_instr *instr, void *unused)
+{
+   (void)unused;
+
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+   /* We could be *slightly* more clever and, for ex, turn a 64b vec4
+    * load into two 32b vec4 loads, rather than 4 32b vec2 loads.
+    */
+
+   if (is_intrinsic_store(intr->intrinsic)) {
+      unsigned offset_src_idx;
+      switch (intr->intrinsic) {
+      case nir_intrinsic_store_ssbo:
+      case nir_intrinsic_store_global_ir3:
+         offset_src_idx = 2;
+         break;
+      default:
+         offset_src_idx = 1;
+      }
+
+      unsigned num_comp = nir_intrinsic_src_components(intr, 0);
+      unsigned wrmask = nir_intrinsic_has_write_mask(intr) ?
+         nir_intrinsic_write_mask(intr) : BITSET_MASK(num_comp);
+      nir_ssa_def *val = nir_ssa_for_src(b, intr->src[0], num_comp);
+      nir_ssa_def *off = nir_ssa_for_src(b, intr->src[offset_src_idx], 1);
+
+      for (unsigned i = 0; i < num_comp; i++) {
+         if (!(wrmask & BITFIELD_BIT(i)))
+            continue;
+
+         nir_ssa_def *c64 = nir_channel(b, val, i);
+         nir_ssa_def *c32 = nir_unpack_64_2x32(b, c64);
+
+         nir_intrinsic_instr *store =
+            nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr));
+         store->num_components = 2;
+         store->src[0] = nir_src_for_ssa(c32);
+         store->src[offset_src_idx] = nir_src_for_ssa(off);
+
+         if (nir_intrinsic_has_write_mask(intr))
+            nir_intrinsic_set_write_mask(store, 0x3);
+         nir_builder_instr_insert(b, &store->instr);
+
+         off = nir_iadd(b, off, nir_imm_intN_t(b, 8, off->bit_size));
+      }
+
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+   }
+
+   unsigned num_comp = nir_intrinsic_dest_components(intr);
+
+   nir_ssa_def *def = &intr->dest.ssa;
+   def->bit_size = 32;
+
+   /* load_kernel_input is handled specially, lowering to two 32b inputs:
+    */
+   if (intr->intrinsic == nir_intrinsic_load_kernel_input) {
+      assert(num_comp == 1);
+
+      nir_ssa_def *offset = nir_iadd(b,
+            nir_ssa_for_src(b, intr->src[0], 1),
+            nir_imm_int(b, 4));
+
+      nir_ssa_def *upper = nir_build_load_kernel_input(
+            b, 1, 32, offset);
+
+      return nir_pack_64_2x32_split(b, def, upper);
+   }
+
+   nir_ssa_def *components[num_comp];
+
+   if (is_intrinsic_load(intr->intrinsic)) {
+      unsigned offset_src_idx;
+      switch(intr->intrinsic) {
+      case nir_intrinsic_load_ssbo:
+      case nir_intrinsic_load_ubo:
+      case nir_intrinsic_load_global_ir3:
+         offset_src_idx = 1;
+         break;
+      default:
+         offset_src_idx = 0;
+      }
+
+      nir_ssa_def *off = nir_ssa_for_src(b, intr->src[offset_src_idx], 1);
+
+      for (unsigned i = 0; i < num_comp; i++) {
+         nir_intrinsic_instr *load =
+            nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr));
+         load->num_components = 2;
+         load->src[offset_src_idx] = nir_src_for_ssa(off);
+
+         nir_ssa_dest_init(&load->instr, &load->dest, 2, 32, NULL);
+         nir_builder_instr_insert(b, &load->instr);
+
+         components[i] = nir_pack_64_2x32(b, &load->dest.ssa);
+
+         off = nir_iadd(b, off, nir_imm_intN_t(b, 8, off->bit_size));
+      }
+   } else {
+      /* The remaining (non load/store) intrinsics just get zero-
+       * extended from 32b to 64b:
+       */
+      for (unsigned i = 0; i < num_comp; i++) {
+         nir_ssa_def *c = nir_channel(b, def, i);
+         components[i] = nir_pack_64_2x32_split(b, c, nir_imm_zero(b, 1, 32));
+      }
+   }
+
+   return nir_build_alu_src_arr(b, nir_op_vec(num_comp), components);
+}
+
+bool
+ir3_nir_lower_64b_intrinsics(nir_shader *shader)
+{
+   return nir_shader_lower_instructions(
+         shader, lower_64b_intrinsics_filter,
+         lower_64b_intrinsics, NULL);
+}
+
+/*
+ * Lowering for 64b undef instructions, splitting into a two 32b undefs
+ */
+
+static nir_ssa_def *
+lower_64b_undef(nir_builder *b, nir_instr *instr, void *unused)
+{
+   (void)unused;
+
+   nir_ssa_undef_instr *undef = nir_instr_as_ssa_undef(instr);
+   unsigned num_comp = undef->def.num_components;
+   nir_ssa_def *components[num_comp];
+
+   for (unsigned i = 0; i < num_comp; i++) {
+      nir_ssa_def *lowered = nir_ssa_undef(b, 2, 32);
+
+      components[i] = nir_pack_64_2x32_split(b,
+                                             nir_channel(b, lowered, 0),
+                                             nir_channel(b, lowered, 1));
+   }
+
+   return nir_build_alu_src_arr(b, nir_op_vec(num_comp), components);
+}
+
+static bool
+lower_64b_undef_filter(const nir_instr *instr, const void *unused)
+{
+   (void)unused;
+
+   return instr->type == nir_instr_type_ssa_undef &&
+      nir_instr_as_ssa_undef(instr)->def.bit_size == 64;
+}
+
+bool
+ir3_nir_lower_64b_undef(nir_shader *shader)
+{
+   return nir_shader_lower_instructions(
+         shader, lower_64b_undef_filter,
+         lower_64b_undef, NULL);
+}
+
+/*
+ * Lowering for load_global/store_global with 64b addresses to ir3
+ * variants, which instead take a uvec2_32
+ */
+
+static bool
+lower_64b_global_filter(const nir_instr *instr, const void *unused)
+{
+   (void)unused;
+
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   return (intr->intrinsic == nir_intrinsic_load_global) ||
+          (intr->intrinsic == nir_intrinsic_load_global_constant) ||
+          (intr->intrinsic == nir_intrinsic_store_global);
+}
+
+static nir_ssa_def *
+lower_64b_global(nir_builder *b, nir_instr *instr, void *unused)
+{
+   (void)unused;
+
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   bool load = intr->intrinsic != nir_intrinsic_store_global;
+
+   nir_ssa_def *addr64 = nir_ssa_for_src(b, intr->src[load ? 0 : 1], 1);
+   nir_ssa_def *addr = nir_unpack_64_2x32(b, addr64);
+
+   /*
+    * Note that we can get vec8/vec16 with OpenCL.. we need to split
+    * those up into max 4 components per load/store.
+    */
+
+   if (load) {
+      unsigned num_comp = nir_intrinsic_dest_components(intr);
+      nir_ssa_def *components[num_comp];
+      for (unsigned off = 0; off < num_comp;) {
+         unsigned c = MIN2(num_comp - off, 4);
+         nir_ssa_def *val = nir_build_load_global_ir3(
+               b, c, nir_dest_bit_size(intr->dest),
+               addr, nir_imm_int(b, off));
+         for (unsigned i = 0; i < c; i++) {
+            components[off++] = nir_channel(b, val, i);
+         }
+      }
+      return nir_build_alu_src_arr(b, nir_op_vec(num_comp), components);
+   } else {
+      unsigned num_comp = nir_intrinsic_src_components(intr, 0);
+      nir_ssa_def *value = nir_ssa_for_src(b, intr->src[0], num_comp);
+      for (unsigned off = 0; off < num_comp; off += 4) {
+         unsigned c = MIN2(num_comp - off, 4);
+         nir_ssa_def *v = nir_channels(b, value, BITFIELD_MASK(c) << off);
+         nir_build_store_global_ir3(b, v, addr, nir_imm_int(b, off));
+      }
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+   }
+}
+
+bool
+ir3_nir_lower_64b_global(nir_shader *shader)
+{
+   return nir_shader_lower_instructions(
+         shader, lower_64b_global_filter,
+         lower_64b_global, NULL);
+}
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_nir_lower_tess.c b/mesa 3D driver/src/freedreno/ir3/ir3_nir_lower_tess.c
index 2329ac517c..7ce254513a 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_nir_lower_tess.c	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_nir_lower_tess.c	
@@ -162,7 +162,7 @@ replace_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
 
    if (nir_intrinsic_infos[op].has_dest)
       nir_ssa_dest_init(&new_intr->instr, &new_intr->dest, intr->num_components,
-                        32, NULL);
+                        intr->dest.ssa.bit_size, NULL);
 
    nir_builder_instr_insert(b, &new_intr->instr);
 
@@ -465,7 +465,8 @@ tess_level_components(struct state *state, uint32_t *inner, uint32_t *outer)
 }
 
 static nir_ssa_def *
-build_tessfactor_base(nir_builder *b, gl_varying_slot slot, struct state *state)
+build_tessfactor_base(nir_builder *b, gl_varying_slot slot, uint32_t comp,
+                      struct state *state)
 {
    uint32_t inner_levels, outer_levels;
    tess_level_components(state, &inner_levels, &outer_levels);
@@ -492,7 +493,7 @@ build_tessfactor_base(nir_builder *b, gl_varying_slot slot, struct state *state)
       unreachable("bad");
    }
 
-   return nir_iadd(b, patch_offset, nir_imm_int(b, offset));
+   return nir_iadd(b, patch_offset, nir_imm_int(b, offset + comp));
 }
 
 static void
@@ -559,7 +560,8 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
          if (is_tess_levels(location)) {
             assert(intr->dest.ssa.num_components == 1);
             address = nir_load_tess_factor_base_ir3(b);
-            offset = build_tessfactor_base(b, location, state);
+            offset = build_tessfactor_base(
+               b, location, nir_intrinsic_component(intr), state);
          } else {
             address = nir_load_tess_param_base_ir3(b);
             offset = build_patch_offset(b, state, location,
@@ -590,9 +592,6 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
 
             assert(intr->src[0].ssa->num_components == 1);
 
-            nir_ssa_def *offset =
-               nir_iadd_imm(b, intr->src[1].ssa, nir_intrinsic_component(intr));
-
             nir_if *nif = NULL;
             if (location != VARYING_SLOT_PRIMITIVE_ID) {
                /* with tess levels are defined as float[4] and float[2],
@@ -605,13 +604,18 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
                else
                   levels = inner_levels;
 
+               nir_ssa_def *offset = nir_iadd_imm(
+                  b, intr->src[1].ssa, nir_intrinsic_component(intr));
                nif = nir_push_if(b, nir_ult(b, offset, nir_imm_int(b, levels)));
             }
 
-            replace_intrinsic(
-               b, intr, nir_intrinsic_store_global_ir3, intr->src[0].ssa,
-               nir_load_tess_factor_base_ir3(b),
-               nir_iadd(b, offset, build_tessfactor_base(b, location, state)));
+            nir_ssa_def *offset = build_tessfactor_base(
+               b, location, nir_intrinsic_component(intr), state);
+
+            replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
+                              intr->src[0].ssa,
+                              nir_load_tess_factor_base_ir3(b),
+                              nir_iadd(b, intr->src[1].ssa, offset));
 
             if (location != VARYING_SLOT_PRIMITIVE_ID) {
                nir_pop_if(b, nif);
@@ -622,8 +626,6 @@ lower_tess_ctrl_block(nir_block *block, nir_builder *b, struct state *state)
                b, state, location, nir_intrinsic_component(intr),
                intr->src[1].ssa);
 
-            debug_assert(nir_intrinsic_component(intr) == 0);
-
             replace_intrinsic(b, intr, nir_intrinsic_store_global_ir3,
                               intr->src[0].ssa, address, offset);
          }
@@ -785,7 +787,8 @@ lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state)
          if (is_tess_levels(location)) {
             assert(intr->dest.ssa.num_components == 1);
             address = nir_load_tess_factor_base_ir3(b);
-            offset = build_tessfactor_base(b, location, state);
+            offset = build_tessfactor_base(
+               b, location, nir_intrinsic_component(intr), state);
          } else {
             address = nir_load_tess_param_base_ir3(b);
             offset = build_patch_offset(b, state, location,
@@ -793,9 +796,6 @@ lower_tess_eval_block(nir_block *block, nir_builder *b, struct state *state)
                                         intr->src[0].ssa);
          }
 
-         offset =
-            nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_component(intr)));
-
          replace_intrinsic(b, intr, nir_intrinsic_load_global_ir3, address,
                            offset, NULL);
          break;
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c b/mesa 3D driver/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c
index 4ca9aaa263..e3f3173299 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c	
@@ -54,7 +54,7 @@ ir3_nir_lower_tg4_to_tex_instr(nir_builder *b, nir_instr *instr, void *data)
       tex->dest_type = tg4->dest_type;
 
       for (int j = 0; j < tg4->num_srcs; j++) {
-         nir_src_copy(&tex->src[j].src, &tg4->src[j].src, tex);
+         nir_src_copy(&tex->src[j].src, &tg4->src[j].src);
          tex->src[j].src_type = tg4->src[j].src_type;
       }
       if (i != 3) {
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_nir_lower_wide_load_store.c b/mesa 3D driver/src/freedreno/ir3/ir3_nir_lower_wide_load_store.c
new file mode 100644
index 0000000000..617d9b141a
--- /dev/null
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_nir_lower_wide_load_store.c	
@@ -0,0 +1,118 @@
+/*
+ * Copyright © 2021 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ir3_nir.h"
+
+
+/*
+ * Lowering for wide (larger than vec4) load/store
+ */
+
+static bool
+lower_wide_load_store_filter(const nir_instr *instr, const void *unused)
+{
+   (void)unused;
+
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+   if (is_intrinsic_store(intr->intrinsic))
+      return nir_intrinsic_src_components(intr, 0) > 4;
+
+   if (is_intrinsic_load(intr->intrinsic))
+      return nir_intrinsic_dest_components(intr) > 4;
+
+   return false;
+}
+
+static nir_ssa_def *
+lower_wide_load_store(nir_builder *b, nir_instr *instr, void *unused)
+{
+   (void)unused;
+
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+   if (is_intrinsic_store(intr->intrinsic)) {
+      unsigned num_comp = nir_intrinsic_src_components(intr, 0);
+      unsigned wrmask = nir_intrinsic_write_mask(intr);
+      nir_ssa_def *val = nir_ssa_for_src(b, intr->src[0], num_comp);
+      nir_ssa_def *addr = nir_ssa_for_src(b, intr->src[1], 1);
+
+      for (unsigned off = 0; off < num_comp; off += 4) {
+         unsigned c = MIN2(num_comp - off, 4);
+         nir_ssa_def *v = nir_channels(b, val, BITFIELD_MASK(c) << off);
+
+         nir_intrinsic_instr *store =
+               nir_intrinsic_instr_create(b->shader, intr->intrinsic);
+         store->num_components = c;
+         store->src[0] = nir_src_for_ssa(v);
+         store->src[1] = nir_src_for_ssa(addr);
+         nir_intrinsic_set_align(store, nir_intrinsic_align(intr), 0);
+         nir_intrinsic_set_write_mask(store, (wrmask >> off) & 0xf);
+         nir_builder_instr_insert(b, &store->instr);
+
+         addr = nir_iadd(b,
+               nir_imm_intN_t(b, (c * val->bit_size) / 8, addr->bit_size),
+               addr);
+      }
+
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+   } else {
+      unsigned num_comp = nir_intrinsic_dest_components(intr);
+      unsigned bit_size = nir_dest_bit_size(intr->dest);
+      nir_ssa_def *addr = nir_ssa_for_src(b, intr->src[0], 1);
+      nir_ssa_def *components[num_comp];
+
+      for (unsigned off = 0; off < num_comp;) {
+         unsigned c = MIN2(num_comp - off, 4);
+
+         nir_intrinsic_instr *load =
+            nir_intrinsic_instr_create(b->shader, intr->intrinsic);
+         load->num_components = c;
+         load->src[0] = nir_src_for_ssa(addr);
+         nir_intrinsic_set_align(load, nir_intrinsic_align(intr), 0);
+         nir_ssa_dest_init(&load->instr, &load->dest, c, bit_size, NULL);
+         nir_builder_instr_insert(b, &load->instr);
+
+         addr = nir_iadd(b,
+               nir_imm_intN_t(b, (c * bit_size) / 8, addr->bit_size),
+               addr);
+
+         for (unsigned i = 0; i < c; i++) {
+            components[off++] = nir_channel(b, &load->dest.ssa, i);
+         }
+      }
+
+      return nir_build_alu_src_arr(b, nir_op_vec(num_comp), components);
+   }
+}
+
+bool
+ir3_nir_lower_wide_load_store(nir_shader *shader)
+{
+   return nir_shader_lower_instructions(
+         shader, lower_wide_load_store_filter,
+         lower_wide_load_store, NULL);
+}
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_nir_trig.py b/mesa 3D driver/src/freedreno/ir3/ir3_nir_trig.py
index 6a88420e1e..657dbb25ea 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_nir_trig.py	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_nir_trig.py	
@@ -24,8 +24,8 @@ import argparse
 import sys
 
 trig_workarounds = [
-   (('fsin', 'x@32'), ('fsin', ('fsub', ('fmul', 6.2831853, ('ffract', ('fadd', ('fmul', 0.15915494, 'x'), 0.5))), 3.14159265))),
-   (('fcos', 'x@32'), ('fcos', ('fsub', ('fmul', 6.2831853, ('ffract', ('fadd', ('fmul', 0.15915494, 'x'), 0.5))), 3.14159265))),
+   (('fsin', 'x@32'), ('fsin', ('!ffma', 6.2831853, ('ffract', ('!ffma', 0.15915494, 'x', 0.5)), -3.14159265))),
+   (('fcos', 'x@32'), ('fcos', ('!ffma', 6.2831853, ('ffract', ('!ffma', 0.15915494, 'x', 0.5)), -3.14159265))),
 ]
 
 
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_parser.y b/mesa 3D driver/src/freedreno/ir3/ir3_parser.y
index 9f16bed739..63f1cccd94 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_parser.y	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_parser.y	
@@ -333,6 +333,7 @@ static void print_token(FILE *file, int type, YYSTYPE value)
 %token <tok> T_A_IN
 %token <tok> T_A_OUT
 %token <tok> T_A_TEX
+%token <tok> T_A_PVTMEM
 /* todo, re-add @sampler/@uniform/@varying if needed someday */
 
 /* src register flags */
@@ -655,6 +656,7 @@ header:            localsize_header
 |                  in_header
 |                  out_header
 |                  tex_header
+|                  pvtmem_header
 
 const_val:         T_FLOAT   { $$ = fui($1); }
 |                  T_INT     { $$ = $1;      }
@@ -696,20 +698,30 @@ invocationid_header: T_A_INVOCATIONID '(' T_REGISTER ')' {
 wgid_header:       T_A_WGID '(' T_REGISTER ')' {
                        assert(($3 & 0x1) == 0);  /* half-reg not allowed */
                        unsigned reg = $3 >> 1;
+                       assert(variant->shader->compiler->gen >= 5);
                        assert(reg >= regid(48, 0)); /* must be a high reg */
                        add_sysval(reg, 0x7, SYSTEM_VALUE_WORKGROUP_ID);
 }
+|                  T_A_WGID '(' T_CONSTANT ')' {
+                       assert(($3 & 0x1) == 0);  /* half-reg not allowed */
+                       unsigned reg = $3 >> 1;
+                       assert(variant->shader->compiler->gen < 5);
+                       info->wgid = reg;
+}
 
 numwg_header:      T_A_NUMWG '(' T_CONSTANT ')' {
                        assert(($3 & 0x1) == 0);  /* half-reg not allowed */
                        unsigned reg = $3 >> 1;
                        info->numwg = reg;
                        /* reserve space in immediates for the actual value to be plugged in later: */
-                       add_const($3, 0, 0, 0, 0);
+                       if (variant->shader->compiler->gen >= 5)
+                          add_const($3, 0, 0, 0, 0);
 }
 
 branchstack_header: T_A_BRANCHSTACK const_val { variant->branchstack = $2; }
 
+pvtmem_header: T_A_PVTMEM const_val { variant->pvtmem_size = $2; }
+
 /* Stubs for now */
 in_header:         T_A_IN '(' T_REGISTER ')' T_IDENTIFIER '(' T_IDENTIFIER '=' integer ')' { }
 
@@ -972,7 +984,7 @@ cat5_flags:
 |                  cat5_flag cat5_flags
 
 cat5_samp:         T_SAMP         { instr->cat5.samp = $1; }
-cat5_tex:          T_TEX          { if (instr->flags & IR3_INSTR_B) instr->cat5.samp |= ($1 << 4); else instr->cat5.tex = $1; }
+cat5_tex:          T_TEX          { instr->cat5.tex = $1; }
 cat5_type:         '(' type ')'   { instr->cat5.type = $2; }
 cat5_a1:           src_reg        { instr->flags |= IR3_INSTR_A1EN; }
 
@@ -1030,15 +1042,13 @@ cat6_load:         T_OP_LDG   { new_instr(OPC_LDG); }   cat6_type dst_reg ',' 'g
                        new_src(0, IR3_REG_IMMED)->iim_val = $8;
                    } ',' immediate
 
-// TODO some of the cat6 instructions have different syntax for a6xx..
-//|                  T_OP_LDIB { new_instr(OPC_LDIB); } cat6_type dst_reg cat6_offset ',' reg ',' cat6_immed
-
 cat6_store:        T_OP_STG   { new_instr(OPC_STG); dummy_dst(); }   cat6_type 'g' '[' src cat6_imm_offset ']' ',' src ',' immediate
 |                  T_OP_STG_A { new_instr(OPC_STG_A); dummy_dst(); } cat6_type 'g' '[' src cat6_stg_ldg_a6xx_offset ']' ',' src ',' immediate
 |                  T_OP_STP  { new_instr(OPC_STP); dummy_dst(); }  cat6_type 'p' '[' src cat6_dst_offset ']' ',' src ',' immediate
 |                  T_OP_STL  { new_instr(OPC_STL); dummy_dst(); }  cat6_type 'l' '[' src cat6_dst_offset ']' ',' src ',' immediate
 |                  T_OP_STLW { new_instr(OPC_STLW); dummy_dst(); } cat6_type 'l' '[' src cat6_dst_offset ']' ',' src ',' immediate
 
+cat6_loadib:       T_OP_LDIB { new_instr(OPC_LDIB); } cat6_typed cat6_dim cat6_type '.' cat6_immed dst_reg ',' 'g' '[' immediate ']' ',' src ',' src
 cat6_storeib:      T_OP_STIB { new_instr(OPC_STIB); dummy_dst(); } cat6_typed cat6_dim cat6_type '.' cat6_immed'g' '[' immediate ']' ',' src ',' src ',' src
 
 cat6_prefetch:     T_OP_PREFETCH { new_instr(OPC_PREFETCH); new_dst(0,0); /* dummy dst */ } 'g' '[' src cat6_offset ']' ',' cat6_immed
@@ -1124,6 +1134,7 @@ cat6_todo:         T_OP_G2L                 { new_instr(OPC_G2L); }
 |                  T_OP_RESFMT              { new_instr(OPC_RESFMT); }
 
 cat6_instr:        cat6_load
+|                  cat6_loadib
 |                  cat6_store
 |                  cat6_storeib
 |                  cat6_prefetch
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_postsched.c b/mesa 3D driver/src/freedreno/ir3/ir3_postsched.c
index b3a6a9d6a8..507302a009 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_postsched.c	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_postsched.c	
@@ -39,15 +39,17 @@
 #define d(fmt, ...)                                                            \
    do {                                                                        \
       if (SCHED_DEBUG) {                                                       \
-         printf("PSCHED: " fmt "\n", ##__VA_ARGS__);                           \
+         mesa_logi("PSCHED: " fmt, ##__VA_ARGS__);                             \
       }                                                                        \
    } while (0)
 
 #define di(instr, fmt, ...)                                                    \
    do {                                                                        \
       if (SCHED_DEBUG) {                                                       \
-         printf("PSCHED: " fmt ": ", ##__VA_ARGS__);                           \
-         ir3_print_instr(instr);                                               \
+         struct log_stream *stream = mesa_log_streami();                       \
+         mesa_log_stream_printf(stream, "PSCHED: " fmt ": ", ##__VA_ARGS__);   \
+         ir3_print_instr_stream(stream, instr);                                \
+         mesa_log_stream_destroy(stream);                                      \
       }                                                                        \
    } while (0)
 
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_print.c b/mesa 3D driver/src/freedreno/ir3/ir3_print.c
index 6f267c50c2..dd57bdc685 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_print.c	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_print.c	
@@ -75,7 +75,6 @@ print_instr_name(struct log_stream *stream, struct ir3_instruction *instr,
 #ifdef DEBUG
    mesa_log_stream_printf(stream, "%04u:", instr->serialno);
 #endif
-   mesa_log_stream_printf(stream, "%04u:", instr->name);
    mesa_log_stream_printf(stream, "%04u:", instr->ip);
    if (instr->flags & IR3_INSTR_UNUSED) {
       mesa_log_stream_printf(stream, "XXX: ");
@@ -250,7 +249,7 @@ print_reg_name(struct log_stream *stream, struct ir3_instruction *instr,
     * although it's more convenient for RA if it's a pointer.
     */
    if (reg->tied)
-      printf("(tied)");
+      mesa_log_stream_printf(stream, "(tied)");
 
    if (reg->flags & IR3_REG_SHARED)
       mesa_log_stream_printf(stream, "s");
@@ -341,14 +340,8 @@ print_instr(struct log_stream *stream, struct ir3_instruction *instr, int lvl)
    }
 
    if (is_tex(instr) && !(instr->flags & IR3_INSTR_S2EN)) {
-      if (!!(instr->flags & IR3_INSTR_B)) {
-         if (!!(instr->flags & IR3_INSTR_A1EN)) {
-            mesa_log_stream_printf(stream, ", s#%d", instr->cat5.samp);
-         } else {
-            mesa_log_stream_printf(stream, ", s#%d, t#%d",
-                                   instr->cat5.samp & 0xf,
-                                   instr->cat5.samp >> 4);
-         }
+      if (!!(instr->flags & IR3_INSTR_B) && !!(instr->flags & IR3_INSTR_A1EN)) {
+         mesa_log_stream_printf(stream, ", s#%d", instr->cat5.samp);
       } else {
          mesa_log_stream_printf(stream, ", s#%d, t#%d", instr->cat5.samp,
                                 instr->cat5.tex);
@@ -367,23 +360,20 @@ print_instr(struct log_stream *stream, struct ir3_instruction *instr, int lvl)
       /* the predicate register src is implied: */
       if (instr->opc == OPC_B) {
          static const struct {
-            const char *suffix;
             int nsrc;
             bool idx;
          } brinfo[7] = {
             /* clang-format off */
-            [BRANCH_PLAIN] = {"r",   1, false},
-            [BRANCH_OR]    = {"rao", 2, false},
-            [BRANCH_AND]   = {"raa", 2, false},
-            [BRANCH_CONST] = {"rac", 0, true},
-            [BRANCH_ANY]   = {"any", 1, false},
-            [BRANCH_ALL]   = {"all", 1, false},
-            [BRANCH_X]     = {"rax", 0, false},
+            [BRANCH_PLAIN] = {1, false},
+            [BRANCH_OR]    = {2, false},
+            [BRANCH_AND]   = {2, false},
+            [BRANCH_CONST] = {0, true},
+            [BRANCH_ANY]   = {1, false},
+            [BRANCH_ALL]   = {1, false},
+            [BRANCH_X]     = {0, false},
             /* clang-format on */
          };
 
-         mesa_log_stream_printf(stream, "%s",
-                                brinfo[instr->cat0.brtype].suffix);
          if (brinfo[instr->cat0.brtype].idx) {
             mesa_log_stream_printf(stream, ".%u", instr->cat0.idx);
          }
@@ -422,6 +412,12 @@ print_instr(struct log_stream *stream, struct ir3_instruction *instr, int lvl)
    mesa_log_stream_printf(stream, "\n");
 }
 
+void
+ir3_print_instr_stream(struct log_stream *stream, struct ir3_instruction *instr)
+{
+   print_instr(stream, instr, 0);
+}
+
 void
 ir3_print_instr(struct ir3_instruction *instr)
 {
@@ -450,6 +446,18 @@ print_block(struct ir3_block *block, int lvl)
       mesa_log_stream_printf(stream, "\n");
    }
 
+   if (block->physical_predecessors_count > 0) {
+      tab(stream, lvl + 1);
+      mesa_log_stream_printf(stream, "physical pred: ");
+      for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
+         struct ir3_block *pred = block->physical_predecessors[i];
+         if (i != 0)
+            mesa_log_stream_printf(stream, ", ");
+         mesa_log_stream_printf(stream, "block%u", block_id(pred));
+      }
+      mesa_log_stream_printf(stream, "\n");
+   }
+
    foreach_instr (instr, &block->instr_list) {
       print_instr(stream, instr, lvl + 1);
    }
@@ -470,13 +478,13 @@ print_block(struct ir3_block *block, int lvl)
       case IR3_BRANCH_COND:
          break;
       case IR3_BRANCH_ANY:
-         printf("any ");
+         mesa_log_stream_printf(stream, "any ");
          break;
       case IR3_BRANCH_ALL:
-         printf("all ");
+         mesa_log_stream_printf(stream, "all ");
          break;
       case IR3_BRANCH_GETONE:
-         printf("getone ");
+         mesa_log_stream_printf(stream, "getone ");
          break;
       }
       if (block->condition)
@@ -490,6 +498,16 @@ print_block(struct ir3_block *block, int lvl)
       mesa_log_stream_printf(stream, "/* succs: block%u; */\n",
                              block_id(block->successors[0]));
    }
+   if (block->physical_successors[0]) {
+      tab(stream, lvl + 1);
+      mesa_log_stream_printf(stream, "/* physical succs: block%u",
+                             block_id(block->physical_successors[0]));
+      if (block->physical_successors[1]) {
+         mesa_log_stream_printf(stream, ", block%u",
+                                block_id(block->physical_successors[1]));
+      }
+      mesa_log_stream_printf(stream, " */\n");
+   }
    tab(stream, lvl);
    mesa_log_stream_printf(stream, "}\n");
 }
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_ra.c b/mesa 3D driver/src/freedreno/ir3/ir3_ra.c
index 6463b62ed2..656f5c5511 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_ra.c	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_ra.c	
@@ -51,7 +51,7 @@
 static int
 ir3_reg_interval_cmp(const struct rb_node *node, const void *data)
 {
-   physreg_t reg = *(const physreg_t *)data;
+   unsigned reg = *(const unsigned *)data;
    const struct ir3_reg_interval *interval =
       ir3_rb_node_to_interval_const(node);
    if (interval->reg->interval_start > reg)
@@ -175,6 +175,17 @@ void
 ir3_reg_interval_insert(struct ir3_reg_ctx *ctx,
                         struct ir3_reg_interval *interval)
 {
+   rb_tree_init(&interval->children);
+   interval->parent = NULL;
+   interval_insert(ctx, &ctx->intervals, interval);
+}
+
+/* Call after ir3_reg_interval_remove_temp() to reinsert the interval */
+static void
+ir3_reg_interval_reinsert(struct ir3_reg_ctx *ctx,
+                          struct ir3_reg_interval *interval)
+{
+   interval->parent = NULL;
    interval_insert(ctx, &ctx->intervals, interval);
 }
 
@@ -207,37 +218,63 @@ ir3_reg_interval_remove(struct ir3_reg_ctx *ctx,
    interval->inserted = false;
 }
 
+static void
+_mark_free(struct ir3_reg_interval *interval)
+{
+   interval->inserted = false;
+   rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {
+      _mark_free(child);
+   }
+}
+
+/* Remove an interval and all its children from the tree. */
 void
 ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx,
                             struct ir3_reg_interval *interval)
 {
    assert(!interval->parent);
 
+   ctx->interval_delete(ctx, interval);
+   rb_tree_remove(&ctx->intervals, &interval->node);
+   _mark_free(interval);
+}
+
+/* Used when popping an interval to be shuffled around. Don't disturb children
+ * so that it can be later reinserted.
+ */
+static void
+ir3_reg_interval_remove_temp(struct ir3_reg_ctx *ctx,
+                             struct ir3_reg_interval *interval)
+{
+   assert(!interval->parent);
+
    ctx->interval_delete(ctx, interval);
    rb_tree_remove(&ctx->intervals, &interval->node);
 }
 
 static void
-interval_dump(struct ir3_reg_interval *interval, unsigned indent)
+interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval,
+              unsigned indent)
 {
    for (unsigned i = 0; i < indent; i++)
-      printf("\t");
-   printf("reg %u start %u\n", interval->reg->name,
-          interval->reg->interval_start);
+      mesa_log_stream_printf(stream, "\t");
+   mesa_log_stream_printf(stream, "reg %u start %u\n", interval->reg->name,
+                          interval->reg->interval_start);
 
    rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {
-      interval_dump(child, indent + 1);
+      interval_dump(stream, child, indent + 1);
    }
 
    for (unsigned i = 0; i < indent; i++)
-      printf("\t");
-   printf("reg %u end %u\n", interval->reg->name, interval->reg->interval_end);
+      mesa_log_stream_printf(stream, "\t");
+   mesa_log_stream_printf(stream, "reg %u end %u\n", interval->reg->name,
+                          interval->reg->interval_end);
 }
 
 void
-ir3_reg_interval_dump(struct ir3_reg_interval *interval)
+ir3_reg_interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval)
 {
-   interval_dump(interval, 0);
+   interval_dump(stream, interval, 0);
 }
 
 /* These are the core datastructures used by the register allocator. First
@@ -301,12 +338,6 @@ struct ra_block_state {
    /* True if the block has been visited and "renames" is complete.
     */
    bool visited;
-
-   /* True if the block is unreachable via the logical CFG. This happens for
-    * blocks after an if where both sides end in a break/continue. We ignore
-    * it for everything but shared registers.
-    */
-   bool logical_unreachable;
 };
 
 struct ra_parallel_copy {
@@ -328,8 +359,6 @@ struct ra_ctx {
    /* Shared regs. */
    struct ra_file shared;
 
-   struct ir3 *ir;
-
    struct ir3_liveness *live;
 
    struct ir3_block *block;
@@ -509,8 +538,6 @@ ra_file_init(struct ra_file *file)
       BITSET_SET(file->available_to_evict, i);
    }
 
-   file->start = 0;
-
    rb_tree_init(&file->reg_ctx.intervals);
    rb_tree_init(&file->physreg_intervals);
 
@@ -548,6 +575,18 @@ ra_file_mark_killed(struct ra_file *file, struct ra_interval *interval)
    interval->is_killed = true;
 }
 
+static void
+ra_file_unmark_killed(struct ra_file *file, struct ra_interval *interval)
+{
+   assert(!interval->interval.parent);
+
+   for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
+      BITSET_CLEAR(file->available, i);
+   }
+
+   interval->is_killed = false;
+}
+
 static physreg_t
 ra_interval_get_physreg(const struct ra_interval *interval)
 {
@@ -577,45 +616,47 @@ ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)
 }
 
 static void
-ra_interval_dump(struct ra_interval *interval)
+ra_interval_dump(struct log_stream *stream, struct ra_interval *interval)
 {
-   printf("physreg %u ", interval->physreg_start);
+   mesa_log_stream_printf(stream, "physreg %u ", interval->physreg_start);
 
-   ir3_reg_interval_dump(&interval->interval);
+   ir3_reg_interval_dump(stream, &interval->interval);
 }
 
 static void
-ra_file_dump(struct ra_file *file)
+ra_file_dump(struct log_stream *stream, struct ra_file *file)
 {
    rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
                     physreg_node) {
-      ra_interval_dump(interval);
+      ra_interval_dump(stream, interval);
    }
 
    unsigned start, end;
-   printf("available:\n");
+   mesa_log_stream_printf(stream, "available:\n");
    BITSET_FOREACH_RANGE (start, end, file->available, file->size) {
-      printf("%u-%u ", start, end);
+      mesa_log_stream_printf(stream, "%u-%u ", start, end);
    }
-   printf("\n");
+   mesa_log_stream_printf(stream, "\n");
 
-   printf("available to evict:\n");
+   mesa_log_stream_printf(stream, "available to evict:\n");
    BITSET_FOREACH_RANGE (start, end, file->available_to_evict, file->size) {
-      printf("%u-%u ", start, end);
+      mesa_log_stream_printf(stream, "%u-%u ", start, end);
    }
-   printf("\n");
-   printf("start: %u\n", file->start);
+   mesa_log_stream_printf(stream, "\n");
+   mesa_log_stream_printf(stream, "start: %u\n", file->start);
 }
 
 static void
 ra_ctx_dump(struct ra_ctx *ctx)
 {
-   printf("full:\n");
-   ra_file_dump(&ctx->full);
-   printf("half:\n");
-   ra_file_dump(&ctx->half);
-   printf("shared:\n");
-   ra_file_dump(&ctx->shared);
+   struct log_stream *stream = mesa_log_streami();
+   mesa_log_stream_printf(stream, "full:\n");
+   ra_file_dump(stream, &ctx->full);
+   mesa_log_stream_printf(stream, "half:\n");
+   ra_file_dump(stream, &ctx->half);
+   mesa_log_stream_printf(stream, "shared:");
+   ra_file_dump(stream, &ctx->shared);
+   mesa_log_stream_destroy(stream);
 }
 
 static unsigned
@@ -660,7 +701,7 @@ ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file,
                    });
    }
 
-   ir3_reg_interval_remove_all(&file->reg_ctx, &interval->interval);
+   ir3_reg_interval_remove_temp(&file->reg_ctx, &interval->interval);
 
    return (struct ra_removed_interval){
       .interval = interval,
@@ -677,7 +718,7 @@ ra_push_interval(struct ra_ctx *ctx, struct ra_file *file,
    interval->physreg_start = dst;
    interval->physreg_end = dst + removed->size;
 
-   ir3_reg_interval_insert(&file->reg_ctx, &interval->interval);
+   ir3_reg_interval_reinsert(&file->reg_ctx, &interval->interval);
 }
 
 /* Pick up the interval and place it at "dst". */
@@ -716,8 +757,13 @@ try_evict_regs(struct ra_ctx *ctx, struct ra_file *file,
    memcpy(available_to_evict, file->available_to_evict,
           sizeof(available_to_evict));
 
-   for (unsigned i = 0; i < reg_size(reg); i++)
+   BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
+   memcpy(available, file->available, sizeof(available));
+
+   for (unsigned i = 0; i < reg_size(reg); i++) {
       BITSET_CLEAR(available_to_evict, physreg + i);
+      BITSET_CLEAR(available, physreg + i);
+   }
 
    unsigned eviction_count = 0;
    /* Iterate over each range conflicting with physreg */
@@ -760,6 +806,64 @@ try_evict_regs(struct ra_ctx *ctx, struct ra_file *file,
          }
       }
 
+      if (evicted)
+         continue;
+
+      /* If we couldn't evict this range, we may be able to swap it with a
+       * killed range to acheive the same effect.
+       */
+      foreach_interval (killed, file) {
+         if (!killed->is_killed)
+            continue;
+
+         if (killed->physreg_end - killed->physreg_start !=
+             conflicting->physreg_end - conflicting->physreg_start)
+            continue;
+
+         /* We can't swap the killed range if it partially/fully overlaps the
+          * space we're trying to allocate or (in speculative mode) if it's
+          * already been swapped and will overlap when we actually evict.
+          */
+         bool killed_available = true;
+         for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) {
+            if (!BITSET_TEST(available, i)) {
+               killed_available = false;
+               break;
+            }
+         }
+         
+         if (!killed_available)
+            continue;
+
+         /* Check for alignment if one is a full reg */
+         if ((!(killed->interval.reg->flags & IR3_REG_HALF) ||
+              !(conflicting->interval.reg->flags & IR3_REG_HALF)) &&
+             (killed->physreg_start % 2 != 0 ||
+              conflicting->physreg_start % 2 != 0))
+            continue;
+
+         for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) {
+            BITSET_CLEAR(available, i);
+         }
+         /* Because this will generate swaps instead of moves, multiply the
+          * cost by 2.
+          */
+         eviction_count += (killed->physreg_end - killed->physreg_start) * 2;
+         if (!speculative) {
+            physreg_t killed_start = killed->physreg_start,
+                      conflicting_start = conflicting->physreg_start;
+            struct ra_removed_interval killed_removed =
+               ra_pop_interval(ctx, file, killed);
+            struct ra_removed_interval conflicting_removed =
+               ra_pop_interval(ctx, file, conflicting);
+            ra_push_interval(ctx, file, &killed_removed, conflicting_start);
+            ra_push_interval(ctx, file, &conflicting_removed, killed_start);
+         }
+
+         evicted = true;
+         break;
+      }
+
       if (!evicted)
          return false;
    }
@@ -833,27 +937,23 @@ compress_regs_left(struct ra_ctx *ctx, struct ra_file *file, unsigned size,
    intervals_count = intervals_sz = 0;
    intervals = NULL;
 
-   unsigned removed_full_size = 0;
-   unsigned removed_half_size = 0;
+   unsigned removed_size = 0, removed_half_size = 0;
    unsigned file_size =
       align == 1 ? MIN2(file->size, RA_HALF_SIZE) : file->size;
    physreg_t start_reg = 0;
 
    foreach_interval_rev_safe (interval, file) {
-      /* Check if we can sort the intervals *after* this one and have
-       * enough space leftover to accomodate "size" units.
+      /* Check if we can sort the intervals *after* this one and have enough
+       * space leftover to accomodate "size" units. Also check that we have
+       * enough space leftover for half-registers, if we're inserting a
+       * half-register (otherwise we only shift any half-registers down so they
+       * should be safe).
        */
-      if (align == 1) {
-         if (interval->physreg_end + removed_half_size <= file_size - size) {
-            start_reg = interval->physreg_end;
-            break;
-         }
-      } else {
-         if (interval->physreg_end + removed_half_size <=
-             file_size - removed_full_size - size) {
-            start_reg = interval->physreg_end;
-            break;
-         }
+      if (interval->physreg_end + size + removed_size <= file->size &&
+          (align != 1 ||
+           interval->physreg_end + size + removed_half_size <= file_size)) {
+         start_reg = interval->physreg_end;
+         break;
       }
 
       /* We assume that all frozen intervals are at the start and that we
@@ -865,12 +965,11 @@ compress_regs_left(struct ra_ctx *ctx, struct ra_file *file, unsigned size,
        * overlap the register we're trying to add.
        */
       if (!interval->is_killed && !is_source) {
-         if (interval->interval.reg->flags & IR3_REG_HALF)
-            removed_half_size +=
-               interval->physreg_end - interval->physreg_start;
-         else
-            removed_full_size +=
-               interval->physreg_end - interval->physreg_start;
+         removed_size += interval->physreg_end - interval->physreg_start;
+         if (interval->interval.reg->flags & IR3_REG_HALF) {
+            removed_half_size += interval->physreg_end -
+               interval->physreg_start;
+         }
       }
 
       /* Now that we've done the accounting, pop this off */
@@ -950,6 +1049,12 @@ static physreg_t
 find_best_gap(struct ra_file *file, unsigned file_size, unsigned size,
               unsigned align, bool is_source)
 {
+   /* This can happen if we create a very large merge set. Just bail out in that
+    * case.
+    */
+   if (size > file_size)
+      return (physreg_t) ~0;
+
    BITSET_WORD *available =
       is_source ? file->available_to_evict : file->available;
 
@@ -1311,15 +1416,11 @@ handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr)
     */
    physreg_t dst_fixed = (physreg_t)~0u;
 
-   for (unsigned i = 0; i < instr->srcs_count; i++) {
-      if (!ra_reg_is_src(instr->srcs[i]))
-         continue;
-
-      if (instr->srcs[i]->flags & IR3_REG_FIRST_KILL) {
-         mark_src_killed(ctx, instr->srcs[i]);
+   ra_foreach_src (src, instr) {
+      if (src->flags & IR3_REG_FIRST_KILL) {
+         mark_src_killed(ctx, src);
       }
 
-      struct ir3_register *src = instr->srcs[i];
       struct ra_interval *interval = &ctx->intervals[src->def->name];
 
       if (src->def->merge_set != dst_set || interval->is_killed)
@@ -1347,11 +1448,7 @@ handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr)
       allocate_dst(ctx, instr->dsts[0]);
 
    /* Remove the temporary is_killed we added */
-   for (unsigned i = 0; i < instr->srcs_count; i++) {
-      if (!ra_reg_is_src(instr->srcs[i]))
-         continue;
-
-      struct ir3_register *src = instr->srcs[i];
+   ra_foreach_src (src, instr) {
       struct ra_interval *interval = &ctx->intervals[src->def->name];
       while (interval->interval.parent != NULL) {
          interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
@@ -1359,8 +1456,9 @@ handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr)
 
       /* Filter out cases where it actually should be killed */
       if (interval != &ctx->intervals[src->def->name] ||
-          !(src->flags & IR3_REG_KILL))
-         interval->is_killed = false;
+          !(src->flags & IR3_REG_KILL)) {
+         ra_file_unmark_killed(ra_get_file(ctx, src), interval);
+      }
    }
 
    ra_foreach_src_rev (src, instr) {
@@ -1538,8 +1636,7 @@ handle_live_in(struct ra_ctx *ctx, struct ir3_register *def)
       struct ir3_block *pred = ctx->block->predecessors[i];
       struct ra_block_state *pred_state = &ctx->blocks[pred->index];
 
-      if (!pred_state->visited ||
-          (pred_state->logical_unreachable && !(def->flags & IR3_REG_SHARED)))
+      if (!pred_state->visited)
          continue;
 
       physreg = read_register(ctx, pred, def);
@@ -1839,21 +1936,6 @@ handle_block(struct ra_ctx *ctx, struct ir3_block *block)
    ra_file_init(&ctx->half);
    ra_file_init(&ctx->shared);
 
-   bool unreachable = false;
-   if (block != ir3_start_block(ctx->ir)) {
-      unreachable = true;
-      for (unsigned i = 0; i < block->predecessors_count; i++) {
-         struct ra_block_state *pred_state =
-            &ctx->blocks[block->predecessors[i]->index];
-         if (!pred_state->logical_unreachable) {
-            unreachable = false;
-            break;
-         }
-      }
-   }
-
-   ctx->blocks[block->index].logical_unreachable = unreachable;
-
    /* Handle live-ins, phis, and input meta-instructions. These all appear
     * live at the beginning of the block, and interfere with each other
     * therefore need to be allocated "in parallel". This means that we
@@ -1878,8 +1960,6 @@ handle_block(struct ra_ctx *ctx, struct ir3_block *block)
    BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
                        ctx->live->definitions_count) {
       struct ir3_register *reg = ctx->live->definitions[name];
-      if (unreachable && !(reg->flags & IR3_REG_SHARED))
-         continue;
       handle_live_in(ctx, reg);
    }
 
@@ -1903,7 +1983,7 @@ handle_block(struct ra_ctx *ctx, struct ir3_block *block)
    insert_live_in_moves(ctx);
 
    if (RA_DEBUG) {
-      printf("after live-in block %u:\n", block->index);
+      d("after live-in block %u:\n", block->index);
       ra_ctx_dump(ctx);
    }
 
@@ -1911,10 +1991,7 @@ handle_block(struct ra_ctx *ctx, struct ir3_block *block)
     * block.
     */
    foreach_instr (instr, &block->instr_list) {
-      if (RA_DEBUG) {
-         printf("processing: ");
-         ir3_print_instr(instr);
-      }
+      di(instr, "processing");
 
       if (instr->opc == OPC_META_PHI)
          assign_phi(ctx, instr);
@@ -1974,6 +2051,152 @@ calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure)
    return (target - 1) * 2 * 4;
 }
 
+static void
+add_pressure(struct ir3_pressure *pressure, struct ir3_register *reg,
+             bool merged_regs)
+{
+   unsigned size = reg_size(reg);
+   if (reg->flags & IR3_REG_HALF)
+      pressure->half += size;
+   if (!(reg->flags & IR3_REG_HALF) || merged_regs)
+      pressure->full += size;
+}
+
+static void
+dummy_interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
+{
+}
+
+static void
+dummy_interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
+{
+}
+
+static void
+dummy_interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *parent,
+                     struct ir3_reg_interval *child)
+{
+}
+
+/* Calculate the minimum possible limit on register pressure so that spilling
+ * still succeeds. Used to implement IR3_SHADER_DEBUG=spillall.
+ */
+
+static void
+calc_min_limit_pressure(struct ir3_shader_variant *v,
+                        struct ir3_liveness *live,
+                        struct ir3_pressure *limit)
+{
+   struct ir3_block *start = ir3_start_block(v->ir);
+   struct ir3_reg_ctx *ctx = ralloc(NULL, struct ir3_reg_ctx);
+   struct ir3_reg_interval *intervals =
+      rzalloc_array(ctx, struct ir3_reg_interval, live->definitions_count);
+
+   ctx->interval_add = dummy_interval_add;
+   ctx->interval_delete = dummy_interval_delete;
+   ctx->interval_readd = dummy_interval_readd;
+
+   limit->full = limit->half = 0;
+
+   struct ir3_pressure cur_pressure = {0};
+   foreach_instr (input, &start->instr_list) {
+      if (input->opc != OPC_META_INPUT &&
+          input->opc != OPC_META_TEX_PREFETCH)
+         break;
+
+      add_pressure(&cur_pressure, input->dsts[0], v->mergedregs);
+   }
+
+   limit->full = MAX2(limit->full, cur_pressure.full);
+   limit->half = MAX2(limit->half, cur_pressure.half);
+
+   foreach_instr (input, &start->instr_list) {
+      if (input->opc != OPC_META_INPUT &&
+          input->opc != OPC_META_TEX_PREFETCH)
+         break;
+
+      /* pre-colored inputs may have holes, which increases the pressure. */
+      struct ir3_register *dst = input->dsts[0];
+      if (dst->num != INVALID_REG) {
+         unsigned physreg = ra_reg_get_physreg(dst) + reg_size(dst);
+         if (dst->flags & IR3_REG_HALF)
+            limit->half = MAX2(limit->half, physreg);
+         if (!(dst->flags & IR3_REG_HALF) || v->mergedregs)
+            limit->full = MAX2(limit->full, physreg);
+      }
+   }
+
+   foreach_block (block, &v->ir->block_list) {
+      rb_tree_init(&ctx->intervals);
+
+      unsigned name;
+      BITSET_FOREACH_SET (name, live->live_in[block->index],
+                          live->definitions_count) {
+         struct ir3_register *reg = live->definitions[name];
+         ir3_reg_interval_init(&intervals[reg->name], reg);
+         ir3_reg_interval_insert(ctx, &intervals[reg->name]);
+      }
+
+      foreach_instr (instr, &block->instr_list) {
+         ra_foreach_dst (dst, instr) {
+            ir3_reg_interval_init(&intervals[dst->name], dst);
+         }
+         /* phis and parallel copies can be deleted via spilling */
+
+         if (instr->opc == OPC_META_PHI) {
+            ir3_reg_interval_insert(ctx, &intervals[instr->dsts[0]->name]);
+            continue;
+         }
+
+         if (instr->opc == OPC_META_PARALLEL_COPY)
+            continue;
+
+         cur_pressure = (struct ir3_pressure) {0};
+
+         ra_foreach_dst (dst, instr) {
+            if (dst->tied && !(dst->tied->flags & IR3_REG_KILL))
+               add_pressure(&cur_pressure, dst, v->mergedregs);
+         }
+
+         ra_foreach_src_rev (src, instr) {
+            /* We currently don't support spilling the parent of a source when
+             * making space for sources, so we have to keep track of the
+             * intervals and figure out the root of the tree to figure out how
+             * much space we need.
+             *
+             * TODO: We should probably support this in the spiller.
+             */
+            struct ir3_reg_interval *interval = &intervals[src->def->name];
+            while (interval->parent)
+               interval = interval->parent;
+            add_pressure(&cur_pressure, interval->reg, v->mergedregs);
+
+            if (src->flags & IR3_REG_FIRST_KILL)
+               ir3_reg_interval_remove(ctx, &intervals[src->def->name]);
+         }
+
+         limit->full = MAX2(limit->full, cur_pressure.full);
+         limit->half = MAX2(limit->half, cur_pressure.half);
+
+         cur_pressure = (struct ir3_pressure) {0};
+
+         ra_foreach_dst (dst, instr) {
+            ir3_reg_interval_init(&intervals[dst->name], dst);
+            ir3_reg_interval_insert(ctx, &intervals[dst->name]);
+            add_pressure(&cur_pressure, dst, v->mergedregs);
+         }
+
+         limit->full = MAX2(limit->full, cur_pressure.full);
+         limit->half = MAX2(limit->half, cur_pressure.half);
+      }
+   }
+
+   /* Account for the base register, which needs to be available everywhere. */
+   limit->full += 2;
+
+   ralloc_free(ctx);
+}
+
 int
 ir3_ra(struct ir3_shader_variant *v)
 {
@@ -1981,7 +2204,13 @@ ir3_ra(struct ir3_shader_variant *v)
 
    ir3_create_parallel_copies(v->ir);
 
-   struct ir3_liveness *live = ir3_calc_liveness(v);
+   struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx);
+
+   ctx->merged_regs = v->mergedregs;
+   ctx->compiler = v->shader->compiler;
+   ctx->stage = v->type;
+
+   struct ir3_liveness *live = ir3_calc_liveness(ctx, v->ir);
 
    ir3_debug_print(v->ir, "AFTER: create_parallel_copies");
 
@@ -1994,23 +2223,37 @@ ir3_ra(struct ir3_shader_variant *v)
    d("\thalf: %u", max_pressure.half);
    d("\tshared: %u", max_pressure.shared);
 
-   if (v->mergedregs) {
-      max_pressure.full += max_pressure.half;
-      max_pressure.half = 0;
+   /* TODO: calculate half/full limit correctly for CS with barrier */
+   struct ir3_pressure limit_pressure;
+   limit_pressure.full = RA_FULL_SIZE;
+   limit_pressure.half = RA_HALF_SIZE;
+   limit_pressure.shared = RA_SHARED_SIZE;
+
+   /* If requested, lower the limit so that spilling happens more often. */
+   if (ir3_shader_debug & IR3_DBG_SPILLALL)
+      calc_min_limit_pressure(v, live, &limit_pressure);
+
+   if (max_pressure.shared > limit_pressure.shared) {
+      /* TODO shared reg -> normal reg spilling */
+      d("shared max pressure exceeded!");
+      goto fail;
    }
 
-   if (max_pressure.full > RA_FULL_SIZE || max_pressure.half > RA_HALF_SIZE ||
-       max_pressure.shared > RA_SHARED_SIZE) {
-      d("max pressure exceeded!");
-      return 1;
+   bool spilled = false;
+   if (max_pressure.full > limit_pressure.full ||
+       max_pressure.half > limit_pressure.half) {
+      if (!v->shader->compiler->has_pvtmem) {
+         d("max pressure exceeded!");
+         goto fail;
+      }
+      d("max pressure exceeded, spilling!");
+      IR3_PASS(v->ir, ir3_spill, v, &live, &limit_pressure);
+      ir3_calc_pressure(v, live, &max_pressure);
+      assert(max_pressure.full <= limit_pressure.full &&
+             max_pressure.half <= limit_pressure.half);
+      spilled = true;
    }
 
-   struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx);
-
-   ctx->ir = v->ir;
-   ctx->merged_regs = v->mergedregs;
-   ctx->compiler = v->shader->compiler;
-   ctx->stage = v->type;
    ctx->live = live;
    ctx->intervals =
       rzalloc_array(ctx, struct ra_interval, live->definitions_count);
@@ -2024,6 +2267,8 @@ ir3_ra(struct ir3_shader_variant *v)
 
    ctx->shared.size = RA_SHARED_SIZE;
 
+   ctx->full.start = ctx->half.start = ctx->shared.start = 0;
+
    foreach_block (block, &v->ir->block_list)
       handle_block(ctx, block);
 
@@ -2039,19 +2284,20 @@ ir3_ra(struct ir3_shader_variant *v)
          for (unsigned i = 0; i < instr->dsts_count; i++) {
             instr->dsts[i]->flags &= ~IR3_REG_SSA;
 
-            /* Parallel copies of array registers copy the whole register,
-             * and we need some way to let the parallel copy code know
-             * that this was an array whose size is determined by
-             * reg->size. So keep the array flag on those.
+            /* Parallel copies of array registers copy the whole register, and
+             * we need some way to let the parallel copy code know that this was
+             * an array whose size is determined by reg->size. So keep the array
+             * flag on those. spill/reload also need to work on the entire
+             * array.
              */
-            if (!is_meta(instr))
+            if (!is_meta(instr) && instr->opc != OPC_RELOAD_MACRO)
                instr->dsts[i]->flags &= ~IR3_REG_ARRAY;
          }
 
          for (unsigned i = 0; i < instr->srcs_count; i++) {
             instr->srcs[i]->flags &= ~IR3_REG_SSA;
 
-            if (!is_meta(instr))
+            if (!is_meta(instr) && instr->opc != OPC_SPILL_MACRO)
                instr->srcs[i]->flags &= ~IR3_REG_ARRAY;
          }
       }
@@ -2059,11 +2305,18 @@ ir3_ra(struct ir3_shader_variant *v)
 
    ir3_debug_print(v->ir, "AFTER: register allocation");
 
+   if (spilled) {
+      IR3_PASS(v->ir, ir3_lower_spill);
+   }
+
    ir3_lower_copies(v);
 
    ir3_debug_print(v->ir, "AFTER: ir3_lower_copies");
 
    ralloc_free(ctx);
-   ralloc_free(live);
+
    return 0;
+fail:
+   ralloc_free(ctx);
+   return -1;
 }
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_ra.h b/mesa 3D driver/src/freedreno/ir3/ir3_ra.h
index 98533a38ae..259341eaac 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_ra.h	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_ra.h	
@@ -36,15 +36,17 @@
 #define d(fmt, ...)                                                            \
    do {                                                                        \
       if (RA_DEBUG) {                                                          \
-         printf("RA: " fmt "\n", ##__VA_ARGS__);                               \
+         mesa_logi("RA: " fmt, ##__VA_ARGS__);                                 \
       }                                                                        \
    } while (0)
 
 #define di(instr, fmt, ...)                                                    \
    do {                                                                        \
       if (RA_DEBUG) {                                                          \
-         printf("RA: " fmt ": ", ##__VA_ARGS__);                               \
-         ir3_print_instr(instr);                                               \
+         struct log_stream *stream = mesa_log_streami();                       \
+         mesa_log_stream_printf(stream, "RA: " fmt ": ", ##__VA_ARGS__);       \
+         ir3_print_instr_stream(stream, instr);                                \
+         mesa_log_stream_destroy(stream);                                      \
       }                                                                        \
    } while (0)
 
@@ -106,14 +108,14 @@ ra_reg_is_dst(const struct ir3_register *reg)
 /* Iterators for sources and destinations which:
  * - Don't include fake sources (irrelevant for RA)
  * - Don't include non-SSA sources (immediates and constants, also irrelevant)
- * - Consider array destinations as both a source and a destination
  */
 
+#define ra_foreach_src_n(__srcreg, __n, __instr)                               \
+   foreach_src_n(__srcreg, __n, __instr)                                       \
+      if (ra_reg_is_src(__srcreg))
+
 #define ra_foreach_src(__srcreg, __instr)                                      \
-   for (struct ir3_register *__srcreg = (void *)~0; __srcreg; __srcreg = NULL) \
-      for (unsigned __cnt = (__instr)->srcs_count, __i = 0; __i < __cnt;       \
-           __i++)                                                              \
-         if (ra_reg_is_src((__srcreg = (__instr)->srcs[__i])))
+   ra_foreach_src_n(__srcreg, __i, __instr)
 
 #define ra_foreach_src_rev(__srcreg, __instr)                                  \
    for (struct ir3_register *__srcreg = (void *)~0; __srcreg; __srcreg = NULL) \
@@ -121,11 +123,12 @@ ra_reg_is_dst(const struct ir3_register *reg)
            __i--)                                                              \
          if (ra_reg_is_src((__srcreg = (__instr)->srcs[__i])))
 
+#define ra_foreach_dst_n(__dstreg, __n, __instr)                               \
+   foreach_dst_n(__dstreg, __n, instr)                                         \
+      if (ra_reg_is_dst(__dstreg))
+
 #define ra_foreach_dst(__dstreg, __instr)                                      \
-   for (struct ir3_register *__dstreg = (void *)~0; __dstreg; __dstreg = NULL) \
-      for (unsigned __cnt = (__instr)->dsts_count, __i = 0; __i < __cnt;       \
-           __i++)                                                              \
-         if (ra_reg_is_dst((__dstreg = (__instr)->dsts[__i])))
+   ra_foreach_dst_n(__dstreg, __i, __instr)
 
 #define RA_HALF_SIZE     (4 * 48)
 #define RA_FULL_SIZE     (4 * 48 * 2)
@@ -134,12 +137,13 @@ ra_reg_is_dst(const struct ir3_register *reg)
 
 struct ir3_liveness {
    unsigned block_count;
+   unsigned interval_offset;
    DECLARE_ARRAY(struct ir3_register *, definitions);
    DECLARE_ARRAY(BITSET_WORD *, live_out);
    DECLARE_ARRAY(BITSET_WORD *, live_in);
 };
 
-struct ir3_liveness *ir3_calc_liveness(struct ir3_shader_variant *v);
+struct ir3_liveness *ir3_calc_liveness(void *mem_ctx, struct ir3 *ir);
 
 bool ir3_def_live_after(struct ir3_liveness *live, struct ir3_register *def,
                         struct ir3_instruction *instr);
@@ -148,6 +152,9 @@ void ir3_create_parallel_copies(struct ir3 *ir);
 
 void ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir);
 
+void ir3_force_merge(struct ir3_register *a, struct ir3_register *b,
+                     int b_offset);
+
 struct ir3_pressure {
    unsigned full, half, shared;
 };
@@ -155,6 +162,12 @@ struct ir3_pressure {
 void ir3_calc_pressure(struct ir3_shader_variant *v, struct ir3_liveness *live,
                        struct ir3_pressure *max_pressure);
 
+bool ir3_spill(struct ir3 *ir, struct ir3_shader_variant *v,
+               struct ir3_liveness **live,
+               const struct ir3_pressure *limit_pressure);
+
+bool ir3_lower_spill(struct ir3 *ir);
+
 void ir3_ra_validate(struct ir3_shader_variant *v, unsigned full_size,
                      unsigned half_size, unsigned block_count);
 
@@ -256,7 +269,8 @@ ir3_reg_interval_init(struct ir3_reg_interval *interval,
    interval->inserted = false;
 }
 
-void ir3_reg_interval_dump(struct ir3_reg_interval *interval);
+void ir3_reg_interval_dump(struct log_stream *stream,
+                           struct ir3_reg_interval *interval);
 
 void ir3_reg_interval_insert(struct ir3_reg_ctx *ctx,
                              struct ir3_reg_interval *interval);
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_remove_unreachable.c b/mesa 3D driver/src/freedreno/ir3/ir3_remove_unreachable.c
new file mode 100644
index 0000000000..c598d9f2e5
--- /dev/null
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_remove_unreachable.c	
@@ -0,0 +1,122 @@
+/*
+ * Copyright (C) 2021 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ir3.h"
+
+/* Sometimes we can get unreachable blocks from NIR. In particular this happens
+ * for blocks after an if where both sides end in a break/continue. These blocks
+ * are then reachable only via the physical CFG. This pass deletes these blocks
+ * and reroutes the physical edge past it.
+ */
+
+static void
+delete_block(struct ir3 *ir, struct ir3_block *block)
+{
+   struct ir3_instruction *end = NULL;
+   foreach_instr (instr, &block->instr_list) {
+      if (instr->opc == OPC_END) {
+         end = instr;
+         break;
+      }
+   }
+
+   /* The end block can be legitimately unreachable if the shader only exits via
+    * discarding. ir3_legalize will then insert a branch to the end. Keep the
+    * block around but delete all the other instructions and make the end not
+    * take any sources, so that we don't have any dangling references to other
+    * unreachable blocks.
+    */
+   if (end) {
+      foreach_instr_safe (instr, &block->instr_list) {
+         if (instr != end)
+            list_delinit(&instr->node);
+      }
+      end->srcs_count = 0;
+      return;
+   }
+
+   for (unsigned i = 0; i < 2; i++) {
+      struct ir3_block *succ = block->successors[i];
+      if (!succ)
+         continue;
+
+      unsigned pred_idx = ir3_block_get_pred_index(succ, block);
+
+      /* If this isn't the last predecessor, we swap it with the last before
+       * removing it.
+       */
+      bool swap_pred = pred_idx != succ->predecessors_count - 1;
+
+      foreach_instr (phi, &succ->instr_list) {
+         if (phi->opc != OPC_META_PHI)
+            break;
+
+         if (swap_pred)
+            phi->srcs[pred_idx] = phi->srcs[phi->srcs_count - 1];
+         phi->srcs_count--;
+      }
+      if (swap_pred) {
+         succ->predecessors[pred_idx] =
+            succ->predecessors[succ->predecessors_count - 1];
+      }
+      succ->predecessors_count--;
+   }
+
+   for (unsigned i = 0; i < 2; i++) {
+      struct ir3_block *succ = block->physical_successors[i];
+      if (!succ)
+         continue;
+
+      ir3_block_remove_physical_predecessor(succ, block);
+   }
+
+   if (block->physical_predecessors_count != 0) {
+      /* There should be only one physical predecessor, for the fallthrough
+       * edge.
+       */
+      assert(block->physical_predecessors_count == 1);
+      struct ir3_block *pred = block->physical_predecessors[0];
+      assert(block->node.next != &ir->block_list);
+      struct ir3_block *next = LIST_ENTRY(struct ir3_block, block->node.next, node);
+      if (pred->physical_successors[1] == block)
+         pred->physical_successors[1] = next;
+      else
+         pred->physical_successors[0] = next;
+      ir3_block_add_physical_predecessor(next, pred);
+   }
+}
+
+bool
+ir3_remove_unreachable(struct ir3 *ir)
+{
+   bool progress = false;
+   foreach_block_safe (block, &ir->block_list) {
+      if (block != ir3_start_block(ir) && block->predecessors_count == 0) {
+         delete_block(ir, block);
+         list_del(&block->node);
+         progress = true;
+      }
+   }
+
+   return progress;
+}
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_sched.c b/mesa 3D driver/src/freedreno/ir3/ir3_sched.c
index 12ef695482..c689374711 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_sched.c	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_sched.c	
@@ -38,15 +38,17 @@
 #define d(fmt, ...)                                                            \
    do {                                                                        \
       if (SCHED_DEBUG) {                                                       \
-         printf("SCHED: " fmt "\n", ##__VA_ARGS__);                            \
+         mesa_logi("SCHED: " fmt, ##__VA_ARGS__);                              \
       }                                                                        \
    } while (0)
 
 #define di(instr, fmt, ...)                                                    \
    do {                                                                        \
       if (SCHED_DEBUG) {                                                       \
-         printf("SCHED: " fmt ": ", ##__VA_ARGS__);                            \
-         ir3_print_instr(instr);                                               \
+         struct log_stream *stream = mesa_log_streami();                       \
+         mesa_log_stream_printf(stream, "SCHED: " fmt ": ", ##__VA_ARGS__);    \
+         ir3_print_instr_stream(stream, instr);                                \
+         mesa_log_stream_destroy(stream);                                      \
       }                                                                        \
    } while (0)
 
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_shader.c b/mesa 3D driver/src/freedreno/ir3/ir3_shader.c
index c6a7986fce..f2b5afbb0d 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_shader.c	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_shader.c	
@@ -183,7 +183,8 @@ ir3_shader_assemble(struct ir3_shader_variant *v)
     * index.
     */
    v->pvtmem_per_wave = compiler->gen >= 6 && !info->multi_dword_ldp_stp &&
-                        v->type == MESA_SHADER_COMPUTE;
+                        ((v->type == MESA_SHADER_COMPUTE) ||
+                         (v->type == MESA_SHADER_KERNEL));
 
    fixup_regfootprint(v);
 
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_shader.h b/mesa 3D driver/src/freedreno/ir3/ir3_shader.h
index 3337b6252e..4a4fd663e3 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_shader.h	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_shader.h	
@@ -45,6 +45,7 @@ enum ir3_driver_param {
    IR3_DP_NUM_WORK_GROUPS_X = 0,
    IR3_DP_NUM_WORK_GROUPS_Y = 1,
    IR3_DP_NUM_WORK_GROUPS_Z = 2,
+   IR3_DP_WORK_DIM          = 3,
    IR3_DP_BASE_GROUP_X = 4,
    IR3_DP_BASE_GROUP_Y = 5,
    IR3_DP_BASE_GROUP_Z = 6,
@@ -145,12 +146,14 @@ struct ir3_ubo_analysis_state {
  *    user consts
  *    UBO addresses
  *    SSBO sizes
+ *    image dimensions
  *    if (vertex shader) {
- *        driver params (IR3_DP_*)
+ *        driver params (IR3_DP_VS_COUNT)
  *        if (stream_output.num_outputs > 0)
  *           stream-out addresses
  *    } else if (compute_shader) {
- *        driver params (IR3_DP_*)
+ *        kernel params
+ *        driver params (IR3_DP_CS_COUNT)
  *    }
  *    immediates
  *
@@ -170,6 +173,7 @@ struct ir3_const_state {
       /* user const start at zero */
       unsigned ubo;
       unsigned image_dims;
+      unsigned kernel_params;
       unsigned driver_param;
       unsigned tfbo;
       unsigned primitive_param;
@@ -695,6 +699,7 @@ ir3_shader_stage(struct ir3_shader_variant *v)
    case MESA_SHADER_FRAGMENT:
       return "FRAG";
    case MESA_SHADER_COMPUTE:
+   case MESA_SHADER_KERNEL:
       return "CL";
    default:
       unreachable("invalid type");
@@ -738,6 +743,15 @@ struct ir3_shader {
    struct nir_shader *nir;
    struct ir3_stream_output_info stream_output;
 
+   /* per shader stage specific info: */
+   union {
+      /* for compute shaders: */
+      struct {
+         unsigned req_input_mem;    /* in dwords */
+         unsigned req_local_mem;
+      } cs;
+   };
+
    struct ir3_shader_variant *variants;
    mtx_t variants_lock;
 
@@ -770,7 +784,8 @@ ir3_max_const(const struct ir3_shader_variant *v)
 {
    const struct ir3_compiler *compiler = v->shader->compiler;
 
-   if (v->shader->type == MESA_SHADER_COMPUTE) {
+   if ((v->shader->type == MESA_SHADER_COMPUTE) ||
+       (v->shader->type == MESA_SHADER_KERNEL)) {
       return compiler->max_const_compute;
    } else if (v->key.safe_constlen) {
       return compiler->max_const_safe;
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_spill.c b/mesa 3D driver/src/freedreno/ir3/ir3_spill.c
index 2b7ff356ff..38627f4db0 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_spill.c	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_spill.c	
@@ -26,62 +26,318 @@
 #include "ir3_shader.h"
 
 /*
- * This pass does one thing so far:
+ * This pass does two things:
  *
  * 1. Calculates the maximum register pressure. To do this, we need to use the
- * exact same technique that RA uses for combining meta_split instructions
- * with their sources, so that our calculation agrees with RA.
- *
- * It will also optionally spill registers once that's implemented.
+ *    exact same technique that RA uses for combining meta_split instructions
+ *    with their sources, so that our calculation agrees with RA.
+ * 2. Spills when the register pressure is exceeded a limit calculated by RA.
+ *    The implementation is based on "Register Spilling and Live-Range Splitting
+ *    for SSA-Form Programs" by Braun and Hack, although again care has to be
+ *    taken to handle combining split/collect instructions.
  */
 
+struct reg_or_immed {
+   unsigned flags;
+   union {
+      struct ir3_register *def;
+      uint32_t uimm;
+      unsigned const_num;
+   };
+};
+
 struct ra_spill_interval {
    struct ir3_reg_interval interval;
+
+   struct rb_node node;
+   struct rb_node half_node;
+
+   /* The current SSA value/const/immed this source is mapped to. */
+   struct reg_or_immed dst;
+
+   /* When computing use distances we use the distance relative to the start
+    * of the block. So, for example, a value that's defined in cycle 5 of the
+    * block and used 6 cycles later will always have a next_use_distance of 11
+    * until we reach that use.
+    */
+   unsigned next_use_distance;
+
+   /* Whether this value was reloaded and therefore doesn't need to be
+    * spilled again. Corresponds to the S set in the paper.
+    */
+   bool already_spilled;
+
+   /* We need to add sources early for accounting purposes, but we have to
+    * insert the reload code for them last. Keep track of whether this interval
+    * needs to be reloaded later.
+    */
+   bool needs_reload;
+
+   /* Keep track of whether this interval currently can't be spilled because:
+    * - It or one of its children is a source and we're making space for
+    *   sources.
+    * - It is a destination and we're making space for destinations.
+    */
+   bool cant_spill;
+};
+
+struct ra_spill_block_state {
+   unsigned *next_use_end;
+   unsigned *next_use_start;
+
+   unsigned cycles;
+
+   /* Map from SSA def to reg_or_immed it is mapped to at the end of the block.
+    * This map only contains values which we didn't spill, so it also serves as
+    * a record of the new live-out set for this block.
+    */
+   struct hash_table *remap;
+
+   /* For blocks whose successors are visited first (i.e. loop backedges), which
+    * values should be live at the end.
+    */
+   BITSET_WORD *live_out;
+
+   bool visited;
 };
 
 struct ra_spill_ctx {
    struct ir3_reg_ctx reg_ctx;
 
-   struct ra_spill_interval *intervals;
+   struct ra_spill_interval **intervals;
+   unsigned intervals_count;
+
+   /* rb tree of live intervals that we can spill, ordered by next-use distance.
+    * full_live_intervals contains the full+shared intervals in the merged_regs
+    * case. We use this list to determine what to spill.
+    */
+   struct rb_tree full_live_intervals;
+   struct rb_tree half_live_intervals;
 
    struct ir3_pressure cur_pressure, max_pressure;
 
+   struct ir3_pressure limit_pressure;
+
+   /* When spilling, we need to reserve a register to serve as the zero'd
+    * "base". For simplicity we reserve a register at the beginning so that it's
+    * always available.
+    */
+   struct ir3_register *base_reg;
+
+   /* Current pvtmem offset in bytes. */
+   unsigned spill_slot;
+
    struct ir3_liveness *live;
 
    const struct ir3_compiler *compiler;
+
+   struct ra_spill_block_state *blocks;
+
+   bool spilling;
+
+   bool merged_regs;
 };
 
+static void
+add_base_reg(struct ra_spill_ctx *ctx, struct ir3 *ir)
+{
+   struct ir3_block *start = ir3_start_block(ir);
+
+   /* We need to stick it after any meta instructions which need to be first. */
+   struct ir3_instruction *after = NULL;
+   foreach_instr (instr, &start->instr_list) {
+      if (instr->opc != OPC_META_INPUT &&
+          instr->opc != OPC_META_TEX_PREFETCH) {
+         after = instr;
+         break;
+      }
+   }
+
+   struct ir3_instruction *mov = create_immed(start, 0);
+
+   if (after)
+      ir3_instr_move_before(mov, after);
+
+   ctx->base_reg = mov->dsts[0];
+
+   /* We don't create an interval, etc. for the base reg, so just lower the
+    * register pressure limit to account for it. We assume it's always
+    * available for simplicity.
+    */
+   ctx->limit_pressure.full -= reg_size(ctx->base_reg);
+}
+
+
+/* Compute the number of cycles per instruction used for next-use-distance
+ * analysis. This is just approximate, obviously.
+ */
+static unsigned
+instr_cycles(struct ir3_instruction *instr)
+{
+   if (instr->opc == OPC_META_PARALLEL_COPY) {
+      unsigned cycles = 0;
+      for (unsigned i = 0; i < instr->dsts_count; i++) {
+         if (!instr->srcs[i]->def ||
+             instr->srcs[i]->def->merge_set != instr->dsts[i]->merge_set) {
+            cycles += reg_elems(instr->srcs[i]);
+         }
+      }
+
+      return cycles;
+   }
+
+   if (instr->opc == OPC_META_COLLECT) {
+      unsigned cycles = 0;
+      for (unsigned i = 0; i < instr->srcs_count; i++) {
+         if (!instr->srcs[i]->def ||
+             instr->srcs[i]->def->merge_set != instr->dsts[0]->merge_set) {
+            cycles++;
+         }
+      }
+
+      return cycles;
+   }
+
+   if (is_meta(instr))
+      return 0;
+
+   return 1 + instr->repeat;
+}
+
+static bool
+compute_block_next_distance(struct ra_spill_ctx *ctx, struct ir3_block *block,
+                            unsigned *tmp_next_use)
+{
+   struct ra_spill_block_state *state = &ctx->blocks[block->index];
+   memcpy(tmp_next_use, state->next_use_end,
+          ctx->live->definitions_count * sizeof(*tmp_next_use));
+
+   unsigned cycle = state->cycles;
+   foreach_instr_rev (instr, &block->instr_list) {
+      ra_foreach_dst (dst, instr) {
+         dst->next_use = tmp_next_use[dst->name];
+      }
+
+      ra_foreach_src (src, instr) {
+         src->next_use = tmp_next_use[src->def->name];
+      }
+
+      cycle -= instr_cycles(instr);
+
+      if (instr->opc == OPC_META_PARALLEL_COPY) {
+         ra_foreach_src_n (src, i, instr) {
+            if (src->def->merge_set == instr->dsts[i]->merge_set &&
+                src->def->merge_set_offset == instr->dsts[i]->merge_set_offset) {
+               tmp_next_use[src->def->name] =
+                  tmp_next_use[instr->dsts[i]->name];
+            } else {
+               tmp_next_use[src->def->name] = cycle;
+            }
+         }
+      } else if (instr->opc != OPC_META_PHI) {
+         ra_foreach_src (src, instr) {
+            tmp_next_use[src->def->name] = cycle;
+         }
+      }
+
+      ra_foreach_dst (dst, instr) {
+         tmp_next_use[dst->name] = UINT_MAX;
+      }
+   }
+
+   memcpy(state->next_use_start, tmp_next_use,
+          ctx->live->definitions_count * sizeof(*tmp_next_use));
+
+   bool progress = false;
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      const struct ir3_block *pred = block->predecessors[i];
+      struct ra_spill_block_state *pred_state = &ctx->blocks[pred->index];
+
+      /* Add a large-enough distance in front of edges exiting the loop so that
+       * variables that are live-through the loop but not used inside it are
+       * prioritized for spilling, as per the paper. This just needs to be
+       * larger than the longest path through the loop.
+       */
+      bool loop_exit = pred->loop_depth < block->loop_depth;
+      unsigned block_distance = pred_state->cycles + (loop_exit ? 100000 : 0);
+
+      for (unsigned j = 0; j < ctx->live->definitions_count; j++) {
+         if (state->next_use_start[j] < UINT_MAX &&
+             state->next_use_start[j] + block_distance <
+             pred_state->next_use_end[j]) {
+            pred_state->next_use_end[j] = state->next_use_start[j] +
+               block_distance;
+            progress = true;
+         }
+      }
+
+      foreach_instr (phi, &block->instr_list) {
+         if (phi->opc != OPC_META_PHI)
+            break;
+         if (!phi->srcs[i]->def)
+            continue;
+         unsigned src = phi->srcs[i]->def->name;
+         if (phi->dsts[0]->next_use < UINT_MAX &&
+             phi->dsts[0]->next_use + block_distance <
+             pred_state->next_use_end[src]) {
+            pred_state->next_use_end[src] = phi->dsts[0]->next_use +
+               block_distance;
+            progress = true;
+         }
+      }
+   }
+
+   return progress;
+}
+
+static void
+compute_next_distance(struct ra_spill_ctx *ctx, struct ir3 *ir)
+{
+   for (unsigned i = 0; i < ctx->live->block_count; i++) {
+      ctx->blocks[i].next_use_start =
+         ralloc_array(ctx, unsigned, ctx->live->definitions_count);
+      ctx->blocks[i].next_use_end =
+         ralloc_array(ctx, unsigned, ctx->live->definitions_count);
+
+      for (unsigned j = 0; j < ctx->live->definitions_count; j++) {
+         ctx->blocks[i].next_use_start[j] = UINT_MAX;
+         ctx->blocks[i].next_use_end[j] = UINT_MAX;
+      }
+   }
+
+   foreach_block (block, &ir->block_list) {
+      struct ra_spill_block_state *state = &ctx->blocks[block->index];
+      state->cycles = 0;
+      foreach_instr (instr, &block->instr_list) {
+         state->cycles += instr_cycles(instr);
+         foreach_dst (dst, instr) {
+            dst->spill_slot = ~0;
+         }
+      }
+   }
+
+   unsigned *tmp_next_use =
+      ralloc_array(ctx, unsigned, ctx->live->definitions_count);
+
+   bool progress = true;
+   while (progress) {
+      progress = false;
+      foreach_block_rev (block, &ir->block_list) {
+         progress |= compute_block_next_distance(ctx, block, tmp_next_use);
+      }
+   }
+}
+
 static void
 ra_spill_interval_init(struct ra_spill_interval *interval,
                        struct ir3_register *reg)
 {
    ir3_reg_interval_init(&interval->interval, reg);
-}
-
-static void
-ra_pressure_add(struct ir3_pressure *pressure,
-                struct ra_spill_interval *interval)
-{
-   unsigned size = reg_size(interval->interval.reg);
-   if (interval->interval.reg->flags & IR3_REG_SHARED)
-      pressure->shared += size;
-   else if (interval->interval.reg->flags & IR3_REG_HALF)
-      pressure->half += size;
-   else
-      pressure->full += size;
-}
-
-static void
-ra_pressure_sub(struct ir3_pressure *pressure,
-                struct ra_spill_interval *interval)
-{
-   unsigned size = reg_size(interval->interval.reg);
-   if (interval->interval.reg->flags & IR3_REG_SHARED)
-      pressure->shared -= size;
-   else if (interval->interval.reg->flags & IR3_REG_HALF)
-      pressure->half -= size;
-   else
-      pressure->full -= size;
+   interval->dst.flags = reg->flags;
+   interval->dst.def = reg;
+   interval->already_spilled = false;
+   interval->needs_reload = false;
+   interval->cant_spill = false;
 }
 
 static struct ra_spill_interval *
@@ -90,19 +346,66 @@ ir3_reg_interval_to_interval(struct ir3_reg_interval *interval)
    return rb_node_data(struct ra_spill_interval, interval, interval);
 }
 
+static struct ra_spill_interval *
+ra_spill_interval_root(struct ra_spill_interval *interval)
+{
+   struct ir3_reg_interval *ir3_interval = &interval->interval;
+   while (ir3_interval->parent)
+      ir3_interval = ir3_interval->parent;
+   return ir3_reg_interval_to_interval(ir3_interval);
+}
+
 static struct ra_spill_ctx *
 ir3_reg_ctx_to_ctx(struct ir3_reg_ctx *ctx)
 {
    return rb_node_data(struct ra_spill_ctx, ctx, reg_ctx);
 }
 
+static int
+ra_spill_interval_cmp(const struct rb_node *_a, const struct rb_node *_b)
+{
+   const struct ra_spill_interval *a =
+      rb_node_data(const struct ra_spill_interval, _a, node);
+   const struct ra_spill_interval *b =
+      rb_node_data(const struct ra_spill_interval, _b, node);
+   return a->next_use_distance - b->next_use_distance;
+}
+
+static int
+ra_spill_interval_half_cmp(const struct rb_node *_a, const struct rb_node *_b)
+{
+   const struct ra_spill_interval *a =
+      rb_node_data(const struct ra_spill_interval, _a, half_node);
+   const struct ra_spill_interval *b =
+      rb_node_data(const struct ra_spill_interval, _b, half_node);
+   return a->next_use_distance - b->next_use_distance;
+}
+
 static void
 interval_add(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
 {
    struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
    struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
 
-   ra_pressure_add(&ctx->cur_pressure, interval);
+   unsigned size = reg_size(interval->interval.reg);
+   if (interval->interval.reg->flags & IR3_REG_SHARED) {
+      ctx->cur_pressure.shared += size;
+   } else {
+      if (interval->interval.reg->flags & IR3_REG_HALF) {
+         ctx->cur_pressure.half += size;
+         if (ctx->spilling) {
+            rb_tree_insert(&ctx->half_live_intervals, &interval->half_node,
+                           ra_spill_interval_half_cmp);
+         }
+      }
+      if (ctx->merged_regs || !(interval->interval.reg->flags & IR3_REG_HALF)) {
+         ctx->cur_pressure.full += size;
+         if (ctx->spilling) {
+            rb_tree_insert(&ctx->full_live_intervals, &interval->node,
+                           ra_spill_interval_cmp);
+         }
+      }
+   }
 }
 
 static void
@@ -111,7 +414,23 @@ interval_delete(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
    struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
    struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
 
-   ra_pressure_sub(&ctx->cur_pressure, interval);
+   unsigned size = reg_size(interval->interval.reg);
+   if (interval->interval.reg->flags & IR3_REG_SHARED) {
+      ctx->cur_pressure.shared -= size;
+   } else {
+      if (interval->interval.reg->flags & IR3_REG_HALF) {
+         ctx->cur_pressure.half -= size;
+         if (ctx->spilling) {
+            rb_tree_remove(&ctx->half_live_intervals, &interval->half_node);
+         }
+      }
+      if (ctx->merged_regs || !(interval->interval.reg->flags & IR3_REG_HALF)) {
+         ctx->cur_pressure.full -= size;
+         if (ctx->spilling) {
+            rb_tree_remove(&ctx->full_live_intervals, &interval->node);
+         }
+      }
+   }
 }
 
 static void
@@ -122,8 +441,22 @@ interval_readd(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_parent,
 }
 
 static void
-spill_ctx_init(struct ra_spill_ctx *ctx)
+spill_ctx_init(struct ra_spill_ctx *ctx, struct ir3_shader_variant *v,
+               struct ir3_liveness *live)
 {
+   ctx->live = live;
+   ctx->intervals = ralloc_array(ctx, struct ra_spill_interval *,
+                                 ctx->live->definitions_count);
+   struct ra_spill_interval *intervals =
+      rzalloc_array(ctx, struct ra_spill_interval,
+                    ctx->live->definitions_count);
+   for (unsigned i = 0; i < ctx->live->definitions_count; i++)
+      ctx->intervals[i] = &intervals[i];
+
+   ctx->intervals_count = ctx->live->definitions_count;
+   ctx->compiler = v->shader->compiler;
+   ctx->merged_regs = v->mergedregs;
+
    rb_tree_init(&ctx->reg_ctx.intervals);
    ctx->reg_ctx.interval_add = interval_add;
    ctx->reg_ctx.interval_delete = interval_delete;
@@ -147,18 +480,21 @@ ra_spill_ctx_remove(struct ra_spill_ctx *ctx,
 static void
 init_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
 {
-   struct ra_spill_interval *interval = &ctx->intervals[dst->name];
+   struct ra_spill_interval *interval = ctx->intervals[dst->name];
    ra_spill_interval_init(interval, dst);
+   if (ctx->spilling)
+      interval->next_use_distance = dst->next_use;
 }
 
 static void
 insert_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
 {
-   struct ra_spill_interval *interval = &ctx->intervals[dst->name];
+   struct ra_spill_interval *interval = ctx->intervals[dst->name];
    if (interval->interval.inserted)
       return;
 
    ra_spill_ctx_insert(ctx, interval);
+   interval->cant_spill = true;
 
    /* For precolored inputs, make sure we leave enough registers to allow for
     * holes in the inputs. It can happen that the binning shader has a lower
@@ -179,14 +515,26 @@ insert_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
    }
 }
 
+static void
+insert_src(struct ra_spill_ctx *ctx, struct ir3_register *src)
+{
+   struct ra_spill_interval *interval = ctx->intervals[src->def->name];
+
+   ra_spill_interval_root(interval)->cant_spill = true;
+
+   if (interval->interval.inserted)
+      return;
+
+   ra_spill_ctx_insert(ctx, interval);
+   interval->needs_reload = true;
+   interval->already_spilled = true;
+}
+
 static void
 remove_src_early(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
                  struct ir3_register *src)
 {
-   if (!(src->flags & IR3_REG_FIRST_KILL))
-      return;
-
-   struct ra_spill_interval *interval = &ctx->intervals[src->def->name];
+   struct ra_spill_interval *interval = ctx->intervals[src->def->name];
 
    if (!interval->interval.inserted || interval->interval.parent ||
        !rb_tree_is_empty(&interval->interval.children))
@@ -199,10 +547,7 @@ static void
 remove_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
            struct ir3_register *src)
 {
-   if (!(src->flags & IR3_REG_FIRST_KILL))
-      return;
-
-   struct ra_spill_interval *interval = &ctx->intervals[src->def->name];
+   struct ra_spill_interval *interval = ctx->intervals[src->def->name];
 
    if (!interval->interval.inserted)
       return;
@@ -210,10 +555,17 @@ remove_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
    ra_spill_ctx_remove(ctx, interval);
 }
 
+static void
+finish_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
+{
+   struct ra_spill_interval *interval = ctx->intervals[dst->name];
+   interval->cant_spill = false;
+}
+
 static void
 remove_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
 {
-   struct ra_spill_interval *interval = &ctx->intervals[dst->name];
+   struct ra_spill_interval *interval = ctx->intervals[dst->name];
 
    if (!interval->interval.inserted)
       return;
@@ -221,6 +573,361 @@ remove_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
    ra_spill_ctx_remove(ctx, interval);
 }
 
+static void
+update_src_next_use(struct ra_spill_ctx *ctx, struct ir3_register *src)
+{
+   struct ra_spill_interval *interval = ctx->intervals[src->def->name];
+
+   assert(interval->interval.inserted);
+
+   interval->next_use_distance = src->next_use;
+
+   /* If this node is inserted in one of the trees, then it needs to be resorted
+    * as its key has changed.
+    */
+   if (!interval->interval.parent && !(src->flags & IR3_REG_SHARED)) {
+      if (src->flags & IR3_REG_HALF) {
+         rb_tree_remove(&ctx->half_live_intervals, &interval->half_node);
+         rb_tree_insert(&ctx->half_live_intervals, &interval->half_node,
+                        ra_spill_interval_half_cmp);
+      }
+      if (ctx->merged_regs || !(src->flags & IR3_REG_HALF)) {
+         rb_tree_remove(&ctx->full_live_intervals, &interval->node);
+         rb_tree_insert(&ctx->full_live_intervals, &interval->node,
+                        ra_spill_interval_cmp);
+      }
+   }
+}
+
+static unsigned
+get_spill_slot(struct ra_spill_ctx *ctx, struct ir3_register *reg)
+{
+   if (reg->merge_set) {
+      if (reg->merge_set->spill_slot == ~0) {
+         reg->merge_set->spill_slot = ALIGN_POT(ctx->spill_slot,
+                                                reg->merge_set->alignment);
+         ctx->spill_slot = reg->merge_set->spill_slot + reg->merge_set->size * 2;
+      }
+      return reg->merge_set->spill_slot + reg->merge_set_offset * 2;
+   } else {
+      if (reg->spill_slot == ~0) {
+         reg->spill_slot = ALIGN_POT(ctx->spill_slot, reg_elem_size(reg));
+         ctx->spill_slot = reg->spill_slot + reg_size(reg) * 2;
+      }
+      return reg->spill_slot;
+   }
+}
+
+static void
+set_src_val(struct ir3_register *src, const struct reg_or_immed *val)
+{
+   if (val->flags & IR3_REG_IMMED) {
+      src->flags = IR3_REG_IMMED | (val->flags & IR3_REG_HALF);
+      src->uim_val = val->uimm;
+      src->def = NULL;
+   } else if (val->flags & IR3_REG_CONST) {
+      src->flags = IR3_REG_CONST | (val->flags & IR3_REG_HALF);
+      src->num = val->const_num;
+      src->def = NULL;
+   } else {
+      src->def = val->def;
+   }
+}
+
+static struct ir3_register *
+materialize_pcopy_src(const struct reg_or_immed *src,
+                      struct ir3_instruction *instr,
+                      struct ir3_block *block)
+{
+   struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1);
+   struct ir3_register *dst = __ssa_dst(mov);
+   dst->flags |= src->flags & IR3_REG_HALF;
+   struct ir3_register *mov_src = ir3_src_create(mov, INVALID_REG, src->flags);
+   set_src_val(mov_src, src);
+   mov->cat1.src_type = mov->cat1.dst_type =
+      (src->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+
+   if (instr)
+      ir3_instr_move_before(mov, instr);
+   return dst;
+}
+
+static void
+spill(struct ra_spill_ctx *ctx, const struct reg_or_immed *val,
+      unsigned spill_slot, struct ir3_instruction *instr, struct ir3_block *block)
+{
+   struct ir3_register *reg;
+
+   /* If spilling an immed/const pcopy src, we need to actually materialize it
+    * first with a mov.
+    */
+   if (val->flags & (IR3_REG_CONST | IR3_REG_IMMED)) {
+      reg = materialize_pcopy_src(val, instr, block);
+   } else {
+      reg = val->def;
+   }
+
+   d("spilling ssa_%u:%u to %u", reg->instr->serialno, reg->name,
+     spill_slot);
+
+   unsigned elems = reg_elems(reg);
+   struct ir3_instruction *spill =
+      ir3_instr_create(block, OPC_SPILL_MACRO, 0, 3);
+   ir3_src_create(spill, INVALID_REG, ctx->base_reg->flags)->def = ctx->base_reg;
+   unsigned src_flags = reg->flags & (IR3_REG_HALF | IR3_REG_IMMED |
+                                      IR3_REG_CONST | IR3_REG_SSA |
+                                      IR3_REG_ARRAY);
+   struct ir3_register *src = ir3_src_create(spill, INVALID_REG, src_flags);
+   ir3_src_create(spill, INVALID_REG, IR3_REG_IMMED)->uim_val = elems;
+   spill->cat6.dst_offset = spill_slot;
+   spill->cat6.type = (reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+   
+   src->def = reg;
+   if (reg->flags & IR3_REG_ARRAY) {
+      src->size = reg->size;
+      src->array.id = reg->array.id;
+      src->array.offset = 0;
+   } else {
+      src->wrmask = reg->wrmask;
+   }
+
+   if (instr)
+      ir3_instr_move_before(spill, instr);
+}
+
+static void
+spill_interval(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval,
+               struct ir3_instruction *instr, struct ir3_block *block)
+{
+   spill(ctx, &interval->dst, get_spill_slot(ctx, interval->interval.reg),
+         instr, block);
+}
+
+/* This is similar to "limit" in the paper. */
+static void
+limit(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
+{
+   if (ctx->cur_pressure.half > ctx->limit_pressure.half) {
+      d("cur half pressure %u exceeds %u", ctx->cur_pressure.half,
+        ctx->limit_pressure.half);
+      rb_tree_foreach_safe (struct ra_spill_interval, interval,
+                            &ctx->half_live_intervals, half_node) {
+         d("trying ssa_%u:%u", interval->interval.reg->instr->serialno,
+           interval->interval.reg->name);
+         if (!interval->cant_spill) {
+            if (!interval->already_spilled)
+               spill_interval(ctx, interval, instr, instr->block);
+            ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
+            if (ctx->cur_pressure.half <= ctx->limit_pressure.half)
+               break;
+         }
+      }
+
+      assert(ctx->cur_pressure.half <= ctx->limit_pressure.half);
+   }
+
+   if (ctx->cur_pressure.full > ctx->limit_pressure.full) {
+      d("cur full pressure %u exceeds %u", ctx->cur_pressure.full,
+        ctx->limit_pressure.full);
+      rb_tree_foreach_safe (struct ra_spill_interval, interval,
+                            &ctx->full_live_intervals, node) {
+         d("trying ssa_%u:%u", interval->interval.reg->instr->serialno,
+           interval->interval.reg->name);
+         if (!interval->cant_spill) {
+            if (!interval->already_spilled)
+               spill_interval(ctx, interval, instr, instr->block);
+            ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
+            if (ctx->cur_pressure.full <= ctx->limit_pressure.full)
+               break;
+         } else {
+            d("can't spill");
+         }
+      }
+
+      assert(ctx->cur_pressure.full <= ctx->limit_pressure.full);
+   }
+}
+
+/* There's a corner case where we reload a value which has overlapping live
+ * values already reloaded, either because it's the child of some other interval
+ * that was already reloaded or some of its children have already been
+ * reloaded. Because RA only expects overlapping source/dest intervals for meta
+ * instructions (split/collect), and we don't want to add register pressure by
+ * creating an entirely separate value, we need to add splits and collects to
+ * deal with this case. These splits/collects have to also have correct merge
+ * set information, so that it doesn't result in any actual code or register
+ * pressure in practice.
+ */
+
+static void
+add_to_merge_set(struct ir3_merge_set *set, struct ir3_register *def,
+                 unsigned offset)
+{
+   def->merge_set = set;
+   def->merge_set_offset = offset;
+   def->interval_start = set->interval_start + offset;
+   def->interval_end = set->interval_start + offset + reg_size(def);
+}
+
+static struct ir3_register *
+split(struct ir3_register *def, unsigned offset,
+      struct ir3_instruction *after, struct ir3_block *block)
+{
+   if (reg_elems(def) == 1) {
+      assert(offset == 0);
+      return def;
+   }
+
+   assert(!(def->flags & IR3_REG_ARRAY));
+   assert(def->merge_set);
+   struct ir3_instruction *split =
+      ir3_instr_create(after->block, OPC_META_SPLIT, 1, 1);
+   struct ir3_register *dst = __ssa_dst(split);
+   dst->flags |= def->flags & IR3_REG_HALF;
+   struct ir3_register *src = ir3_src_create(split, INVALID_REG, def->flags);
+   src->wrmask = def->wrmask;
+   src->def = def;
+   add_to_merge_set(def->merge_set, dst,
+                    def->merge_set_offset + offset * reg_elem_size(def));
+   if (after)
+      ir3_instr_move_before(split, after);
+   return dst;
+}
+
+static struct ir3_register *
+extract(struct ir3_register *parent_def, unsigned offset, unsigned elems,
+        struct ir3_instruction *after, struct ir3_block *block)
+{
+   if (offset == 0 && elems == reg_elems(parent_def))
+      return parent_def;
+
+   struct ir3_instruction *collect =
+      ir3_instr_create(after->block, OPC_META_COLLECT, 1, elems);
+   struct ir3_register *dst = __ssa_dst(collect);
+   dst->flags |= parent_def->flags & IR3_REG_HALF;
+   dst->wrmask = MASK(elems);
+   add_to_merge_set(parent_def->merge_set, dst, parent_def->merge_set_offset);
+
+   for (unsigned i = 0; i < elems; i++) {
+      ir3_src_create(collect, INVALID_REG, parent_def->flags)->def =
+         split(parent_def, offset + i, after, block);
+   }
+
+   if (after)
+      ir3_instr_move_before(collect, after);
+   return dst;
+}
+
+static struct ir3_register *
+reload(struct ra_spill_ctx *ctx, struct ir3_register *reg,
+       struct ir3_instruction *after, struct ir3_block *block)
+{
+   unsigned spill_slot = get_spill_slot(ctx, reg);
+
+   d("reloading ssa_%u:%u from %u", reg->instr->serialno, reg->name,
+     spill_slot);
+
+   unsigned elems = reg_elems(reg);
+   struct ir3_instruction *reload =
+      ir3_instr_create(block, OPC_RELOAD_MACRO, 1, 3);
+   struct ir3_register *dst = __ssa_dst(reload);
+   dst->flags |= reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
+   ir3_src_create(reload, INVALID_REG, ctx->base_reg->flags)->def = ctx->base_reg;
+   struct ir3_register *offset_reg =
+      ir3_src_create(reload, INVALID_REG, IR3_REG_IMMED);
+   offset_reg->uim_val = spill_slot;
+   ir3_src_create(reload, INVALID_REG, IR3_REG_IMMED)->uim_val = elems;
+   reload->cat6.type = (reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+
+   if (reg->flags & IR3_REG_ARRAY) {
+      dst->array.offset = 0;
+      dst->array.id = reg->array.id;
+      dst->size = reg->size;
+   } else {
+      dst->wrmask = MASK(elems);
+   }
+
+   dst->merge_set = reg->merge_set;
+   dst->merge_set_offset = reg->merge_set_offset;
+   dst->interval_start = reg->interval_start;
+   dst->interval_end = reg->interval_end;
+
+   if (after)
+      ir3_instr_move_before(reload, after);
+
+   return dst;
+}
+
+static void
+rewrite_src_interval(struct ra_spill_ctx *ctx,
+                    struct ra_spill_interval *interval,
+                    struct ir3_register *def,
+                    struct ir3_instruction *instr,
+                    struct ir3_block *block)
+{
+   interval->dst.flags = def->flags;
+   interval->dst.def = def;
+   interval->needs_reload = false;
+
+   rb_tree_foreach (struct ra_spill_interval, child, 
+                    &interval->interval.children, interval.node) {
+      struct ir3_register *child_reg = child->interval.reg;
+      struct ir3_register *child_def =
+         extract(def, (child_reg->interval_start -
+                       interval->interval.reg->interval_start) / reg_elem_size(def),
+                 reg_elems(child_reg), instr, block);
+      rewrite_src_interval(ctx, child, child_def, instr, block);
+   }
+}
+
+static void
+reload_def(struct ra_spill_ctx *ctx, struct ir3_register *def,
+           struct ir3_instruction *instr, struct ir3_block *block)
+{
+   unsigned elems = reg_elems(def);
+   struct ra_spill_interval *interval = ctx->intervals[def->name];
+
+   struct ir3_reg_interval *ir3_parent = interval->interval.parent;
+
+   if (ir3_parent) {
+      struct ra_spill_interval *parent =
+         ir3_reg_interval_to_interval(ir3_parent);
+      if (!parent->needs_reload) {
+         interval->dst.flags = def->flags;
+         interval->dst.def = extract(
+            parent->dst.def, (def->interval_start - parent->dst.def->interval_start) /
+            reg_elem_size(def), elems, instr, block);
+         return;
+      }
+   }
+
+   struct ir3_register *dst = reload(ctx, def, instr, block);
+
+   rewrite_src_interval(ctx, interval, dst, instr, block);
+}
+
+static void
+reload_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
+            struct ir3_register *src)
+{
+   struct ra_spill_interval *interval = ctx->intervals[src->def->name];
+
+   if (interval->needs_reload) {
+      reload_def(ctx, src->def, instr, instr->block);
+   }
+
+   ra_spill_interval_root(interval)->cant_spill = false;
+}
+
+static void
+rewrite_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
+            struct ir3_register *src)
+{
+   struct ra_spill_interval *interval = ctx->intervals[src->def->name];
+
+   set_src_val(src, &interval->dst);
+}
+
 static void
 update_max_pressure(struct ra_spill_ctx *ctx)
 {
@@ -240,15 +947,15 @@ update_max_pressure(struct ra_spill_ctx *ctx)
 static void
 handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
 {
-   if (RA_DEBUG) {
-      printf("processing: ");
-      ir3_print_instr(instr);
-   }
-
    ra_foreach_dst (dst, instr) {
       init_dst(ctx, dst);
    }
 
+   if (ctx->spilling) {
+      ra_foreach_src (src, instr)
+         insert_src(ctx, src);
+   }
+
    /* Handle tied destinations. If a destination is tied to a source and that
     * source is live-through, then we need to allocate a new register for the
     * destination which is live-through itself and cannot overlap the
@@ -261,7 +968,17 @@ handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
          insert_dst(ctx, dst);
    }
 
-   update_max_pressure(ctx);
+   if (ctx->spilling)
+      limit(ctx, instr);
+   else
+      update_max_pressure(ctx);
+
+   if (ctx->spilling) {
+      ra_foreach_src (src, instr) {
+         reload_src(ctx, instr, src);
+         update_src_next_use(ctx, src);
+      }
+   }
 
    ra_foreach_src (src, instr) {
       if (src->flags & IR3_REG_FIRST_KILL)
@@ -272,13 +989,29 @@ handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
       insert_dst(ctx, dst);
    }
 
-   update_max_pressure(ctx);
+   if (ctx->spilling)
+      limit(ctx, instr);
+   else
+      update_max_pressure(ctx);
 
-   for (unsigned i = 0; i < instr->srcs_count; i++) {
-      if (ra_reg_is_src(instr->srcs[i]) &&
-          (instr->srcs[i]->flags & IR3_REG_FIRST_KILL))
-         remove_src(ctx, instr, instr->srcs[i]);
+   /* We have to remove sources before rewriting them so that we can lookup the
+    * interval to remove before the source itself is changed.
+    */
+   ra_foreach_src (src, instr) {
+      if (src->flags & IR3_REG_FIRST_KILL)
+         remove_src(ctx, instr, src);
    }
+
+   if (ctx->spilling) {
+      ra_foreach_src (src, instr) {
+         rewrite_src(ctx, instr, src);
+      }
+   }
+
+   ra_foreach_dst (dst, instr) {
+      finish_dst(ctx, dst);
+   }
+
    for (unsigned i = 0; i < instr->dsts_count; i++) {
       if (ra_reg_is_dst(instr->dsts[i]) &&
           (instr->dsts[i]->flags & IR3_REG_UNUSED))
@@ -286,28 +1019,672 @@ handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
    }
 }
 
+static struct ra_spill_interval *
+create_temp_interval(struct ra_spill_ctx *ctx, struct ir3_register *def)
+{
+   unsigned name = ctx->intervals_count++;
+   unsigned offset = ctx->live->interval_offset;
+
+   /* This is kinda hacky, but we need to create a fake SSA def here that is
+    * only used as part of the pcopy accounting. See below.
+    */
+   struct ir3_register *reg = rzalloc(ctx, struct ir3_register);
+   *reg = *def;
+   reg->name = name;
+   reg->interval_start = offset;
+   reg->interval_end = offset + reg_size(def);
+   reg->merge_set = NULL;
+
+   ctx->intervals = reralloc(ctx, ctx->intervals, struct ra_spill_interval *,
+                             ctx->intervals_count); 
+   struct ra_spill_interval *interval = rzalloc(ctx, struct ra_spill_interval);
+   ra_spill_interval_init(interval, reg);
+   ctx->intervals[name] = interval;
+   ctx->live->interval_offset += reg_size(def);
+   return interval;
+}
+
+/* In the sequence of copies generated (see below), would this source be killed?
+ */
+static bool
+is_last_pcopy_src(struct ir3_instruction *pcopy, unsigned src_n)
+{
+   struct ir3_register *src = pcopy->srcs[src_n];
+   if (!(src->flags & IR3_REG_KILL))
+      return false;
+   for (unsigned j = src_n + 1; j < pcopy->srcs_count; j++) {
+      if (pcopy->srcs[j]->def == src->def)
+         return false;
+   }
+   return true;
+}
+
+/* Parallel copies are different from normal instructions. The sources together
+ * may be larger than the entire register file, so we cannot just reload every
+ * source like normal, and indeed that probably wouldn't be a great idea.
+ * Instead we essentially need to lower the parallel copy to "copies," just like
+ * in the normal CSSA construction, although we implement the copies by
+ * reloading and then possibly spilling values. We essentially just shuffle
+ * around the sources until each source either (a) is live or (b) has the same
+ * spill slot as its corresponding destination. We do this by decomposing the
+ * copy into a series of copies, so:
+ *
+ * a, b, c = d, e, f
+ *
+ * becomes:
+ *
+ * d' = d
+ * e' = e
+ * f' = f
+ * a = d'
+ * b = e'
+ * c = f'
+ *
+ * the temporary SSA values d', e', and f' never actually show up in the result.
+ * They are only used for our internal accounting. They may, however, have their
+ * own spill slot created for them. Similarly, we don't actually emit any copy
+ * instructions, although we emit the spills/reloads that *would've* been
+ * required if those copies were there.
+ *
+ * TODO: in order to reduce the number of temporaries and therefore spill slots,
+ * we could instead do a more complicated analysis that considers the location
+ * transfer graph.
+ *
+ * In addition, we actually remove the parallel copy and rewrite all its uses
+ * (in the phi nodes) rather than rewrite its sources at the end. Recreating it
+ * later turns out to be easier than keeping it up-to-date throughout this pass,
+ * since we may have to remove entries for phi sources that are spilled and add
+ * entries for live-outs that are spilled and reloaded, which can happen here
+ * and then possibly be undone or done again when processing live-ins of the
+ * successor block.
+ */
+
+static void
+handle_pcopy(struct ra_spill_ctx *ctx, struct ir3_instruction *pcopy)
+{
+   foreach_dst (dst, pcopy) {
+      struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
+      ra_spill_interval_init(dst_interval, dst);
+   }
+
+   foreach_src_n (src, i, pcopy) {
+      d("processing src %u", i);
+      struct ir3_register *dst = pcopy->dsts[i];
+
+      /* Skip the intermediate copy for cases where the source is merged with
+       * the destination. Crucially this means that we also don't reload/spill
+       * it if it's been spilled, because it shares the same spill slot.
+       */
+      if (src->def && src->def->merge_set &&
+          src->def->merge_set == dst->merge_set &&
+          src->def->merge_set_offset == dst->merge_set_offset) {
+         struct ra_spill_interval *src_interval = ctx->intervals[src->def->name];
+         struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
+         if (src_interval->interval.inserted) {
+            update_src_next_use(ctx, src);
+            if (is_last_pcopy_src(pcopy, i))
+               ra_spill_ctx_remove(ctx, src_interval);
+            dst_interval->cant_spill = true;
+            ra_spill_ctx_insert(ctx, dst_interval);
+            limit(ctx, pcopy);
+            dst_interval->cant_spill = false;
+            dst_interval->dst = src_interval->dst;
+         }
+      } else if (src->def) {
+         struct ra_spill_interval *temp_interval =
+            create_temp_interval(ctx, dst);
+         struct ir3_register *temp = temp_interval->interval.reg;
+         temp_interval->next_use_distance = src->next_use;
+
+         insert_src(ctx, src);
+         limit(ctx, pcopy);
+         reload_src(ctx, pcopy, src);
+         update_src_next_use(ctx, src);
+         if (is_last_pcopy_src(pcopy, i))
+            remove_src(ctx, pcopy, src);
+         struct ra_spill_interval *src_interval =
+            ctx->intervals[src->def->name];
+         temp_interval->dst = src_interval->dst;
+
+         temp_interval->cant_spill = true;
+         ra_spill_ctx_insert(ctx, temp_interval);
+         limit(ctx, pcopy);
+         temp_interval->cant_spill = false;
+
+         src->flags = temp->flags;
+         src->def = temp;
+      }
+   }
+
+   d("done with pcopy srcs");
+
+   foreach_src_n (src, i, pcopy) {
+      struct ir3_register *dst = pcopy->dsts[i];
+
+      if (src->def && src->def->merge_set &&
+          src->def->merge_set == dst->merge_set &&
+          src->def->merge_set_offset == dst->merge_set_offset)
+         continue;
+
+      struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
+
+      if (!src->def) {
+         dst_interval->cant_spill = true;
+         ra_spill_ctx_insert(ctx, dst_interval);
+         limit(ctx, pcopy);
+         dst_interval->cant_spill = false;
+
+         assert(src->flags & (IR3_REG_CONST | IR3_REG_IMMED));
+         if (src->flags & IR3_REG_CONST) {
+            dst_interval->dst.flags = src->flags;
+            dst_interval->dst.const_num = src->num;
+         } else {
+            dst_interval->dst.flags = src->flags;
+            dst_interval->dst.uimm = src->uim_val;
+         }
+      } else {
+         struct ra_spill_interval *temp_interval = ctx->intervals[src->def->name];
+
+         insert_src(ctx, src);
+         limit(ctx, pcopy);
+         reload_src(ctx, pcopy, src);
+         remove_src(ctx, pcopy, src);
+
+         dst_interval->dst = temp_interval->dst;
+         ra_spill_ctx_insert(ctx, dst_interval);
+      }
+   }
+
+   pcopy->flags |= IR3_INSTR_UNUSED;
+}
+
 static void
 handle_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
 {
    init_dst(ctx, instr->dsts[0]);
    insert_dst(ctx, instr->dsts[0]);
+   finish_dst(ctx, instr->dsts[0]);
 }
 
 static void
 remove_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
 {
-   ra_foreach_src (src, instr)
-      remove_src(ctx, instr, src);
+   if (instr->opc == OPC_META_TEX_PREFETCH) {
+      ra_foreach_src (src, instr)
+         remove_src(ctx, instr, src);
+   }
    if (instr->dsts[0]->flags & IR3_REG_UNUSED)
       remove_dst(ctx, instr->dsts[0]);
 }
 
 static void
-handle_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def)
+handle_live_in(struct ra_spill_ctx *ctx, struct ir3_block *block,
+               struct ir3_register *def)
 {
-   struct ra_spill_interval *interval = &ctx->intervals[def->name];
+   struct ra_spill_interval *interval = ctx->intervals[def->name];
    ra_spill_interval_init(interval, def);
-   insert_dst(ctx, def);
+   if (ctx->spilling) {
+      interval->next_use_distance =
+         ctx->blocks[block->index].next_use_start[def->name];
+   }
+
+   ra_spill_ctx_insert(ctx, interval);
+}
+
+static bool
+is_live_in_phi(struct ir3_register *def, struct ir3_block *block)
+{
+   return def->instr->opc == OPC_META_PHI && def->instr->block == block;
+}
+
+static bool
+is_live_in_pred(struct ra_spill_ctx *ctx, struct ir3_register *def,
+                struct ir3_block *block, unsigned pred_idx)
+{
+   struct ir3_block *pred = block->predecessors[pred_idx];
+   struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+   if (is_live_in_phi(def, block)) {
+      def = def->instr->srcs[pred_idx]->def;
+      if (!def)
+         return false;
+   }
+
+   return _mesa_hash_table_search(state->remap, def);
+}
+
+static bool
+is_live_in_undef(struct ir3_register *def,
+                 struct ir3_block *block, unsigned pred_idx)
+{
+   if (!is_live_in_phi(def, block))
+      return false;
+
+   return !def->instr->srcs[pred_idx]->def;
+}
+
+static struct reg_or_immed *
+read_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
+             struct ir3_block *block, unsigned pred_idx)
+{
+   struct ir3_block *pred = block->predecessors[pred_idx];
+   struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+
+   if (is_live_in_phi(def, block)) {
+      def = def->instr->srcs[pred_idx]->def;
+      if (!def)
+         return NULL;
+   }
+
+   struct hash_entry *entry = _mesa_hash_table_search(state->remap, def);
+   if (entry)
+      return entry->data;
+   else
+      return NULL;
+}
+
+static bool
+is_live_in_all_preds(struct ra_spill_ctx *ctx, struct ir3_register *def,
+                     struct ir3_block *block)
+{
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      if (!is_live_in_pred(ctx, def, block, i))
+         return false;
+   }
+
+   return true;
+}
+
+static void
+spill_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
+              struct ir3_block *block)
+{
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *pred = block->predecessors[i];
+      struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+
+      if (!state->visited)
+         continue;
+
+      struct reg_or_immed *pred_def = read_live_in(ctx, def, block, i);
+      if (pred_def) {
+         spill(ctx, pred_def, get_spill_slot(ctx, def), NULL, pred);
+      }
+   }
+}
+
+static void
+spill_live_ins(struct ra_spill_ctx *ctx, struct ir3_block *block)
+{
+   bool all_preds_visited = true;
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *pred = block->predecessors[i];
+      struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+      if (!state->visited) {
+         all_preds_visited = false;
+         break;
+      }
+   }
+
+   /* Note: in the paper they explicitly spill live-through values first, but we
+    * should be doing that automatically by virtue of picking the largest
+    * distance due to the extra distance added to edges out of loops.
+    *
+    * TODO: Keep track of pressure in each block and preemptively spill
+    * live-through values as described in the paper to avoid spilling them
+    * inside the loop.
+    */
+
+   if (ctx->cur_pressure.half > ctx->limit_pressure.half) {
+      rb_tree_foreach_safe (struct ra_spill_interval, interval,
+                            &ctx->half_live_intervals, half_node) {
+         if (all_preds_visited &&
+             is_live_in_all_preds(ctx, interval->interval.reg, block))
+            continue;
+         spill_live_in(ctx, interval->interval.reg, block);
+         ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
+         if (ctx->cur_pressure.half <= ctx->limit_pressure.half)
+            break;
+      }
+   }
+
+   if (ctx->cur_pressure.full > ctx->limit_pressure.full) {
+      rb_tree_foreach_safe (struct ra_spill_interval, interval,
+                            &ctx->full_live_intervals, node) {
+         if (all_preds_visited &&
+             is_live_in_all_preds(ctx, interval->interval.reg, block))
+            continue;
+         spill_live_in(ctx, interval->interval.reg, block);
+         ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
+         if (ctx->cur_pressure.full <= ctx->limit_pressure.full)
+            break;
+      }
+   }
+}
+
+static void
+live_in_rewrite(struct ra_spill_ctx *ctx,
+                struct ra_spill_interval *interval,
+                struct reg_or_immed *new_val,
+                struct ir3_block *block, unsigned pred_idx)
+{
+   struct ir3_block *pred = block->predecessors[pred_idx];
+   struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+   struct ir3_register *def = interval->interval.reg;
+   if (is_live_in_phi(def, block)) {
+      def = def->instr->srcs[pred_idx]->def;
+   }
+
+   if (def)
+      _mesa_hash_table_insert(state->remap, def, new_val);
+
+   rb_tree_foreach (struct ra_spill_interval, child,
+                    &interval->interval.children, interval.node) {
+      assert(new_val->flags & IR3_REG_SSA);
+      struct ir3_register *child_def =
+         extract(new_val->def,
+                 (child->interval.reg->interval_start - def->interval_start) /
+                 reg_elem_size(def), reg_elems(child->interval.reg),
+                 NULL, pred);
+      struct reg_or_immed *child_val = ralloc(ctx, struct reg_or_immed);
+      child_val->def = child_def;
+      child_val->flags = child_def->flags;
+      live_in_rewrite(ctx, child, child_val, block, pred_idx);
+   }
+}
+
+static void
+reload_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
+               struct ir3_block *block)
+{
+   struct ra_spill_interval *interval = ctx->intervals[def->name];
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *pred = block->predecessors[i];
+      struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+      if (!state->visited)
+         continue;
+
+      if (is_live_in_undef(def, block, i))
+         continue;
+
+      struct reg_or_immed *new_val = read_live_in(ctx, def, block, i);
+
+      if (!new_val) {
+         new_val = ralloc(ctx, struct reg_or_immed);
+         new_val->def = reload(ctx, def, NULL, pred);
+         new_val->flags = new_val->def->flags;
+      }
+      live_in_rewrite(ctx, interval, new_val, block, i);
+   }
+}
+
+static void
+reload_live_ins(struct ra_spill_ctx *ctx, struct ir3_block *block)
+{
+   rb_tree_foreach (struct ra_spill_interval, interval, &ctx->reg_ctx.intervals,
+                    interval.node) {
+      reload_live_in(ctx, interval->interval.reg, block);
+   }
+}
+
+static void
+add_live_in_phi(struct ra_spill_ctx *ctx, struct ir3_register *def,
+                struct ir3_block *block)
+{
+   struct ra_spill_interval *interval = ctx->intervals[def->name];
+   if (!interval->interval.inserted)
+      return;
+
+   bool needs_phi = false;
+   struct ir3_register *cur_def = NULL;
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *pred = block->predecessors[i];
+      struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+
+      if (!state->visited) {
+         needs_phi = true;
+         break;
+      }
+
+      struct hash_entry *entry =
+         _mesa_hash_table_search(state->remap, def);
+      assert(entry);
+      struct reg_or_immed *pred_val = entry->data;
+      if ((pred_val->flags & (IR3_REG_IMMED | IR3_REG_CONST)) ||
+          !pred_val->def ||
+          (cur_def && cur_def != pred_val->def)) {
+         needs_phi = true;
+         break;
+      }
+      cur_def = pred_val->def;
+   }
+
+   if (!needs_phi) {
+      interval->dst.def = cur_def;
+      interval->dst.flags = cur_def->flags;
+      return;
+   }
+
+   struct ir3_instruction *phi =
+      ir3_instr_create(block, OPC_META_PHI, 1, block->predecessors_count);
+   struct ir3_register *dst = __ssa_dst(phi);
+   dst->flags |= def->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
+   dst->size = def->size;
+   dst->wrmask = def->wrmask;
+
+   dst->interval_start = def->interval_start;
+   dst->interval_end = def->interval_end;
+   dst->merge_set = def->merge_set;
+   dst->merge_set_offset = def->merge_set_offset;
+
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *pred = block->predecessors[i];
+      struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+      struct ir3_register *src = ir3_src_create(phi, INVALID_REG, dst->flags);
+      src->size = def->size;
+      src->wrmask = def->wrmask;
+
+      if (state->visited) {
+         struct hash_entry *entry =
+            _mesa_hash_table_search(state->remap, def);
+         assert(entry);
+         struct reg_or_immed *new_val = entry->data;
+         set_src_val(src, new_val);
+      } else {
+         src->def = def;
+      }
+   }
+
+   interval->dst.def = dst;
+   interval->dst.flags = dst->flags;
+
+   ir3_instr_move_before_block(phi, block);
+}
+
+/* When spilling a block with a single predecessors, the pred may have other
+ * successors so we can't choose what's live in and we can't spill/restore
+ * anything. Just make the inserted intervals exactly match the predecessor. If
+ * it wasn't live in the predecessor then it must've already been spilled. Also,
+ * there are no phi nodes and no live-ins.
+ */
+static void
+spill_single_pred_live_in(struct ra_spill_ctx *ctx,
+                          struct ir3_block *block)
+{
+   unsigned name;
+   BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
+                       ctx->live->definitions_count) {
+      struct ir3_register *reg = ctx->live->definitions[name];
+      struct ra_spill_interval *interval = ctx->intervals[reg->name];
+      struct reg_or_immed *val = read_live_in(ctx, reg, block, 0);
+      if (val)
+         interval->dst = *val;
+      else
+         ra_spill_ctx_remove(ctx, interval);
+   }
+}
+
+static void
+rewrite_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *phi,
+            struct ir3_block *block)
+{
+   if (!ctx->intervals[phi->dsts[0]->name]->interval.inserted) {
+      phi->flags |= IR3_INSTR_UNUSED;
+      return;
+   }
+
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *pred = block->predecessors[i];
+      struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+
+      if (!state->visited)
+         continue;
+
+      struct ir3_register *src = phi->srcs[i];
+      if (!src->def)
+         continue;
+
+      struct hash_entry *entry =
+         _mesa_hash_table_search(state->remap, src->def);
+      assert(entry);
+      struct reg_or_immed *new_val = entry->data;
+      set_src_val(src, new_val);
+   }
+}
+
+static void
+spill_live_out(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval,
+               struct ir3_block *block)
+{
+   struct ir3_register *def = interval->interval.reg;
+
+   spill(ctx, &interval->dst, get_spill_slot(ctx, def), NULL, block);
+   ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
+}
+
+static void
+spill_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
+{
+   struct ra_spill_block_state *state = &ctx->blocks[block->index];
+   rb_tree_foreach_safe (struct ra_spill_interval, interval,
+                         &ctx->reg_ctx.intervals, interval.node) {
+      if (!BITSET_TEST(state->live_out, interval->interval.reg->name)) {
+         spill_live_out(ctx, interval, block);
+      }
+   }
+}
+
+static void
+reload_live_out(struct ra_spill_ctx *ctx, struct ir3_register *def,
+                struct ir3_block *block)
+{
+   struct ra_spill_interval *interval = ctx->intervals[def->name];
+   ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
+
+   reload_def(ctx, def, NULL, block);
+}
+
+static void
+reload_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
+{
+   struct ra_spill_block_state *state = &ctx->blocks[block->index];
+   unsigned name;
+   BITSET_FOREACH_SET (name, state->live_out, ctx->live->definitions_count) {
+      struct ir3_register *reg = ctx->live->definitions[name];
+      struct ra_spill_interval *interval = ctx->intervals[name];
+      if (!interval->interval.inserted)
+         reload_live_out(ctx, reg, block);
+   }
+}
+
+static void
+update_live_out_phis(struct ra_spill_ctx *ctx, struct ir3_block *block)
+{
+   assert(!block->successors[1]);
+   struct ir3_block *succ = block->successors[0];
+   unsigned pred_idx = ir3_block_get_pred_index(succ, block);
+   
+   foreach_instr (instr, &succ->instr_list) {
+      if (instr->opc != OPC_META_PHI)
+         break;
+
+      struct ir3_register *def = instr->srcs[pred_idx]->def;
+      if (!def)
+         continue;
+
+      struct ra_spill_interval *interval = ctx->intervals[def->name];
+      if (!interval->interval.inserted)
+         continue;
+      set_src_val(instr->srcs[pred_idx], &interval->dst);
+   }
+}
+
+static void
+record_pred_live_out(struct ra_spill_ctx *ctx,
+                     struct ra_spill_interval *interval,
+                     struct ir3_block *block, unsigned pred_idx)
+{
+   struct ir3_block *pred = block->predecessors[pred_idx];
+   struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+
+   struct ir3_register *def = interval->interval.reg;
+   if (is_live_in_phi(def, block)) {
+      def = def->instr->srcs[pred_idx]->def;
+   }
+   BITSET_SET(state->live_out, def->name);
+
+   rb_tree_foreach (struct ra_spill_interval, child,
+                    &interval->interval.children, interval.node) {
+      record_pred_live_out(ctx, child, block, pred_idx);
+   }
+}
+
+static void
+record_pred_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
+{
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *pred = block->predecessors[i];
+      struct ra_spill_block_state *state = &ctx->blocks[pred->index];
+      if (state->visited)
+         continue;
+
+      state->live_out = rzalloc_array(ctx, BITSET_WORD,
+                                      BITSET_WORDS(ctx->live->definitions_count));
+
+
+      rb_tree_foreach (struct ra_spill_interval, interval,
+                       &ctx->reg_ctx.intervals, interval.node) {
+         record_pred_live_out(ctx, interval, block, i);
+      }
+   }
+}
+
+static void
+record_live_out(struct ra_spill_ctx *ctx,
+                struct ra_spill_block_state *state,
+                struct ra_spill_interval *interval)
+{
+   if (!(interval->dst.flags & IR3_REG_SSA) ||
+       interval->dst.def) {
+      struct reg_or_immed *val = ralloc(ctx, struct reg_or_immed);
+      *val = interval->dst;
+      _mesa_hash_table_insert(state->remap, interval->interval.reg, val);
+   }
+   rb_tree_foreach (struct ra_spill_interval, child,
+                    &interval->interval.children, interval.node) {
+      record_live_out(ctx, state, child);
+   }
+}
+
+static void
+record_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
+{
+   struct ra_spill_block_state *state = &ctx->blocks[block->index];
+   state->remap = _mesa_pointer_hash_table_create(ctx);
+
+   rb_tree_foreach (struct ra_spill_interval, interval, &ctx->reg_ctx.intervals,
+                    interval.node) {
+      record_live_out(ctx, state, interval);
+   }
 }
 
 static void
@@ -315,12 +1692,14 @@ handle_block(struct ra_spill_ctx *ctx, struct ir3_block *block)
 {
    memset(&ctx->cur_pressure, 0, sizeof(ctx->cur_pressure));
    rb_tree_init(&ctx->reg_ctx.intervals);
+   rb_tree_init(&ctx->full_live_intervals);
+   rb_tree_init(&ctx->half_live_intervals);
 
    unsigned name;
    BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
                        ctx->live->definitions_count) {
       struct ir3_register *reg = ctx->live->definitions[name];
-      handle_live_in(ctx, reg);
+      handle_live_in(ctx, block, reg);
    }
 
    foreach_instr (instr, &block->instr_list) {
@@ -330,36 +1709,298 @@ handle_block(struct ra_spill_ctx *ctx, struct ir3_block *block)
       handle_input_phi(ctx, instr);
    }
 
-   update_max_pressure(ctx);
+   if (ctx->spilling) {
+      if (block->predecessors_count == 1) {
+         spill_single_pred_live_in(ctx, block);
+      } else {
+         spill_live_ins(ctx, block);
+         reload_live_ins(ctx, block);
+         record_pred_live_outs(ctx, block);
+         foreach_instr (instr, &block->instr_list) {
+            if (instr->opc != OPC_META_PHI)
+               break;
+            rewrite_phi(ctx, instr, block);
+         }
+         BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
+                             ctx->live->definitions_count) {
+            struct ir3_register *reg = ctx->live->definitions[name];
+            add_live_in_phi(ctx, reg, block);
+         }
+      }
+   } else {
+      update_max_pressure(ctx);
+   }
 
    foreach_instr (instr, &block->instr_list) {
+      di(instr, "processing");
+
       if (instr->opc == OPC_META_PHI || instr->opc == OPC_META_INPUT ||
           instr->opc == OPC_META_TEX_PREFETCH)
          remove_input_phi(ctx, instr);
+      else if (ctx->spilling && instr->opc == OPC_META_PARALLEL_COPY)
+         handle_pcopy(ctx, instr);
+      else if (ctx->spilling && instr->opc == OPC_MOV &&
+               instr->dsts[0] == ctx->base_reg)
+         /* skip */;
       else
          handle_instr(ctx, instr);
    }
+
+   if (ctx->spilling && block->successors[0]) {
+      struct ra_spill_block_state *state =
+         &ctx->blocks[block->successors[0]->index];
+      if (state->visited) {
+         assert(!block->successors[1]);
+
+         spill_live_outs(ctx, block);
+         reload_live_outs(ctx, block);
+         update_live_out_phis(ctx, block);
+      }
+   }
+
+   if (ctx->spilling) {
+      record_live_outs(ctx, block);
+      ctx->blocks[block->index].visited = true;
+   }
+}
+
+static bool
+simplify_phi_node(struct ir3_instruction *phi)
+{
+   struct ir3_register *def = NULL;
+   foreach_src (src, phi) {
+      /* Ignore phi sources which point to the phi itself. */
+      if (src->def == phi->dsts[0])
+         continue;
+      /* If it's undef or it doesn't match the previous sources, bail */
+      if (!src->def || (def && def != src->def))
+         return false;
+      def = src->def;
+   }
+
+   phi->data = def;
+   phi->flags |= IR3_INSTR_UNUSED;
+   return true;
+}
+
+static void
+simplify_phi_srcs(struct ir3_instruction *instr)
+{
+   foreach_src (src, instr) {
+      if (src->def && src->def->instr->opc == OPC_META_PHI) {
+         struct ir3_instruction *phi = src->def->instr;
+         if (phi->data)
+            src->def = phi->data;
+      }
+   }
+}
+
+/* We insert phi nodes for all live-ins of loops in case we need to split the
+ * live range. This pass cleans that up for the case where the live range didn't
+ * actually need to be split.
+ */
+static void
+simplify_phi_nodes(struct ir3 *ir)
+{
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         if (instr->opc != OPC_META_PHI)
+            break;
+         instr->data = NULL;
+      }
+   }
+
+   bool progress;
+   do {
+      progress = false;
+      foreach_block (block, &ir->block_list) {
+         foreach_instr (instr, &block->instr_list) {
+            if (instr->opc == OPC_META_PHI || (instr->flags & IR3_INSTR_UNUSED))
+               continue;
+
+            simplify_phi_srcs(instr);
+         }
+
+         for (unsigned i = 0; i < 2; i++) {
+            struct ir3_block *succ = block->successors[i];
+            if (!succ)
+               continue;
+            foreach_instr (instr, &succ->instr_list) {
+               if (instr->opc != OPC_META_PHI)
+                  break;
+               if (instr->flags & IR3_INSTR_UNUSED)
+                  continue;
+
+               simplify_phi_srcs(instr);
+               progress |= simplify_phi_node(instr);
+            }
+         }
+      }
+   } while (progress);
+}
+
+static void
+unmark_dead(struct ir3 *ir)
+{
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         instr->flags &= ~IR3_INSTR_UNUSED;
+      }
+   }
+}
+
+/* Simple pass to remove now-dead phi nodes and pcopy instructions. We mark
+ * which ones are dead along the way, so there's nothing to compute here.
+ */
+static void
+cleanup_dead(struct ir3 *ir)
+{
+   foreach_block (block, &ir->block_list) {
+      foreach_instr_safe (instr, &block->instr_list) {
+         if (instr->flags & IR3_INSTR_UNUSED)
+            list_delinit(&instr->node);
+      }
+   }
+}
+
+/* Deal with merge sets after spilling. Spilling generally leaves the merge sets
+ * in a mess, and even if we properly cleaned up after ourselves, we would want
+ * to recompute the merge sets afterward anway. That's because
+ * spilling/reloading can "break up" phi webs and split/collect webs so that
+ * allocating them to the same register no longer gives any benefit. For
+ * example, imagine we have this:
+ *
+ * if (...) {
+ *    foo = ...
+ * } else {
+ *    bar = ...
+ * }
+ * baz = phi(foo, bar)
+ *
+ * and we spill "baz":
+ *
+ * if (...) {
+ *    foo = ...
+ *    spill(foo)
+ * } else {
+ *    bar = ...
+ *    spill(bar)
+ * }
+ * baz = reload()
+ *
+ * now foo, bar, and baz don't have to be allocated to the same register. How
+ * exactly the merge sets change can be complicated, so it's easier just to
+ * recompute them.
+ *
+ * However, there's a wrinkle in this: those same merge sets determine the
+ * register pressure, due to multiple values inhabiting the same register! And
+ * we assume that this sharing happens when spilling. Therefore we need a
+ * three-step procedure:
+ *
+ * 1. Drop the original merge sets.
+ * 2. Calculate which values *must* be merged, being careful to only use the
+ *    interval information which isn't trashed by spilling, and forcibly merge
+ *    them.
+ * 3. Let ir3_merge_regs() finish the job, including recalculating the
+ *    intervals.
+ */
+
+static void
+fixup_merge_sets(struct ir3_liveness *live, struct ir3 *ir)
+{
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         ra_foreach_dst (dst, instr) {
+            dst->merge_set = NULL;
+            dst->merge_set_offset = 0;
+         }
+      }
+   }
+
+   foreach_block (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         if (instr->opc != OPC_META_SPLIT &&
+             instr->opc != OPC_META_COLLECT)
+            continue;
+
+         struct ir3_register *dst = instr->dsts[0];
+         ra_foreach_src (src, instr) {
+            if (!(src->flags & IR3_REG_KILL) &&
+                src->def->interval_start < dst->interval_end &&
+                dst->interval_start < src->def->interval_end) {
+               ir3_force_merge(dst, src->def,
+                               src->def->interval_start - dst->interval_start);
+            }
+         }
+      }
+   }
+
+   ir3_merge_regs(live, ir);
 }
 
 void
 ir3_calc_pressure(struct ir3_shader_variant *v, struct ir3_liveness *live,
                   struct ir3_pressure *max_pressure)
 {
-   struct ra_spill_ctx ctx = {};
-   ctx.live = live;
-   ctx.intervals = calloc(live->definitions_count, sizeof(*ctx.intervals));
-   ctx.compiler = v->shader->compiler;
-   spill_ctx_init(&ctx);
+   struct ra_spill_ctx *ctx = rzalloc(NULL, struct ra_spill_ctx);
+   spill_ctx_init(ctx, v, live);
 
    foreach_block (block, &v->ir->block_list) {
-      handle_block(&ctx, block);
+      handle_block(ctx, block);
    }
 
-   assert(ctx.cur_pressure.full == 0);
-   assert(ctx.cur_pressure.half == 0);
-   assert(ctx.cur_pressure.shared == 0);
+   assert(ctx->cur_pressure.full == 0);
+   assert(ctx->cur_pressure.half == 0);
+   assert(ctx->cur_pressure.shared == 0);
 
-   free(ctx.intervals);
-
-   *max_pressure = ctx.max_pressure;
+   *max_pressure = ctx->max_pressure;
+   ralloc_free(ctx);
+}
+
+bool
+ir3_spill(struct ir3 *ir, struct ir3_shader_variant *v,
+          struct ir3_liveness **live,
+          const struct ir3_pressure *limit_pressure)
+{
+   void *mem_ctx = ralloc_parent(*live);
+   struct ra_spill_ctx *ctx = rzalloc(mem_ctx, struct ra_spill_ctx);
+   spill_ctx_init(ctx, v, *live);
+
+   ctx->spilling = true;
+
+   ctx->blocks = rzalloc_array(ctx, struct ra_spill_block_state,
+                               ctx->live->block_count);
+   rb_tree_init(&ctx->full_live_intervals);
+   rb_tree_init(&ctx->half_live_intervals);
+
+   ctx->limit_pressure = *limit_pressure;
+   ctx->spill_slot = v->pvtmem_size;
+
+   add_base_reg(ctx, ir);
+   compute_next_distance(ctx, ir);
+
+   unmark_dead(ir);
+
+   foreach_block (block, &ir->block_list) {
+      handle_block(ctx, block);
+   }
+
+   simplify_phi_nodes(ir);
+
+   cleanup_dead(ir);
+
+   ir3_create_parallel_copies(ir);
+
+   /* After this point, we're done mutating the IR. Liveness has been trashed,
+    * so recalculate it. We'll need it for recalculating the merge sets.
+    */
+   ralloc_free(ctx->live);
+   *live = ir3_calc_liveness(mem_ctx, ir);
+
+   fixup_merge_sets(*live, ir);
+
+   v->pvtmem_size = ctx->spill_slot;
+   ralloc_free(ctx);
+
+   return true;
 }
diff --git a/mesa 3D driver/src/freedreno/ir3/ir3_validate.c b/mesa 3D driver/src/freedreno/ir3/ir3_validate.c
index 4c07b02159..f56e3c0ad9 100644
--- a/mesa 3D driver/src/freedreno/ir3/ir3_validate.c	
+++ b/mesa 3D driver/src/freedreno/ir3/ir3_validate.c	
@@ -30,6 +30,9 @@
 struct ir3_validate_ctx {
    struct ir3 *ir;
 
+   /* Current block being validated: */
+   struct ir3_block *current_block;
+
    /* Current instruction being validated: */
    struct ir3_instruction *current_instr;
 
@@ -43,8 +46,12 @@ static void
 validate_error(struct ir3_validate_ctx *ctx, const char *condstr)
 {
    fprintf(stderr, "validation fail: %s\n", condstr);
-   fprintf(stderr, "  -> for instruction: ");
-   ir3_print_instr(ctx->current_instr);
+   if (ctx->current_instr) {
+      fprintf(stderr, "  -> for instruction: ");
+      ir3_print_instr(ctx->current_instr);
+   } else {
+      fprintf(stderr, "  -> for block%u\n", block_id(ctx->current_block));
+   }
    abort();
 }
 
@@ -65,6 +72,9 @@ static void
 validate_src(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr,
              struct ir3_register *reg)
 {
+   if (reg->flags & IR3_REG_IMMED)
+      validate_assert(ctx, ir3_valid_immediate(instr, reg->iim_val));
+
    if (!(reg->flags & IR3_REG_SSA) || !reg->def)
       return;
 
@@ -148,7 +158,7 @@ validate_dst(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr,
 
 #define validate_reg_size(ctx, reg, type)                                      \
    validate_assert(                                                            \
-      ctx, type_size(type) == (((reg)->flags & IR3_REG_HALF) ? 16 : 32))
+      ctx, (type_size(type) <= 16) == !!((reg)->flags & IR3_REG_HALF))
 
 static void
 validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr)
@@ -181,10 +191,18 @@ validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr)
             else
                validate_assert(ctx, reg->flags & IR3_REG_HALF);
          }
-      } else if (opc_cat(instr->opc) == 6) {
+      } else if (opc_cat(instr->opc) == 1 || opc_cat(instr->opc) == 6) {
          /* handled below */
       } else if (opc_cat(instr->opc) == 0) {
          /* end/chmask/etc are allowed to have different size sources */
+      } else if (instr->opc == OPC_META_PARALLEL_COPY) {
+         /* pcopy sources have to match with their destination but can have
+          * different sizes from each other.
+          */
+      } else if (instr->opc == OPC_ANY_MACRO || instr->opc == OPC_ALL_MACRO ||
+                 instr->opc == OPC_READ_FIRST_MACRO ||
+                 instr->opc == OPC_READ_COND_MACRO) {
+         /* nothing yet */
       } else if (n > 0) {
          validate_assert(ctx, (last_reg->flags & IR3_REG_HALF) ==
                                  (reg->flags & IR3_REG_HALF));
@@ -299,6 +317,7 @@ validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr)
       case OPC_STL:
       case OPC_STP:
       case OPC_STLW:
+      case OPC_SPILL_MACRO:
          validate_assert(ctx, !(instr->srcs[0]->flags & IR3_REG_HALF));
          validate_reg_size(ctx, instr->srcs[1], instr->cat6.type);
          validate_assert(ctx, !(instr->srcs[2]->flags & IR3_REG_HALF));
@@ -322,6 +341,22 @@ validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr)
          break;
       }
    }
+
+   if (instr->opc == OPC_META_PARALLEL_COPY) {
+      foreach_src_n (src, n, instr) {
+         validate_assert(ctx, reg_class_flags(src) ==
+                         reg_class_flags(instr->dsts[n]));
+      }
+   }
+}
+
+static bool
+is_physical_successor(struct ir3_block *block, struct ir3_block *succ)
+{
+   for (unsigned i = 0; i < ARRAY_SIZE(block->physical_successors); i++)
+      if (block->physical_successors[i] == succ)
+         return true;
+   return false;
 }
 
 void
@@ -342,6 +377,9 @@ ir3_validate(struct ir3 *ir)
    ctx->defs = _mesa_pointer_set_create(ctx);
 
    foreach_block (block, &ir->block_list) {
+      ctx->current_block = block;
+      ctx->current_instr = NULL;
+
       /* We require that the first block does not have any predecessors,
        * which allows us to assume that phi nodes and meta:input's do not
        * appear in the same basic block.
@@ -363,9 +401,18 @@ ir3_validate(struct ir3 *ir)
       }
 
       for (unsigned i = 0; i < 2; i++) {
-         if (block->successors[i])
+         if (block->successors[i]) {
             validate_phi_src(ctx, block->successors[i], block);
+
+            ctx->current_instr = NULL;
+
+            /* Each logical successor should also be a physical successor: */
+            validate_assert(ctx, is_physical_successor(block, block->successors[i]));
+         }
       }
+
+      validate_assert(ctx, block->successors[0] || !block->successors[1]);
+      validate_assert(ctx, block->physical_successors[0] || !block->physical_successors[1]);
    }
 
    ralloc_free(ctx);
diff --git a/mesa 3D driver/src/freedreno/ir3/meson.build b/mesa 3D driver/src/freedreno/ir3/meson.build
index 1bf9847ad9..55f5ad03c7 100644
--- a/mesa 3D driver/src/freedreno/ir3/meson.build	
+++ b/mesa 3D driver/src/freedreno/ir3/meson.build	
@@ -88,29 +88,32 @@ libfreedreno_ir3_files = files(
   'ir3_legalize.c',
   'ir3_liveness.c',
   'ir3_lower_parallelcopy.c',
+  'ir3_lower_spill.c',
   'ir3_lower_subgroups.c',
   'ir3_merge_regs.c',
   'ir3_nir.c',
   'ir3_nir.h',
   'ir3_nir_analyze_ubo_ranges.c',
+  'ir3_nir_lower_64b.c',
   'ir3_nir_lower_load_barycentric_at_sample.c',
   'ir3_nir_lower_load_barycentric_at_offset.c',
   'ir3_nir_lower_io_offsets.c',
   'ir3_nir_lower_tess.c',
   'ir3_nir_lower_tex_prefetch.c',
   'ir3_nir_lower_tg4_to_tex.c',
+  'ir3_nir_lower_wide_load_store.c',
   'ir3_nir_move_varying_inputs.c',
   'ir3_postsched.c',
   'ir3_print.c',
   'ir3_ra.c',
   'ir3_ra.h',
   'ir3_ra_validate.c',
+  'ir3_remove_unreachable.c',
   'ir3_sched.c',
   'ir3_shader.c',
   'ir3_shader.h',
   'ir3_spill.c',
   'ir3_validate.c',
-  'regmask.h',
 )
 
 libfreedreno_ir3 = static_library(
diff --git a/mesa 3D driver/src/freedreno/ir3/tests/disasm.c b/mesa 3D driver/src/freedreno/ir3/tests/disasm.c
index aa376eabeb..320cd61809 100644
--- a/mesa 3D driver/src/freedreno/ir3/tests/disasm.c	
+++ b/mesa 3D driver/src/freedreno/ir3/tests/disasm.c	
@@ -58,142 +58,161 @@ static const struct test {
    bool parse_fail;
 } tests[] = {
    /* clang-format off */
-	/* cat0 */
-	INSTR_6XX(00000000_00000000, "nop"),
-	INSTR_6XX(00000200_00000000, "(rpt2)nop"),
-	INSTR_6XX(03000000_00000000, "end"),
-	INSTR_6XX(00800000_00000004, "br p0.x, #4"),
-	INSTR_6XX(00900000_00000003, "br !p0.x, #3"),
-	INSTR_6XX(03820000_00000015, "shps #21"), /* emit */
-	INSTR_6XX(04021000_00000000, "(ss)shpe"), /* cut */
-	INSTR_6XX(02820000_00000014, "getone #20"), /* kill p0.x */
-	INSTR_6XX(00906020_00000007, "brao !p0.x, !p0.y, #7"),
-	INSTR_6XX(00804040_00000003, "braa p0.x, p0.y, #3"),
-	INSTR_6XX(07820000_00000000, "prede"),
-	INSTR_6XX(00800063_0000001e, "brac.3 #30"),
-	INSTR_6XX(06820000_00000000, "predt p0.x"),
-	INSTR_6XX(07020000_00000000, "predf p0.x"),
-	INSTR_6XX(07820000_00000000, "prede"),
+   /* cat0 */
+   INSTR_6XX(00000000_00000000, "nop"),
+   INSTR_6XX(00000200_00000000, "(rpt2)nop"),
+   INSTR_6XX(03000000_00000000, "end"),
+   INSTR_6XX(00800000_00000004, "br p0.x, #4"),
+   INSTR_6XX(00800000_fffffffc, "br p0.x, #-4"),
+   INSTR_6XX(00900000_00000003, "br !p0.x, #3"),
+   INSTR_6XX(03820000_00000015, "shps #21"), /* emit */
+   INSTR_6XX(04021000_00000000, "(ss)shpe"), /* cut */
+   INSTR_6XX(02820000_00000014, "getone #20"), /* kill p0.x */
+   INSTR_6XX(00906020_00000007, "brao !p0.x, !p0.y, #7"),
+   INSTR_6XX(00804040_00000003, "braa p0.x, p0.y, #3"),
+   INSTR_6XX(07820000_00000000, "prede"),
+   INSTR_6XX(00800063_0000001e, "brac.3 #30"),
+   INSTR_6XX(06820000_00000000, "predt p0.x"),
+   INSTR_6XX(07020000_00000000, "predf p0.x"),
+   INSTR_6XX(07820000_00000000, "prede"),
 
-	/* cat1 */
-	INSTR_6XX(20244000_00000020, "mov.f32f32 r0.x, c8.x"),
-	INSTR_6XX(20200000_00000020, "mov.f16f16 hr0.x, hc8.x"),
-	INSTR_6XX(20150000_00000000, "cov.s32s16 hr0.x, r0.x"),
-	INSTR_6XX(20156004_00000c11, "(ul)mov.s32s32 r1.x, c<a0.x + 17>"),
-	INSTR_6XX(201100f4_00000000, "mova a0.x, hr0.x"),
-	INSTR_6XX(20244905_00000410, "(rpt1)mov.f32f32 r1.y, (r)c260.x"),
-	INSTR_6XX(20174004_00000008, "mov.s32s32 r<a0.x + 4>, r2.x"),
-	INSTR_6XX(20130000_00000005, "mov.s16s16 hr<a0.x>, hr1.y"),
-	INSTR_6XX(20110004_00000800, "mov.s16s16 hr1.x, hr<a0.x>"),
-	/* dEQP-VK.subgroups.ballot.compute.compute */
-	INSTR_6XX(260cc3c0_00000000, "movmsk.w128 r48.x"), /* movmsk.w128 sr48.x */
+   /* cat1 */
+   INSTR_6XX(20244000_00000020, "mov.f32f32 r0.x, c8.x"),
+   INSTR_6XX(20200000_00000020, "mov.f16f16 hr0.x, hc8.x"),
+   INSTR_6XX(20150000_00000000, "cov.s32s16 hr0.x, r0.x"),
+   INSTR_6XX(20156004_00000c11, "(ul)mov.s32s32 r1.x, c<a0.x + 17>"),
+   INSTR_6XX(201100f4_00000000, "mova a0.x, hr0.x"),
+   INSTR_6XX(20244905_00000410, "(rpt1)mov.f32f32 r1.y, (r)c260.x"),
+   INSTR_6XX(20174004_00000008, "mov.s32s32 r<a0.x + 4>, r2.x"),
+   INSTR_6XX(20130000_00000005, "mov.s16s16 hr<a0.x>, hr1.y"),
+   INSTR_6XX(20110004_00000800, "mov.s16s16 hr1.x, hr<a0.x>"),
+   /* dEQP-VK.subgroups.ballot.compute.compute */
+   INSTR_6XX(260cc3c0_00000000, "movmsk.w128 r48.x"), /* movmsk.w128 sr48.x */
 
-	INSTR_6XX(240cc004_00030201, "swz.u32u32 r1.x, r0.w, r0.y, r0.z"),
-	INSTR_6XX(2400c105_04030201, "gat.f16u32 r1.y, hr0.y, hr0.z, hr0.w, hr1.x"),
-	INSTR_6XX(240c0205_04030201, "sct.u32f16 hr1.y, hr0.z, hr0.w, hr1.x, r0.y"),
-	INSTR_6XX(2400c205_04030201, "sct.f16u32 r1.y, r0.z, r0.w, r1.x, hr0.y"),
+   INSTR_6XX(240cc004_00030201, "swz.u32u32 r1.x, r0.w, r0.y, r0.z"),
+   INSTR_6XX(2400c105_04030201, "gat.f16u32 r1.y, hr0.y, hr0.z, hr0.w, hr1.x"),
+   INSTR_6XX(240c0205_04030201, "sct.u32f16 hr1.y, hr0.z, hr0.w, hr1.x, r0.y"),
+   INSTR_6XX(2400c205_04030201, "sct.f16u32 r1.y, r0.z, r0.w, r1.x, hr0.y"),
 
-	INSTR_6XX(20510005_0000ffff, "mov.s16s16 hr1.y, -1"),
-	INSTR_6XX(20400005_00003900, "mov.f16f16 hr1.y, h(0.625000)"),
-	INSTR_6XX(20400006_00003800, "mov.f16f16 hr1.z, h(0.500000)"),
-	INSTR_6XX(204880f5_00000000, "mova1 a1.x, 0"),
+   INSTR_6XX(20510005_0000ffff, "mov.s16s16 hr1.y, -1"),
+   INSTR_6XX(20400005_00003900, "mov.f16f16 hr1.y, h(0.625000)"),
+   INSTR_6XX(20400006_00003800, "mov.f16f16 hr1.z, h(0.500000)"),
+   INSTR_6XX(204880f5_00000000, "mova1 a1.x, 0"),
 
-	/* cat2 */
-	INSTR_6XX(40104002_0c210001, "add.f hr0.z, r0.y, c<a0.x + 33>"),
-	INSTR_6XX(40b80804_10408004, "(nop3) cmps.f.lt r1.x, (abs)r1.x, c16.x"),
-	INSTR_6XX(47308a02_00002000, "(rpt2)bary.f (ei)r0.z, (r)0, r0.x"),
-	INSTR_6XX(43480801_00008001, "(nop3) absneg.s hr0.y, (abs)hr0.y"),
-	INSTR_6XX(50600004_2c010004, "(sy)mul.f hr1.x, hr1.x, h(0.5)"),
-	INSTR_6XX(42280807_27ff0000, "(nop3) add.s hr1.w, hr0.x, h(-1)"),
-	INSTR_6XX(40a500f8_2c000004, "cmps.f.ne p0.x, hr1.x, h(0.0)"),
-	INSTR_6XX(438000f8_20010009, "and.b p0.x, hr2.y, h(1)"),
-	INSTR_6XX(438000f9_00020001, "and.b p0.y, hr0.y, hr0.z"),
-	INSTR_6XX(40080902_50200006, "(rpt1)add.f hr0.z, (r)hr1.z, (neg)(r)hc8.x"),
-	INSTR_6XX(42380c01_00040001, "(sat)(nop3) add.s r0.y, r0.y, r1.x"),
-	INSTR_6XX(42480000_48801086, "(nop2) sub.u hr0.x, hc33.z, (neg)hr<a0.x + 128>"),
-	INSTR_6XX(46b00001_00001020, "clz.b r0.y, c8.x"),
-	INSTR_6XX(46700009_00000009, "bfrev.b r2.y, r2.y"),
+   /* cat2 */
+   INSTR_6XX(40104002_0c210001, "add.f hr0.z, r0.y, c<a0.x + 33>"),
+   INSTR_6XX(40b80804_10408004, "(nop3) cmps.f.lt r1.x, (abs)r1.x, c16.x"),
+   INSTR_6XX(47308a02_00002000, "(rpt2)bary.f (ei)r0.z, (r)0, r0.x"),
+   INSTR_6XX(43480801_00008001, "(nop3) absneg.s hr0.y, (abs)hr0.y"),
+   INSTR_6XX(50600004_2c010004, "(sy)mul.f hr1.x, hr1.x, h(0.5)"),
+   INSTR_6XX(42280807_27ff0000, "(nop3) add.s hr1.w, hr0.x, h(-1)"),
+   INSTR_6XX(40a500f8_2c000004, "cmps.f.ne p0.x, hr1.x, h(0.0)"),
+   INSTR_6XX(438000f8_20010009, "and.b p0.x, hr2.y, h(1)"),
+   INSTR_6XX(438000f9_00020001, "and.b p0.y, hr0.y, hr0.z"),
+   INSTR_6XX(40080902_50200006, "(rpt1)add.f hr0.z, (r)hr1.z, (neg)(r)hc8.x"),
+   INSTR_6XX(42380c01_00040001, "(sat)(nop3) add.s r0.y, r0.y, r1.x"),
+   INSTR_6XX(42480000_48801086, "(nop2) sub.u hr0.x, hc33.z, (neg)hr<a0.x + 128>"),
+   INSTR_6XX(46b00001_00001020, "clz.b r0.y, c8.x"),
+   INSTR_6XX(46700009_00000009, "bfrev.b r2.y, r2.y"),
 
-	/* cat3 */
-	INSTR_6XX(66000000_10421041, "sel.f16 hr0.x, hc16.y, hr0.x, hc16.z"),
-	INSTR_6XX(64848109_109a9099, "(rpt1)sel.b32 r2.y, c38.y, (r)r2.y, c38.z"),
-	INSTR_6XX(64810904_30521036, "(rpt1)sel.b32 r1.x, (r)c13.z, r0.z, (r)c20.z"),
-	INSTR_6XX(64818902_20041032, "(rpt1)sel.b32 r0.z, (r)c12.z, r0.w, (r)r1.x"),
-	INSTR_6XX(63820005_10315030, "mad.f32 r1.y, (neg)c12.x, r1.x, c12.y"),
-	INSTR_6XX(62050009_00091000, "mad.u24 r2.y, c0.x, r2.z, r2.y"),
-	INSTR_6XX(61828008_00081033, "madsh.m16 r2.x, c12.w, r1.y, r2.x"),
-	INSTR_6XX(65900820_100cb008, "(nop3) shlg.b16 hr8.x, 8, hr8.x, 12"), /* (nop3) shlg.b16 hr8.x, (r)8, (r)hr8.x, 12; */
-	INSTR_6XX(65ae085c_0002a001, "(nop3) shlg.b16 hr23.x, hr0.y, hr23.x, hr0.z"), /* not seen in blob */
-	INSTR_6XX(65900820_0c0aac05, "(nop3) shlg.b16 hr8.x, hc<a0.x + 5>, hr8.x, hc<a0.x + 10>"), /* not seen in blob */
+   /* cat3 */
+   INSTR_6XX(66000000_10421041, "sel.f16 hr0.x, hc16.y, hr0.x, hc16.z"),
+   INSTR_6XX(64848109_109a9099, "(rpt1)sel.b32 r2.y, c38.y, (r)r2.y, c38.z"),
+   INSTR_6XX(64810904_30521036, "(rpt1)sel.b32 r1.x, (r)c13.z, r0.z, (r)c20.z"),
+   INSTR_6XX(64818902_20041032, "(rpt1)sel.b32 r0.z, (r)c12.z, r0.w, (r)r1.x"),
+   INSTR_6XX(63820005_10315030, "mad.f32 r1.y, (neg)c12.x, r1.x, c12.y"),
+   INSTR_6XX(62050009_00091000, "mad.u24 r2.y, c0.x, r2.z, r2.y"),
+   INSTR_6XX(61828008_00081033, "madsh.m16 r2.x, c12.w, r1.y, r2.x"),
+   INSTR_6XX(65900820_100cb008, "(nop3) shlg.b16 hr8.x, 8, hr8.x, 12"), /* (nop3) shlg.b16 hr8.x, (r)8, (r)hr8.x, 12; */
+   INSTR_6XX(65ae085c_0002a001, "(nop3) shlg.b16 hr23.x, hr0.y, hr23.x, hr0.z"), /* not seen in blob */
+   INSTR_6XX(65900820_0c0aac05, "(nop3) shlg.b16 hr8.x, hc<a0.x + 5>, hr8.x, hc<a0.x + 10>"), /* not seen in blob */
 
-	/* cat4 */
-	INSTR_6XX(8010000a_00000003, "rcp r2.z, r0.w"),
+   /* cat4 */
+   INSTR_6XX(8010000a_00000003, "rcp r2.z, r0.w"),
 
-	/* cat5 */
-	/* dEQP-VK.glsl.derivate.dfdx.uniform_if.float_mediump */
-	INSTR_6XX(a3801102_00000001, "dsx (f32)(x)r0.z, r0.x"), /* dsx (f32)(xOOO)r0.z, r0.x */
-	/* dEQP-VK.glsl.derivate.dfdy.uniform_if.float_mediump */
-	INSTR_6XX(a3c01102_00000001, "dsy (f32)(x)r0.z, r0.x"), /* dsy (f32)(xOOO)r0.z, r0.x */
-	/* dEQP-VK.glsl.derivate.dfdxfine.uniform_loop.float_highp */
-	INSTR_6XX(a6001105_00000001, "dsxpp.1 (x)r1.y, r0.x"), /* dsxpp.1 (xOOO)r1.y, r0.x */
-	INSTR_6XX(a6201105_00000001, "dsxpp.1.p (x)r1.y, r0.x"), /* dsxpp.1 (xOOO)r1.y, r0.x */
+   /* cat5 */
+   /* dEQP-VK.glsl.derivate.dfdx.uniform_if.float_mediump */
+   INSTR_6XX(a3801102_00000001, "dsx (f32)(x)r0.z, r0.x"), /* dsx (f32)(xOOO)r0.z, r0.x */
+   /* dEQP-VK.glsl.derivate.dfdy.uniform_if.float_mediump */
+   INSTR_6XX(a3c01102_00000001, "dsy (f32)(x)r0.z, r0.x"), /* dsy (f32)(xOOO)r0.z, r0.x */
+   /* dEQP-VK.glsl.derivate.dfdxfine.uniform_loop.float_highp */
+   INSTR_6XX(a6001105_00000001, "dsxpp.1 (x)r1.y, r0.x"), /* dsxpp.1 (xOOO)r1.y, r0.x */
+   INSTR_6XX(a6201105_00000001, "dsxpp.1.p (x)r1.y, r0.x"), /* dsxpp.1 (xOOO)r1.y, r0.x */
 
-	INSTR_6XX(a2802f00_00000001, "getsize (u16)(xyzw)hr0.x, r0.x, t#0"),
-	INSTR_6XX(a0c89f04_c4600005, "sam.base1 (f32)(xyzw)r1.x, r0.z, s#3, t#2"),  /* sam.s2en.mode6.base1 (f32)(xyzw)r1.x, r0.z, 35 */
-	INSTR_6XX(a1c85f00_c0200005, "getlod.base0 (s32)(xyzw)r0.x, r0.z, s#1, t#0"),  /* getlod.s2en.mode6.base0 (s32)(xyzw)r0.x, r0.z, 1 */
-	INSTR_6XX(a1000f00_00000004, "samb (f16)(xyzw)hr0.x, hr0.z, hr0.x, s#0, t#0"),
-	INSTR_6XX(a1000f00_00000003, "samb (f16)(xyzw)hr0.x, r0.y, r0.x, s#0, t#0"),
-	INSTR_6XX(a0c00f00_04400002, "sam (f16)(xyzw)hr0.x, hr0.y, s#2, t#2"),
-	INSTR_6XX(a6c02f00_00000000, "rgetinfo (u16)(xyzw)hr0.x"),
-	INSTR_6XX(a3482f08_c0000000, "getinfo.base0 (u16)(xyzw)hr2.x, t#0"),
-	/* dEQP-GLES31.functional.texture.texture_buffer.render.as_fragment_texture.buffer_size_65536 */
-	INSTR_5XX(a2c03102_00000000, "getbuf (u32)(x)r0.z, t#0"),
-	INSTR_6XX(a0c81f00_e0200005, "sam.base0 (f32)(xyzw)r0.x, r0.z, s#1, a1.x"),
+   INSTR_6XX(a2802f00_00000001, "getsize (u16)(xyzw)hr0.x, r0.x, t#0"),
+   INSTR_6XX(a0c89f04_c4600005, "sam.base1 (f32)(xyzw)r1.x, r0.z, s#3, t#2"),  /* sam.s2en.mode6.base1 (f32)(xyzw)r1.x, r0.z, 35 */
+   INSTR_6XX(a1c85f00_c0200005, "getlod.base0 (s32)(xyzw)r0.x, r0.z, s#1, t#0"),  /* getlod.s2en.mode6.base0 (s32)(xyzw)r0.x, r0.z, 1 */
+   INSTR_6XX(a1000f00_00000004, "samb (f16)(xyzw)hr0.x, hr0.z, hr0.x, s#0, t#0"),
+   INSTR_6XX(a1000f00_00000003, "samb (f16)(xyzw)hr0.x, r0.y, r0.x, s#0, t#0"),
+   INSTR_6XX(a0c00f00_04400002, "sam (f16)(xyzw)hr0.x, hr0.y, s#2, t#2"),
+   INSTR_6XX(a6c02f00_00000000, "rgetinfo (u16)(xyzw)hr0.x"),
+   INSTR_6XX(a3482f08_c0000000, "getinfo.base0 (u16)(xyzw)hr2.x, t#0"),
+   /* dEQP-GLES31.functional.texture.texture_buffer.render.as_fragment_texture.buffer_size_65536 */
+   INSTR_5XX(a2c03102_00000000, "getbuf (u32)(x)r0.z, t#0"),
+   INSTR_6XX(a0c81f00_e0200005, "sam.base0 (f32)(xyzw)r0.x, r0.z, s#1, a1.x"),
+   INSTR_6XX(a0c81108_e2000001, "sam.base0 (f32)(x)r2.x, r0.x, s#16, a1.x"),
+   INSTR_6XX(a048d107_cc080a07, "isaml.base3 (s32)(x)r1.w, r0.w, r1.y, s#0, t#6"),
 
 
-	/* cat6 */
+   /* cat6 */
 
-	INSTR_5XX(c6e60000_00010600, "ldgb.untyped.4d.u32.1 r0.x, g[0], r1.x, r0.x"),
-	INSTR_5XX(d7660204_02000a01, "(sy)stib.typed.2d.u32.1 g[1], r0.x, r0.z, r1.x"),
-	INSTR_6XX(c0240402_00674100, "stib.b.untyped.1d.u16.1.imm.base0 r0.z, r0.x, 2"),
+   INSTR_5XX(c6e60000_00010600, "ldgb.untyped.4d.u32.1 r0.x, g[0], r1.x, r0.x"), /* ldgb.a.untyped.1dtype.u32.1 r0.x, g[r1.x], r0.x, 0 */
+   INSTR_5XX(d7660204_02000a01, "(sy)stib.typed.2d.u32.1 g[1], r0.x, r0.z, r1.x"), /* (sy)stib.a.u32.2d.1 g[r1.x], r0.x, r0.z, 1.  r1.x is offset in ibo, r0.x is value*/
+   /* dEQP-VK.image.load_store.1d_array.r8g8b8a8_unorm */
+   INSTR_5XX(c1a20006_0600ba01, "ldib.typed.2d.f32.4 r1.z, g[0], r0.z, r1.z"), /* ldib.a.f32.2d.4 r1.z, g[r0.z], r1.z, 0.  r0.z is offset in ibo as src.  r1.z */
+   /* dEQP-VK.image.load_store.3d.r32g32b32a32_sint */
+   INSTR_5XX(c1aa0003_0500fc01, "ldib.typed.3d.s32.4 r0.w, g[0], r0.w, r1.y"), /* ldib.a.s32.3d.4 r0.w, g[r0.w], r1.y, 0.  r0.w is offset in ibo as src, and dst */
+   /* dEQP-VK.binding_model.shader_access.primary_cmd_buf.storage_image.vertex.descriptor_array.3d */
+   INSTR_5XX(c1a20204_0401fc01, "ldib.typed.3d.f32.4 r1.x, g[1], r1.w, r1.x"), /* ldib.a.f32.3d.4 r1.x, g[r1.w], r1.x, 1 */
+   /* dEQP-VK.binding_model.shader_access.secondary_cmd_buf.with_push.storage_texel_buffer.vertex_fragment.single_descriptor.offset_zero */
+   INSTR_5XX(c1a20005_0501be01, "ldib.typed.4d.f32.4 r1.y, g[0], r1.z, r1.y"), /* ldib.a.f32.1dtype.4 r1.y, g[r1.z], r1.y, 0 */
+   /* dEQP-VK.texture.filtering.cube.formats.r8g8b8a8_snorm_nearest */
+   INSTR_5XX(c1a60200_0000ba01, "ldib.typed.2d.u32.4 r0.x, g[1], r0.z, r0.x"), /* ldib.a.u32.2d.4 r0.x, g[r0.z], r0.x, 1 */
 
-	// TODO is this a real instruction?  Or float -6.0 ?
-	// INSTR_6XX(c0c00000_00000000, "stg.f16 g[hr0.x], hr0.x, hr0.x", .parse_fail=true),
-	/* dEQP-GLES31.functional.tessellation.invariance.outer_edge_symmetry.isolines_equal_spacing_ccw */
-	INSTR_6XX(c0d20906_02800004, "stg.a.f32 g[r1.x+(r1.z)<<2], r0.z, 2"), /* stg.a.f32 g[r1.x+(r1.z<<2)], r0.z, 2 */
-	INSTR_6XX(c0da052e_01800042, "stg.a.s32 g[r0.z+(r11.z)<<2], r8.y, 1"), /* stg.a.s32 g[r0.z+(r11.z<<2)], r8.y, 1 */
-	INSTR_6XX(c0ca0505_03800042, "stg.s32 g[r0.z+5], r8.y, 3"),
-	INSTR_6XX(c0ca0500_03800042, "stg.s32 g[r0.z], r8.y, 3"),
-	INSTR_6XX(c0ca0531_03800242, "stg.s32 g[r0.z+305], r8.y, 3"),
+   // TODO is this a real instruction?  Or float -6.0 ?
+   // INSTR_6XX(c0c00000_00000000, "stg.f16 g[hr0.x], hr0.x, hr0.x", .parse_fail=true),
+   /* dEQP-GLES31.functional.tessellation.invariance.outer_edge_symmetry.isolines_equal_spacing_ccw */
+   INSTR_6XX(c0d20906_02800004, "stg.a.f32 g[r1.x+(r1.z)<<2], r0.z, 2"), /* stg.a.f32 g[r1.x+(r1.z<<2)], r0.z, 2 */
+   INSTR_6XX(c0da052e_01800042, "stg.a.s32 g[r0.z+(r11.z)<<2], r8.y, 1"), /* stg.a.s32 g[r0.z+(r11.z<<2)], r8.y, 1 */
+   INSTR_6XX(c0dc052e_01800042, "stg.a.u8 g[r0.z+(r11.z)<<2], hr8.y, 1"),
+   INSTR_6XX(c0ca0505_03800042, "stg.s32 g[r0.z+5], r8.y, 3"),
+   INSTR_6XX(c0ca0500_03800042, "stg.s32 g[r0.z], r8.y, 3"),
+   INSTR_6XX(c0ca0531_03800242, "stg.s32 g[r0.z+305], r8.y, 3"),
+   INSTR_5XX(c0ce0100_02800000, "stg.s8 g[r0.x], hr0.x, 2"),
+   INSTR_5XX(c0c00100_02800000, "stg.f16 g[r0.x], hr0.x, 2"),
 
-	/* Customely crafted */
-	INSTR_6XX(c0d61104_01800228, "stg.a.u32 g[r2.x+(r1.x+1)<<2], r5.x, 1"),
-	INSTR_6XX(c0d61104_01802628, "stg.a.u32 g[r2.x+r1.x<<4+3<<2], r5.x, 1"),
+   /* Customely crafted */
+   INSTR_6XX(c0d61104_01800228, "stg.a.u32 g[r2.x+(r1.x+1)<<2], r5.x, 1"),
+   INSTR_6XX(c0d61104_01802628, "stg.a.u32 g[r2.x+r1.x<<4+3<<2], r5.x, 1"),
 
-	INSTR_6XX(c0020011_04c08023, "ldg.a.f32 r4.y, g[r0.z+(r4.y)<<2], 4"), /* ldg.a.f32 r4.y, g[r0.z+(r4.y<<2)], 4 */
-	INSTR_6XX(c0060006_01c18017, "ldg.a.u32 r1.z, g[r1.z+(r2.w)<<2], 1"), /* ldg.a.u32 r1.z, g[r1.z+(r2.w<<2)], 1 */
-	INSTR_6XX(c0060006_0181800f, "ldg.u32 r1.z, g[r1.z+7], 1"),
-	INSTR_6XX(c0060006_01818001, "ldg.u32 r1.z, g[r1.z], 1"),
-	INSTR_6XX(c0060003_0180c269, "ldg.u32 r0.w, g[r0.w+308], 1"),
+   INSTR_6XX(c0020011_04c08023, "ldg.a.f32 r4.y, g[r0.z+(r4.y)<<2], 4"), /* ldg.a.f32 r4.y, g[r0.z+(r4.y<<2)], 4 */
+   INSTR_6XX(c0060006_01c18017, "ldg.a.u32 r1.z, g[r1.z+(r2.w)<<2], 1"), /* ldg.a.u32 r1.z, g[r1.z+(r2.w<<2)], 1 */
+   INSTR_6XX(c0060006_0181800f, "ldg.u32 r1.z, g[r1.z+7], 1"),
+   INSTR_6XX(c0060006_01818001, "ldg.u32 r1.z, g[r1.z], 1"),
+   INSTR_6XX(c0060003_0180c269, "ldg.u32 r0.w, g[r0.w+308], 1"),
+   INSTR_6XX(c0040003_0180c269, "ldg.u16 hr0.w, g[r0.w+308], 1"),
 
-	/* Found in TCS/TES shaders of GTA V */
-	INSTR_6XX(c0020007_03c1420f, "ldg.a.f32 r1.w, g[r1.y+(r1.w+1)<<2], 3"), /* ldg.a.f32 r1.w, g[r1.y+((r1.w+1)<<2)], 3 */
+   /* Found in TCS/TES shaders of GTA V */
+   INSTR_6XX(c0020007_03c1420f, "ldg.a.f32 r1.w, g[r1.y+(r1.w+1)<<2], 3"), /* ldg.a.f32 r1.w, g[r1.y+((r1.w+1)<<2)], 3 */
 
-	/* Customely crafted */
-	INSTR_6XX(c0020007_03c1740f, "ldg.a.f32 r1.w, g[r1.y+r1.w<<5+2<<2], 3"),
+   /* Customely crafted */
+   INSTR_6XX(c0020007_03c1740f, "ldg.a.f32 r1.w, g[r1.y+r1.w<<5+2<<2], 3"),
 
-	INSTR_6XX(c0020011_04c08023, "ldg.a.f32 r4.y, g[r0.z+(r4.y)<<2], 4"), /* ldg.a.f32 r4.y, g[r0.z+(r4.y<<2)], 4 */
-	INSTR_6XX(c0060006_01c18017, "ldg.a.u32 r1.z, g[r1.z+(r2.w)<<2], 1"), /* ldg.a.u32 r1.z, g[r1.z+(r2.w<<2)], 1 */
-	INSTR_6XX(c0060006_0181800f, "ldg.u32 r1.z, g[r1.z+7], 1"),
-	INSTR_6XX(c0060006_01818001, "ldg.u32 r1.z, g[r1.z], 1"),
+   INSTR_6XX(c0020011_04c08023, "ldg.a.f32 r4.y, g[r0.z+(r4.y)<<2], 4"), /* ldg.a.f32 r4.y, g[r0.z+(r4.y<<2)], 4 */
+   INSTR_6XX(c0060006_01c18017, "ldg.a.u32 r1.z, g[r1.z+(r2.w)<<2], 1"), /* ldg.a.u32 r1.z, g[r1.z+(r2.w<<2)], 1 */
+   INSTR_6XX(c0000006_01c18017, "ldg.a.f16 hr1.z, g[r1.z+(r2.w)<<2], 1"),
+   INSTR_6XX(c0060006_0181800f, "ldg.u32 r1.z, g[r1.z+7], 1"),
+   INSTR_6XX(c0060006_01818001, "ldg.u32 r1.z, g[r1.z], 1"),
 
-	/* dEQP-GLES3.functional.ubo.random.basic_arrays.0 */
-	INSTR_6XX(c7020020_01800000, "stc c[32], r0.x, 1", .parse_fail=true),
-	/* dEQP-VK.image.image_size.cube_array.readonly_writeonly_1x1x12 */
-	INSTR_6XX(c7060020_03800000, "stc c[32], r0.x, 3", .parse_fail=true),
+   /* dEQP-GLES3.functional.ubo.random.basic_arrays.0 */
+   INSTR_6XX(c7020020_01800000, "stc c[32], r0.x, 1", .parse_fail=true),
+   /* dEQP-VK.image.image_size.cube_array.readonly_writeonly_1x1x12 */
+   INSTR_6XX(c7060020_03800000, "stc c[32], r0.x, 3", .parse_fail=true),
 
-	/* dEQP-VK.image.image_size.cube_array.readonly_writeonly_1x1x12 */
-	INSTR_6XX(c0260200_03676100, "stib.b.untyped.1d.u32.3.imm.base0 r0.x, r0.w, 1"), /* stib.untyped.u32.1d.3.mode4.base0 r0.x, r0.w, 1 */
+   /* dEQP-VK.image.image_size.cube_array.readonly_writeonly_1x1x12 */
+   INSTR_6XX(c0260200_03676100, "stib.b.untyped.1d.u32.3.imm.base0 r0.x, r0.w, 1"), /* stib.untyped.u32.1d.3.mode4.base0 r0.x, r0.w, 1 */
+
+   INSTR_6XX(c0240402_00674100, "stib.b.untyped.1d.u16.1.imm.base0 r0.z, r0.x, 2"),
 #if 0
    /* TODO blob sometimes/frequently sets b0, although there does not seem
     * to be an obvious pattern and our encoding never sets it.  AFAICT it
@@ -226,6 +245,8 @@ static const struct test {
    INSTR_6XX(c0260000_0063c000, "resinfo.b.untyped.1d.u32.1.imm r0.x, 0"), /* resinfo.u32.1d.mode0.base0 r0.x, 0 */
    /* dEQP-VK.image.image_size.2d.readonly_12x34.txt */
    INSTR_6XX(c0260000_0063c300, "resinfo.b.untyped.2d.u32.1.imm.base0 r0.x, 0"), /* resinfo.u32.2d.mode4.base0 r0.x, 0 */
+   /* Custom test */
+   INSTR_6XX(c0260000_0063c382, "resinfo.b.untyped.2d.u32.1.nonuniform.base1 r0.x, r0.x"), /* resinfo.u32.2d.mode6.base1 r0.x, r0.x */
 
    /* dEQP-GLES31.functional.image_load_store.2d.image_size.readonly_writeonly_32x32.txt */
    INSTR_5XX(c3e60000_00000200, "resinfo.u32.2d r0.x, g[0]"), /* resinfo.u32.2d r0.x, 0 */
@@ -278,22 +299,16 @@ static const struct test {
    /* LDC.  Our disasm differs greatly from qcom here, and we've got some
     * important info they lack(?!), but same goes the other way.
     */
-#if 0
-   /* TODO our encoding differs in b23 for these four.. unsure if that is dontcare bit */
    /* dEQP-GLES31.functional.shaders.opaque_type_indexing.ubo.uniform_fragment */
-   INSTR_6XX(c0260000_00c78040, "ldc.offset0.1.uniform r0.x, r0.x, r0.x"), /* ldc.1.mode1.base0 r0.x, 0, r0.x */
-   INSTR_6XX(c0260201_00c78040, "ldc.offset0.1.uniform r0.y, r0.x, r0.y"), /* ldc.1.mode1.base0 r0.y, 0, r0.y */
+   INSTR_6XX(c0260000_00c78040, "ldc.offset0.1.uniform r0.x, 0, r0.x"), /* ldc.1.mode1.base0 r0.x, 0, r0.x */
+   INSTR_6XX(c0260201_00c78040, "ldc.offset0.1.uniform r0.y, 0, r0.y"), /* ldc.1.mode1.base0 r0.y, 0, r0.y */
    /* dEQP-GLES31.functional.shaders.opaque_type_indexing.ubo.dynamically_uniform_fragment  */
-   INSTR_6XX(c0260000_00c78080, "ldc.offset0.1.nonuniform r0.x, r0.x, r0.x"), /* ldc.1.mode2.base0 r0.x, 0, r0.x */
-   INSTR_6XX(c0260201_00c78080, "ldc.offset0.1.nonuniform r0.y, r0.x, r0.y"), /* ldc.1.mode2.base0 r0.y, 0, r0.y */
-#else
-   /* dEQP-GLES31.functional.shaders.opaque_type_indexing.ubo.uniform_fragment */
-   INSTR_6XX(c0260000_00478040, "ldc.offset0.1.uniform r0.x, r0.x, r0.x"), /* ldc.1.mode1.base0 r0.x, 0, r0.x */
-   INSTR_6XX(c0260201_00478040, "ldc.offset0.1.uniform r0.y, r0.x, r0.y"), /* ldc.1.mode1.base0 r0.y, 0, r0.y */
-   /* dEQP-GLES31.functional.shaders.opaque_type_indexing.ubo.dynamically_uniform_fragment  */
-   INSTR_6XX(c0260000_00478080, "ldc.offset0.1.nonuniform r0.x, r0.x, r0.x"), /* ldc.1.mode2.base0 r0.x, 0, r0.x */
-   INSTR_6XX(c0260201_00478080, "ldc.offset0.1.nonuniform r0.y, r0.x, r0.y"), /* ldc.1.mode2.base0 r0.y, 0, r0.y */
-#endif
+   INSTR_6XX(c0260000_00c78080, "ldc.offset0.1.nonuniform r0.x, 0, r0.x"), /* ldc.1.mode2.base0 r0.x, 0, r0.x */
+   INSTR_6XX(c0260201_00c78080, "ldc.offset0.1.nonuniform r0.y, 0, r0.y"), /* ldc.1.mode2.base0 r0.y, 0, r0.y */
+
+   /* custom */
+   INSTR_6XX(c0260201_ffc78080, "ldc.offset0.1.nonuniform r0.y, 255, r0.y"), /* ldc.1.mode2.base0 r0.y, 255, r0.y */
+
    /* custom shaders, loading .x, .y, .z, .w from an array of vec4 in block 0 */
    INSTR_6XX(c0260000_00478000, "ldc.offset0.1.imm r0.x, r0.x, 0"), /* ldc.1.mode0.base0 r0.x, r0.x, 0 */
    INSTR_6XX(c0260000_00478200, "ldc.offset1.1.imm r0.x, r0.x, 0"), /* ldc.1.mode0.base0 r0.x, r0.x, 0 */
@@ -409,7 +424,6 @@ main(int argc, char **argv)
          printf("  Got:      \"%s\"\n", disasm_output);
          retval = 1;
          decode_fails++;
-         continue;
       }
 
       /*
diff --git a/mesa 3D driver/src/freedreno/isa/encode.c b/mesa 3D driver/src/freedreno/isa/encode.c
index fb4e851f66..5b89353ec2 100644
--- a/mesa 3D driver/src/freedreno/isa/encode.c	
+++ b/mesa 3D driver/src/freedreno/isa/encode.c	
@@ -145,6 +145,20 @@ extract_ABSNEG(struct ir3_register *reg)
 	}
 }
 
+static inline int32_t
+extract_reg_iim(struct ir3_register *reg)
+{
+   assert(reg->flags & IR3_REG_IMMED);
+   return reg->iim_val;
+}
+
+static inline uint32_t
+extract_reg_uim(struct ir3_register *reg)
+{
+   assert(reg->flags & IR3_REG_IMMED);
+   return reg->uim_val;
+}
+
 /**
  * This is a bit messy, to deal with the fact that the optional "s2en"
  * src is the first src, shifting everything else up by one.
@@ -179,7 +193,11 @@ extract_cat5_DESC_MODE(struct ir3_instruction *instr)
 	if (instr->flags & IR3_INSTR_S2EN) {
 		if (instr->flags & IR3_INSTR_B) {
 			if (instr->flags & IR3_INSTR_A1EN) {
-				return CAT5_BINDLESS_A1_UNIFORM;
+				if (instr->flags & IR3_INSTR_NONUNIF) {
+					return CAT5_BINDLESS_A1_NONUNIFORM;
+				} else {
+					return CAT5_BINDLESS_A1_UNIFORM;
+				}
 			} else if (instr->flags & IR3_INSTR_NONUNIF) {
 				return CAT5_BINDLESS_NONUNIFORM;
 			} else {
@@ -298,7 +316,7 @@ __cat3_src_case(struct encode_state *s, struct ir3_register *reg)
 void *
 isa_assemble(struct ir3_shader_variant *v)
 {
-	uint64_t *ptr, *instrs;
+	BITSET_WORD *ptr, *instrs;
 	const struct ir3_info *info = &v->info;
 	struct ir3 *shader = v->ir;
 
@@ -311,7 +329,9 @@ isa_assemble(struct ir3_shader_variant *v)
 				.instr = instr,
 			};
 
-			*(instrs++) = encode__instruction(&s, NULL, instr);
+			const bitmask_t encoded = encode__instruction(&s, NULL, instr);
+			store_instruction(instrs, encoded);
+			instrs += BITMASK_WORDS;
 		}
 	}
 
diff --git a/mesa 3D driver/src/freedreno/isa/ir3-cat1.xml b/mesa 3D driver/src/freedreno/isa/ir3-cat1.xml
index 384de20162..34a893b1cf 100644
--- a/mesa 3D driver/src/freedreno/isa/ir3-cat1.xml	
+++ b/mesa 3D driver/src/freedreno/isa/ir3-cat1.xml	
@@ -207,7 +207,7 @@ SOFTWARE.
 
 	<field name="IMMED" low="0" high="31" type="uint"/>
 	<encode type="struct ir3_register *">
-		<map name="IMMED">src->uim_val</map>
+		<map name="IMMED">extract_reg_uim(src)</map>
 	</encode>
 </bitset>
 
diff --git a/mesa 3D driver/src/freedreno/isa/ir3-cat3.xml b/mesa 3D driver/src/freedreno/isa/ir3-cat3.xml
index 5e638e1543..18491e2a1a 100644
--- a/mesa 3D driver/src/freedreno/isa/ir3-cat3.xml	
+++ b/mesa 3D driver/src/freedreno/isa/ir3-cat3.xml	
@@ -69,7 +69,7 @@ SOFTWARE.
 	<encode>
 		<map name="CONST">src->num >> 2</map>
 		<map name="SWIZ">src->num &amp; 0x3</map>
-		<map name="IMMED">src->uim_val</map>
+		<map name="IMMED">extract_reg_uim(src)</map>
 	</encode>
 </bitset>
 
diff --git a/mesa 3D driver/src/freedreno/isa/ir3-cat5.xml b/mesa 3D driver/src/freedreno/isa/ir3-cat5.xml
index a75c428d69..a129b75fb0 100644
--- a/mesa 3D driver/src/freedreno/isa/ir3-cat5.xml	
+++ b/mesa 3D driver/src/freedreno/isa/ir3-cat5.xml	
@@ -85,15 +85,7 @@ SOFTWARE.
 	<display>
 		{SY}{JP}{NAME}{3D}{A}{O}{P}{S} {TYPE}({WRMASK}){DST_HALF}{DST}{SRC1}{SRC2}{SAMP}{TEX}
 	</display>
-	<derived name="DST_HALF" type="bool" display="h">
-		<expr>
-			({TYPE} == 0) /* f16 */ ||
-			({TYPE} == 2) /* u16 */ ||
-			({TYPE} == 4) /* s16 */ ||
-			({TYPE} == 6) /* u8 */  ||
-			({TYPE} == 7) /* s8 */
-		</expr>
-	</derived>
+	<derived name="DST_HALF" expr="#type-half" type="bool" display="h"/>
 	<field name="FULL" pos="0" type="bool"/>
 	<derived name="HALF" expr="#multisrc-half" type="bool" display="h"/>
 	<field name="SRC1" low="1" high="8" type="#cat5-src1">
@@ -481,12 +473,7 @@ SOFTWARE.
 	<display/>
 	<assert low="0" high="3">0000</assert>
 	<encode type="struct ir3_instruction *">
-		<!--
-			TODO properly decouple the encoding from ir3 IR in this
-			case.. the IR has no business knowing how this gets
-			encoded into "SRC3"..
-		 -->
-		<map name="TEX">src->cat5.samp >> 4</map>
+		<map name="TEX">src->cat5.tex</map>
 	</encode>
 </bitset>
 
diff --git a/mesa 3D driver/src/freedreno/isa/ir3-cat6.xml b/mesa 3D driver/src/freedreno/isa/ir3-cat6.xml
index e4bccd0fc1..7c1b2516f1 100644
--- a/mesa 3D driver/src/freedreno/isa/ir3-cat6.xml	
+++ b/mesa 3D driver/src/freedreno/isa/ir3-cat6.xml	
@@ -32,6 +32,8 @@ SOFTWARE.
 	<field   pos="59"          name="JP" type="bool" display="(jp)"/>
 	<field   pos="60"          name="SY" type="bool" display="(sy)"/>
 	<pattern low="61" high="63">110</pattern>  <!-- cat6 -->
+	<!-- is load dst / store src a half-reg? -->
+	<derived name="TYPE_HALF" expr="#type-half" type="bool" display="h"/>
 	<encode>
 		<map name="TYPE">src->cat6.type</map>
 	</encode>
@@ -59,15 +61,15 @@ SOFTWARE.
 	</doc>
 
 	<display>
-		{SY}{JP}{NAME}.{TYPE} {DST}, g[{SRC1}{OFF}], {SIZE}
+		{SY}{JP}{NAME}.{TYPE} {TYPE_HALF}{DST}, g[{SRC1}{OFF}], {SIZE}
 	</display>
 
 	<field low="1" high="13" name="OFF" type="offset"/>
 	<pattern pos="22"          >0</pattern> <!-- Imm offset ldg form -->
 
 	<encode>
-		<map name="OFF">src->srcs[1]->iim_val</map>
-		<map name="SIZE">src->srcs[2]->uim_val</map>
+		<map name="OFF">extract_reg_iim(src->srcs[1])</map>
+		<map name="SIZE">extract_reg_uim(src->srcs[2])</map>
 	</encode>
 </bitset>
 
@@ -79,12 +81,12 @@ SOFTWARE.
 	<gen min="600"/>
 
 	<display>
-		{SY}{JP}{NAME}.{TYPE} {DST}, g[{SRC1}+({SRC2}{OFF})&lt;&lt;{SRC2_BYTE_SHIFT}], {SIZE}
+		{SY}{JP}{NAME}.{TYPE} {TYPE_HALF}{DST}, g[{SRC1}+({SRC2}{OFF})&lt;&lt;{SRC2_BYTE_SHIFT}], {SIZE}
 	</display>
 
 	<override>
 		<display>
-			{SY}{JP}{NAME}.{TYPE} {DST}, g[{SRC1}+{SRC2}&lt;&lt;{SRC2_BYTE_SHIFT}{OFF}&lt;&lt;2], {SIZE}
+			{SY}{JP}{NAME}.{TYPE} {TYPE_HALF}{DST}, g[{SRC1}+{SRC2}&lt;&lt;{SRC2_BYTE_SHIFT}{OFF}&lt;&lt;2], {SIZE}
 		</display>
 		<expr>{SRC2_ADD_DWORD_SHIFT} > 0</expr>
 	</override>
@@ -101,9 +103,9 @@ SOFTWARE.
 
 	<encode>
 		<map name="SRC2">src->srcs[1]</map>
-		<map name="SRC2_ADD_DWORD_SHIFT">src->srcs[2]->uim_val</map>
-		<map name="OFF">src->srcs[3]->uim_val</map>
-		<map name="SIZE">src->srcs[4]->uim_val</map>
+		<map name="SRC2_ADD_DWORD_SHIFT">extract_reg_uim(src->srcs[2])</map>
+		<map name="OFF">extract_reg_uim(src->srcs[3])</map>
+		<map name="SIZE">extract_reg_uim(src->srcs[4])</map>
 	</encode>
 </bitset>
 
@@ -129,22 +131,22 @@ SOFTWARE.
 	</doc>
 
 	<display>
-		{SY}{JP}{NAME}.{TYPE} g[{SRC1}{OFF}], {SRC3}, {SIZE}
+		{SY}{JP}{NAME}.{TYPE} g[{SRC1}{OFF}], {TYPE_HALF}{SRC3}, {SIZE}
 	</display>
 
 	<derived name="OFF" width="13" type="offset">
 		<expr>({OFF_HI} &lt;&lt; 8) | {OFF_LO}</expr>
 	</derived>
 
-	<field   low="9"  high="13" name="OFF_HI" type="uint"/>
+	<field   low="9"  high="13" name="OFF_HI" type="int"/>
 	<field   low="32" high="39" name="OFF_LO" type="uint"/>
 	<pattern pos="52" >0</pattern> <!-- Imm offset stg form -->
 
 	<encode>
-		<map name="OFF_LO">src->srcs[1]->iim_val</map>
-		<map name="OFF_HI">src->srcs[1]->iim_val >> 8</map>
+		<map name="OFF_LO">extract_reg_iim(src->srcs[1]) &amp; 0xff</map>
+		<map name="OFF_HI">extract_reg_iim(src->srcs[1]) >> 8</map>
 		<map name="SRC3">src->srcs[2]</map>
-		<map name="SIZE">src->srcs[3]->uim_val</map>
+		<map name="SIZE">extract_reg_uim(src->srcs[3])</map>
 	</encode>
 </bitset>
 
@@ -156,12 +158,12 @@ SOFTWARE.
 	<gen min="600"/>
 
 	<display>
-		{SY}{JP}{NAME}.{TYPE} g[{SRC1}+({SRC2}{OFF})&lt;&lt;{DST_BYTE_SHIFT}], {SRC3}, {SIZE}
+		{SY}{JP}{NAME}.{TYPE} g[{SRC1}+({SRC2}{OFF})&lt;&lt;{DST_BYTE_SHIFT}], {TYPE_HALF}{SRC3}, {SIZE}
 	</display>
 
 	<override>
 		<display>
-			{SY}{JP}{NAME}.{TYPE} g[{SRC1}+{SRC2}&lt;&lt;{DST_BYTE_SHIFT}{OFF}&lt;&lt;2], {SRC3}, {SIZE}
+			{SY}{JP}{NAME}.{TYPE} g[{SRC1}+{SRC2}&lt;&lt;{DST_BYTE_SHIFT}{OFF}&lt;&lt;2], {TYPE_HALF}{SRC3}, {SIZE}
 		</display>
 		<expr>{SRC2_ADD_DWORD_SHIFT} > 0</expr>
 	</override>
@@ -178,10 +180,10 @@ SOFTWARE.
 
 	<encode>
 		<map name="SRC2">src->srcs[1]</map>
-		<map name="SRC2_ADD_DWORD_SHIFT">src->srcs[2]->uim_val</map>
-		<map name="OFF">src->srcs[3]->uim_val</map>
+		<map name="SRC2_ADD_DWORD_SHIFT">extract_reg_uim(src->srcs[2])</map>
+		<map name="OFF">extract_reg_uim(src->srcs[3])</map>
 		<map name="SRC3">src->srcs[4]</map>
-		<map name="SIZE">src->srcs[5]->uim_val</map>
+		<map name="SIZE">extract_reg_uim(src->srcs[5])</map>
 	</encode>
 </bitset>
 
@@ -196,9 +198,9 @@ SOFTWARE.
 	<pattern low="40" high="48">xxxxxxxxx</pattern>
 	<pattern low="52" high="53">xx</pattern>
 	<encode>
-		<map name="OFF">src->srcs[1]->uim_val</map>
+		<map name="OFF">extract_reg_uim(src->srcs[1])</map>
 		<map name="SRC">src->srcs[0]</map>
-		<map name="SIZE">src->srcs[2]->uim_val</map>
+		<map name="SIZE">extract_reg_uim(src->srcs[2])</map>
 	</encode>
 </bitset>
 
@@ -249,8 +251,8 @@ SOFTWARE.
 	<pattern low="52" high="53">xx</pattern>
 	<pattern low="54" high="58">11111</pattern>  <!-- OPC -->
 	<encode>
-		<map name="SIZE">src->srcs[1]->uim_val</map>
-		<map name="OFF">src->srcs[0]->uim_val</map>
+		<map name="SIZE">extract_reg_uim(src->srcs[1])</map>
+		<map name="OFF">extract_reg_uim(src->srcs[0])</map>
 	</encode>
 </bitset>
 
@@ -260,7 +262,7 @@ SOFTWARE.
 	</derived>
 
 	<field   low="1"  high="8" name="SRC" type="#reg-gpr"/>
-	<field   low="9"  high="13" name="OFF_HI" type="uint"/>
+	<field   low="9"  high="13" name="OFF_HI" type="int"/>
 	<pattern low="14" high="22">xxxxxxxxx</pattern>
 	<pattern pos="23"          >1</pattern>
 	<field   low="24" high="31" name="SIZE" type="uint"/>
@@ -278,7 +280,7 @@ SOFTWARE.
 		<map name="OFF_LO">src->cat6.dst_offset &amp; 0xff</map>
 		<map name="SRC">src->srcs[1]</map>
 		<map name="DST">src->srcs[0]</map>"
-		<map name="SIZE">src->srcs[2]->uim_val</map>
+		<map name="SIZE">extract_reg_uim(src->srcs[2])</map>
 	</encode>
 </bitset>
 
@@ -343,7 +345,7 @@ SOFTWARE.
 	<pattern low="52" high="53">xx</pattern>
 	<pattern low="54" high="58">11100</pattern>  <!-- OPC -->
 	<encode>
-		<map name="DST">src->srcs[0]->uim_val</map>
+		<map name="DST">extract_reg_uim(src->srcs[0])</map>
 		<map name="SRC">src->srcs[1]</map>
 	</encode>
 </bitset>
@@ -378,21 +380,35 @@ SOFTWARE.
 	</encode>
 </bitset>
 
-<!-- ldgb.untyped.4d.f32.4 r0.x, g[0], r0.x, r1.z -->
-<bitset name="ldgb" extends="#instruction-cat6-a3xx">
+<!-- base pattern for a3xx cat6 ssbo/ibo load/store instructions -->
+<bitset name="#instruction-cat6-a3xx-ibo" extends="#instruction-cat6-a3xx">
+	<derived name="D" expr="#cat6-d" type="uint"/>
+	<derived name="TYPE_SIZE" expr="#cat6-type-size" type="uint"/>
+
+	<field   low="9"  high="10" name="D_MINUS_ONE" type="uint"/>
+	<field   pos="11"           name="TYPED" type="#cat6-typed"/>
+	<field   low="12" high="13" name="TYPE_SIZE_MINUS_ONE" type="uint"/>
+	<field   low="41" high="48" name="SSBO" type="#cat6-src">   <!-- SSBO/image binding point -->
+		<param name="SSBO_IM" as="SRC_IM"/>
+	</field>
+	<pattern pos="52"          >x</pattern>         <!-- G -->
+	<field   pos="53"          name="SSBO_IM" type="bool"/>
+	<encode>
+		<map name="D_MINUS_ONE">src->cat6.d - 1</map>
+		<map name="TYPED">src</map>
+		<map name="TYPE_SIZE_MINUS_ONE">src->cat6.iim_val - 1</map>
+		<map name="SSBO">src->srcs[0]</map>
+		<map name="SSBO_IM">!!(src->srcs[0]->flags &amp; IR3_REG_IMMED)</map>
+	</encode>
+</bitset>
+
+<bitset name="#instruction-cat6-a3xx-ibo-load" extends="#instruction-cat6-a3xx-ibo">
 	<display>
 		{SY}{JP}{NAME}.{TYPED}.{D}d.{TYPE}.{TYPE_SIZE} {DST}, g[{SSBO}], {SRC1}, {SRC2}
 	</display>
 	<gen max="599"/>
 
-	<derived name="D" expr="#cat6-d" type="uint"/>
-	<derived name="TYPE_SIZE" expr="#cat6-type-size" type="uint"/>
-
-	<pattern pos="0"           >x</pattern>
 	<pattern low="1"  high="8" >xxxxxxxx</pattern>  <!-- SRC3 -->
-	<field   low="9"  high="10" name="D_MINUS_ONE" type="uint"/>
-	<field   pos="11"           name="TYPED" type="#cat6-typed"/>
-	<field   low="12" high="13" name="TYPE_SIZE_MINUS_ONE" type="uint"/>
 	<field   low="14" high="21" name="SRC1" type="#cat6-src">
 		<param name="SRC1_IM" as="SRC_IM"/>
 	</field>
@@ -402,19 +418,8 @@ SOFTWARE.
 		<param name="SRC2_IM" as="SRC_IM"/>
 	</field>
 	<field   low="32" high="39" name="DST" type="#reg-gpr"/>
-	<pattern pos="40"          >0</pattern>
-	<field   low="41" high="48" name="SSBO" type="#cat6-src">   <!-- SSBO/image binding point -->
-		<param name="SSBO_IM" as="SRC_IM"/>
-	</field>
-	<pattern pos="52"          >x</pattern>         <!-- G -->
-	<field   pos="53"          name="SSBO_IM" type="bool"/>
-	<pattern low="54" high="58">11011</pattern>  <!-- OPC -->
+	<pattern pos="40"          >x</pattern> <!-- .rck -->
 	<encode>
-		<map name="D_MINUS_ONE">src->cat6.d - 1</map>
-		<map name="TYPED">src</map>
-		<map name="TYPE_SIZE_MINUS_ONE">src->cat6.iim_val - 1</map>
-		<map name="SSBO">src->srcs[0]</map>
-		<map name="SSBO_IM">!!(src->srcs[0]->flags &amp; IR3_REG_IMMED)</map>
 		<map name="SRC1">src->srcs[1]</map>
 		<map name="SRC1_IM">!!(src->srcs[1]->flags &amp; IR3_REG_IMMED)</map>
 		<map name="SRC2">src->srcs[2]</map>
@@ -422,20 +427,24 @@ SOFTWARE.
 	</encode>
 </bitset>
 
-<bitset name="#instruction-cat6-a3xx-ibo" extends="#instruction-cat6-a3xx">
+<bitset name="ldgb" extends="#instruction-cat6-a3xx-ibo-load">
+	<pattern low="54" high="58">11011</pattern>  <!-- OPC -->
+        <pattern pos="0"           >x</pattern> <!-- .a -->
+</bitset>
+
+<bitset name="ldib" extends="#instruction-cat6-a3xx-ibo-load">
+	<pattern low="54" high="58">00110</pattern>  <!-- OPC -->
+        <pattern pos="0"           >1</pattern> <!-- .a -->
+</bitset>
+
+<bitset name="#instruction-cat6-a3xx-ibo-store" extends="#instruction-cat6-a3xx-ibo">
 	<display>
 		{SY}{JP}{NAME}.{TYPED}.{D}d.{TYPE}.{TYPE_SIZE} g[{SSBO}], {SRC1}, {SRC2}, {SRC3}
 	</display>
 	<gen max="599"/>
 
-	<derived name="D" expr="#cat6-d" type="uint"/>
-	<derived name="TYPE_SIZE" expr="#cat6-type-size" type="uint"/>
-
-	<pattern pos="0"           >1</pattern>
+        <pattern pos="0"           >1</pattern> <!-- .a -->
 	<field   low="1"  high="8"  name="SRC1" type="#reg-gpr"/>
-	<field   low="9"  high="10" name="D_MINUS_ONE" type="uint"/>
-	<field   pos="11"           name="TYPED" type="#cat6-typed"/>
-	<field   low="12" high="13" name="TYPE_SIZE_MINUS_ONE" type="uint"/>
 	<pattern low="14" high="22">xxxxxxxxx</pattern>
 	<field   pos="23"           name="SRC2_IM" type="bool"/>
 	<field   low="24" high="31" name="SRC2" type="#cat6-src">
@@ -445,17 +454,7 @@ SOFTWARE.
 		<param name="SRC3_IM" as="SRC_IM"/>
 	</field>
 	<field   pos="40"           name="SRC3_IM" type="bool"/>
-	<field   low="41" high="48" name="SSBO" type="#cat6-src">   <!-- SSBO/image binding point -->
-		<param name="SSBO_IM" as="SRC_IM"/>
-	</field>
-	<pattern pos="52"          >x</pattern>         <!-- G -->
-	<field   pos="53"          name="SSBO_IM" type="bool"/>
 	<encode>
-		<map name="D_MINUS_ONE">src->cat6.d - 1</map>
-		<map name="TYPED">src</map>
-		<map name="TYPE_SIZE_MINUS_ONE">src->cat6.iim_val - 1</map>
-		<map name="SSBO">src->srcs[0]</map>
-		<map name="SSBO_IM">!!(src->srcs[0]->flags &amp; IR3_REG_IMMED)</map>
 		<map name="SRC1">src->srcs[1]</map>
 		<map name="SRC1_IM">!!(src->srcs[1]->flags &amp; IR3_REG_IMMED)</map>
 		<map name="SRC2">src->srcs[2]</map>
@@ -465,11 +464,11 @@ SOFTWARE.
 	</encode>
 </bitset>
 
-<bitset name="stgb" extends="#instruction-cat6-a3xx-ibo">
+<bitset name="stgb" extends="#instruction-cat6-a3xx-ibo-store">
 	<pattern low="54" high="58">11100</pattern>  <!-- OPC -->
 </bitset>
 
-<bitset name="stib" extends="#instruction-cat6-a3xx-ibo">
+<bitset name="stib" extends="#instruction-cat6-a3xx-ibo-store">
 	<pattern low="54" high="58">11101</pattern>  <!-- OPC -->
 </bitset>
 
@@ -721,8 +720,9 @@ SOFTWARE.
 	<pattern low="24" high="31">xxxxxxxx</pattern>    <!-- SRC2 -->
 	<field   low="32" high="39" name="DST" type="#reg-gpr"/>
 	<field   low="41" high="48" name="SSBO" type="#cat6-src">   <!-- SSBO/image binding point -->
-		<param name="TRUE" as="SRC_IM"/>
+		<param name="SSBO_IM" as="SRC_IM"/>
 	</field>
+	<derived name="SSBO_IM" expr="#cat6-direct" type="bool"/>
 	<field   low="49" high="51" name="TYPE" type="#type"/>
 	<pattern low="52" high="53">1x</pattern>
 	<encode>
@@ -894,7 +894,7 @@ SOFTWARE.
 		<display>
 			{IMMED}
 		</display>
-		<field name="IMMED" low="0" high="7" type="int"/>
+		<field name="IMMED" low="0" high="7" type="uint"/>
 	</override>
 	<display>
 		r{GPR}.{SWIZ}
@@ -904,7 +904,7 @@ SOFTWARE.
 	<encode type="struct ir3_register *">
 		<map name="GPR">src->num >> 2</map>
 		<map name="SWIZ">src->num &amp; 0x3</map>
-		<map name="IMMED">src->iim_val</map>
+		<map name="IMMED">extract_reg_iim(src)</map>
 	</encode>
 </bitset>
 
diff --git a/mesa 3D driver/src/freedreno/isa/ir3-common.xml b/mesa 3D driver/src/freedreno/isa/ir3-common.xml
index 0be7eeb404..b6717b557e 100644
--- a/mesa 3D driver/src/freedreno/isa/ir3-common.xml	
+++ b/mesa 3D driver/src/freedreno/isa/ir3-common.xml	
@@ -149,7 +149,7 @@ SOFTWARE.
 	<pattern low="11" high="13">100</pattern>
 	<field name="ABSNEG" low="14" high="15" type="#absneg"/>
 	<encode>
-		<map name="IMMED">src->uim_val</map>
+		<map name="IMMED">extract_reg_iim(src)</map>
 	</encode>
 </bitset>
 
@@ -175,7 +175,7 @@ SOFTWARE.
 	<pattern low="11" high="13">101</pattern>
 	<field name="ABSNEG" low="14" high="15" type="#absneg"/>
 	<encode>
-		<map name="IMMED">src->uim_val</map>
+		<map name="IMMED">extract_reg_uim(src)</map>
 	</encode>
 </bitset>
 
@@ -308,6 +308,14 @@ SOFTWARE.
 	<value val="7" display="s8"/>
 </enum>
 
+<expr name="#type-half">
+	({TYPE} == 0) /* f16 */ ||
+	({TYPE} == 2) /* u16 */ ||
+	({TYPE} == 4) /* s16 */ ||
+	({TYPE} == 6) /* u8 */  ||
+	({TYPE} == 7) /* s8 */
+</expr>
+
 <enum name="#absneg">
 	<value val="0" display=""/>
 	<value val="1" display="(neg)"/>
diff --git a/mesa 3D driver/src/freedreno/isa/ir3-disasm.c b/mesa 3D driver/src/freedreno/isa/ir3-disasm.c
index ee1c0c1660..b1a1a1fc16 100644
--- a/mesa 3D driver/src/freedreno/isa/ir3-disasm.c	
+++ b/mesa 3D driver/src/freedreno/isa/ir3-disasm.c	
@@ -35,9 +35,9 @@
 
 
 static void
-disasm_instr_cb(void *d, unsigned n, uint64_t instr)
+disasm_instr_cb(void *d, unsigned n, void *instr)
 {
-	uint32_t *dwords = (uint32_t *)&instr;
+	uint32_t *dwords = (uint32_t *)instr;
 	printf("%3d[%08x_%08x] ", n, dwords[1], dwords[0]);
 }
 
diff --git a/mesa 3D driver/src/freedreno/isa/isa.h b/mesa 3D driver/src/freedreno/isa/isa.h
index 84f12081b4..5056c240b6 100644
--- a/mesa 3D driver/src/freedreno/isa/isa.h	
+++ b/mesa 3D driver/src/freedreno/isa/isa.h	
@@ -76,7 +76,7 @@ struct isa_decode_options {
 	/**
 	 * Callback prior to instruction decode
 	 */
-	void (*instr_cb)(void *data, unsigned n, uint64_t instr);
+	void (*instr_cb)(void *data, unsigned n, void *instr);
 };
 
 void isa_decode(void *bin, int sz, FILE *out, const struct isa_decode_options *options);
diff --git a/mesa 3D driver/src/freedreno/isa/meson.build b/mesa 3D driver/src/freedreno/isa/meson.build
index 4e08541ffc..4110ad0db7 100644
--- a/mesa 3D driver/src/freedreno/isa/meson.build	
+++ b/mesa 3D driver/src/freedreno/isa/meson.build	
@@ -28,30 +28,27 @@ isa_depend_files = [
   'ir3-cat5.xml',
   'ir3-cat6.xml',
   'ir3-cat7.xml',
-  'isa.py',
 ]
 
-ir3_isa_c = custom_target(
-  'ir3-isa.c',
-  input: ['decode.py', 'ir3.xml'],
-  output: 'ir3-isa.c',
+ir3_isa = custom_target(
+  'ir3-isa',
+  input: ['ir3.xml'],
+  output: ['isaspec-isa.h', 'ir3-isa.c', 'ir3-isa.h'],
   command: [
-    prog_python, '@INPUT0@', '@INPUT1@', '@OUTPUT@'
+    prog_isaspec_decode, '@INPUT@', '@OUTPUT@'
   ],
   depend_files: isa_depend_files,
 )
 
 decode_files = [
-  ir3_isa_c,
+  ir3_isa,
   'isa.h',
-  'decode.h',
-  'decode.c',
 ]
 
 libir3decode = static_library(
   'ir3decode',
   decode_files,
-  dependencies: idep_mesautil,
+  dependencies: [idep_mesautil, idep_isaspec_decode],
   include_directories: [
     inc_include,
     inc_src,
@@ -75,10 +72,10 @@ ir3disasm = executable(
 
 encode_h = custom_target(
   'encode.h',
-  input: ['encode.py', 'ir3.xml'],
+  input: ['ir3.xml'],
   output: 'encode.h',
   command: [
-    prog_python, '@INPUT0@', '@INPUT1@', '@OUTPUT@'
+    prog_isaspec_encode, '@INPUT@', '@OUTPUT@'
   ],
   depend_files: isa_depend_files,
 )
diff --git a/mesa 3D driver/src/freedreno/meson.build b/mesa 3D driver/src/freedreno/meson.build
index 8d1f12b63c..f24221e735 100644
--- a/mesa 3D driver/src/freedreno/meson.build	
+++ b/mesa 3D driver/src/freedreno/meson.build	
@@ -25,13 +25,13 @@ rnn_src_path = meson.source_root() + '/src/freedreno/registers'
 rnn_install_path = get_option('datadir') + '/freedreno/registers'
 rnn_path = rnn_src_path + ':' + get_option('prefix') + '/' + rnn_install_path
 
-dep_lua = dependency('lua53', required: false)
-if not dep_lua.found()
-  dep_lua = dependency('lua52', required: false)
-endif
-if not dep_lua.found()
-  dep_lua = dependency('lua', required: false, version : '>=5.2')
-endif
+# TODO: use multi-argument dependency() in meson 0.60
+foreach lua : ['lua54', 'lua53', 'lua52', 'lua']
+    dep_lua = dependency(lua, required: false, version: '>=5.2')
+    if dep_lua.found()
+        break
+    endif
+endforeach
 
 dep_libarchive = dependency('libarchive', required: false)
 dep_libxml2 = dependency('libxml-2.0', required: false)
diff --git a/mesa 3D driver/src/freedreno/registers/adreno/a4xx.xml b/mesa 3D driver/src/freedreno/registers/adreno/a4xx.xml
index 3f0e1ae0e4..f98ac60b70 100644
--- a/mesa 3D driver/src/freedreno/registers/adreno/a4xx.xml	
+++ b/mesa 3D driver/src/freedreno/registers/adreno/a4xx.xml	
@@ -1670,7 +1670,7 @@ perhaps they should be taken with a grain of salt
 			<bitfield name="COLOR_SRGB" pos="18" type="boolean"/>
 		</reg32>
 	</array>
-	<reg32 offset="0x2300" name="SP_CS_CTRL_REG0"/>
+	<reg32 offset="0x2300" name="SP_CS_CTRL_REG0" type="a4xx_sp_vs_fs_ctrl_reg0"/>
 	<reg32 offset="0x2301" name="SP_CS_OBJ_OFFSET_REG"/>
 	<reg32 offset="0x2302" name="SP_CS_OBJ_START"/>
 	<reg32 offset="0x2303" name="SP_CS_PVT_MEM_PARAM"/>
@@ -2097,15 +2097,24 @@ perhaps they should be taken with a grain of salt
 	</reg32>
 	<reg32 offset="0x23d3" name="HLSQ_CL_NDRANGE_6"/>
 	<reg32 offset="0x23d4" name="HLSQ_CL_CONTROL_0">
-		<bitfield name="WGIDCONSTID" low="0" high="7" type="a3xx_regid"/>
+		<bitfield name="WGIDCONSTID" low="0" high="11" type="a3xx_regid"/>
+		<bitfield name="UNK12CONSTID" low="12" high="23" type="a3xx_regid"/>
 		<bitfield name="LOCALIDREGID" low="24" high="31" type="a3xx_regid"/>
 	</reg32>
-	<reg32 offset="0x23d5" name="HLSQ_CL_CONTROL_1"/>
-	<reg32 offset="0x23d6" name="HLSQ_CL_KERNEL_CONST"/>
+	<reg32 offset="0x23d5" name="HLSQ_CL_CONTROL_1">
+		<bitfield name="UNK0CONSTID" low="0" high="11" type="a3xx_regid"/>
+		<bitfield name="UNK12CONSTID" low="12" high="23" type="a3xx_regid"/>
+	</reg32>
+	<reg32 offset="0x23d6" name="HLSQ_CL_KERNEL_CONST">
+		<bitfield name="UNK0CONSTID" low="0" high="11" type="a3xx_regid"/>
+		<bitfield name="NUMWGCONSTID" low="12" high="23" type="a3xx_regid"/>
+	</reg32>
 	<reg32 offset="0x23d7" name="HLSQ_CL_KERNEL_GROUP_X"/>
 	<reg32 offset="0x23d8" name="HLSQ_CL_KERNEL_GROUP_Y"/>
 	<reg32 offset="0x23d9" name="HLSQ_CL_KERNEL_GROUP_Z"/>
-	<reg32 offset="0x23da" name="HLSQ_CL_WG_OFFSET"/>
+	<reg32 offset="0x23da" name="HLSQ_CL_WG_OFFSET">
+		<bitfield name="UNK0CONSTID" low="0" high="11" type="a3xx_regid"/>
+	</reg32>
 	<reg32 offset="0x23db" name="HLSQ_UPDATE_CONTROL"/>
 
 	<!-- PC registers -->
diff --git a/mesa 3D driver/src/freedreno/registers/adreno/a5xx.xml b/mesa 3D driver/src/freedreno/registers/adreno/a5xx.xml
index 55e90f29b6..b85f195402 100644
--- a/mesa 3D driver/src/freedreno/registers/adreno/a5xx.xml	
+++ b/mesa 3D driver/src/freedreno/registers/adreno/a5xx.xml	
@@ -1825,14 +1825,9 @@ xsi:schemaLocation="http://nouveau.freedesktop.org/ rules-ng.xsd">
 		<bitfield name="IJ_PERSP_PIXEL" pos="0" type="boolean"/>
 		<bitfield name="IJ_PERSP_CENTROID" pos="1" type="boolean"/>
 		<bitfield name="IJ_PERSP_SAMPLE" pos="2" type="boolean"/>
-		<!--
-		bit 3 set when blob turns on WCOORD.. which also corresponds to
-		register being set in in HLSQ_CONTROL_3_REG bits 8..15 (which
-		shader does not use).. possibly providing wcoord in an alternate
-		way??
-		Also, when that happens, VARYING bits are turned on as well.
-		 -->
-		<bitfield name="SIZE" pos="3" type="boolean"/>
+		<bitfield name="IJ_LINEAR_PIXEL" pos="3" type="boolean"/>
+		<bitfield name="IJ_LINEAR_CENTROID" pos="4" type="boolean"/>
+		<bitfield name="IJ_LINEAR_SAMPLE" pos="5" type="boolean"/>
 		<bitfield name="COORD_MASK" low="6" high="9" type="hex"/>
 	</reg32>
 	<reg32 offset="0xe006" name="GRAS_CL_GUARDBAND_CLIP_ADJ">
@@ -1851,7 +1846,7 @@ xsi:schemaLocation="http://nouveau.freedesktop.org/ rules-ng.xsd">
 		<bitfield name="FRONT_CW" pos="2" type="boolean"/>
 		<bitfield name="LINEHALFWIDTH" low="3" high="10" radix="2" type="fixed"/>
 		<bitfield name="POLY_OFFSET" pos="11" type="boolean"/>
-		<bitfield name="MSAA_ENABLE" pos="13" type="boolean"/>
+		<bitfield name="LINE_MODE" pos="13" type="a5xx_line_mode"/>
 		<!-- probably LINEHALFWIDTH is the same as a4xx.. -->
 	</reg32>
 	<reg32 offset="0xe091" name="GRAS_SU_POINT_MINMAX">
@@ -1978,14 +1973,9 @@ bit 7 for RECTLIST (clear) when z32s8 (used for clear of depth32?  not set
 		<bitfield name="IJ_PERSP_PIXEL" pos="0" type="boolean"/>
 		<bitfield name="IJ_PERSP_CENTROID" pos="1" type="boolean"/>
 		<bitfield name="IJ_PERSP_SAMPLE" pos="2" type="boolean"/>
-		<!--
-		bit 3 set when blob turns on WCOORD.. which also corresponds to
-		register being set in in HLSQ_CONTROL_3_REG bits 8..15 (which
-		shader does not use).. possibly providing wcoord in an alternate
-		way??
-		Also, when that happens, VARYING bits are turned on as well.
-		 -->
-		<bitfield name="SIZE" pos="3" type="boolean"/>
+		<bitfield name="IJ_LINEAR_PIXEL" pos="3" type="boolean"/>
+		<bitfield name="IJ_LINEAR_CENTROID" pos="4" type="boolean"/>
+		<bitfield name="IJ_LINEAR_SAMPLE" pos="5" type="boolean"/>
 		<bitfield name="COORD_MASK" low="6" high="9" type="hex"/>
 	</reg32>
 	<reg32 offset="0xe145" name="RB_RENDER_CONTROL1">
diff --git a/mesa 3D driver/src/freedreno/registers/adreno/a6xx.xml b/mesa 3D driver/src/freedreno/registers/adreno/a6xx.xml
index 92b93740c7..4a966dee8f 100644
--- a/mesa 3D driver/src/freedreno/registers/adreno/a6xx.xml	
+++ b/mesa 3D driver/src/freedreno/registers/adreno/a6xx.xml	
@@ -104,16 +104,29 @@ xsi:schemaLocation="http://nouveau.freedesktop.org/ rules-ng.xsd">
 	<value value="0x84" name="FMT6_32_32_32_32_SINT"/>
 	<value value="0x85" name="FMT6_32_32_32_32_FIXED"/>
 
-	<value value="0x8c" name="FMT6_G8R8B8R8_422_UNORM"/>
-	<value value="0x8d" name="FMT6_R8G8R8B8_422_UNORM"/>
-	<value value="0x8e" name="FMT6_R8_G8B8_2PLANE_420_UNORM"/>
-	<value value="0x90" name="FMT6_R8_G8_B8_3PLANE_420_UNORM"/>
+	<value value="0x8c" name="FMT6_G8R8B8R8_422_UNORM"/> <!-- UYVY -->
+	<value value="0x8d" name="FMT6_R8G8R8B8_422_UNORM"/> <!-- YUYV -->
+	<value value="0x8e" name="FMT6_R8_G8B8_2PLANE_420_UNORM"/> <!-- NV12 -->
+	<value value="0x8f" name="FMT6_NV21"/>
+	<value value="0x90" name="FMT6_R8_G8_B8_3PLANE_420_UNORM"/> <!-- YV12 -->
 
 	<value value="0x91" name="FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8"/>
 
-	<!-- used with the Y plane of FMT6_R8_G8B8_2PLANE_420_UNORM
-	     which has different UBWC compression from regular 8_UNORM format -->
-	<value value="0x94" name="FMT6_8_PLANE_UNORM"/>
+	<!-- Note: tiling/UBWC for these may be different from equivalent formats
+	For example FMT6_NV12_Y is not compatible with FMT6_8_UNORM
+	-->
+	<value value="0x94" name="FMT6_NV12_Y"/>
+	<value value="0x95" name="FMT6_NV12_UV"/>
+	<value value="0x96" name="FMT6_NV12_VU"/>
+	<value value="0x97" name="FMT6_NV12_4R"/>
+	<value value="0x98" name="FMT6_NV12_4R_Y"/>
+	<value value="0x99" name="FMT6_NV12_4R_UV"/>
+	<value value="0x9a" name="FMT6_P010"/>
+	<value value="0x9b" name="FMT6_P010_Y"/>
+	<value value="0x9c" name="FMT6_P010_UV"/>
+	<value value="0x9d" name="FMT6_TP10"/>
+	<value value="0x9e" name="FMT6_TP10_Y"/>
+	<value value="0x9f" name="FMT6_TP10_UV"/>
 
 	<value value="0xa0" name="FMT6_Z24_UNORM_S8_UINT"/>
 
@@ -1552,18 +1565,11 @@ to upconvert to 32b float internally?
 	<reg32 offset="0x8005" name="GRAS_CNTL">
 		<!-- see also RB_RENDER_CONTROL0 -->
 		<bitfield name="IJ_PERSP_PIXEL" pos="0" type="boolean"/>
-		<!-- b1 set for interpolateAtCentroid() -->
 		<bitfield name="IJ_PERSP_CENTROID" pos="1" type="boolean"/>
-		<!-- b2 set instead of b0 when running in per-sample mode -->
 		<bitfield name="IJ_PERSP_SAMPLE" pos="2" type="boolean"/>
-		<!--
-		b3 set for interpolateAt{Offset,Sample}() if not in per-sample
-		mode, and frag_face
-		 -->
-		<bitfield name="SIZE" pos="3" type="boolean"/>
-		<bitfield name="UNK4" pos="4" type="boolean"/>
-		<!-- b5 set ofr interpolateAt{Offset,Sample}() if in per-sample mode -->
-		<bitfield name="SIZE_PERSAMP" pos="5" type="boolean"/>
+		<bitfield name="IJ_LINEAR_PIXEL" pos="3" type="boolean"/>
+		<bitfield name="IJ_LINEAR_CENTROID" pos="4" type="boolean"/>
+		<bitfield name="IJ_LINEAR_SAMPLE" pos="5" type="boolean"/>
 		<bitfield name="COORD_MASK" low="6" high="9" type="hex"/>
 	</reg32>
 	<reg32 offset="0x8006" name="GRAS_CL_GUARDBAND_CLIP_ADJ">
@@ -1591,7 +1597,7 @@ to upconvert to 32b float internally?
 		<bitfield name="LINEHALFWIDTH" low="3" high="10" radix="2" type="fixed"/>
 		<bitfield name="POLY_OFFSET" pos="11" type="boolean"/>
 		<bitfield name="UNK12" pos="12"/>
-		<bitfield name="MSAA_ENABLE" pos="13" type="boolean"/>
+		<bitfield name="LINE_MODE" pos="13" type="a5xx_line_mode"/>
 		<bitfield name="UNK15" low="15" high="16"/>
 		<!--
 		This is set by the blob when multiview is enabled, but doesn't seem
@@ -1792,7 +1798,7 @@ to upconvert to 32b float internally?
 		<bitfield name="FRAGCOORDSAMPLEMODE" low="1" high="2" type="a6xx_fragcoord_sample_mode"/>
 	</reg32>
 
-	<reg32 offset="0x8102" name="GRAS_2D_BLIT_INFO">
+	<reg32 offset="0x8102" name="GRAS_LRZ_MRT_BUF_INFO_0">
 		<bitfield name="COLOR_FORMAT" low="0" high="7" type="a6xx_format"/>
 	</reg32>
 	<reg64 offset="0x8103" name="GRAS_LRZ_BUFFER_BASE" align="256" type="waddress"/>
@@ -1952,18 +1958,11 @@ to upconvert to 32b float internally?
 	<reg32 offset="0x8809" name="RB_RENDER_CONTROL0">
 		<!-- see also GRAS_CNTL -->
 		<bitfield name="IJ_PERSP_PIXEL" pos="0" type="boolean"/>
-		<!-- b1 set for interpolateAtCentroid() -->
 		<bitfield name="IJ_PERSP_CENTROID" pos="1" type="boolean"/>
-		<!-- b2 set instead of b0 when running in per-sample mode -->
 		<bitfield name="IJ_PERSP_SAMPLE" pos="2" type="boolean"/>
-		<!--
-		b3 set for interpolateAt{Offset,Sample}() if not in per-sample
-		mode, and frag_face
-		 -->
-		<bitfield name="SIZE" pos="3" type="boolean"/>
-		<bitfield name="UNK4" pos="4" type="boolean"/>
-		<!-- b5 set ofr interpolateAt{Offset,Sample}() if in per-sample mode -->
-		<bitfield name="SIZE_PERSAMP" pos="5" type="boolean"/>
+		<bitfield name="IJ_LINEAR_PIXEL" pos="3" type="boolean"/>
+		<bitfield name="IJ_LINEAR_CENTROID" pos="4" type="boolean"/>
+		<bitfield name="IJ_LINEAR_SAMPLE" pos="5" type="boolean"/>
 		<bitfield name="COORD_MASK" low="6" high="9" type="hex"/>
 		<bitfield name="UNK10" pos="10" type="boolean"/>
 	</reg32>
@@ -2266,6 +2265,15 @@ to upconvert to 32b float internally?
 
 	<!-- TODO: there are some registers in the 0x8a00-0x8bff range -->
 
+	<!--
+		These show up in a6xx gen3+ but so far haven't found an example of
+		blob writing non-zero:
+	 -->
+	<reg32 offset="0x8a00" name="RB_UNKNOWN_8A00"/>
+	<reg32 offset="0x8a10" name="RB_UNKNOWN_8A10"/>
+	<reg32 offset="0x8a20" name="RB_UNKNOWN_8A20"/>
+	<reg32 offset="0x8a30" name="RB_UNKNOWN_8A30"/>
+
 	<reg32 offset="0x8c00" name="RB_2D_BLIT_CNTL" type="a6xx_2d_blit_cntl"/>
 	<reg32 offset="0x8c01" name="RB_2D_UNKNOWN_8C01" low="0" high="31"/>
 
@@ -2538,7 +2546,7 @@ to upconvert to 32b float internally?
 
 	<!-- always 0x0 ? -->
 	<reg32 offset="0x9801" name="PC_HS_INPUT_SIZE">
-		<bitfield name="SIZE" low="0" high="10"/>
+		<bitfield name="SIZE" low="0" high="10" type="uint"/>
 		<bitfield name="UNK13" pos="13"/>
 	</reg32>
 
@@ -2566,6 +2574,11 @@ to upconvert to 32b float internally?
 	<!-- probably a mirror of VFD_CONTROL_6 -->
 	<reg32 offset="0x9806" name="PC_PRIMID_PASSTHRU" pos="0" type="boolean"/>
 
+	<!-- New in a6xx gen3+ -->
+	<reg32 offset="0x9808" name="PC_SO_STREAM_CNTL">
+		<bitfield name="STREAM_ENABLE" pos="15" type="boolean"/>
+	</reg32>
+
 	<reg32 offset="0x980a" name="PC_DGEN_SU_CONSERVATIVE_RAS_CNTL">
 		<bitfield name="CONSERVATIVERASEN" pos="0" type="boolean"/>
 	</reg32>
@@ -3261,6 +3274,26 @@ to upconvert to 32b float internally?
 	<reg32 offset="0xa9bc" name="SP_CS_INSTRLEN" low="0" high="27" type="uint"/>
 	<reg32 offset="0xa9bd" name="SP_CS_PVT_MEM_HW_STACK_OFFSET" type="a6xx_sp_xs_pvt_mem_hw_stack_offset"/>
 
+	<!-- new in a6xx gen4, matches HLSQ_CS_CNTL_0 -->
+	<reg32 offset="0xa9c2" name="SP_CS_CNTL_0">
+		<bitfield name="WGIDCONSTID" low="0" high="7" type="a3xx_regid"/>
+		<bitfield name="WGSIZECONSTID" low="8" high="15" type="a3xx_regid"/>
+		<bitfield name="WGOFFSETCONSTID" low="16" high="23" type="a3xx_regid"/>
+		<bitfield name="LOCALIDREGID" low="24" high="31" type="a3xx_regid"/>
+	</reg32>
+	<!-- new in a6xx gen4, matches HLSQ_CS_CNTL_1 -->
+	<reg32 offset="0xa9c3" name="SP_CS_CNTL_1">
+		<!-- gl_LocalInvocationIndex -->
+		<bitfield name="LINEARLOCALIDREGID" low="0" high="7" type="a3xx_regid"/>
+		<!-- a650 has 6 "SP cores" (but 3 "SP"). this makes it use only
+		     one of those 6 "SP cores" -->
+		<bitfield name="SINGLE_SP_CORE" pos="8" type="boolean"/>
+		<!-- Must match SP_CS_CTRL -->
+		<bitfield name="THREADSIZE" pos="9" type="a6xx_threadsize"/>
+		<!-- 1 thread per wave (ignored if bit9 set) -->
+		<bitfield name="THREADSIZE_SCALAR" pos="10" type="boolean"/>
+	</reg32>
+
 	<!-- TODO: two 64kb aligned addresses at a9d0/a9d2 -->
 
 	<reg64 offset="0xa9e0" name="SP_FS_TEX_SAMP" type="address" align="16"/>
@@ -3422,7 +3455,7 @@ to upconvert to 32b float internally?
 	<reg32 offset="0xb4d1" name="SP_WINDOW_OFFSET" type="a6xx_reg_xy"/>
 
 	<!-- always 0x100000 or 0x1000000? -->
-	<reg32 offset="0xb600" name="TPL1_UNKNOWN_B600" low="0" high="25"/>
+	<reg32 offset="0xb600" name="TPL1_DBG_ECO_CNTL" low="0" high="25"/>
 	<reg32 offset="0xb601" name="TPL1_ADDR_MODE_CNTL" type="a5xx_address_mode"/>
 	<reg32 offset="0xb602" name="TPL1_UNKNOWN_B602" low="0" high="7" type="uint"/>
 	<reg32 offset="0xb604" name="TPL1_NC_MODE_CNTL">
@@ -3560,6 +3593,14 @@ to upconvert to 32b float internally?
 		<reg64 offset="0" name="ADDR" type="waddress"/>
 	</array>
 
+	<!-- new in a6xx gen4, mirror of SP_CS_UNKNOWN_A9B1? -->
+	<reg32 offset="0xb9d0" name="HLSQ_CS_UNKNOWN_B9D0">
+		<bitfield name="SHARED_SIZE" low="0" high="4" type="uint"/>
+		<bitfield name="UNK5" pos="5" type="boolean"/>
+		<!-- always 1 ? -->
+		<bitfield name="UNK6" pos="6" type="boolean"/>
+	</reg32>
+
 	<reg32 offset="0xbb00" name="HLSQ_DRAW_CMD">
 		<bitfield name="STATE_ID" low="0" high="7"/>
 	</reg32>
diff --git a/mesa 3D driver/src/freedreno/registers/adreno/adreno_common.xml b/mesa 3D driver/src/freedreno/registers/adreno/adreno_common.xml
index d70fbaf10c..d9fc4ab3f4 100644
--- a/mesa 3D driver/src/freedreno/registers/adreno/adreno_common.xml	
+++ b/mesa 3D driver/src/freedreno/registers/adreno/adreno_common.xml	
@@ -366,5 +366,14 @@ xsi:schemaLocation="http://nouveau.freedesktop.org/ rules-ng.xsd">
 	<value name="ADDR_64B" value="1"/>
 </enum>
 
+<doc>
+    Line mode for a5xx+
+	Note that Bresenham lines are only supported with MSAA disabled.
+</doc>
+<enum name="a5xx_line_mode">
+	<value value="0x0"  name="BRESENHAM"/>
+	<value value="0x1"  name="RECTANGULAR"/>
+</enum>
+
 </database>
 
diff --git a/mesa 3D driver/src/freedreno/rnn/meson.build b/mesa 3D driver/src/freedreno/rnn/meson.build
index b9379e6e23..a01f741946 100644
--- a/mesa 3D driver/src/freedreno/rnn/meson.build	
+++ b/mesa 3D driver/src/freedreno/rnn/meson.build	
@@ -57,3 +57,13 @@ headergen2 = executable(
   build_by_default: with_tools.contains('freedreno'),
   install: false
 )
+
+if with_tests
+  foreach xml : ['adreno.xml', 'msm.xml']
+    test('headergen2-' + xml,
+      headergen2,
+      args: [xml],
+      suite: 'freedreno',
+    )
+  endforeach
+endif
diff --git a/mesa 3D driver/src/freedreno/rnn/rnndec.c b/mesa 3D driver/src/freedreno/rnn/rnndec.c
index e2a738db81..8bf230cc4b 100644
--- a/mesa 3D driver/src/freedreno/rnn/rnndec.c	
+++ b/mesa 3D driver/src/freedreno/rnn/rnndec.c	
@@ -166,10 +166,11 @@ char *rnndec_decodeval(struct rnndeccontext *ctx, struct rnntypeinfo *ti, uint64
 	int bitfieldsnum;
 	char *tmp;
 	const char *ctmp;
-	uint64_t mask, value_orig;
+	uint64_t mask;
+
+	uint64_t value_orig = value;
 	if (!ti)
 		goto failhex;
-	value_orig = value;
 	value = (value & typeinfo_mask(ti)) >> ti->low;
 	value <<= ti->shr;
 
@@ -417,7 +418,7 @@ static struct rnndecaddrinfo *trymatch (struct rnndeccontext *ctx, struct rnndel
 					offset = addr - (elems[i]->offset + elems[i]->stride * idx);
 					int extraidx = (elems[i]->length != 1);
 					int nindnum = (elems[i]->name ? 0 : indicesnum + extraidx);
-					uint64_t nind[nindnum];
+					uint64_t nind[MAX2(nindnum, 1)];
 					if (!elems[i]->name) {
 						for (j = 0; j < indicesnum; j++)
 							nind[j] = indices[j];
diff --git a/mesa 3D driver/src/freedreno/vulkan/meson.build b/mesa 3D driver/src/freedreno/vulkan/meson.build
index ca8d847183..9a1a46468e 100644
--- a/mesa 3D driver/src/freedreno/vulkan/meson.build	
+++ b/mesa 3D driver/src/freedreno/vulkan/meson.build	
@@ -50,52 +50,41 @@ libtu_files = files(
   'tu_shader.c',
   'tu_util.c',
   'tu_util.h',
+  'tu_perfetto.h',
   'vk_format.h',
 )
 
 tu_deps = []
 tu_flags = []
-tu_link_with = []
 
 tu_wsi = false
 
 if with_platform_x11
   tu_deps += dep_xcb_dri3
-  tu_flags += [
-    '-DVK_USE_PLATFORM_XCB_KHR',
-    '-DVK_USE_PLATFORM_XLIB_KHR',
-  ]
-  libtu_files += files('tu_wsi_x11.c')
   tu_wsi = true
 endif
 
 if with_platform_wayland
   tu_deps += dep_wayland_client
-  tu_flags += '-DVK_USE_PLATFORM_WAYLAND_KHR'
-  libtu_files += files('tu_wsi_wayland.c')
   tu_wsi = true
 endif
 
 if system_has_kms_drm and not with_platform_android
-  tu_flags += '-DVK_USE_PLATFORM_DISPLAY_KHR'
   libtu_files += files('tu_wsi_display.c')
   tu_wsi = true
 endif
 
 if tu_wsi
   libtu_files += 'tu_wsi.c'
-  tu_link_with += libvulkan_wsi
 endif
 
 if with_platform_android
-  tu_flags += '-DVK_USE_PLATFORM_ANDROID_KHR'
   libtu_files += files('tu_android.c')
   tu_deps += [dep_android]
 endif
 
 if with_xlib_lease
   tu_deps += [dep_xlib_xrandr]
-  tu_flags += '-DVK_USE_PLATFORM_XLIB_XRANDR_EXT'
 endif
 
 if with_freedreno_kgsl
@@ -106,9 +95,28 @@ else
   tu_deps += dep_libdrm
 endif
 
+tu_tracepoints = custom_target(
+  'tu_tracepoints.[ch]',
+  input: 'tu_tracepoints.py',
+  output: ['tu_tracepoints.c', 'tu_tracepoints.h', 'tu_tracepoints_perfetto.h'],
+  command: [
+    prog_python, '@INPUT@',
+    '-p', join_paths(meson.source_root(), 'src/util/perf/'),
+    '--utrace-src', '@OUTPUT0@',
+    '--utrace-hdr', '@OUTPUT1@',
+    '--perfetto-hdr', '@OUTPUT2@',
+  ],
+  depend_files: u_trace_py,
+)
+
+if with_perfetto
+  libtu_files += ['tu_perfetto.cc', 'tu_perfetto_util.c']
+  tu_deps += dep_perfetto
+endif
+
 libvulkan_freedreno = shared_library(
   'vulkan_freedreno',
-  [libtu_files, tu_entrypoints, freedreno_xml_header_files],
+  [libtu_files, tu_entrypoints, tu_tracepoints, freedreno_xml_header_files, sha1_h],
   include_directories : [
     inc_include,
     inc_src,
@@ -117,11 +125,9 @@ libvulkan_freedreno = shared_library(
     inc_gallium,
     inc_gallium_aux,
     inc_compiler,
-    inc_vulkan_wsi,
     inc_freedreno,
   ],
   link_with : [
-    tu_link_with,
     libfreedreno_ir3,
     libfreedreno_layout,
     libfreedreno_perfcntrs,
@@ -136,6 +142,7 @@ libvulkan_freedreno = shared_library(
     idep_nir,
     tu_deps,
     idep_vulkan_util,
+    idep_vulkan_wsi,
     idep_mesautil,
   ],
   c_args : [no_override_init_args, tu_flags],
diff --git a/mesa 3D driver/src/freedreno/vulkan/tu_android.c b/mesa 3D driver/src/freedreno/vulkan/tu_android.c
index e59c9e605f..e285c1e7b9 100644
--- a/mesa 3D driver/src/freedreno/vulkan/tu_android.c	
+++ b/mesa 3D driver/src/freedreno/vulkan/tu_android.c	
@@ -142,7 +142,7 @@ tu_gralloc_info_other(struct tu_device *device,
        */
 
       if (gralloc_info->handle->numInts < 2) {
-         return vk_errorf(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+         return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
                           "VkNativeBufferANDROID::handle::numInts is %d, "
                           "expected at least 2 for qcom gralloc",
                           gralloc_info->handle->numFds);
@@ -150,7 +150,7 @@ tu_gralloc_info_other(struct tu_device *device,
 
       uint32_t gmsm = ('g' << 24) | ('m' << 16) | ('s' << 8) | 'm';
       if (handle_data[0] != gmsm) {
-         return vk_errorf(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+         return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
                           "private_handle_t::magic is %x, expected %x",
                           handle_data[0], gmsm);
       }
@@ -164,7 +164,7 @@ tu_gralloc_info_other(struct tu_device *device,
        */
       *dma_buf = handle_fds[0];
    } else {
-      return vk_errorf(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+      return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
                        "VkNativeBufferANDROID::handle::numFds is %d, "
                        "expected 1 (gbm_gralloc) or 2 (qcom gralloc)",
                        gralloc_info->handle->numFds);
@@ -358,7 +358,7 @@ format_supported_with_usage(VkDevice device_h, VkFormat format,
    result = tu_GetPhysicalDeviceImageFormatProperties2(
       phys_dev_h, &image_format_info, &image_format_props);
    if (result != VK_SUCCESS) {
-      return vk_errorf(device->instance, result,
+      return vk_errorf(device, result,
                        "tu_GetPhysicalDeviceImageFormatProperties2 failed "
                        "inside %s",
                        __func__);
@@ -385,7 +385,7 @@ setup_gralloc0_usage(struct tu_device *device, VkFormat format,
     * gralloc swapchains.
     */
    if (imageUsage != 0) {
-      return vk_errorf(device->instance, VK_ERROR_FORMAT_NOT_SUPPORTED,
+      return vk_errorf(device, VK_ERROR_FORMAT_NOT_SUPPORTED,
                        "unsupported VkImageUsageFlags(0x%x) for gralloc "
                        "swapchain",
                        imageUsage);
diff --git a/mesa 3D driver/src/freedreno/vulkan/tu_clear_blit.c b/mesa 3D driver/src/freedreno/vulkan/tu_clear_blit.c
index 5f80034e89..2e1f10ab3c 100644
--- a/mesa 3D driver/src/freedreno/vulkan/tu_clear_blit.c	
+++ b/mesa 3D driver/src/freedreno/vulkan/tu_clear_blit.c	
@@ -19,6 +19,8 @@
 #include "util/half_float.h"
 #include "compiler/nir/nir_builder.h"
 
+#include "tu_tracepoints.h"
+
 static uint32_t
 tu_pack_float32_for_unorm(float val, int bits)
 {
@@ -28,27 +30,27 @@ tu_pack_float32_for_unorm(float val, int bits)
 /* r2d_ = BLIT_OP_SCALE operations */
 
 static enum a6xx_2d_ifmt
-format_to_ifmt(VkFormat format)
+format_to_ifmt(enum pipe_format format)
 {
-   if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
-       format == VK_FORMAT_X8_D24_UNORM_PACK32)
+   if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
+       format == PIPE_FORMAT_Z24X8_UNORM)
       return R2D_UNORM8;
 
    /* get_component_bits doesn't work with depth/stencil formats: */
-   if (format == VK_FORMAT_D16_UNORM || format == VK_FORMAT_D32_SFLOAT)
+   if (format == PIPE_FORMAT_Z16_UNORM || format == PIPE_FORMAT_Z32_FLOAT)
       return R2D_FLOAT32;
-   if (format == VK_FORMAT_S8_UINT)
+   if (format == PIPE_FORMAT_S8_UINT)
       return R2D_INT8;
 
    /* use the size of the red channel to find the corresponding "ifmt" */
-   bool is_int = vk_format_is_int(format);
-   switch (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
+   bool is_int = util_format_is_pure_integer(format);
+   switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
    case 4: case 5: case 8:
       return is_int ? R2D_INT8 : R2D_UNORM8;
    case 10: case 11:
       return is_int ? R2D_INT16 : R2D_FLOAT16;
    case 16:
-      if (vk_format_is_float(format))
+      if (util_format_is_float(format))
          return R2D_FLOAT16;
       return is_int ? R2D_INT16 : R2D_FLOAT32;
    case 32:
@@ -80,38 +82,38 @@ r2d_coords(struct tu_cs *cs,
 }
 
 static void
-r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
+r2d_clear_value(struct tu_cs *cs, enum pipe_format format, const VkClearValue *val)
 {
    uint32_t clear_value[4] = {};
 
    switch (format) {
-   case VK_FORMAT_X8_D24_UNORM_PACK32:
-   case VK_FORMAT_D24_UNORM_S8_UINT:
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+   case PIPE_FORMAT_Z24X8_UNORM:
       /* cleared as r8g8b8a8_unorm using special format */
       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
       clear_value[1] = clear_value[0] >> 8;
       clear_value[2] = clear_value[0] >> 16;
       clear_value[3] = val->depthStencil.stencil;
       break;
-   case VK_FORMAT_D16_UNORM:
-   case VK_FORMAT_D32_SFLOAT:
+   case PIPE_FORMAT_Z16_UNORM:
+   case PIPE_FORMAT_Z32_FLOAT:
       /* R2D_FLOAT32 */
       clear_value[0] = fui(val->depthStencil.depth);
       break;
-   case VK_FORMAT_S8_UINT:
+   case PIPE_FORMAT_S8_UINT:
       clear_value[0] = val->depthStencil.stencil;
       break;
-   case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
+   case PIPE_FORMAT_R9G9B9E5_FLOAT:
       /* cleared as UINT32 */
       clear_value[0] = float3_to_rgb9e5(val->color.float32);
       break;
    default:
-      assert(!vk_format_is_depth_or_stencil(format));
-      const struct util_format_description *desc = vk_format_description(format);
+      assert(!util_format_is_depth_or_stencil(format));
+      const struct util_format_description *desc = util_format_description(format);
       enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
 
       assert(desc && (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
-                      format == VK_FORMAT_B10G11R11_UFLOAT_PACK32));
+                      format == PIPE_FORMAT_R11G11B10_FLOAT));
 
       for (unsigned i = 0; i < desc->nr_channels; i++) {
          const struct util_format_channel_description *ch = &desc->channel[i];
@@ -142,7 +144,7 @@ r2d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
 static void
 r2d_src(struct tu_cmd_buffer *cmd,
         struct tu_cs *cs,
-        const struct tu_image_view *iview,
+        const struct fdl6_view *iview,
         uint32_t layer,
         VkFilter filter)
 {
@@ -168,7 +170,7 @@ r2d_src_stencil(struct tu_cmd_buffer *cmd,
 {
    tu_cs_emit_pkt4(cs, REG_A6XX_SP_PS_2D_SRC_INFO, 5);
    tu_cs_emit(cs, tu_image_view_stencil(iview, SP_PS_2D_SRC_INFO) & ~A6XX_SP_PS_2D_SRC_INFO_FLAGS);
-   tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
+   tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE);
    tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
    /* SP_PS_2D_SRC_PITCH has shifted pitch field */
    tu_cs_emit(cs, iview->stencil_PITCH << 9);
@@ -177,17 +179,17 @@ r2d_src_stencil(struct tu_cmd_buffer *cmd,
 static void
 r2d_src_buffer(struct tu_cmd_buffer *cmd,
                struct tu_cs *cs,
-               VkFormat vk_format,
+               enum pipe_format format,
                uint64_t va, uint32_t pitch,
                uint32_t width, uint32_t height)
 {
-   struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
+   struct tu_native_format fmt = tu6_format_texture(format, TILE6_LINEAR);
 
    tu_cs_emit_regs(cs,
                    A6XX_SP_PS_2D_SRC_INFO(
-                      .color_format = format.fmt,
-                      .color_swap = format.swap,
-                      .srgb = vk_format_is_srgb(vk_format),
+                      .color_format = fmt.fmt,
+                      .color_swap = fmt.swap,
+                      .srgb = util_format_is_srgb(format),
                       .unk20 = 1,
                       .unk22 = 1),
                    A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height),
@@ -196,7 +198,7 @@ r2d_src_buffer(struct tu_cmd_buffer *cmd,
 }
 
 static void
-r2d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
+r2d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer)
 {
    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
    tu_cs_emit(cs, iview->RB_2D_DST_INFO);
@@ -216,15 +218,15 @@ r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t la
 }
 
 static void
-r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
+r2d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch)
 {
-   struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
+   struct tu_native_format fmt = tu6_format_color(format, TILE6_LINEAR);
 
    tu_cs_emit_regs(cs,
                    A6XX_RB_2D_DST_INFO(
-                      .color_format = format.fmt,
-                      .color_swap = format.swap,
-                      .srgb = vk_format_is_srgb(vk_format)),
+                      .color_format = fmt.fmt,
+                      .color_swap = fmt.swap,
+                      .srgb = util_format_is_srgb(format)),
                    A6XX_RB_2D_DST(.qword = va),
                    A6XX_RB_2D_DST_PITCH(pitch));
 }
@@ -232,24 +234,25 @@ r2d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch
 static void
 r2d_setup_common(struct tu_cmd_buffer *cmd,
                  struct tu_cs *cs,
-                 VkFormat vk_format,
+                 enum pipe_format format,
                  VkImageAspectFlags aspect_mask,
                  unsigned blit_param,
                  bool clear,
                  bool ubwc,
                  bool scissor)
 {
-   enum a6xx_format format = tu6_base_format(vk_format);
-   enum a6xx_2d_ifmt ifmt = format_to_ifmt(vk_format);
+   enum a6xx_format fmt = tu6_base_format(format);
+   enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
+
    uint32_t unknown_8c01 = 0;
 
-   if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
-        vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
-      format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
+   if ((format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
+       format == PIPE_FORMAT_Z24X8_UNORM) && ubwc) {
+      fmt = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
    }
 
    /* note: the only format with partial clearing is D24S8 */
-   if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
+   if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
       /* preserve stencil channel */
       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
          unknown_8c01 = 0x08000041;
@@ -265,10 +268,10 @@ r2d_setup_common(struct tu_cmd_buffer *cmd,
          .scissor = scissor,
          .rotate = blit_param,
          .solid_color = clear,
-         .d24s8 = format == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
-         .color_format = format,
+         .d24s8 = fmt == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
+         .color_format = fmt,
          .mask = 0xf,
-         .ifmt = vk_format_is_srgb(vk_format) ? R2D_UNORM8_SRGB : ifmt,
+         .ifmt = util_format_is_srgb(format) ? R2D_UNORM8_SRGB : ifmt,
       ).value;
 
    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
@@ -277,21 +280,21 @@ r2d_setup_common(struct tu_cmd_buffer *cmd,
    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
    tu_cs_emit(cs, blit_cntl);
 
-   if (format == FMT6_10_10_10_2_UNORM_DEST)
-      format = FMT6_16_16_16_16_FLOAT;
+   if (fmt == FMT6_10_10_10_2_UNORM_DEST)
+      fmt = FMT6_16_16_16_16_FLOAT;
 
    tu_cs_emit_regs(cs, A6XX_SP_2D_DST_FORMAT(
-         .sint = vk_format_is_sint(vk_format),
-         .uint = vk_format_is_uint(vk_format),
-         .color_format = format,
-         .srgb = vk_format_is_srgb(vk_format),
+         .sint = util_format_is_pure_sint(format),
+         .uint = util_format_is_pure_uint(format),
+         .color_format = fmt,
+         .srgb = util_format_is_srgb(format),
          .mask = 0xf));
 }
 
 static void
 r2d_setup(struct tu_cmd_buffer *cmd,
           struct tu_cs *cs,
-          VkFormat vk_format,
+          enum pipe_format format,
           VkImageAspectFlags aspect_mask,
           unsigned blit_param,
           bool clear,
@@ -302,7 +305,7 @@ r2d_setup(struct tu_cmd_buffer *cmd,
 
    tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
 
-   r2d_setup_common(cmd, cs, vk_format, aspect_mask, blit_param, clear, ubwc, false);
+   r2d_setup_common(cmd, cs, format, aspect_mask, blit_param, clear, ubwc, false);
 }
 
 static void
@@ -694,7 +697,8 @@ r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool blit,
       }
    }
 
-   tu6_emit_msaa(cs, samples);
+   cmd->state.line_mode = RECTANGULAR;
+   tu6_emit_msaa(cs, samples, cmd->state.line_mode);
 }
 
 static void
@@ -746,7 +750,7 @@ r3d_coords(struct tu_cs *cs,
 }
 
 static void
-r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
+r3d_clear_value(struct tu_cs *cs, enum pipe_format format, const VkClearValue *val)
 {
    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3 + 4);
    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
@@ -757,8 +761,8 @@ r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
    switch (format) {
-   case VK_FORMAT_X8_D24_UNORM_PACK32:
-   case VK_FORMAT_D24_UNORM_S8_UINT: {
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT: {
       /* cleared as r8g8b8a8_unorm using special format */
       uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
       tu_cs_emit(cs, fui((tmp & 0xff) / 255.0f));
@@ -766,14 +770,14 @@ r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
       tu_cs_emit(cs, fui((tmp >> 16 & 0xff) / 255.0f));
       tu_cs_emit(cs, fui((val->depthStencil.stencil & 0xff) / 255.0f));
    } break;
-   case VK_FORMAT_D16_UNORM:
-   case VK_FORMAT_D32_SFLOAT:
+   case PIPE_FORMAT_Z16_UNORM:
+   case PIPE_FORMAT_Z32_FLOAT:
       tu_cs_emit(cs, fui(val->depthStencil.depth));
       tu_cs_emit(cs, 0);
       tu_cs_emit(cs, 0);
       tu_cs_emit(cs, 0);
       break;
-   case VK_FORMAT_S8_UINT:
+   case PIPE_FORMAT_S8_UINT:
       tu_cs_emit(cs, val->depthStencil.stencil & 0xff);
       tu_cs_emit(cs, 0);
       tu_cs_emit(cs, 0);
@@ -781,7 +785,7 @@ r3d_clear_value(struct tu_cs *cs, VkFormat format, const VkClearValue *val)
       break;
    default:
       /* as color formats use clear value as-is */
-      assert(!vk_format_is_depth_or_stencil(format));
+      assert(!util_format_is_depth_or_stencil(format));
       tu_cs_emit_array(cs, val->color.uint32, 4);
       break;
    }
@@ -851,7 +855,7 @@ r3d_src_common(struct tu_cmd_buffer *cmd,
 static void
 r3d_src(struct tu_cmd_buffer *cmd,
         struct tu_cs *cs,
-        const struct tu_image_view *iview,
+        const struct fdl6_view *iview,
         uint32_t layer,
         VkFilter filter)
 {
@@ -864,23 +868,23 @@ r3d_src(struct tu_cmd_buffer *cmd,
 static void
 r3d_src_buffer(struct tu_cmd_buffer *cmd,
                struct tu_cs *cs,
-               VkFormat vk_format,
+               enum pipe_format format,
                uint64_t va, uint32_t pitch,
                uint32_t width, uint32_t height)
 {
    uint32_t desc[A6XX_TEX_CONST_DWORDS];
 
-   struct tu_native_format format = tu6_format_texture(vk_format, TILE6_LINEAR);
+   struct tu_native_format fmt = tu6_format_texture(format, TILE6_LINEAR);
 
    desc[0] =
-      COND(vk_format_is_srgb(vk_format), A6XX_TEX_CONST_0_SRGB) |
-      A6XX_TEX_CONST_0_FMT(format.fmt) |
-      A6XX_TEX_CONST_0_SWAP(format.swap) |
+      COND(util_format_is_srgb(format), A6XX_TEX_CONST_0_SRGB) |
+      A6XX_TEX_CONST_0_FMT(fmt.fmt) |
+      A6XX_TEX_CONST_0_SWAP(fmt.swap) |
       A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
       // XXX to swizzle into .w for stencil buffer_to_image
-      A6XX_TEX_CONST_0_SWIZ_Y(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
-      A6XX_TEX_CONST_0_SWIZ_Z(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
-      A6XX_TEX_CONST_0_SWIZ_W(vk_format == VK_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
+      A6XX_TEX_CONST_0_SWIZ_Y(format == PIPE_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Y) |
+      A6XX_TEX_CONST_0_SWIZ_Z(format == PIPE_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_Z) |
+      A6XX_TEX_CONST_0_SWIZ_W(format == PIPE_FORMAT_R8_UNORM ? A6XX_TEX_X : A6XX_TEX_W);
    desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
    desc[2] =
       A6XX_TEX_CONST_2_PITCH(pitch) |
@@ -898,12 +902,12 @@ static void
 r3d_src_gmem(struct tu_cmd_buffer *cmd,
              struct tu_cs *cs,
              const struct tu_image_view *iview,
-             VkFormat format,
+             enum pipe_format format,
              uint32_t gmem_offset,
              uint32_t cpp)
 {
    uint32_t desc[A6XX_TEX_CONST_DWORDS];
-   memcpy(desc, iview->descriptor, sizeof(desc));
+   memcpy(desc, iview->view.descriptor, sizeof(desc));
 
    /* patch the format so that depth/stencil get the right format */
    desc[0] &= ~A6XX_TEX_CONST_0_FMT__MASK;
@@ -925,7 +929,7 @@ r3d_src_gmem(struct tu_cmd_buffer *cmd,
 }
 
 static void
-r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
+r3d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer)
 {
    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(0), 6);
    tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
@@ -935,6 +939,11 @@ r3d_dst(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
    tu_cs_image_flag_ref(cs, iview, layer);
 
+   /* Use color format from RB_MRT_BUF_INFO. This register is relevant for
+    * FMT6_NV12_Y.
+    */
+   tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = iview->RB_MRT_BUF_INFO & 0xff));
+
    tu_cs_emit_regs(cs, A6XX_RB_RENDER_CNTL(.flag_mrts = iview->ubwc_enabled));
 }
 
@@ -950,12 +959,12 @@ r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t la
 }
 
 static void
-r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch)
+r3d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch)
 {
-   struct tu_native_format format = tu6_format_color(vk_format, TILE6_LINEAR);
+   struct tu_native_format fmt = tu6_format_color(format, TILE6_LINEAR);
 
    tu_cs_emit_regs(cs,
-                   A6XX_RB_MRT_BUF_INFO(0, .color_format = format.fmt, .color_swap = format.swap),
+                   A6XX_RB_MRT_BUF_INFO(0, .color_format = fmt.fmt, .color_swap = fmt.swap),
                    A6XX_RB_MRT_PITCH(0, pitch),
                    A6XX_RB_MRT_ARRAY_PITCH(0, 0),
                    A6XX_RB_MRT_BASE(0, .qword = va),
@@ -965,14 +974,14 @@ r3d_dst_buffer(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch
 }
 
 static uint8_t
-aspect_write_mask(VkFormat vk_format, VkImageAspectFlags aspect_mask)
+aspect_write_mask(enum pipe_format format, VkImageAspectFlags aspect_mask)
 {
    uint8_t mask = 0xf;
    assert(aspect_mask);
    /* note: the only format with partial writing is D24S8,
     * clear/blit uses the _AS_R8G8B8A8 format to access it
     */
-   if (vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
+   if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
          mask = 0x7;
       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
@@ -984,18 +993,18 @@ aspect_write_mask(VkFormat vk_format, VkImageAspectFlags aspect_mask)
 static void
 r3d_setup(struct tu_cmd_buffer *cmd,
           struct tu_cs *cs,
-          VkFormat vk_format,
+          enum pipe_format format,
           VkImageAspectFlags aspect_mask,
           unsigned blit_param,
           bool clear,
           bool ubwc,
           VkSampleCountFlagBits samples)
 {
-   enum a6xx_format format = tu6_base_format(vk_format);
+   enum a6xx_format fmt = tu6_base_format(format);
 
-   if ((vk_format == VK_FORMAT_D24_UNORM_S8_UINT ||
-        vk_format == VK_FORMAT_X8_D24_UNORM_PACK32) && ubwc) {
-      format = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
+   if ((format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
+        format == PIPE_FORMAT_Z24X8_UNORM) && ubwc) {
+      fmt = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
    }
 
    if (!cmd->state.pass) {
@@ -1033,14 +1042,14 @@ r3d_setup(struct tu_cmd_buffer *cmd,
    tu_cs_emit_regs(cs, A6XX_SP_FS_RENDER_COMPONENTS(.rt0 = 0xf));
 
    tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
-                        .color_format = format,
-                        .color_sint = vk_format_is_sint(vk_format),
-                        .color_uint = vk_format_is_uint(vk_format)));
+                        .color_format = fmt,
+                        .color_sint = util_format_is_pure_sint(format),
+                        .color_uint = util_format_is_pure_uint(format)));
 
    tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
-      .component_enable = aspect_write_mask(vk_format, aspect_mask)));
-   tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(vk_format_is_srgb(vk_format)));
-   tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(vk_format_is_srgb(vk_format)));
+      .component_enable = aspect_write_mask(format, aspect_mask)));
+   tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(util_format_is_srgb(format)));
+   tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(util_format_is_srgb(format)));
 
    tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
    tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
@@ -1081,22 +1090,22 @@ struct blit_ops {
                   const VkOffset2D *dst,
                   const VkOffset2D *src,
                   const VkExtent2D *extent);
-   void (*clear_value)(struct tu_cs *cs, VkFormat format, const VkClearValue *val);
+   void (*clear_value)(struct tu_cs *cs, enum pipe_format format, const VkClearValue *val);
    void (*src)(
         struct tu_cmd_buffer *cmd,
         struct tu_cs *cs,
-        const struct tu_image_view *iview,
+        const struct fdl6_view *iview,
         uint32_t layer,
         VkFilter filter);
    void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
-                      VkFormat vk_format,
+                      enum pipe_format format,
                       uint64_t va, uint32_t pitch,
                       uint32_t width, uint32_t height);
-   void (*dst)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
-   void (*dst_buffer)(struct tu_cs *cs, VkFormat vk_format, uint64_t va, uint32_t pitch);
+   void (*dst)(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer);
+   void (*dst_buffer)(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch);
    void (*setup)(struct tu_cmd_buffer *cmd,
                  struct tu_cs *cs,
-                 VkFormat vk_format,
+                 enum pipe_format format,
                  VkImageAspectFlags aspect_mask,
                  unsigned blit_param, /* CmdBlitImage: rotation in 2D path and z scaling in 3D path */
                  bool clear,
@@ -1147,76 +1156,53 @@ coords(const struct blit_ops *ops,
  * compression behavior, so no just returning R8_UINT/R16_UINT/R32_UINT for
  * everything.
  */
-static VkFormat
-copy_format(VkFormat format, VkImageAspectFlags aspect_mask, bool copy_buffer)
+static enum pipe_format
+copy_format(VkFormat vk_format, VkImageAspectFlags aspect_mask, bool copy_buffer)
 {
-   if (vk_format_is_compressed(format)) {
-      switch (vk_format_get_blocksize(format)) {
-      case 1: return VK_FORMAT_R8_UINT;
-      case 2: return VK_FORMAT_R16_UINT;
-      case 4: return VK_FORMAT_R32_UINT;
-      case 8: return VK_FORMAT_R32G32_UINT;
-      case 16:return VK_FORMAT_R32G32B32A32_UINT;
+   if (vk_format_is_compressed(vk_format)) {
+      switch (vk_format_get_blocksize(vk_format)) {
+      case 1: return PIPE_FORMAT_R8_UINT;
+      case 2: return PIPE_FORMAT_R16_UINT;
+      case 4: return PIPE_FORMAT_R32_UINT;
+      case 8: return PIPE_FORMAT_R32G32_UINT;
+      case 16:return PIPE_FORMAT_R32G32B32A32_UINT;
       default:
          unreachable("unhandled format size");
       }
    }
 
-   switch (format) {
+   enum pipe_format format = tu_vk_format_to_pipe_format(vk_format);
+
    /* For SNORM formats, copy them as the equivalent UNORM format.  If we treat
     * them as snorm then the 0x80 (-1.0 snorm8) value will get clamped to 0x81
     * (also -1.0), when we're supposed to be memcpying the bits. See
     * https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/2917 for discussion.
     */
-   case VK_FORMAT_R8_SNORM:
-      return VK_FORMAT_R8_UNORM;
-   case VK_FORMAT_R8G8_SNORM:
-      return VK_FORMAT_R8G8_UNORM;
-   case VK_FORMAT_R8G8B8_SNORM:
-      return VK_FORMAT_R8G8B8_UNORM;
-   case VK_FORMAT_B8G8R8_SNORM:
-      return VK_FORMAT_B8G8R8_UNORM;
-   case VK_FORMAT_R8G8B8A8_SNORM:
-      return VK_FORMAT_R8G8B8A8_UNORM;
-   case VK_FORMAT_B8G8R8A8_SNORM:
-      return VK_FORMAT_B8G8R8A8_UNORM;
-   case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
-      return VK_FORMAT_A8B8G8R8_UNORM_PACK32;
-   case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
-      return VK_FORMAT_A2R10G10B10_UNORM_PACK32;
-   case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
-      return VK_FORMAT_A2B10G10R10_UNORM_PACK32;
-   case VK_FORMAT_R16_SNORM:
-      return VK_FORMAT_R16_UNORM;
-   case VK_FORMAT_R16G16_SNORM:
-      return VK_FORMAT_R16G16_UNORM;
-   case VK_FORMAT_R16G16B16_SNORM:
-      return VK_FORMAT_R16G16B16_UNORM;
-   case VK_FORMAT_R16G16B16A16_SNORM:
-      return VK_FORMAT_R16G16B16A16_UNORM;
+   format = util_format_snorm_to_unorm(format);
 
-   case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
-      return VK_FORMAT_R32_UINT;
+   switch (format) {
+   case PIPE_FORMAT_R9G9B9E5_FLOAT:
+      return PIPE_FORMAT_R32_UINT;
 
-   case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
+   case PIPE_FORMAT_R8_G8B8_420_UNORM:
       if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
-         return VK_FORMAT_R8G8_UNORM;
+         return PIPE_FORMAT_R8G8_UNORM;
       else
-         return VK_FORMAT_R8_UNORM;
-   case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
-      return VK_FORMAT_R8_UNORM;
+         return PIPE_FORMAT_Y8_UNORM;
+   case PIPE_FORMAT_R8_G8_B8_420_UNORM:
+      return PIPE_FORMAT_R8_UNORM;
 
-   case VK_FORMAT_D24_UNORM_S8_UINT:
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT && copy_buffer)
-         return VK_FORMAT_R8_UNORM;
+         return PIPE_FORMAT_R8_UNORM;
       else
          return format;
 
-   case VK_FORMAT_D32_SFLOAT_S8_UINT:
+   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
-         return VK_FORMAT_S8_UINT;
+         return PIPE_FORMAT_S8_UINT;
       assert(aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT);
-      return VK_FORMAT_D32_SFLOAT;
+      return PIPE_FORMAT_Z32_FLOAT;
 
    default:
       return format;
@@ -1231,10 +1217,10 @@ tu6_clear_lrz(struct tu_cmd_buffer *cmd,
 {
    const struct blit_ops *ops = &r2d_ops;
 
-   ops->setup(cmd, cs, VK_FORMAT_D16_UNORM, VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false,
+   ops->setup(cmd, cs, PIPE_FORMAT_Z16_UNORM, VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false,
               VK_SAMPLE_COUNT_1_BIT);
-   ops->clear_value(cs, VK_FORMAT_D16_UNORM, value);
-   ops->dst_buffer(cs, VK_FORMAT_D16_UNORM,
+   ops->clear_value(cs, PIPE_FORMAT_Z16_UNORM, value);
+   ops->dst_buffer(cs, PIPE_FORMAT_Z16_UNORM,
                    image->bo->iova + image->bo_offset + image->lrz_offset,
                    image->lrz_pitch * 2);
    ops->coords(cs, &(VkOffset2D) {}, NULL, &(VkExtent2D) {image->lrz_pitch, image->lrz_height});
@@ -1243,9 +1229,9 @@ tu6_clear_lrz(struct tu_cmd_buffer *cmd,
 }
 
 static void
-tu_image_view_copy_blit(struct tu_image_view *iview,
+tu_image_view_copy_blit(struct fdl6_view *iview,
                         struct tu_image *image,
-                        VkFormat format,
+                        enum pipe_format format,
                         const VkImageSubresourceLayers *subres,
                         uint32_t layer,
                         bool stencil_read,
@@ -1254,46 +1240,51 @@ tu_image_view_copy_blit(struct tu_image_view *iview,
    VkImageAspectFlags aspect_mask = subres->aspectMask;
 
    /* always use the AS_R8G8B8A8 format for these */
-   if (format == VK_FORMAT_D24_UNORM_S8_UINT ||
-       format == VK_FORMAT_X8_D24_UNORM_PACK32) {
+   if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
+       format == PIPE_FORMAT_Z24X8_UNORM) {
       aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
    }
 
-   tu_image_view_init(iview, &(VkImageViewCreateInfo) {
-      .image = tu_image_to_handle(image),
-      .viewType = z_scale ? VK_IMAGE_VIEW_TYPE_3D : VK_IMAGE_VIEW_TYPE_2D,
-      .format = format,
-      /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
-      .components.r = stencil_read ? VK_COMPONENT_SWIZZLE_A : VK_COMPONENT_SWIZZLE_R,
-      .subresourceRange = {
-         .aspectMask = aspect_mask,
-         .baseMipLevel = subres->mipLevel,
-         .levelCount = 1,
-         .baseArrayLayer = subres->baseArrayLayer + layer,
-         .layerCount = 1,
+   const struct fdl_layout *layout =
+      &image->layout[tu6_plane_index(image->vk_format, aspect_mask)];
+
+   fdl6_view_init(iview, &layout, &(struct fdl_view_args) {
+      .iova = image->bo->iova + image->bo_offset,
+      .base_array_layer = subres->baseArrayLayer + layer,
+      .layer_count = 1,
+      .base_miplevel = subres->mipLevel,
+      .level_count = 1,
+      .format = tu_format_for_aspect(format, aspect_mask),
+      .swiz = {
+         /* image_to_buffer from d24s8 with stencil aspect mask writes out to r8 */
+         stencil_read ? PIPE_SWIZZLE_W : PIPE_SWIZZLE_X,
+         PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W
       },
+      .type = z_scale ? FDL_VIEW_TYPE_3D : FDL_VIEW_TYPE_2D,
    }, false);
 }
 
 static void
-tu_image_view_copy(struct tu_image_view *iview,
+tu_image_view_copy(struct fdl6_view *iview,
                    struct tu_image *image,
-                   VkFormat format,
+                   enum pipe_format format,
                    const VkImageSubresourceLayers *subres,
                    uint32_t layer,
                    bool stencil_read)
 {
-   format = copy_format(format, subres->aspectMask, false);
    tu_image_view_copy_blit(iview, image, format, subres, layer, stencil_read, false);
 }
 
 static void
-tu_image_view_blit(struct tu_image_view *iview,
+tu_image_view_blit(struct fdl6_view *iview,
                    struct tu_image *image,
                    const VkImageSubresourceLayers *subres,
                    uint32_t layer)
 {
-   tu_image_view_copy_blit(iview, image, image->vk_format, subres, layer, false, false);
+   enum pipe_format format =
+      tu6_plane_format(image->vk_format, tu6_plane_index(image->vk_format,
+                                                         subres->aspectMask));
+   tu_image_view_copy_blit(iview, image, format, subres, layer, false, false);
 }
 
 static void
@@ -1370,7 +1361,9 @@ tu6_blit_image(struct tu_cmd_buffer *cmd,
          unreachable("unexpected D32_S8 aspect mask in blit_image");
    }
 
-   ops->setup(cmd, cs, format, info->dstSubresource.aspectMask,
+   trace_start_blit(&cmd->trace, cs);
+
+   ops->setup(cmd, cs, tu_vk_format_to_pipe_format(format), info->dstSubresource.aspectMask,
               blit_param, false, dst_image->layout[0].ubwc,
               dst_image->layout[0].nr_samples);
 
@@ -1394,12 +1387,16 @@ tu6_blit_image(struct tu_cmd_buffer *cmd,
          A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
    }
 
-   struct tu_image_view dst, src;
+   struct fdl6_view dst, src;
    tu_image_view_blit(&dst, dst_image, &info->dstSubresource,
                       MIN2(info->dstOffsets[0].z, info->dstOffsets[1].z));
 
    if (z_scale) {
-      tu_image_view_copy_blit(&src, src_image, src_image->vk_format,
+      enum pipe_format src_format =
+         tu6_plane_format(src_image->vk_format,
+                          tu6_plane_index(src_image->vk_format,
+                                          info->srcSubresource.aspectMask));
+      tu_image_view_copy_blit(&src, src_image, src_format,
                               &info->srcSubresource, 0, false, true);
       ops->src(cmd, cs, &src, 0, filter);
    } else {
@@ -1418,6 +1415,12 @@ tu6_blit_image(struct tu_cmd_buffer *cmd,
    }
 
    ops->teardown(cmd, cs);
+
+   trace_end_blit(&cmd->trace, cs,
+                  ops == &r3d_ops,
+                  src_image->vk_format,
+                  dst_image->vk_format,
+                  layers);
 }
 
 VKAPI_ATTR void VKAPI_CALL
@@ -1487,8 +1490,10 @@ tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
 {
    struct tu_cs *cs = &cmd->cs;
    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
-   VkFormat src_format =
+   enum pipe_format src_format =
       copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, true);
+   enum pipe_format dst_format =
+      copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, false);
    const struct blit_ops *ops = &r2d_ops;
 
    /* special case for buffer to stencil */
@@ -1497,9 +1502,9 @@ tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
       ops = &r3d_ops;
    }
 
-   /* TODO: G8_B8R8_2PLANE_420_UNORM Y plane has different hardware format,
-    * which matters for UBWC. buffer_to_image/etc can fail because of this
-    */
+   /* note: could use "R8_UNORM" when no UBWC */
+   if (src_format == PIPE_FORMAT_Y8_UNORM)
+      ops = &r3d_ops;
 
    VkOffset3D offset = info->imageOffset;
    VkExtent3D extent = info->imageExtent;
@@ -1508,16 +1513,15 @@ tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
 
    copy_compressed(dst_image->vk_format, &offset, &extent, &src_width, &src_height);
 
-   uint32_t pitch = src_width * vk_format_get_blocksize(src_format);
+   uint32_t pitch = src_width * util_format_get_blocksize(src_format);
    uint32_t layer_size = src_height * pitch;
 
-   ops->setup(cmd, cs,
-              copy_format(dst_image->vk_format, info->imageSubresource.aspectMask, false),
+   ops->setup(cmd, cs, dst_format,
               info->imageSubresource.aspectMask, 0, false, dst_image->layout[0].ubwc,
               dst_image->layout[0].nr_samples);
 
-   struct tu_image_view dst;
-   tu_image_view_copy(&dst, dst_image, dst_image->vk_format, &info->imageSubresource, offset.z, false);
+   struct fdl6_view dst;
+   tu_image_view_copy(&dst, dst_image, dst_format, &info->imageSubresource, offset.z, false);
 
    for (uint32_t i = 0; i < layers; i++) {
       ops->dst(cs, &dst, i);
@@ -1525,7 +1529,7 @@ tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
       uint64_t src_va = tu_buffer_iova(src_buffer) + info->bufferOffset + layer_size * i;
       if ((src_va & 63) || (pitch & 63)) {
          for (uint32_t y = 0; y < extent.height; y++) {
-            uint32_t x = (src_va & 63) / vk_format_get_blocksize(src_format);
+            uint32_t x = (src_va & 63) / util_format_get_blocksize(src_format);
             ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
                             x + extent.width, 1);
             ops->coords(cs, &(VkOffset2D){offset.x, offset.y + y},  &(VkOffset2D){x},
@@ -1567,16 +1571,23 @@ tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
 {
    struct tu_cs *cs = &cmd->cs;
    uint32_t layers = MAX2(info->imageExtent.depth, info->imageSubresource.layerCount);
-   VkFormat dst_format =
+   enum pipe_format dst_format =
       copy_format(src_image->vk_format, info->imageSubresource.aspectMask, true);
+   enum pipe_format src_format =
+      copy_format(src_image->vk_format, info->imageSubresource.aspectMask, false);
+   const struct blit_ops *ops = &r2d_ops;
    bool stencil_read = false;
 
    if (src_image->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
+      ops = &r3d_ops;
       stencil_read = true;
    }
 
-   const struct blit_ops *ops = stencil_read ? &r3d_ops : &r2d_ops;
+   /* note: could use "R8_UNORM" when no UBWC */
+   if (dst_format == PIPE_FORMAT_Y8_UNORM)
+      ops = &r3d_ops;
+
    VkOffset3D offset = info->imageOffset;
    VkExtent3D extent = info->imageExtent;
    uint32_t dst_width = info->bufferRowLength ?: extent.width;
@@ -1584,14 +1595,14 @@ tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
 
    copy_compressed(src_image->vk_format, &offset, &extent, &dst_width, &dst_height);
 
-   uint32_t pitch = dst_width * vk_format_get_blocksize(dst_format);
+   uint32_t pitch = dst_width * util_format_get_blocksize(dst_format);
    uint32_t layer_size = pitch * dst_height;
 
    ops->setup(cmd, cs, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
               VK_SAMPLE_COUNT_1_BIT);
 
-   struct tu_image_view src;
-   tu_image_view_copy(&src, src_image, src_image->vk_format, &info->imageSubresource, offset.z, stencil_read);
+   struct fdl6_view src;
+   tu_image_view_copy(&src, src_image, src_format, &info->imageSubresource, offset.z, stencil_read);
 
    for (uint32_t i = 0; i < layers; i++) {
       ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST);
@@ -1599,7 +1610,7 @@ tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
       uint64_t dst_va = tu_buffer_iova(dst_buffer) + info->bufferOffset + layer_size * i;
       if ((dst_va & 63) || (pitch & 63)) {
          for (uint32_t y = 0; y < extent.height; y++) {
-            uint32_t x = (dst_va & 63) / vk_format_get_blocksize(dst_format);
+            uint32_t x = (dst_va & 63) / util_format_get_blocksize(dst_format);
             ops->dst_buffer(cs, dst_format, dst_va & ~63, 0);
             ops->coords(cs, &(VkOffset2D) {x}, &(VkOffset2D){offset.x, offset.y + y},
                         &(VkExtent2D) {extent.width, 1});
@@ -1643,7 +1654,7 @@ tu_CmdCopyImageToBuffer(VkCommandBuffer commandBuffer,
  */
 
 static bool
-is_swapped_format(VkFormat format)
+is_swapped_format(enum pipe_format format)
 {
    struct tu_native_format linear = tu6_format_texture(format, TILE6_LINEAR);
    struct tu_native_format tiled = tu6_format_texture(format, TILE6_3);
@@ -1673,7 +1684,7 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
    if (dst_image->layout[0].nr_samples > 1)
       ops = &r3d_ops;
 
-   VkFormat format = VK_FORMAT_UNDEFINED;
+   enum pipe_format format = PIPE_FORMAT_NONE;
    VkOffset3D src_offset = info->srcOffset;
    VkOffset3D dst_offset = info->dstOffset;
    VkExtent3D extent = info->extent;
@@ -1698,8 +1709,13 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
    copy_compressed(src_image->vk_format, &src_offset, &extent, NULL, NULL);
    copy_compressed(dst_image->vk_format, &dst_offset, NULL, NULL, NULL);
 
-   VkFormat dst_format = copy_format(dst_image->vk_format, info->dstSubresource.aspectMask, false);
-   VkFormat src_format = copy_format(src_image->vk_format, info->srcSubresource.aspectMask, false);
+   enum pipe_format dst_format = copy_format(dst_image->vk_format, info->dstSubresource.aspectMask, false);
+   enum pipe_format src_format = copy_format(src_image->vk_format, info->srcSubresource.aspectMask, false);
+
+   /* note: could use "R8_UNORM" when no UBWC */
+   if (dst_format == PIPE_FORMAT_Y8_UNORM ||
+       src_format == PIPE_FORMAT_Y8_UNORM)
+      ops = &r3d_ops;
 
    bool use_staging_blit = false;
 
@@ -1737,53 +1753,50 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
       use_staging_blit = true;
    }
 
-   struct tu_image_view dst, src;
+   struct fdl6_view dst, src;
 
    if (use_staging_blit) {
       tu_image_view_copy(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z, false);
       tu_image_view_copy(&src, src_image, src_format, &info->srcSubresource, src_offset.z, false);
 
-      struct tu_image staging_image = {
-         .vk_format = src_format,
-         .level_count = 1,
-         .layer_count = info->srcSubresource.layerCount,
-         .bo_offset = 0,
-      }; 
-
-      VkImageSubresourceLayers staging_subresource = {
-         .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
-         .mipLevel = 0,
-         .baseArrayLayer = 0,
-         .layerCount = info->srcSubresource.layerCount,
-      };
-
+      struct fdl_layout staging_layout = { 0 };
       VkOffset3D staging_offset = { 0 };
 
-      staging_image.layout[0].tile_mode = TILE6_LINEAR;
-      staging_image.layout[0].ubwc = false;
+      staging_layout.tile_mode = TILE6_LINEAR;
+      staging_layout.ubwc = false;
 
-      fdl6_layout(&staging_image.layout[0],
-                  vk_format_to_pipe_format(staging_image.vk_format),
+      fdl6_layout(&staging_layout,
+                  src_format,
                   src_image->layout[0].nr_samples,
                   extent.width,
                   extent.height,
                   extent.depth,
-                  staging_image.level_count,
-                  staging_image.layer_count,
+                  1,
+                  info->srcSubresource.layerCount,
                   extent.depth > 1,
                   NULL);
 
+      struct tu_bo *staging_bo;
       VkResult result = tu_get_scratch_bo(cmd->device,
-                                          staging_image.layout[0].size,
-                                          &staging_image.bo);
+                                          staging_layout.size,
+                                          &staging_bo);
       if (result != VK_SUCCESS) {
          cmd->record_result = result;
          return;
       }
 
-      struct tu_image_view staging;
-      tu_image_view_copy(&staging, &staging_image, src_format,
-                         &staging_subresource, 0, false);
+      struct fdl6_view staging;
+      const struct fdl_layout *staging_layout_ptr = &staging_layout;
+      fdl6_view_init(&staging, &staging_layout_ptr, &(struct fdl_view_args) {
+         .iova = staging_bo->iova,
+         .base_array_layer = 0,
+         .layer_count = 1,
+         .base_miplevel = 0,
+         .level_count = info->srcSubresource.layerCount,
+         .format = tu_format_for_aspect(src_format, VK_IMAGE_ASPECT_COLOR_BIT),
+         .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
+         .type = FDL_VIEW_TYPE_2D,
+      }, false);
 
       ops->setup(cmd, cs, src_format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
                  dst_image->layout[0].nr_samples);
@@ -1800,9 +1813,18 @@ tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
        */
       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
       tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
+      tu_cs_emit_wfi(cs);
 
-      tu_image_view_copy(&staging, &staging_image, dst_format,
-                         &staging_subresource, 0, false);
+      fdl6_view_init(&staging, &staging_layout_ptr, &(struct fdl_view_args) {
+         .iova = staging_bo->iova,
+         .base_array_layer = 0,
+         .layer_count = 1,
+         .base_miplevel = 0,
+         .level_count = info->srcSubresource.layerCount,
+         .format = tu_format_for_aspect(dst_format, VK_IMAGE_ASPECT_COLOR_BIT),
+         .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
+         .type = FDL_VIEW_TYPE_2D,
+      }, false);
 
       ops->setup(cmd, cs, dst_format, info->dstSubresource.aspectMask,
                  0, false, dst_image->layout[0].ubwc,
@@ -1870,7 +1892,7 @@ copy_buffer(struct tu_cmd_buffer *cmd,
 {
    const struct blit_ops *ops = &r2d_ops;
    struct tu_cs *cs = &cmd->cs;
-   VkFormat format = block_size == 4 ? VK_FORMAT_R32_UINT : VK_FORMAT_R8_UNORM;
+   enum pipe_format format = block_size == 4 ? PIPE_FORMAT_R32_UINT : PIPE_FORMAT_R8_UNORM;
    uint64_t blocks = size / block_size;
 
    ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
@@ -1952,15 +1974,15 @@ tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
    uint64_t dst_va = tu_buffer_iova(buffer) + dstOffset;
    uint32_t blocks = fillSize / 4;
 
-   ops->setup(cmd, cs, VK_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
+   ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
               VK_SAMPLE_COUNT_1_BIT);
-   ops->clear_value(cs, VK_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
+   ops->clear_value(cs, PIPE_FORMAT_R32_UINT, &(VkClearValue){.color = {.uint32[0] = data}});
 
    while (blocks) {
       uint32_t dst_x = (dst_va & 63) / 4;
       uint32_t width = MIN2(blocks, 0x4000 - dst_x);
 
-      ops->dst_buffer(cs, VK_FORMAT_R32_UINT, dst_va & ~63, 0);
+      ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT, dst_va & ~63, 0);
       ops->coords(cs, &(VkOffset2D) {dst_x}, NULL, &(VkExtent2D) {width, 1});
       ops->run(cmd, cs);
 
@@ -1986,8 +2008,9 @@ tu_CmdResolveImage(VkCommandBuffer commandBuffer,
    const struct blit_ops *ops = &r2d_ops;
    struct tu_cs *cs = &cmd->cs;
 
-   ops->setup(cmd, cs, dst_image->vk_format, VK_IMAGE_ASPECT_COLOR_BIT,
-              0, false, dst_image->layout[0].ubwc, VK_SAMPLE_COUNT_1_BIT);
+   ops->setup(cmd, cs, tu_vk_format_to_pipe_format(dst_image->vk_format),
+              VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst_image->layout[0].ubwc, 
+              VK_SAMPLE_COUNT_1_BIT);
 
    for (uint32_t i = 0; i < regionCount; ++i) {
       const VkImageResolve *info = &pRegions[i];
@@ -1998,7 +2021,7 @@ tu_CmdResolveImage(VkCommandBuffer commandBuffer,
 
       coords(ops, cs, &info->dstOffset, &info->srcOffset, &info->extent);
 
-      struct tu_image_view dst, src;
+      struct fdl6_view dst, src;
       tu_image_view_blit(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
       tu_image_view_blit(&src, src_image, &info->srcSubresource, info->srcOffset.z);
 
@@ -2022,8 +2045,8 @@ static void
 resolve_sysmem(struct tu_cmd_buffer *cmd,
                struct tu_cs *cs,
                VkFormat format,
-               struct tu_image_view *src,
-               struct tu_image_view *dst,
+               const struct tu_image_view *src,
+               const struct tu_image_view *dst,
                uint32_t layer_mask,
                uint32_t layers,
                const VkRect2D *rect,
@@ -2031,8 +2054,11 @@ resolve_sysmem(struct tu_cmd_buffer *cmd,
 {
    const struct blit_ops *ops = &r2d_ops;
 
-   ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT,
-              0, false, dst->ubwc_enabled, VK_SAMPLE_COUNT_1_BIT);
+   trace_start_sysmem_resolve(&cmd->trace, cs);
+
+   ops->setup(cmd, cs, tu_vk_format_to_pipe_format(format),
+              VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst->view.ubwc_enabled,
+              VK_SAMPLE_COUNT_1_BIT);
    ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
 
    for_each_layer(i, layer_mask, layers) {
@@ -2040,20 +2066,22 @@ resolve_sysmem(struct tu_cmd_buffer *cmd,
          r2d_src_stencil(cmd, cs, src, i, VK_FILTER_NEAREST);
          r2d_dst_stencil(cs, dst, i);
       } else {
-         ops->src(cmd, cs, src, i, VK_FILTER_NEAREST);
-         ops->dst(cs, dst, i);
+         ops->src(cmd, cs, &src->view, i, VK_FILTER_NEAREST);
+         ops->dst(cs, &dst->view, i);
       }
       ops->run(cmd, cs);
    }
 
    ops->teardown(cmd, cs);
+
+   trace_end_sysmem_resolve(&cmd->trace, cs, format);
 }
 
 void
 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
                   struct tu_cs *cs,
-                  struct tu_image_view *src,
-                  struct tu_image_view *dst,
+                  const struct tu_image_view *src,
+                  const struct tu_image_view *dst,
                   uint32_t layer_mask,
                   uint32_t layers,
                   const VkRect2D *rect)
@@ -2081,9 +2109,14 @@ clear_image(struct tu_cmd_buffer *cmd,
    uint32_t level_count = tu_get_levelCount(image, range);
    uint32_t layer_count = tu_get_layerCount(image, range);
    struct tu_cs *cs = &cmd->cs;
-   VkFormat format = image->vk_format;
-   if (format == VK_FORMAT_D32_SFLOAT_S8_UINT || format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
-      format = copy_format(format, aspect_mask, false);
+   enum pipe_format format;
+   if (image->vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) {
+      format = PIPE_FORMAT_R32_UINT;
+   } else {
+      format = tu6_plane_format(image->vk_format,
+                                tu6_plane_index(image->vk_format,
+                                                aspect_mask));
+   }
 
    if (image->layout[0].depth0 > 1) {
       assert(layer_count == 1);
@@ -2095,7 +2128,7 @@ clear_image(struct tu_cmd_buffer *cmd,
    ops->setup(cmd, cs, format, aspect_mask, 0, true, image->layout[0].ubwc,
               image->layout[0].nr_samples);
    if (image->vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
-      ops->clear_value(cs, VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, clear_value);
+      ops->clear_value(cs, PIPE_FORMAT_R9G9B9E5_FLOAT, clear_value);
    else
       ops->clear_value(cs, format, clear_value);
 
@@ -2108,7 +2141,7 @@ clear_image(struct tu_cmd_buffer *cmd,
                      u_minify(image->layout[0].height0, range->baseMipLevel + j)
                   });
 
-      struct tu_image_view dst;
+      struct fdl6_view dst;
       tu_image_view_copy_blit(&dst, image, format, &(VkImageSubresourceLayers) {
          .aspectMask = aspect_mask,
          .mipLevel = range->baseMipLevel + j,
@@ -2183,6 +2216,8 @@ tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
    bool z_clear = false;
    bool s_clear = false;
 
+   trace_start_sysmem_clear_all(&cmd->trace, cs);
+
    for (uint32_t i = 0; i < attachment_count; i++) {
       uint32_t a;
       if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
@@ -2313,24 +2348,27 @@ tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
          r3d_run(cmd, cs);
       }
    }
+
+   trace_end_sysmem_clear_all(&cmd->trace,
+                              cs, mrt_count, rect_count);
 }
 
 static void
-pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_value[4])
+pack_gmem_clear_value(const VkClearValue *val, enum pipe_format format, uint32_t clear_value[4])
 {
    switch (format) {
-   case VK_FORMAT_X8_D24_UNORM_PACK32:
-   case VK_FORMAT_D24_UNORM_S8_UINT:
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
                        val->depthStencil.stencil << 24;
       return;
-   case VK_FORMAT_D16_UNORM:
+   case PIPE_FORMAT_Z16_UNORM:
       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
       return;
-   case VK_FORMAT_D32_SFLOAT:
+   case PIPE_FORMAT_Z32_FLOAT:
       clear_value[0] = fui(val->depthStencil.depth);
       return;
-   case VK_FORMAT_S8_UINT:
+   case PIPE_FORMAT_S8_UINT:
       clear_value[0] = val->depthStencil.stencil;
       return;
    default:
@@ -2339,33 +2377,33 @@ pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_v
 
    float tmp[4];
    memcpy(tmp, val->color.float32, 4 * sizeof(float));
-   if (vk_format_is_srgb(format)) {
+   if (util_format_is_srgb(format)) {
       for (int i = 0; i < 3; i++)
          tmp[i] = util_format_linear_to_srgb_float(tmp[i]);
    }
 
 #define PACK_F(type) util_format_##type##_pack_rgba_float \
    ( (uint8_t*) &clear_value[0], 0, tmp, 0, 1, 1)
-   switch (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
+   switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
    case 4:
       PACK_F(r4g4b4a4_unorm);
       break;
    case 5:
-      if (vk_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_Y) == 6)
+      if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_Y) == 6)
          PACK_F(r5g6b5_unorm);
       else
          PACK_F(r5g5b5a1_unorm);
       break;
    case 8:
-      if (vk_format_is_snorm(format))
+      if (util_format_is_snorm(format))
          PACK_F(r8g8b8a8_snorm);
-      else if (vk_format_is_unorm(format))
+      else if (util_format_is_unorm(format))
          PACK_F(r8g8b8a8_unorm);
       else
          pack_int8(clear_value, val->color.uint32);
       break;
    case 10:
-      if (vk_format_is_int(format))
+      if (util_format_is_pure_integer(format))
          pack_int10_2(clear_value, val->color.uint32);
       else
          PACK_F(r10g10b10a2_unorm);
@@ -2374,11 +2412,11 @@ pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_v
       clear_value[0] = float3_to_r11g11b10f(val->color.float32);
       break;
    case 16:
-      if (vk_format_is_snorm(format))
+      if (util_format_is_snorm(format))
          PACK_F(r16g16b16a16_snorm);
-      else if (vk_format_is_unorm(format))
+      else if (util_format_is_unorm(format))
          PACK_F(r16g16b16a16_unorm);
-      else if (vk_format_is_float(format))
+      else if (util_format_is_float(format))
          PACK_F(r16g16b16a16_float);
       else
          pack_int16(clear_value, val->color.uint32);
@@ -2395,7 +2433,7 @@ pack_gmem_clear_value(const VkClearValue *val, VkFormat format, uint32_t clear_v
 static void
 clear_gmem_attachment(struct tu_cmd_buffer *cmd,
                       struct tu_cs *cs,
-                      VkFormat format,
+                      enum pipe_format format,
                       uint8_t clear_mask,
                       uint32_t gmem_offset,
                       const VkClearValue *value)
@@ -2430,15 +2468,20 @@ tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
    const struct tu_render_pass_attachment *att =
       &cmd->state.pass->attachments[attachment];
 
+   trace_start_gmem_clear(&cmd->trace, cs);
+
+   enum pipe_format format = tu_vk_format_to_pipe_format(att->format);
    if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
       if (mask & VK_IMAGE_ASPECT_DEPTH_BIT)
-         clear_gmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, 0xf, att->gmem_offset, value);
+         clear_gmem_attachment(cmd, cs, PIPE_FORMAT_Z32_FLOAT, 0xf, att->gmem_offset, value);
       if (mask & VK_IMAGE_ASPECT_STENCIL_BIT)
-         clear_gmem_attachment(cmd, cs, VK_FORMAT_S8_UINT, 0xf, att->gmem_offset_stencil, value);
+         clear_gmem_attachment(cmd, cs, PIPE_FORMAT_S8_UINT, 0xf, att->gmem_offset_stencil, value);
       return;
    }
 
-   clear_gmem_attachment(cmd, cs, att->format, aspect_write_mask(att->format, mask), att->gmem_offset, value);
+   clear_gmem_attachment(cmd, cs, format, aspect_write_mask(format, mask), att->gmem_offset, value);
+
+   trace_end_gmem_clear(&cmd->trace, cs, att->format, att->samples);
 }
 
 static void
@@ -2528,20 +2571,23 @@ tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
 static void
 clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
                         struct tu_cs *cs,
-                        VkFormat format,
+                        VkFormat vk_format,
                         VkImageAspectFlags clear_mask,
                         const VkRenderPassBeginInfo *info,
                         uint32_t a,
                         bool separate_stencil)
 {
+   enum pipe_format format = tu_vk_format_to_pipe_format(vk_format);
    const struct tu_framebuffer *fb = cmd->state.framebuffer;
-   const struct tu_image_view *iview = fb->attachments[a].attachment;
+   const struct tu_image_view *iview = cmd->state.attachments[a];
    const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views;
    const struct blit_ops *ops = &r2d_ops;
    if (cmd->state.pass->attachments[a].samples > 1)
       ops = &r3d_ops;
 
-   ops->setup(cmd, cs, format, clear_mask, 0, true, iview->ubwc_enabled,
+   trace_start_sysmem_clear(&cmd->trace, cs);
+
+   ops->setup(cmd, cs, format, clear_mask, 0, true, iview->view.ubwc_enabled,
               cmd->state.pass->attachments[a].samples);
    ops->coords(cs, &info->renderArea.offset, NULL, &info->renderArea.extent);
    ops->clear_value(cs, format, &info->pClearValues[a]);
@@ -2553,12 +2599,16 @@ clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
          else
             r2d_dst_stencil(cs, iview, i);
       } else {
-         ops->dst(cs, iview, i);
+         ops->dst(cs, &iview->view, i);
       }
       ops->run(cmd, cs);
    }
 
    ops->teardown(cmd, cs);
+
+   trace_end_sysmem_clear(&cmd->trace, cs,
+                          vk_format, ops == &r3d_ops,
+                          cmd->state.pass->attachments[a].samples);
 }
 
 void
@@ -2573,9 +2623,6 @@ tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
    if (!attachment->clear_mask)
       return;
 
-   /* Wait for any flushes at the beginning of the renderpass to complete */
-   tu_cs_emit_wfi(cs);
-
    if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
       if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
          clear_sysmem_attachment(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT,
@@ -2606,6 +2653,9 @@ tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
       tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
       tu6_emit_event_write(cmd, cs, PC_CCU_INVALIDATE_COLOR);
    }
+
+   if (cmd->device->physical_device->info->a6xx.has_ccu_flush_bug)
+      tu_cs_emit_wfi(cs);
 }
 
 void
@@ -2652,11 +2702,11 @@ tu_emit_blit(struct tu_cmd_buffer *cmd,
       tu_cs_emit_regs(cs,
                       A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset_stencil));
    } else {
-      tu_cs_emit(cs, iview->RB_BLIT_DST_INFO);
-      tu_cs_image_ref_2d(cs, iview, 0, false);
+      tu_cs_emit(cs, iview->view.RB_BLIT_DST_INFO);
+      tu_cs_image_ref_2d(cs, &iview->view, 0, false);
 
       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
-      tu_cs_image_flag_ref(cs, iview, 0);
+      tu_cs_image_flag_ref(cs, &iview->view, 0);
 
       tu_cs_emit_regs(cs,
                       A6XX_RB_BLIT_BASE_GMEM(attachment->gmem_offset));
@@ -2706,43 +2756,46 @@ tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
                         uint32_t a,
                         bool force_load)
 {
-   const struct tu_image_view *iview =
-      cmd->state.framebuffer->attachments[a].attachment;
+   const struct tu_image_view *iview = cmd->state.attachments[a];
    const struct tu_render_pass_attachment *attachment =
       &cmd->state.pass->attachments[a];
 
+   trace_start_gmem_load(&cmd->trace, cs);
+
    if (attachment->load || force_load)
       tu_emit_blit(cmd, cs, iview, attachment, false, false);
 
    if (attachment->load_stencil || (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load))
       tu_emit_blit(cmd, cs, iview, attachment, false, true);
+
+   trace_end_gmem_load(&cmd->trace, cs, attachment->format, force_load);
 }
 
 static void
 store_cp_blit(struct tu_cmd_buffer *cmd,
               struct tu_cs *cs,
-              struct tu_image_view *iview,
+              const struct tu_image_view *iview,
               uint32_t samples,
               bool separate_stencil,
-              VkFormat format,
+              enum pipe_format format,
               uint32_t gmem_offset,
               uint32_t cpp)
 {
    r2d_setup_common(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false,
-                    iview->ubwc_enabled, true);
+                    iview->view.ubwc_enabled, true);
    if (separate_stencil)
       r2d_dst_stencil(cs, iview, 0);
    else
-      r2d_dst(cs, iview, 0);
+      r2d_dst(cs, &iview->view, 0);
 
    tu_cs_emit_regs(cs,
                    A6XX_SP_PS_2D_SRC_INFO(
                       .color_format = tu6_format_texture(format, TILE6_2).fmt,
                       .tile_mode = TILE6_2,
-                      .srgb = vk_format_is_srgb(format),
+                      .srgb = util_format_is_srgb(format),
                       .samples = tu_msaa_samples(samples),
-                      .samples_average = !vk_format_is_int(format) &&
-                                         !vk_format_is_depth_or_stencil(format),
+                      .samples_average = !util_format_is_pure_integer(format) &&
+                                         !util_format_is_depth_or_stencil(format),
                       .unk20 = 1,
                       .unk22 = 1),
                    /* note: src size does not matter when not scaling */
@@ -2772,20 +2825,20 @@ store_3d_blit(struct tu_cmd_buffer *cmd,
               const struct tu_image_view *iview,
               uint32_t dst_samples,
               bool separate_stencil,
-              VkFormat format,
+              enum pipe_format format,
               const VkRect2D *render_area,
               uint32_t gmem_offset,
               uint32_t cpp)
 {
    r3d_setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false,
-             iview->ubwc_enabled, dst_samples);
+             iview->view.ubwc_enabled, dst_samples);
 
    r3d_coords(cs, &render_area->offset, &render_area->offset, &render_area->extent);
 
    if (separate_stencil)
       r3d_dst_stencil(cs, iview, 0);
    else
-      r3d_dst(cs, iview, 0);
+      r3d_dst(cs, &iview->view, 0);
 
    r3d_src_gmem(cmd, cs, iview, format, gmem_offset, cpp);
 
@@ -2811,7 +2864,7 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
    struct tu_physical_device *phys_dev = cmd->device->physical_device;
    const VkRect2D *render_area = &cmd->state.render_area;
    struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
-   struct tu_image_view *iview = cmd->state.framebuffer->attachments[a].attachment;
+   const struct tu_image_view *iview = cmd->state.attachments[a];
    struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
 
    if (!dst->store && !dst->store_stencil)
@@ -2827,11 +2880,11 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
     * required y padding in the layout (except for the last level)
     */
    bool need_y2_align =
-      y2 != iview->extent.height || iview->need_y2_align;
+      y2 != iview->view.height || iview->view.need_y2_align;
 
    bool unaligned =
       x1 % phys_dev->info->gmem_align_w ||
-      (x2 % phys_dev->info->gmem_align_w && x2 != iview->extent.width) ||
+      (x2 % phys_dev->info->gmem_align_w && x2 != iview->view.width) ||
       y1 % phys_dev->info->gmem_align_h || (y2 % phys_dev->info->gmem_align_h && need_y2_align);
 
    /* D32_SFLOAT_S8_UINT is quite special format: it has two planes,
@@ -2842,18 +2895,22 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
       src->format == VK_FORMAT_D32_SFLOAT_S8_UINT &&
       dst->format == VK_FORMAT_S8_UINT;
 
+   trace_start_gmem_store(&cmd->trace, cs);
+
    /* use fast path when render area is aligned, except for unsupported resolve cases */
    if (!unaligned && (a == gmem_a || blit_can_resolve(dst->format))) {
       if (dst->store)
          tu_emit_blit(cmd, cs, iview, src, true, resolve_d32s8_s8);
       if (dst->store_stencil)
          tu_emit_blit(cmd, cs, iview, src, true, true);
+
+      trace_end_gmem_store(&cmd->trace, cs, dst->format, true, false);
       return;
    }
 
-   VkFormat format = src->format;
-   if (format == VK_FORMAT_D32_SFLOAT_S8_UINT)
-      format = VK_FORMAT_D32_SFLOAT;
+   enum pipe_format format = tu_vk_format_to_pipe_format(src->format);
+   if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
+      format = PIPE_FORMAT_Z32_FLOAT;
 
    if (dst->samples > 1) {
       /* If we hit this path, we have to disable draw states after every tile
@@ -2871,7 +2928,7 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
                        render_area, src->gmem_offset, src->cpp);
       }
       if (dst->store_stencil) {
-         store_3d_blit(cmd, cs, iview, dst->samples, true, VK_FORMAT_S8_UINT,
+         store_3d_blit(cmd, cs, iview, dst->samples, true, PIPE_FORMAT_S8_UINT,
                        render_area, src->gmem_offset, src->samples);
       }
    } else {
@@ -2882,8 +2939,10 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
                        src->gmem_offset, src->cpp);
       }
       if (dst->store_stencil) {
-         store_cp_blit(cmd, cs, iview, src->samples, true, VK_FORMAT_S8_UINT,
+         store_cp_blit(cmd, cs, iview, src->samples, true, PIPE_FORMAT_S8_UINT,
                        src->gmem_offset_stencil, src->samples);
       }
    }
+
+   trace_end_gmem_store(&cmd->trace, cs, dst->format, false, unaligned);
 }
diff --git a/mesa 3D driver/src/freedreno/vulkan/tu_cmd_buffer.c b/mesa 3D driver/src/freedreno/vulkan/tu_cmd_buffer.c
index e8801b2340..70f1c6943e 100644
--- a/mesa 3D driver/src/freedreno/vulkan/tu_cmd_buffer.c	
+++ b/mesa 3D driver/src/freedreno/vulkan/tu_cmd_buffer.c	
@@ -35,6 +35,8 @@
 
 #include "tu_cs.h"
 
+#include "tu_tracepoints.h"
+
 void
 tu6_emit_event_write(struct tu_cmd_buffer *cmd,
                      struct tu_cs *cs,
@@ -68,7 +70,7 @@ tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
                  enum tu_cmd_flush_bits flushes)
 {
    if (unlikely(cmd_buffer->device->physical_device->instance->debug_flags & TU_DEBUG_FLUSHALL))
-      flushes |= TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_GPU_INVALIDATE;
+      flushes |= TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_ALL_INVALIDATE;
 
    if (unlikely(cmd_buffer->device->physical_device->instance->debug_flags & TU_DEBUG_SYNCDRAW))
       flushes |= TU_CMD_FLAG_WAIT_MEM_WRITES |
@@ -96,7 +98,9 @@ tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
       tu6_emit_event_write(cmd_buffer, cs, CACHE_INVALIDATE);
    if (flushes & TU_CMD_FLAG_WAIT_MEM_WRITES)
       tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
-   if (flushes & TU_CMD_FLAG_WAIT_FOR_IDLE)
+   if ((flushes & TU_CMD_FLAG_WAIT_FOR_IDLE) ||
+       (cmd_buffer->device->physical_device->info->a6xx.has_ccu_flush_bug &&
+        (flushes & (TU_CMD_FLAG_CCU_FLUSH_COLOR | TU_CMD_FLAG_CCU_FLUSH_DEPTH))))
       tu_cs_emit_wfi(cs);
    if (flushes & TU_CMD_FLAG_WAIT_FOR_ME)
       tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
@@ -183,8 +187,6 @@ tu6_emit_zs(struct tu_cmd_buffer *cmd,
             const struct tu_subpass *subpass,
             struct tu_cs *cs)
 {
-   const struct tu_framebuffer *fb = cmd->state.framebuffer;
-
    const uint32_t a = subpass->depth_stencil_attachment.attachment;
    if (a == VK_ATTACHMENT_UNUSED) {
       tu_cs_emit_regs(cs,
@@ -207,21 +209,21 @@ tu6_emit_zs(struct tu_cmd_buffer *cmd,
       return;
    }
 
-   const struct tu_image_view *iview = fb->attachments[a].attachment;
+   const struct tu_image_view *iview = cmd->state.attachments[a];
    const struct tu_render_pass_attachment *attachment =
       &cmd->state.pass->attachments[a];
    enum a6xx_depth_format fmt = tu6_pipe2depth(attachment->format);
 
    tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_BUFFER_INFO, 6);
    tu_cs_emit(cs, A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = fmt).value);
-   tu_cs_image_ref(cs, iview, 0);
+   tu_cs_image_ref(cs, &iview->view, 0);
    tu_cs_emit(cs, attachment->gmem_offset);
 
    tu_cs_emit_regs(cs,
                    A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt));
 
    tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_FLAG_BUFFER_BASE, 3);
-   tu_cs_image_flag_ref(cs, iview, 0);
+   tu_cs_image_flag_ref(cs, &iview->view, 0);
 
    tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_BUFFER_BASE(.bo = iview->image->bo,
                                                  .bo_offset = iview->image->bo_offset + iview->image->lrz_offset),
@@ -237,7 +239,7 @@ tu6_emit_zs(struct tu_cmd_buffer *cmd,
          tu_cs_image_stencil_ref(cs, iview, 0);
          tu_cs_emit(cs, attachment->gmem_offset_stencil);
       } else {
-         tu_cs_image_ref(cs, iview, 0);
+         tu_cs_image_ref(cs, &iview->view, 0);
          tu_cs_emit(cs, attachment->gmem_offset);
       }
    } else {
@@ -258,18 +260,27 @@ tu6_emit_mrt(struct tu_cmd_buffer *cmd,
       if (a == VK_ATTACHMENT_UNUSED)
          continue;
 
-      const struct tu_image_view *iview = fb->attachments[a].attachment;
+      const struct tu_image_view *iview = cmd->state.attachments[a];
 
       tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(i), 6);
-      tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
-      tu_cs_image_ref(cs, iview, 0);
+      tu_cs_emit(cs, iview->view.RB_MRT_BUF_INFO);
+      tu_cs_image_ref(cs, &iview->view, 0);
       tu_cs_emit(cs, cmd->state.pass->attachments[a].gmem_offset);
 
       tu_cs_emit_regs(cs,
-                      A6XX_SP_FS_MRT_REG(i, .dword = iview->SP_FS_MRT_REG));
+                      A6XX_SP_FS_MRT_REG(i, .dword = iview->view.SP_FS_MRT_REG));
 
       tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER_ADDR(i), 3);
-      tu_cs_image_flag_ref(cs, iview, 0);
+      tu_cs_image_flag_ref(cs, &iview->view, 0);
+   }
+
+   if (subpass->color_count) {
+      uint32_t a = subpass->color_attachments[0].attachment;
+      if (a != VK_ATTACHMENT_UNUSED) {
+         const struct tu_image_view *iview = cmd->state.attachments[a];
+         enum a6xx_format fmt = iview->view.RB_MRT_BUF_INFO & 0xff;
+         tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = fmt));
+      }
    }
 
    tu_cs_emit_regs(cs,
@@ -302,10 +313,11 @@ tu6_emit_mrt(struct tu_cmd_buffer *cmd,
 }
 
 void
-tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits vk_samples)
+tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits vk_samples,
+              enum a5xx_line_mode line_mode)
 {
    const enum a3xx_msaa_samples samples = tu_msaa_samples(vk_samples);
-   bool msaa_disable = samples == MSAA_ONE;
+   bool msaa_disable = (samples == MSAA_ONE) || (line_mode == BRESENHAM);
 
    tu_cs_emit_regs(cs,
                    A6XX_SP_TP_RAS_MSAA_CNTL(samples),
@@ -352,7 +364,6 @@ tu6_emit_render_cntl(struct tu_cmd_buffer *cmd,
                      struct tu_cs *cs,
                      bool binning)
 {
-   const struct tu_framebuffer *fb = cmd->state.framebuffer;
    /* doesn't RB_RENDER_CNTL set differently for binning pass: */
    bool no_track = !cmd->device->physical_device->info->a6xx.has_cp_reg_write;
    uint32_t cntl = 0;
@@ -368,8 +379,8 @@ tu6_emit_render_cntl(struct tu_cmd_buffer *cmd,
          if (a == VK_ATTACHMENT_UNUSED)
             continue;
 
-         const struct tu_image_view *iview = fb->attachments[a].attachment;
-         if (iview->ubwc_enabled)
+         const struct tu_image_view *iview = cmd->state.attachments[a];
+         if (iview->view.ubwc_enabled)
             mrts_ubwc_enable |= 1 << i;
       }
 
@@ -377,8 +388,8 @@ tu6_emit_render_cntl(struct tu_cmd_buffer *cmd,
 
       const uint32_t a = subpass->depth_stencil_attachment.attachment;
       if (a != VK_ATTACHMENT_UNUSED) {
-         const struct tu_image_view *iview = fb->attachments[a].attachment;
-         if (iview->ubwc_enabled)
+         const struct tu_image_view *iview = cmd->state.attachments[a];
+         if (iview->view.ubwc_enabled)
             cntl |= A6XX_RB_RENDER_CNTL_FLAG_DEPTH;
       }
 
@@ -605,8 +616,8 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
 
    const uint32_t x1 = fb->tile0.width * tx;
    const uint32_t y1 = fb->tile0.height * ty;
-   const uint32_t x2 = x1 + fb->tile0.width - 1;
-   const uint32_t y2 = y1 + fb->tile0.height - 1;
+   const uint32_t x2 = MIN2(x1 + fb->tile0.width - 1, MAX_VIEWPORT_SIZE - 1);
+   const uint32_t y2 = MIN2(y1 + fb->tile0.height - 1, MAX_VIEWPORT_SIZE - 1);
    tu6_emit_window_scissor(cs, x1, y1, x2, y2);
    tu6_emit_window_offset(cs, x1, y1);
 
@@ -647,8 +658,8 @@ tu6_emit_sysmem_resolve(struct tu_cmd_buffer *cmd,
                         uint32_t gmem_a)
 {
    const struct tu_framebuffer *fb = cmd->state.framebuffer;
-   struct tu_image_view *dst = fb->attachments[a].attachment;
-   struct tu_image_view *src = fb->attachments[gmem_a].attachment;
+   const struct tu_image_view *dst = cmd->state.attachments[a];
+   const struct tu_image_view *src = cmd->state.attachments[gmem_a];
 
    tu_resolve_sysmem(cmd, cs, src, dst, layer_mask, fb->layers, &cmd->state.render_area);
 }
@@ -779,7 +790,8 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
    tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE00, 0);
    tu_cs_emit_write_reg(cs, REG_A6XX_SP_PERFCTR_ENABLE, 0x3f);
    tu_cs_emit_write_reg(cs, REG_A6XX_TPL1_UNKNOWN_B605, 0x44);
-   tu_cs_emit_write_reg(cs, REG_A6XX_TPL1_UNKNOWN_B600, 0x100000);
+   tu_cs_emit_write_reg(cs, REG_A6XX_TPL1_DBG_ECO_CNTL,
+                        phys_dev->info->a6xx.magic.TPL1_DBG_ECO_CNTL);
    tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80);
    tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE01, 0);
 
@@ -985,9 +997,13 @@ tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
    tu_cs_emit_regs(cs,
                    A6XX_SP_TP_WINDOW_OFFSET(.x = 0, .y = 0));
 
+   trace_start_binning_ib(&cmd->trace, cs);
+
    /* emit IB to binning drawcmds: */
    tu_cs_emit_call(cs, &cmd->draw_cs);
 
+   trace_end_binning_ib(&cmd->trace, cs);
+
    /* switching from binning pass to GMEM pass will cause a switch from
     * PROGRAM_BINNING to PROGRAM, which invalidates const state (XS_CONST states)
     * so make sure these states are re-emitted
@@ -1059,15 +1075,14 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
       if (a == VK_ATTACHMENT_UNUSED)
          continue;
 
-      struct tu_image_view *iview =
-         cmd->state.framebuffer->attachments[a].attachment;
+      const struct tu_image_view *iview = cmd->state.attachments[a];
       const struct tu_render_pass_attachment *att =
          &cmd->state.pass->attachments[a];
       uint32_t *dst = &texture.map[A6XX_TEX_CONST_DWORDS * i];
       uint32_t gmem_offset = att->gmem_offset;
       uint32_t cpp = att->cpp;
 
-      memcpy(dst, iview->descriptor, A6XX_TEX_CONST_DWORDS * 4);
+      memcpy(dst, iview->view.descriptor, A6XX_TEX_CONST_DWORDS * 4);
 
       if (i % 2 == 1 && att->format == VK_FORMAT_D24_UNORM_S8_UINT) {
          /* note this works because spec says fb and input attachments
@@ -1298,6 +1313,15 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 
    tu_cs_emit_call(cs, &cmd->tile_store_cs);
 
+   if (!u_trace_iterator_equal(cmd->trace_renderpass_start, cmd->trace_renderpass_end)) {
+      tu_cs_emit_wfi(cs);
+      tu_cs_emit_pkt7(&cmd->cs, CP_WAIT_FOR_ME, 0);
+      u_trace_clone_append(cmd->trace_renderpass_start,
+                           cmd->trace_renderpass_end,
+                           &cmd->trace,
+                           cs, tu_copy_timestamp_buffer);
+   }
+
    tu_cs_sanity_check(cs);
 }
 
@@ -1334,13 +1358,22 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd)
          for (uint32_t ty = ty1; ty < ty2; ty++) {
             for (uint32_t tx = tx1; tx < tx2; tx++, slot++) {
                tu6_emit_tile_select(cmd, &cmd->cs, tx, ty, pipe, slot);
+
+               trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs);
                tu6_render_tile(cmd, &cmd->cs);
+               trace_end_draw_ib_gmem(&cmd->trace, &cmd->cs);
             }
          }
       }
    }
 
    tu6_tile_render_end(cmd, &cmd->cs);
+
+   trace_end_render_pass(&cmd->trace, &cmd->cs, fb);
+
+   if (!u_trace_iterator_equal(cmd->trace_renderpass_start, cmd->trace_renderpass_end))
+      u_trace_disable_event_range(cmd->trace_renderpass_start,
+                                  cmd->trace_renderpass_end);
 }
 
 static void
@@ -1348,9 +1381,15 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd)
 {
    tu6_sysmem_render_begin(cmd, &cmd->cs);
 
+   trace_start_draw_ib_sysmem(&cmd->trace, &cmd->cs);
+
    tu_cs_emit_call(&cmd->cs, &cmd->draw_cs);
 
+   trace_end_draw_ib_sysmem(&cmd->trace, &cmd->cs);
+
    tu6_sysmem_render_end(cmd, &cmd->cs);
+
+   trace_end_render_pass(&cmd->trace, &cmd->cs, cmd->state.framebuffer);
 }
 
 static VkResult
@@ -1361,10 +1400,17 @@ tu_create_cmd_buffer(struct tu_device *device,
 {
    struct tu_cmd_buffer *cmd_buffer;
 
-   cmd_buffer = vk_object_zalloc(&device->vk, NULL, sizeof(*cmd_buffer),
-                                 VK_OBJECT_TYPE_COMMAND_BUFFER);
+   cmd_buffer = vk_zalloc2(&device->vk.alloc, NULL, sizeof(*cmd_buffer), 8,
+                           VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+
    if (cmd_buffer == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   VkResult result = vk_command_buffer_init(&cmd_buffer->vk, &device->vk);
+   if (result != VK_SUCCESS) {
+      vk_free2(&device->vk.alloc, NULL, cmd_buffer);
+      return result;
+   }
 
    cmd_buffer->device = device;
    cmd_buffer->pool = pool;
@@ -1382,6 +1428,8 @@ tu_create_cmd_buffer(struct tu_device *device,
       cmd_buffer->queue_family_index = TU_QUEUE_GENERAL;
    }
 
+   u_trace_init(&cmd_buffer->trace, &device->trace_context);
+
    tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096);
    tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096);
    tu_cs_init(&cmd_buffer->tile_store_cs, device, TU_CS_MODE_GROW, 2048);
@@ -1404,12 +1452,18 @@ tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)
    tu_cs_finish(&cmd_buffer->draw_epilogue_cs);
    tu_cs_finish(&cmd_buffer->sub_cs);
 
-   vk_object_free(&cmd_buffer->device->vk, &cmd_buffer->pool->alloc, cmd_buffer);
+   u_trace_fini(&cmd_buffer->trace);
+
+   vk_command_buffer_finish(&cmd_buffer->vk);
+   vk_free2(&cmd_buffer->device->vk.alloc, &cmd_buffer->pool->alloc,
+            cmd_buffer);
 }
 
 static VkResult
 tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
 {
+   vk_command_buffer_reset(&cmd_buffer->vk);
+
    cmd_buffer->record_result = VK_SUCCESS;
 
    tu_cs_reset(&cmd_buffer->cs);
@@ -1421,8 +1475,12 @@ tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
    for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
       memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
       memset(&cmd_buffer->descriptors[i].push_set, 0, sizeof(cmd_buffer->descriptors[i].push_set));
+      cmd_buffer->descriptors[i].push_set.base.type = VK_OBJECT_TYPE_DESCRIPTOR_SET;
    }
 
+   u_trace_fini(&cmd_buffer->trace);
+   u_trace_init(&cmd_buffer->trace, &cmd_buffer->device->trace_context);
+
    cmd_buffer->status = TU_CMD_BUFFER_STATUS_INITIAL;
 
    return cmd_buffer->record_result;
@@ -1450,7 +1508,11 @@ tu_AllocateCommandBuffers(VkDevice _device,
 
          result = tu_reset_cmd_buffer(cmd_buffer);
          cmd_buffer->level = pAllocateInfo->level;
-         vk_object_base_reset(&cmd_buffer->base);
+         vk_command_buffer_finish(&cmd_buffer->vk);
+         VkResult init_result =
+            vk_command_buffer_init(&cmd_buffer->vk, &device->vk);
+         if (init_result != VK_SUCCESS)
+            result = init_result;
 
          pCommandBuffers[i] = tu_cmd_buffer_to_handle(cmd_buffer);
       } else {
@@ -1537,10 +1599,7 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
 
    memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
    cmd_buffer->state.index_size = 0xff; /* dirty restart index */
-
-   cmd_buffer->state.last_vs_params.first_instance = -1;
-   cmd_buffer->state.last_vs_params.params_offset = -1;
-   cmd_buffer->state.last_vs_params.vertex_offset = -1;
+   cmd_buffer->state.line_mode = RECTANGULAR;
 
    tu_cache_init(&cmd_buffer->state.cache);
    tu_cache_init(&cmd_buffer->state.renderpass_cache);
@@ -1969,8 +2028,8 @@ tu_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
          tu_cs_emit_pkt7(cs, CP_REG_RMW, 3);
          tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(idx)) |
                         CP_REG_RMW_0_SRC1_ADD);
-         tu_cs_emit_qw(cs, 0xffffffff);
-         tu_cs_emit_qw(cs, offset);
+         tu_cs_emit(cs, 0xffffffff);
+         tu_cs_emit(cs, offset);
       }
    }
 
@@ -2022,8 +2081,8 @@ tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
          tu_cs_emit_pkt7(cs, CP_REG_RMW, 3);
          tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_CP_SCRATCH_REG(0)) |
                         CP_REG_RMW_0_SRC1_ADD);
-         tu_cs_emit_qw(cs, 0xffffffff);
-         tu_cs_emit_qw(cs, -offset);
+         tu_cs_emit(cs, 0xffffffff);
+         tu_cs_emit(cs, -offset);
       }
 
       tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
@@ -2135,7 +2194,8 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
    assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS);
 
    cmd->state.pipeline = pipeline;
-   cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD | TU_CMD_DIRTY_SHADER_CONSTS | TU_CMD_DIRTY_LRZ;
+   cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD | TU_CMD_DIRTY_SHADER_CONSTS |
+                       TU_CMD_DIRTY_LRZ | TU_CMD_DIRTY_VS_PARAMS;
 
    /* note: this also avoids emitting draw states before renderpass clears,
     * which may use the 3D clear path (for MSAA cases)
@@ -2157,6 +2217,21 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
          tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i, pipeline->dynamic_state[i]);
    }
 
+   if (cmd->state.line_mode != pipeline->line_mode) {
+      cmd->state.line_mode = pipeline->line_mode;
+
+      /* We have to disable MSAA when bresenham lines are used, this is
+       * a hardware limitation and spec allows it:
+       *
+       *    When Bresenham lines are being rasterized, sample locations may
+       *    all be treated as being at the pixel center (this may affect
+       *    attribute and depth interpolation).
+       */
+      if (cmd->state.subpass && cmd->state.subpass->samples) {
+         tu6_emit_msaa(&cmd->draw_cs, cmd->state.subpass->samples, cmd->state.line_mode);
+      }
+   }
+
    /* the vertex_buffers draw state always contains all the currently
     * bound vertex buffers. update its size to only emit the vbs which
     * are actually used by the pipeline
@@ -2568,6 +2643,14 @@ tu_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer,
    tu_stub();
 }
 
+VKAPI_ATTR void VKAPI_CALL
+tu_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer,
+                        uint32_t lineStippleFactor,
+                        uint16_t lineStipplePattern)
+{
+   tu_stub();
+}
+
 static void
 tu_flush_for_access(struct tu_cache_state *cache,
                     enum tu_cmd_access_mask src_mask,
@@ -2575,26 +2658,16 @@ tu_flush_for_access(struct tu_cache_state *cache,
 {
    enum tu_cmd_flush_bits flush_bits = 0;
 
-   if (src_mask & TU_ACCESS_HOST_WRITE) {
-      /* Host writes are always visible to CP, so only invalidate GPU caches */
-      cache->pending_flush_bits |= TU_CMD_FLAG_GPU_INVALIDATE;
-   }
-
    if (src_mask & TU_ACCESS_SYSMEM_WRITE) {
-      /* Invalidate CP and 2D engine (make it do WFI + WFM if necessary) as
-       * well.
-       */
       cache->pending_flush_bits |= TU_CMD_FLAG_ALL_INVALIDATE;
    }
 
    if (src_mask & TU_ACCESS_CP_WRITE) {
-      /* Flush the CP write queue. However a WFI shouldn't be necessary as
-       * WAIT_MEM_WRITES should cover it.
+      /* Flush the CP write queue.
        */
       cache->pending_flush_bits |=
          TU_CMD_FLAG_WAIT_MEM_WRITES |
-         TU_CMD_FLAG_GPU_INVALIDATE |
-         TU_CMD_FLAG_WAIT_FOR_ME;
+         TU_CMD_FLAG_ALL_INVALIDATE;
    }
 
 #define SRC_FLUSH(domain, flush, invalidate) \
@@ -2624,8 +2697,7 @@ tu_flush_for_access(struct tu_cache_state *cache,
    /* Treat host & sysmem write accesses the same, since the kernel implicitly
     * drains the queue before signalling completion to the host.
     */
-   if (dst_mask & (TU_ACCESS_SYSMEM_READ | TU_ACCESS_SYSMEM_WRITE |
-                   TU_ACCESS_HOST_READ | TU_ACCESS_HOST_WRITE)) {
+   if (dst_mask & (TU_ACCESS_SYSMEM_READ | TU_ACCESS_SYSMEM_WRITE)) {
       flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH;
    }
 
@@ -2656,30 +2728,26 @@ tu_flush_for_access(struct tu_cache_state *cache,
 
 #undef DST_INCOHERENT_FLUSH
 
-   if (dst_mask & TU_ACCESS_WFI_READ) {
-      flush_bits |= cache->pending_flush_bits &
-         (TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_WAIT_FOR_IDLE);
-   }
-
-   if (dst_mask & TU_ACCESS_WFM_READ) {
-      flush_bits |= cache->pending_flush_bits &
-         (TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_WAIT_FOR_ME);
-   }
-
    cache->flush_bits |= flush_bits;
    cache->pending_flush_bits &= ~flush_bits;
 }
 
-static enum tu_cmd_access_mask
-vk2tu_access(VkAccessFlags flags, bool gmem)
+static void
+tu_flush_for_stage(struct tu_cache_state *cache,
+                   enum tu_stage src_stage, enum tu_stage dst_stage)
 {
-   enum tu_cmd_access_mask mask = 0;
+   /* As far as we know, flushes take place in the last stage so if there are
+    * any pending flushes then we have to move down the source stage, because
+    * the data only becomes available when the flush finishes. In particular
+    * this can matter when the CP writes something and we need to invalidate
+    * UCHE to read it.
+    */
+   if (cache->flush_bits & (TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_ALL_INVALIDATE))
+      src_stage = TU_STAGE_PS;
 
-   /* If the GPU writes a buffer that is then read by an indirect draw
-    * command, we theoretically need to emit a WFI to wait for any cache
-    * flushes, and then a WAIT_FOR_ME to wait on the CP for the WFI to
-    * complete. Waiting for the WFI to complete is performed as part of the
-    * draw by the firmware, so we just need to execute the WFI.
+   /* Note: if the destination stage is the CP, then the CP also has to wait
+    * for any WFI's to finish. This is already done for draw calls, including
+    * before indirect param reads, for the most part, so we just need to WFI.
     *
     * Transform feedback counters are read via CP_MEM_TO_REG, which implicitly
     * does CP_WAIT_FOR_ME, but we still need a WFI if the GPU writes it.
@@ -2692,13 +2760,14 @@ vk2tu_access(VkAccessFlags flags, bool gmem)
     * future, or if CP_DRAW_PRED_SET grows the capability to do 32-bit
     * comparisons, then this will have to be dealt with.
     */
-   if (flags &
-       (VK_ACCESS_INDIRECT_COMMAND_READ_BIT |
-        VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT |
-        VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT |
-        VK_ACCESS_MEMORY_READ_BIT)) {
-      mask |= TU_ACCESS_WFI_READ;
-   }
+   if (src_stage > dst_stage)
+      cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE;
+}
+
+static enum tu_cmd_access_mask
+vk2tu_access(VkAccessFlags flags, bool gmem)
+{
+   enum tu_cmd_access_mask mask = 0;
 
    if (flags &
        (VK_ACCESS_INDIRECT_COMMAND_READ_BIT | /* Read performed by CP */
@@ -2717,13 +2786,13 @@ vk2tu_access(VkAccessFlags flags, bool gmem)
    if (flags &
        (VK_ACCESS_HOST_READ_BIT |
         VK_ACCESS_MEMORY_WRITE_BIT)) {
-      mask |= TU_ACCESS_HOST_READ;
+      mask |= TU_ACCESS_SYSMEM_READ;
    }
 
    if (flags &
        (VK_ACCESS_HOST_WRITE_BIT |
         VK_ACCESS_MEMORY_WRITE_BIT)) {
-      mask |= TU_ACCESS_HOST_WRITE;
+      mask |= TU_ACCESS_SYSMEM_WRITE;
    }
 
    if (flags &
@@ -2792,13 +2861,6 @@ vk2tu_access(VkAccessFlags flags, bool gmem)
       }
    }
 
-   /* When the dst access is a transfer read/write, it seems we sometimes need
-    * to insert a WFI after any flushes, to guarantee that the flushes finish
-    * before the 2D engine starts. However the opposite (i.e. a WFI after
-    * CP_BLIT and before any subsequent flush) does not seem to be needed, and
-    * the blob doesn't emit such a WFI.
-    */
-
    if (flags &
        (VK_ACCESS_TRANSFER_WRITE_BIT |
         VK_ACCESS_MEMORY_WRITE_BIT)) {
@@ -2807,18 +2869,82 @@ vk2tu_access(VkAccessFlags flags, bool gmem)
       } else {
          mask |= TU_ACCESS_CCU_COLOR_WRITE;
       }
-      mask |= TU_ACCESS_WFI_READ;
    }
 
    if (flags &
        (VK_ACCESS_TRANSFER_READ_BIT | /* Access performed by TP */
         VK_ACCESS_MEMORY_READ_BIT)) {
-      mask |= TU_ACCESS_UCHE_READ | TU_ACCESS_WFI_READ;
+      mask |= TU_ACCESS_UCHE_READ;
    }
 
    return mask;
 }
 
+static enum tu_stage
+vk2tu_single_stage(VkPipelineStageFlags vk_stage, bool dst)
+{
+   switch (vk_stage) {
+   case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
+   case VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT:
+   case VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT:
+      return TU_STAGE_CP;
+   case VK_PIPELINE_STAGE_VERTEX_INPUT_BIT:
+      return TU_STAGE_FE;
+   case VK_PIPELINE_STAGE_VERTEX_SHADER_BIT:
+   case VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT:
+   case VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT:
+   case VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT:
+      return TU_STAGE_SP_VS;
+   case VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT:
+   case VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT:
+      return TU_STAGE_SP_PS;
+   case VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT: /* Yes, really */
+   /* See comment in TU_STAGE_GRAS about early fragment tests */
+   case VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT:
+   case VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT:
+   case VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT:
+   case VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT:
+      return TU_STAGE_PS;
+
+   case VK_PIPELINE_STAGE_TRANSFER_BIT:
+      /* Blits read in SP_PS and write in PS, in both 2d and 3d cases */
+      return dst ? TU_STAGE_SP_PS : TU_STAGE_PS;
+
+   case VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT:
+   case VK_PIPELINE_STAGE_ALL_COMMANDS_BIT:
+      /* Be conservative */
+      return dst ? TU_STAGE_CP : TU_STAGE_PS;
+
+   case VK_PIPELINE_STAGE_HOST_BIT:
+      return dst ? TU_STAGE_PS : TU_STAGE_CP;
+   }
+
+   unreachable("unknown pipeline stage");
+}
+
+static enum tu_stage
+vk2tu_src_stage(VkPipelineStageFlags vk_stages)
+{
+   enum tu_stage stage = TU_STAGE_CP;
+   u_foreach_bit (bit, vk_stages) {
+      enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, false); 
+      stage = MAX2(stage, new_stage);
+   }
+
+   return stage;
+}
+
+static enum tu_stage
+vk2tu_dst_stage(VkPipelineStageFlags vk_stages)
+{
+   enum tu_stage stage = TU_STAGE_PS;
+   u_foreach_bit (bit, vk_stages) {
+      enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, true); 
+      stage = MIN2(stage, new_stage);
+   }
+
+   return stage;
+}
 
 VKAPI_ATTR void VKAPI_CALL
 tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
@@ -2907,7 +3033,7 @@ tu_CreateCommandPool(VkDevice _device,
    pool = vk_object_alloc(&device->vk, pAllocator, sizeof(*pool),
                           VK_OBJECT_TYPE_COMMAND_POOL);
    if (pool == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    if (pAllocator)
       pool->alloc = *pAllocator;
@@ -3007,6 +3133,10 @@ tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer,
       src_flags |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE;
 
    tu_flush_for_access(cache, src_flags, dst_flags);
+
+   enum tu_stage src_stage = vk2tu_src_stage(barrier->src_stage_mask);
+   enum tu_stage dst_stage = vk2tu_dst_stage(barrier->dst_stage_mask);
+   tu_flush_for_stage(cache, src_stage, dst_stage);
 }
 
 VKAPI_ATTR void VKAPI_CALL
@@ -3018,11 +3148,33 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
    TU_FROM_HANDLE(tu_render_pass, pass, pRenderPassBegin->renderPass);
    TU_FROM_HANDLE(tu_framebuffer, fb, pRenderPassBegin->framebuffer);
 
+   const struct VkRenderPassAttachmentBeginInfo *pAttachmentInfo =
+      vk_find_struct_const(pRenderPassBegin->pNext,
+                           RENDER_PASS_ATTACHMENT_BEGIN_INFO);
+
    cmd->state.pass = pass;
    cmd->state.subpass = pass->subpasses;
    cmd->state.framebuffer = fb;
    cmd->state.render_area = pRenderPassBegin->renderArea;
 
+   cmd->state.attachments =
+      vk_alloc(&cmd->pool->alloc, pass->attachment_count *
+               sizeof(cmd->state.attachments[0]), 8,
+               VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+
+   if (!cmd->state.attachments) {
+      cmd->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
+      return;
+   }
+
+   for (unsigned i = 0; i < pass->attachment_count; i++) {
+      cmd->state.attachments[i] = pAttachmentInfo ?
+         tu_image_view_from_handle(pAttachmentInfo->pAttachments[i]) :
+         cmd->state.framebuffer->attachments[i].attachment;
+   }
+
+   trace_start_render_pass(&cmd->trace, &cmd->cs);
+
    /* Note: because this is external, any flushes will happen before draw_cs
     * gets called. However deferred flushes could have to happen later as part
     * of the subpass.
@@ -3039,7 +3191,7 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
    uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
    if (a != VK_ATTACHMENT_UNUSED) {
       const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
-      struct tu_image *image = fb->attachments[a].attachment->image;
+      struct tu_image *image = cmd->state.attachments[a]->image;
       /* if image has lrz and it isn't a stencil-only clear: */
       if (image->lrz_height &&
           (att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT))) {
@@ -3048,19 +3200,27 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
          cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
 
          tu6_clear_lrz(cmd, &cmd->cs, image, &pRenderPassBegin->pClearValues[a]);
-         tu6_emit_event_write(cmd, &cmd->cs, PC_CCU_FLUSH_COLOR_TS);
+
+         /* Clearing writes via CCU color in the PS stage, and LRZ is read via
+          * UCHE in the earlier GRAS stage.
+          */
+         cmd->state.cache.flush_bits |=
+            TU_CMD_FLAG_CCU_FLUSH_COLOR | TU_CMD_FLAG_CACHE_INVALIDATE |
+            TU_CMD_FLAG_WAIT_FOR_IDLE;
       } else {
          cmd->state.lrz.valid = false;
       }
       cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
    }
 
+   cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace);
+
    tu_emit_renderpass_begin(cmd, pRenderPassBegin);
 
    tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs);
    tu6_emit_mrt(cmd, cmd->state.subpass, &cmd->draw_cs);
    if (cmd->state.subpass->samples)
-      tu6_emit_msaa(&cmd->draw_cs, cmd->state.subpass->samples);
+      tu6_emit_msaa(&cmd->draw_cs, cmd->state.subpass->samples, cmd->state.line_mode);
    tu6_emit_render_cntl(cmd, cmd->state.subpass, &cmd->draw_cs, false);
 
    tu_set_input_attachments(cmd, cmd->state.subpass);
@@ -3129,7 +3289,7 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
    tu6_emit_zs(cmd, cmd->state.subpass, cs);
    tu6_emit_mrt(cmd, cmd->state.subpass, cs);
    if (cmd->state.subpass->samples)
-      tu6_emit_msaa(cs, cmd->state.subpass->samples);
+      tu6_emit_msaa(cs, cmd->state.subpass->samples, cmd->state.line_mode);
    tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, false);
 
    tu_set_input_attachments(cmd, cmd->state.subpass);
@@ -3981,14 +4141,17 @@ tu6_emit_vs_params(struct tu_cmd_buffer *cmd,
                    uint32_t vertex_offset,
                    uint32_t first_instance)
 {
-   uint32_t offset = vs_params_offset(cmd);
-
-   if (offset == cmd->state.last_vs_params.params_offset &&
+   /* Beside re-emitting params when they are changed, we should re-emit
+    * them after constants are invalidated via HLSQ_INVALIDATE_CMD.
+    */
+   if (!(cmd->state.dirty & (TU_CMD_DIRTY_DRAW_STATE | TU_CMD_DIRTY_VS_PARAMS)) &&
        vertex_offset == cmd->state.last_vs_params.vertex_offset &&
        first_instance == cmd->state.last_vs_params.first_instance) {
       return;
    }
 
+   uint32_t offset = vs_params_offset(cmd);
+
    struct tu_cs cs;
    VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 3 + (offset ? 8 : 0), &cs);
    if (result != VK_SUCCESS) {
@@ -4016,7 +4179,6 @@ tu6_emit_vs_params(struct tu_cmd_buffer *cmd,
       tu_cs_emit(&cs, 0);
    }
 
-   cmd->state.last_vs_params.params_offset = offset;
    cmd->state.last_vs_params.vertex_offset = vertex_offset;
    cmd->state.last_vs_params.first_instance = first_instance;
 
@@ -4371,6 +4533,10 @@ static void
 tu_dispatch(struct tu_cmd_buffer *cmd,
             const struct tu_dispatch_info *info)
 {
+   if (!info->indirect &&
+       (info->blocks[0] == 0 || info->blocks[1] == 0 || info->blocks[2] == 0))
+      return;
+
    struct tu_cs *cs = &cmd->cs;
    struct tu_pipeline *pipeline = cmd->state.compute_pipeline;
    struct tu_descriptor_state *descriptors_state =
@@ -4414,6 +4580,8 @@ tu_dispatch(struct tu_cmd_buffer *cmd,
                    A6XX_HLSQ_CS_KERNEL_GROUP_Y(1),
                    A6XX_HLSQ_CS_KERNEL_GROUP_Z(1));
 
+   trace_start_compute(&cmd->trace, cs);
+
    if (info->indirect) {
       uint64_t iova = tu_buffer_iova(info->indirect) + info->indirect_offset;
 
@@ -4432,6 +4600,11 @@ tu_dispatch(struct tu_cmd_buffer *cmd,
       tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2]));
    }
 
+   trace_end_compute(&cmd->trace, cs,
+                     info->indirect != NULL,
+                     local_size[0], local_size[1], local_size[2],
+                     info->blocks[0], info->blocks[1], info->blocks[2]);
+
    tu_cs_emit_wfi(cs);
 }
 
@@ -4493,6 +4666,8 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
    tu_cs_end(&cmd_buffer->tile_store_cs);
    tu_cs_end(&cmd_buffer->draw_epilogue_cs);
 
+   cmd_buffer->trace_renderpass_end = u_trace_end_iterator(&cmd_buffer->trace);
+
    if (use_sysmem_rendering(cmd_buffer))
       tu_cmd_render_sysmem(cmd_buffer);
    else
@@ -4517,9 +4692,12 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
       cmd_buffer->state.renderpass_cache.pending_flush_bits;
    tu_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier, true);
 
+   vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
+
    cmd_buffer->state.pass = NULL;
    cmd_buffer->state.subpass = NULL;
    cmd_buffer->state.framebuffer = NULL;
+   cmd_buffer->state.attachments = NULL;
    cmd_buffer->state.has_tess = false;
    cmd_buffer->state.has_subpass_predication = false;
    cmd_buffer->state.disable_gmem = false;
@@ -4625,6 +4803,10 @@ tu_barrier(struct tu_cmd_buffer *cmd,
       cmd->state.pass  ? &cmd->state.renderpass_cache : &cmd->state.cache;
    tu_flush_for_access(cache, src_flags, dst_flags);
 
+   enum tu_stage src_stage = vk2tu_src_stage(info->srcStageMask);
+   enum tu_stage dst_stage = vk2tu_dst_stage(info->dstStageMask);
+   tu_flush_for_stage(cache, src_stage, dst_stage);
+
    for (uint32_t i = 0; i < info->eventCount; i++) {
       TU_FROM_HANDLE(tu_event, event, info->pEvents[i]);
 
diff --git a/mesa 3D driver/src/freedreno/vulkan/tu_cs.c b/mesa 3D driver/src/freedreno/vulkan/tu_cs.c
index 59f7f70229..8372af0080 100644
--- a/mesa 3D driver/src/freedreno/vulkan/tu_cs.c	
+++ b/mesa 3D driver/src/freedreno/vulkan/tu_cs.c	
@@ -45,10 +45,12 @@ tu_cs_init(struct tu_cs *cs,
  * Initialize a command stream as a wrapper to an external buffer.
  */
 void
-tu_cs_init_external(struct tu_cs *cs, uint32_t *start, uint32_t *end)
+tu_cs_init_external(struct tu_cs *cs, struct tu_device *device,
+                    uint32_t *start, uint32_t *end)
 {
    memset(cs, 0, sizeof(*cs));
 
+   cs->device = device;
    cs->mode = TU_CS_MODE_EXTERNAL;
    cs->start = cs->reserved_end = cs->cur = start;
    cs->end = end;
@@ -252,7 +254,7 @@ tu_cs_begin_sub_stream(struct tu_cs *cs, uint32_t size, struct tu_cs *sub_cs)
    if (result != VK_SUCCESS)
       return result;
 
-   tu_cs_init_external(sub_cs, cs->cur, cs->reserved_end);
+   tu_cs_init_external(sub_cs, cs->device, cs->cur, cs->reserved_end);
    tu_cs_begin(sub_cs);
    result = tu_cs_reserve_space(sub_cs, size);
    assert(result == VK_SUCCESS);
diff --git a/mesa 3D driver/src/freedreno/vulkan/tu_cs.h b/mesa 3D driver/src/freedreno/vulkan/tu_cs.h
index e606ab465a..494d9d8fcb 100644
--- a/mesa 3D driver/src/freedreno/vulkan/tu_cs.h	
+++ b/mesa 3D driver/src/freedreno/vulkan/tu_cs.h	
@@ -36,7 +36,8 @@ tu_cs_init(struct tu_cs *cs,
            uint32_t initial_size);
 
 void
-tu_cs_init_external(struct tu_cs *cs, uint32_t *start, uint32_t *end);
+tu_cs_init_external(struct tu_cs *cs, struct tu_device *device,
+                    uint32_t *start, uint32_t *end);
 
 void
 tu_cs_finish(struct tu_cs *cs);
@@ -79,7 +80,7 @@ tu_cs_draw_state(struct tu_cs *sub_cs, struct tu_cs *cs, uint32_t size)
 
    /* TODO: clean this up */
    tu_cs_alloc(sub_cs, size, 1, &memory);
-   tu_cs_init_external(cs, memory.map, memory.map + size);
+   tu_cs_init_external(cs, sub_cs->device, memory.map, memory.map + size);
    tu_cs_begin(cs);
    tu_cs_reserve_space(cs, size);
 
diff --git a/mesa 3D driver/src/freedreno/vulkan/tu_descriptor_set.c b/mesa 3D driver/src/freedreno/vulkan/tu_descriptor_set.c
index 5d085d1319..b99d8eafed 100644
--- a/mesa 3D driver/src/freedreno/vulkan/tu_descriptor_set.c	
+++ b/mesa 3D driver/src/freedreno/vulkan/tu_descriptor_set.c	
@@ -152,7 +152,7 @@ tu_CreateDescriptorSetLayout(
    set_layout = vk_object_zalloc(&device->vk, pAllocator, size,
                                  VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT);
    if (!set_layout)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    set_layout->flags = pCreateInfo->flags;
 
@@ -166,7 +166,7 @@ tu_CreateDescriptorSetLayout(
       pCreateInfo->pBindings, pCreateInfo->bindingCount, &bindings);
    if (result != VK_SUCCESS) {
       vk_object_free(&device->vk, pAllocator, set_layout);
-      return vk_error(device->instance, result);
+      return vk_error(device, result);
    }
 
    set_layout->binding_count = num_bindings;
@@ -388,7 +388,7 @@ tu_CreatePipelineLayout(VkDevice _device,
    layout = vk_object_alloc(&device->vk, pAllocator, sizeof(*layout),
                             VK_OBJECT_TYPE_PIPELINE_LAYOUT);
    if (layout == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    layout->num_sets = pCreateInfo->setLayoutCount;
    layout->dynamic_offset_count = 0;
@@ -448,7 +448,7 @@ tu_descriptor_set_create(struct tu_device *device,
 
    if (pool->host_memory_base) {
       if (pool->host_memory_end - pool->host_memory_ptr < mem_size)
-         return vk_error(device->instance, VK_ERROR_OUT_OF_POOL_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_POOL_MEMORY);
 
       set = (struct tu_descriptor_set*)pool->host_memory_ptr;
       pool->host_memory_ptr += mem_size;
@@ -457,7 +457,7 @@ tu_descriptor_set_create(struct tu_device *device,
                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 
       if (!set)
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
    }
 
    memset(set, 0, mem_size);
@@ -482,7 +482,7 @@ tu_descriptor_set_create(struct tu_device *device,
 
       if (!pool->host_memory_base && pool->entry_count == pool->max_entry_count) {
          vk_object_free(&device->vk, NULL, set);
-         return vk_error(device->instance, VK_ERROR_OUT_OF_POOL_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_POOL_MEMORY);
       }
 
       /* try to allocate linearly first, so that we don't spend
@@ -511,7 +511,7 @@ tu_descriptor_set_create(struct tu_device *device,
 
          if (pool->size - offset < layout_size) {
             vk_object_free(&device->vk, NULL, set);
-            return vk_error(device->instance, VK_ERROR_OUT_OF_POOL_MEMORY);
+            return vk_error(device, VK_ERROR_OUT_OF_POOL_MEMORY);
          }
 
          set->mapped_ptr = (uint32_t*)(pool_base(pool) + offset);
@@ -524,7 +524,7 @@ tu_descriptor_set_create(struct tu_device *device,
          pool->entries[index].set = set;
          pool->entry_count++;
       } else
-         return vk_error(device->instance, VK_ERROR_OUT_OF_POOL_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_POOL_MEMORY);
    }
 
    if (layout->has_immutable_samplers) {
@@ -635,7 +635,7 @@ tu_CreateDescriptorPool(VkDevice _device,
    pool = vk_object_zalloc(&device->vk, pAllocator, size,
                           VK_OBJECT_TYPE_DESCRIPTOR_POOL);
    if (!pool)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT)) {
       pool->host_memory_base = (uint8_t*)pool + sizeof(struct tu_descriptor_pool);
@@ -875,9 +875,9 @@ write_image_descriptor(uint32_t *dst,
    TU_FROM_HANDLE(tu_image_view, iview, image_info->imageView);
 
    if (descriptor_type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE) {
-      memcpy(dst, iview->storage_descriptor, sizeof(iview->storage_descriptor));
+      memcpy(dst, iview->view.storage_descriptor, sizeof(iview->view.storage_descriptor));
    } else {
-      memcpy(dst, iview->descriptor, sizeof(iview->descriptor));
+      memcpy(dst, iview->view.descriptor, sizeof(iview->view.descriptor));
    }
 }
 
@@ -887,11 +887,10 @@ write_combined_image_sampler_descriptor(uint32_t *dst,
                                         const VkDescriptorImageInfo *image_info,
                                         bool has_sampler)
 {
-   TU_FROM_HANDLE(tu_sampler, sampler, image_info->sampler);
-
    write_image_descriptor(dst, descriptor_type, image_info);
    /* copy over sampler state */
    if (has_sampler) {
+      TU_FROM_HANDLE(tu_sampler, sampler, image_info->sampler);
       memcpy(dst + A6XX_TEX_CONST_DWORDS, sampler->descriptor, sizeof(sampler->descriptor));
    }
 }
@@ -1074,7 +1073,7 @@ tu_CreateDescriptorUpdateTemplate(
    templ = vk_object_alloc(&device->vk, pAllocator, size,
                            VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE);
    if (!templ)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    templ->entry_count = entry_count;
 
@@ -1254,7 +1253,7 @@ tu_CreateSamplerYcbcrConversion(
    conversion = vk_object_alloc(&device->vk, pAllocator, sizeof(*conversion),
                                 VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION);
    if (!conversion)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    conversion->format = pCreateInfo->format;
    conversion->ycbcr_model = pCreateInfo->ycbcrModel;
diff --git a/mesa 3D driver/src/freedreno/vulkan/tu_device.c b/mesa 3D driver/src/freedreno/vulkan/tu_device.c
index 983b3c10f4..0811259dd4 100644
--- a/mesa 3D driver/src/freedreno/vulkan/tu_device.c	
+++ b/mesa 3D driver/src/freedreno/vulkan/tu_device.c	
@@ -27,6 +27,7 @@
 
 #include "tu_private.h"
 #include "tu_cs.h"
+#include "git_sha1.h"
 
 #include <fcntl.h>
 #include <poll.h>
@@ -37,6 +38,7 @@
 
 #include "util/debug.h"
 #include "util/disk_cache.h"
+#include "util/driconf.h"
 #include "util/u_atomic.h"
 #include "vk_format.h"
 #include "vk_util.h"
@@ -128,6 +130,7 @@ get_device_extensions(const struct tu_physical_device *device,
       .KHR_external_semaphore = true,
       .KHR_external_semaphore_fd = true,
       .KHR_get_memory_requirements2 = true,
+      .KHR_imageless_framebuffer = true,
       .KHR_incremental_present = TU_HAS_SURFACE,
       .KHR_image_format_list = true,
       .KHR_maintenance1 = true,
@@ -143,17 +146,25 @@ get_device_extensions(const struct tu_physical_device *device,
       .KHR_shader_draw_parameters = true,
       .KHR_shader_float_controls = true,
       .KHR_shader_float16_int8 = true,
+      .KHR_shader_subgroup_extended_types = true,
       .KHR_shader_terminate_invocation = true,
       .KHR_spirv_1_4 = true,
       .KHR_storage_buffer_storage_class = true,
       .KHR_swapchain = TU_HAS_SURFACE,
+      .KHR_uniform_buffer_standard_layout = true,
       .KHR_variable_pointers = true,
       .KHR_vulkan_memory_model = true,
 #ifndef TU_USE_KGSL
       .KHR_timeline_semaphore = true,
 #endif
 #ifdef VK_USE_PLATFORM_DISPLAY_KHR
-      .EXT_display_control = true,
+      /* This extension is supported by common code across drivers, but it is
+       * missing some core functionality and fails
+       * dEQP-VK.wsi.display_control.register_device_event. Once some variant of
+       * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12305 lands,
+       * then we can re-enable it.
+       */
+      /* .EXT_display_control = true, */
 #endif
       .EXT_external_memory_dma_buf = true,
       .EXT_image_drm_format_modifier = true,
@@ -180,6 +191,7 @@ get_device_extensions(const struct tu_physical_device *device,
       .EXT_shader_viewport_index_layer = true,
       .EXT_vertex_attribute_divisor = true,
       .EXT_provoking_vertex = true,
+      .EXT_line_rasterization = true,
 #ifdef ANDROID
       .ANDROID_native_buffer = true,
 #endif
@@ -194,13 +206,26 @@ tu_physical_device_init(struct tu_physical_device *device,
 {
    VkResult result = VK_SUCCESS;
 
-   device->name = fd_dev_name(&device->dev_id);
+   const char *fd_name = fd_dev_name(&device->dev_id);
+   if (strncmp(fd_name, "FD", 2) == 0) {
+      device->name = vk_asprintf(&instance->vk.alloc,
+                                 VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE,
+                                 "Turnip Adreno (TM) %s", &fd_name[2]);
+   } else {
+      device->name = vk_strdup(&instance->vk.alloc, fd_name,
+                               VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
+
+   }
+   if (!device->name) {
+      return vk_startup_errorf(instance, VK_ERROR_OUT_OF_HOST_MEMORY,
+                               "device name alloc fail");
+   }
 
    const struct fd_dev_info *info = fd_dev_info(&device->dev_id);
    if (!info) {
       result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
                                  "device %s is unsupported", device->name);
-      return result;
+      goto fail_free_name;
    }
    switch (fd_dev_gen(&device->dev_id)) {
    case 6:
@@ -212,12 +237,12 @@ tu_physical_device_init(struct tu_physical_device *device,
    default:
       result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
                                  "device %s is unsupported", device->name);
-      return result;
+      goto fail_free_name;
    }
    if (tu_device_get_cache_uuid(fd_dev_gpu_id(&device->dev_id), device->cache_uuid)) {
       result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
                                  "cannot generate UUID");
-      return result;
+      goto fail_free_name;
    }
 
    /* The gpu id is already embedded in the uuid so we just pass "tu"
@@ -238,23 +263,31 @@ tu_physical_device_init(struct tu_physical_device *device,
    struct vk_physical_device_dispatch_table dispatch_table;
    vk_physical_device_dispatch_table_from_entrypoints(
       &dispatch_table, &tu_physical_device_entrypoints, true);
+   vk_physical_device_dispatch_table_from_entrypoints(
+      &dispatch_table, &wsi_physical_device_entrypoints, false);
 
    result = vk_physical_device_init(&device->vk, &instance->vk,
                                     &supported_extensions,
                                     &dispatch_table);
    if (result != VK_SUCCESS)
-      return result;
+      goto fail_free_cache;
 
 #if TU_HAS_SURFACE
    result = tu_wsi_init(device);
    if (result != VK_SUCCESS) {
       vk_startup_errorf(instance, result, "WSI init failure");
       vk_physical_device_finish(&device->vk);
-      return result;
+      goto fail_free_cache;
    }
 #endif
 
    return VK_SUCCESS;
+
+fail_free_cache:
+   disk_cache_destroy(device->disk_cache);
+fail_free_name:
+   vk_free(&instance->vk.alloc, (void *)device->name);
+   return result;
 }
 
 static void
@@ -269,6 +302,8 @@ tu_physical_device_finish(struct tu_physical_device *device)
    if (device->master_fd != -1)
       close(device->master_fd);
 
+   vk_free(&device->instance->vk.alloc, (void *)device->name);
+
    vk_physical_device_finish(&device->vk);
 }
 
@@ -284,6 +319,7 @@ static const struct debug_control tu_debug_options[] = {
    { "perfc", TU_DEBUG_PERFC },
    { "flushall", TU_DEBUG_FLUSHALL },
    { "syncdraw", TU_DEBUG_SYNCDRAW },
+   { "dontcare_as_load", TU_DEBUG_DONT_CARE_AS_LOAD },
    { NULL, 0 }
 };
 
@@ -294,6 +330,33 @@ tu_get_debug_option_name(int id)
    return tu_debug_options[id].string;
 }
 
+static const driOptionDescription tu_dri_options[] = {
+   DRI_CONF_SECTION_PERFORMANCE
+      DRI_CONF_VK_X11_OVERRIDE_MIN_IMAGE_COUNT(0)
+      DRI_CONF_VK_X11_STRICT_IMAGE_COUNT(false)
+      DRI_CONF_VK_X11_ENSURE_MIN_IMAGE_COUNT(false)
+      DRI_CONF_VK_XWAYLAND_WAIT_READY(true)
+   DRI_CONF_SECTION_END
+
+   DRI_CONF_SECTION_DEBUG
+      DRI_CONF_VK_WSI_FORCE_BGRA8_UNORM_FIRST(false)
+      DRI_CONF_VK_DONT_CARE_AS_LOAD(false)
+   DRI_CONF_SECTION_END
+};
+
+static void
+tu_init_dri_options(struct tu_instance *instance)
+{
+   driParseOptionInfo(&instance->available_dri_options, tu_dri_options,
+                      ARRAY_SIZE(tu_dri_options));
+   driParseConfigFiles(&instance->dri_options, &instance->available_dri_options, 0, "turnip", NULL, NULL,
+                       instance->vk.app_info.app_name, instance->vk.app_info.app_version,
+                       instance->vk.app_info.engine_name, instance->vk.app_info.engine_version);
+
+   if (driQueryOptionb(&instance->dri_options, "vk_dont_care_as_load"))
+      instance->debug_flags |= TU_DEBUG_DONT_CARE_AS_LOAD;
+}
+
 VKAPI_ATTR VkResult VKAPI_CALL
 tu_CreateInstance(const VkInstanceCreateInfo *pCreateInfo,
                   const VkAllocationCallbacks *pAllocator,
@@ -316,6 +379,8 @@ tu_CreateInstance(const VkInstanceCreateInfo *pCreateInfo,
    struct vk_instance_dispatch_table dispatch_table;
    vk_instance_dispatch_table_from_entrypoints(
       &dispatch_table, &tu_instance_entrypoints, true);
+   vk_instance_dispatch_table_from_entrypoints(
+      &dispatch_table, &wsi_instance_entrypoints, false);
 
    result = vk_instance_init(&instance->vk,
                              &tu_instance_extensions_supported,
@@ -344,8 +409,14 @@ tu_CreateInstance(const VkInstanceCreateInfo *pCreateInfo,
 
    VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false));
 
+   tu_init_dri_options(instance);
+
    *pInstance = tu_instance_to_handle(instance);
 
+#ifdef HAVE_PERFETTO
+   tu_perfetto_init();
+#endif
+
    return VK_SUCCESS;
 }
 
@@ -364,6 +435,9 @@ tu_DestroyInstance(VkInstance _instance,
 
    VG(VALGRIND_DESTROY_MEMPOOL(instance));
 
+   driDestroyOptionCache(&instance->dri_options);
+   driDestroyOptionInfo(&instance->available_dri_options);
+
    vk_instance_finish(&instance->vk);
    vk_free(&instance->vk.alloc, instance);
 }
@@ -424,7 +498,80 @@ tu_EnumeratePhysicalDeviceGroups(
    return vk_outarray_status(&out);
 }
 
-VKAPI_ATTR void VKAPI_CALL
+static void
+tu_get_physical_device_features_1_1(struct tu_physical_device *pdevice,
+                                    VkPhysicalDeviceVulkan11Features *features)
+{
+   features->storageBuffer16BitAccess            = pdevice->info->a6xx.storage_16bit;
+   features->uniformAndStorageBuffer16BitAccess  = false;
+   features->storagePushConstant16               = false;
+   features->storageInputOutput16                = false;
+   features->multiview                           = true;
+   features->multiviewGeometryShader             = false;
+   features->multiviewTessellationShader         = false;
+   features->variablePointersStorageBuffer       = true;
+   features->variablePointers                    = true;
+   features->protectedMemory                     = false;
+   features->samplerYcbcrConversion              = true;
+   features->shaderDrawParameters                = true;
+}
+
+static void
+tu_get_physical_device_features_1_2(struct tu_physical_device *pdevice,
+                                    VkPhysicalDeviceVulkan12Features *features)
+{
+   features->samplerMirrorClampToEdge            = true;
+   features->drawIndirectCount                   = true;
+   features->storageBuffer8BitAccess             = false;
+   features->uniformAndStorageBuffer8BitAccess   = false;
+   features->storagePushConstant8                = false;
+   features->shaderBufferInt64Atomics            = false;
+   features->shaderSharedInt64Atomics            = false;
+   features->shaderFloat16                       = true;
+   features->shaderInt8                          = false;
+
+   features->descriptorIndexing                                 = true;
+   features->shaderInputAttachmentArrayDynamicIndexing          = false;
+   features->shaderUniformTexelBufferArrayDynamicIndexing       = true;
+   features->shaderStorageTexelBufferArrayDynamicIndexing       = true;
+   features->shaderUniformBufferArrayNonUniformIndexing         = true;
+   features->shaderSampledImageArrayNonUniformIndexing          = true;
+   features->shaderStorageBufferArrayNonUniformIndexing         = true;
+   features->shaderStorageImageArrayNonUniformIndexing          = true;
+   features->shaderInputAttachmentArrayNonUniformIndexing       = false;
+   features->shaderUniformTexelBufferArrayNonUniformIndexing    = true;
+   features->shaderStorageTexelBufferArrayNonUniformIndexing    = true;
+   features->descriptorBindingUniformBufferUpdateAfterBind      = false;
+   features->descriptorBindingSampledImageUpdateAfterBind       = true;
+   features->descriptorBindingStorageImageUpdateAfterBind       = true;
+   features->descriptorBindingStorageBufferUpdateAfterBind      = true;
+   features->descriptorBindingUniformTexelBufferUpdateAfterBind = true;
+   features->descriptorBindingStorageTexelBufferUpdateAfterBind = true;
+   features->descriptorBindingUpdateUnusedWhilePending          = true;
+   features->descriptorBindingPartiallyBound                    = true;
+   features->descriptorBindingVariableDescriptorCount           = true;
+   features->runtimeDescriptorArray                             = true;
+
+   features->samplerFilterMinmax                 = true;
+   features->scalarBlockLayout                   = true;
+   features->imagelessFramebuffer                = true;
+   features->uniformBufferStandardLayout         = true;
+   features->shaderSubgroupExtendedTypes         = true;
+   features->separateDepthStencilLayouts         = false;
+   features->hostQueryReset                      = true;
+   features->timelineSemaphore                   = true;
+   features->bufferDeviceAddress                 = false;
+   features->bufferDeviceAddressCaptureReplay    = false;
+   features->bufferDeviceAddressMultiDevice      = false;
+   features->vulkanMemoryModel                   = true;
+   features->vulkanMemoryModelDeviceScope        = true;
+   features->vulkanMemoryModelAvailabilityVisibilityChains = true;
+   features->shaderOutputViewportIndex           = true;
+   features->shaderOutputLayer                   = true;
+   features->subgroupBroadcastDynamicId          = false;
+}
+
+void
 tu_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
                               VkPhysicalDeviceFeatures2 *pFeatures)
 {
@@ -478,144 +625,24 @@ tu_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
       .inheritedQueries = true,
    };
 
+   VkPhysicalDeviceVulkan11Features core_1_1 = {
+      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES,
+   };
+   tu_get_physical_device_features_1_1(pdevice, &core_1_1);
+
+   VkPhysicalDeviceVulkan12Features core_1_2 = {
+      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES,
+   };
+   tu_get_physical_device_features_1_2(pdevice, &core_1_2);
+
    vk_foreach_struct(ext, pFeatures->pNext)
    {
+      if (vk_get_physical_device_core_1_1_feature_ext(ext, &core_1_1))
+         continue;
+      if (vk_get_physical_device_core_1_2_feature_ext(ext, &core_1_2))
+         continue;
+
       switch (ext->sType) {
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES: {
-         VkPhysicalDeviceVulkan11Features *features = (void *) ext;
-         features->storageBuffer16BitAccess            = pdevice->info->a6xx.storage_16bit;
-         features->uniformAndStorageBuffer16BitAccess  = false;
-         features->storagePushConstant16               = false;
-         features->storageInputOutput16                = false;
-         features->multiview                           = true;
-         features->multiviewGeometryShader             = false;
-         features->multiviewTessellationShader         = false;
-         features->variablePointersStorageBuffer       = true;
-         features->variablePointers                    = true;
-         features->protectedMemory                     = false;
-         features->samplerYcbcrConversion              = true;
-         features->shaderDrawParameters                = true;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES: {
-         VkPhysicalDeviceVulkan12Features *features = (void *) ext;
-         features->samplerMirrorClampToEdge            = true;
-         features->drawIndirectCount                   = true;
-         features->storageBuffer8BitAccess             = false;
-         features->uniformAndStorageBuffer8BitAccess   = false;
-         features->storagePushConstant8                = false;
-         features->shaderBufferInt64Atomics            = false;
-         features->shaderSharedInt64Atomics            = false;
-         features->shaderFloat16                       = true;
-         features->shaderInt8                          = false;
-
-         features->descriptorIndexing                                 = true;
-         features->shaderInputAttachmentArrayDynamicIndexing          = false;
-         features->shaderUniformTexelBufferArrayDynamicIndexing       = true;
-         features->shaderStorageTexelBufferArrayDynamicIndexing       = true;
-         features->shaderUniformBufferArrayNonUniformIndexing         = true;
-         features->shaderSampledImageArrayNonUniformIndexing          = true;
-         features->shaderStorageBufferArrayNonUniformIndexing         = true;
-         features->shaderStorageImageArrayNonUniformIndexing          = true;
-         features->shaderInputAttachmentArrayNonUniformIndexing       = false;
-         features->shaderUniformTexelBufferArrayNonUniformIndexing    = true;
-         features->shaderStorageTexelBufferArrayNonUniformIndexing    = true;
-         features->descriptorBindingUniformBufferUpdateAfterBind      = false;
-         features->descriptorBindingSampledImageUpdateAfterBind       = true;
-         features->descriptorBindingStorageImageUpdateAfterBind       = true;
-         features->descriptorBindingStorageBufferUpdateAfterBind      = true;
-         features->descriptorBindingUniformTexelBufferUpdateAfterBind = true;
-         features->descriptorBindingStorageTexelBufferUpdateAfterBind = true;
-         features->descriptorBindingUpdateUnusedWhilePending          = true;
-         features->descriptorBindingPartiallyBound                    = true;
-         features->descriptorBindingVariableDescriptorCount           = true;
-         features->runtimeDescriptorArray                             = true;
-
-         features->samplerFilterMinmax                 = true;
-         features->scalarBlockLayout                   = true;
-         features->imagelessFramebuffer                = false;
-         features->uniformBufferStandardLayout         = false;
-         features->shaderSubgroupExtendedTypes         = false;
-         features->separateDepthStencilLayouts         = false;
-         features->hostQueryReset                      = true;
-         features->timelineSemaphore                   = true;
-         features->bufferDeviceAddress                 = false;
-         features->bufferDeviceAddressCaptureReplay    = false;
-         features->bufferDeviceAddressMultiDevice      = false;
-         features->vulkanMemoryModel                   = true;
-         features->vulkanMemoryModelDeviceScope        = true;
-         features->vulkanMemoryModelAvailabilityVisibilityChains = true;
-         features->shaderOutputViewportIndex           = true;
-         features->shaderOutputLayer                   = true;
-         features->subgroupBroadcastDynamicId          = false;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTERS_FEATURES: {
-         VkPhysicalDeviceVariablePointersFeatures *features = (void *) ext;
-         features->variablePointersStorageBuffer = true;
-         features->variablePointers = true;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_FEATURES: {
-         VkPhysicalDeviceMultiviewFeatures *features =
-            (VkPhysicalDeviceMultiviewFeatures *) ext;
-         features->multiview = true;
-         features->multiviewGeometryShader = false;
-         features->multiviewTessellationShader = false;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETERS_FEATURES: {
-         VkPhysicalDeviceShaderDrawParametersFeatures *features =
-            (VkPhysicalDeviceShaderDrawParametersFeatures *) ext;
-         features->shaderDrawParameters = true;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROTECTED_MEMORY_FEATURES: {
-         VkPhysicalDeviceProtectedMemoryFeatures *features =
-            (VkPhysicalDeviceProtectedMemoryFeatures *) ext;
-         features->protectedMemory = false;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES: {
-         VkPhysicalDevice16BitStorageFeatures *features =
-            (VkPhysicalDevice16BitStorageFeatures *) ext;
-         features->storageBuffer16BitAccess = pdevice->info->a6xx.storage_16bit;
-         features->uniformAndStorageBuffer16BitAccess = false;
-         features->storagePushConstant16 = false;
-         features->storageInputOutput16 = false;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES: {
-         VkPhysicalDeviceSamplerYcbcrConversionFeatures *features =
-            (VkPhysicalDeviceSamplerYcbcrConversionFeatures *) ext;
-         features->samplerYcbcrConversion = true;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_FEATURES_EXT: {
-         VkPhysicalDeviceDescriptorIndexingFeaturesEXT *features =
-            (VkPhysicalDeviceDescriptorIndexingFeaturesEXT *) ext;
-         features->shaderInputAttachmentArrayDynamicIndexing = false;
-         features->shaderUniformTexelBufferArrayDynamicIndexing = true;
-         features->shaderStorageTexelBufferArrayDynamicIndexing = true;
-         features->shaderUniformBufferArrayNonUniformIndexing = true;
-         features->shaderSampledImageArrayNonUniformIndexing = true;
-         features->shaderStorageBufferArrayNonUniformIndexing = true;
-         features->shaderStorageImageArrayNonUniformIndexing = true;
-         features->shaderInputAttachmentArrayNonUniformIndexing = false;
-         features->shaderUniformTexelBufferArrayNonUniformIndexing = true;
-         features->shaderStorageTexelBufferArrayNonUniformIndexing = true;
-         features->descriptorBindingUniformBufferUpdateAfterBind = false;
-         features->descriptorBindingSampledImageUpdateAfterBind = true;
-         features->descriptorBindingStorageImageUpdateAfterBind = true;
-         features->descriptorBindingStorageBufferUpdateAfterBind = true;
-         features->descriptorBindingUniformTexelBufferUpdateAfterBind = true;
-         features->descriptorBindingStorageTexelBufferUpdateAfterBind = true;
-         features->descriptorBindingUpdateUnusedWhilePending = true;
-         features->descriptorBindingPartiallyBound = true;
-         features->descriptorBindingVariableDescriptorCount = true;
-         features->runtimeDescriptorArray = true;
-         break;
-      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONDITIONAL_RENDERING_FEATURES_EXT: {
          VkPhysicalDeviceConditionalRenderingFeaturesEXT *features =
             (VkPhysicalDeviceConditionalRenderingFeaturesEXT *) ext;
@@ -667,12 +694,6 @@ tu_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
          features->customBorderColorWithoutFormat = true;
          break;
       }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES_EXT: {
-         VkPhysicalDeviceHostQueryResetFeaturesEXT *features =
-            (VkPhysicalDeviceHostQueryResetFeaturesEXT *)ext;
-         features->hostQueryReset = true;
-         break;
-      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTENDED_DYNAMIC_STATE_FEATURES_EXT: {
          VkPhysicalDeviceExtendedDynamicStateFeaturesEXT *features = (void *)ext;
          features->extendedDynamicState = true;
@@ -730,14 +751,6 @@ tu_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
          features->shaderTerminateInvocation = true;
          break;
       }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_MEMORY_MODEL_FEATURES_KHR: {
-         VkPhysicalDeviceVulkanMemoryModelFeaturesKHR *feature =
-            (VkPhysicalDeviceVulkanMemoryModelFeaturesKHR *)ext;
-         feature->vulkanMemoryModel = true;
-         feature->vulkanMemoryModelDeviceScope = true;
-         feature->vulkanMemoryModelAvailabilityVisibilityChains = true;
-         break;
-      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES: {
          VkPhysicalDeviceTimelineSemaphoreFeaturesKHR *features =
             (VkPhysicalDeviceTimelineSemaphoreFeaturesKHR *) ext;
@@ -757,6 +770,17 @@ tu_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
          features->mutableDescriptorType = true;
          break;
       }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LINE_RASTERIZATION_FEATURES_EXT: {
+         VkPhysicalDeviceLineRasterizationFeaturesEXT *features =
+            (VkPhysicalDeviceLineRasterizationFeaturesEXT *)ext;
+         features->rectangularLines = true;
+         features->bresenhamLines = true;
+         features->smoothLines = false;
+         features->stippledRectangularLines = false;
+         features->stippledBresenhamLines = false;
+         features->stippledSmoothLines = false;
+         break;
+      }
 
       default:
          break;
@@ -764,21 +788,137 @@ tu_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
    }
 }
 
+
+static void
+tu_get_physical_device_properties_1_1(struct tu_physical_device *pdevice,
+                                       VkPhysicalDeviceVulkan11Properties *p)
+{
+   assert(p->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES);
+
+   memcpy(p->deviceUUID, pdevice->device_uuid, VK_UUID_SIZE);
+   memcpy(p->driverUUID, pdevice->driver_uuid, VK_UUID_SIZE);
+   memset(p->deviceLUID, 0, VK_LUID_SIZE);
+   p->deviceNodeMask = 0;
+   p->deviceLUIDValid = false;
+
+   p->subgroupSize = 128;
+   p->subgroupSupportedStages = VK_SHADER_STAGE_COMPUTE_BIT;
+   p->subgroupSupportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT |
+                                    VK_SUBGROUP_FEATURE_VOTE_BIT |
+                                    VK_SUBGROUP_FEATURE_BALLOT_BIT;
+   p->subgroupQuadOperationsInAllStages = false;
+
+   p->pointClippingBehavior = VK_POINT_CLIPPING_BEHAVIOR_ALL_CLIP_PLANES;
+   p->maxMultiviewViewCount = MAX_VIEWS;
+   p->maxMultiviewInstanceIndex = INT_MAX;
+   p->protectedNoFault = false;
+   /* Make sure everything is addressable by a signed 32-bit int, and
+    * our largest descriptors are 96 bytes.
+    */
+   p->maxPerSetDescriptors = (1ull << 31) / 96;
+   /* Our buffer size fields allow only this much */
+   p->maxMemoryAllocationSize = 0xFFFFFFFFull;
+
+}
+
+
+/* I have no idea what the maximum size is, but the hardware supports very
+ * large numbers of descriptors (at least 2^16). This limit is based on
+ * CP_LOAD_STATE6, which has a 28-bit field for the DWORD offset, so that
+ * we don't have to think about what to do if that overflows, but really
+ * nothing is likely to get close to this.
+ */
+static const size_t max_descriptor_set_size = (1 << 28) / A6XX_TEX_CONST_DWORDS;
+static const VkSampleCountFlags sample_counts =
+   VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT;
+
+static void
+tu_get_physical_device_properties_1_2(struct tu_physical_device *pdevice,
+                                       VkPhysicalDeviceVulkan12Properties *p)
+{
+   assert(p->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES);
+
+   p->driverID = VK_DRIVER_ID_MESA_TURNIP;
+   memset(p->driverName, 0, sizeof(p->driverName));
+   snprintf(p->driverName, VK_MAX_DRIVER_NAME_SIZE_KHR,
+            "turnip Mesa driver");
+   memset(p->driverInfo, 0, sizeof(p->driverInfo));
+   snprintf(p->driverInfo, VK_MAX_DRIVER_INFO_SIZE_KHR,
+            "Mesa " PACKAGE_VERSION MESA_GIT_SHA1);
+   /* XXX: VK 1.2: Need to pass conformance. */
+   p->conformanceVersion = (VkConformanceVersionKHR) {
+      .major = 0,
+      .minor = 0,
+      .subminor = 0,
+      .patch = 0,
+   };
+
+   p->denormBehaviorIndependence =
+      VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL;
+   p->roundingModeIndependence =
+      VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL;
+
+   p->shaderDenormFlushToZeroFloat16         = true;
+   p->shaderDenormPreserveFloat16            = false;
+   p->shaderRoundingModeRTEFloat16           = true;
+   p->shaderRoundingModeRTZFloat16           = false;
+   p->shaderSignedZeroInfNanPreserveFloat16  = true;
+
+   p->shaderDenormFlushToZeroFloat32         = true;
+   p->shaderDenormPreserveFloat32            = false;
+   p->shaderRoundingModeRTEFloat32           = true;
+   p->shaderRoundingModeRTZFloat32           = false;
+   p->shaderSignedZeroInfNanPreserveFloat32  = true;
+
+   p->shaderDenormFlushToZeroFloat64         = false;
+   p->shaderDenormPreserveFloat64            = false;
+   p->shaderRoundingModeRTEFloat64           = false;
+   p->shaderRoundingModeRTZFloat64           = false;
+   p->shaderSignedZeroInfNanPreserveFloat64  = false;
+
+   p->shaderUniformBufferArrayNonUniformIndexingNative   = true;
+   p->shaderSampledImageArrayNonUniformIndexingNative    = true;
+   p->shaderStorageBufferArrayNonUniformIndexingNative   = true;
+   p->shaderStorageImageArrayNonUniformIndexingNative    = true;
+   p->shaderInputAttachmentArrayNonUniformIndexingNative = false;
+   p->robustBufferAccessUpdateAfterBind                  = false;
+   p->quadDivergentImplicitLod                           = false;
+
+   p->maxUpdateAfterBindDescriptorsInAllPools            = max_descriptor_set_size;
+   p->maxPerStageDescriptorUpdateAfterBindSamplers       = max_descriptor_set_size;
+   p->maxPerStageDescriptorUpdateAfterBindUniformBuffers = max_descriptor_set_size;
+   p->maxPerStageDescriptorUpdateAfterBindStorageBuffers = max_descriptor_set_size;
+   p->maxPerStageDescriptorUpdateAfterBindSampledImages  = max_descriptor_set_size;
+   p->maxPerStageDescriptorUpdateAfterBindStorageImages  = max_descriptor_set_size;
+   p->maxPerStageDescriptorUpdateAfterBindInputAttachments = max_descriptor_set_size;
+   p->maxPerStageUpdateAfterBindResources                = max_descriptor_set_size;
+   p->maxDescriptorSetUpdateAfterBindSamplers            = max_descriptor_set_size;
+   p->maxDescriptorSetUpdateAfterBindUniformBuffers      = max_descriptor_set_size;
+   p->maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = MAX_DYNAMIC_UNIFORM_BUFFERS;
+   p->maxDescriptorSetUpdateAfterBindStorageBuffers      = max_descriptor_set_size;
+   p->maxDescriptorSetUpdateAfterBindStorageBuffersDynamic = MAX_DYNAMIC_STORAGE_BUFFERS;
+   p->maxDescriptorSetUpdateAfterBindSampledImages       = max_descriptor_set_size;
+   p->maxDescriptorSetUpdateAfterBindStorageImages       = max_descriptor_set_size;
+   p->maxDescriptorSetUpdateAfterBindInputAttachments    = max_descriptor_set_size;
+
+   p->supportedDepthResolveModes    = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT;
+   p->supportedStencilResolveModes  = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT;
+   p->independentResolveNone  = false;
+   p->independentResolve      = false;
+
+   p->filterMinmaxSingleComponentFormats  = true;
+   p->filterMinmaxImageComponentMapping   = true;
+
+   p->maxTimelineSemaphoreValueDifference = UINT64_MAX;
+
+   p->framebufferIntegerColorSampleCounts = sample_counts;
+}
+
 VKAPI_ATTR void VKAPI_CALL
 tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
                                 VkPhysicalDeviceProperties2 *pProperties)
 {
    TU_FROM_HANDLE(tu_physical_device, pdevice, physicalDevice);
-   VkSampleCountFlags sample_counts =
-      VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT;
-
-   /* I have no idea what the maximum size is, but the hardware supports very
-    * large numbers of descriptors (at least 2^16). This limit is based on
-    * CP_LOAD_STATE6, which has a 28-bit field for the DWORD offset, so that
-    * we don't have to think about what to do if that overflows, but really
-    * nothing is likely to get close to this.
-    */
-   const size_t max_descriptor_set_size = (1 << 28) / A6XX_TEX_CONST_DWORDS;
 
    VkPhysicalDeviceLimits limits = {
       .maxImageDimension1D = (1 << 14),
@@ -844,7 +984,7 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
       .maxSamplerLodBias = 4095.0 / 256.0, /* [-16, 15.99609375] */
       .maxSamplerAnisotropy = 16,
       .maxViewports = MAX_VIEWPORTS,
-      .maxViewportDimensions = { (1 << 14), (1 << 14) },
+      .maxViewportDimensions = { MAX_VIEWPORT_SIZE, MAX_VIEWPORT_SIZE },
       .viewportBoundsRange = { INT16_MIN, INT16_MAX },
       .viewportSubPixelBits = 8,
       .minMemoryMapAlignment = 4096, /* A page */
@@ -882,7 +1022,7 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
       .lineWidthRange = { 1.0, 1.0 },
       .pointSizeGranularity = 	0.0625,
       .lineWidthGranularity = 0.0,
-      .strictLines = false, /* FINISHME */
+      .strictLines = true,
       .standardSampleLocations = true,
       .optimalBufferCopyOffsetAlignment = 128,
       .optimalBufferCopyRowPitchAlignment = 128,
@@ -892,8 +1032,8 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
    pProperties->properties = (VkPhysicalDeviceProperties) {
       .apiVersion = TU_API_VERSION,
       .driverVersion = vk_get_driver_version(),
-      .vendorID = 0, /* TODO */
-      .deviceID = 0,
+      .vendorID = 0x5143,
+      .deviceID = pdevice->dev_id.chip_id,
       .deviceType = VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU,
       .limits = limits,
       .sparseProperties = { 0 },
@@ -902,8 +1042,23 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
    strcpy(pProperties->properties.deviceName, pdevice->name);
    memcpy(pProperties->properties.pipelineCacheUUID, pdevice->cache_uuid, VK_UUID_SIZE);
 
+   VkPhysicalDeviceVulkan11Properties core_1_1 = {
+      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES,
+   };
+   tu_get_physical_device_properties_1_1(pdevice, &core_1_1);
+
+   VkPhysicalDeviceVulkan12Properties core_1_2 = {
+      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES,
+   };
+   tu_get_physical_device_properties_1_2(pdevice, &core_1_2);
+
    vk_foreach_struct(ext, pProperties->pNext)
    {
+      if (vk_get_physical_device_core_1_1_property_ext(ext, &core_1_1))
+         continue;
+      if (vk_get_physical_device_core_1_2_property_ext(ext, &core_1_2))
+         continue;
+
       switch (ext->sType) {
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PUSH_DESCRIPTOR_PROPERTIES_KHR: {
          VkPhysicalDevicePushDescriptorPropertiesKHR *properties =
@@ -911,38 +1066,6 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
          properties->maxPushDescriptors = MAX_PUSH_DESCRIPTORS;
          break;
       }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES: {
-         VkPhysicalDeviceIDProperties *properties =
-            (VkPhysicalDeviceIDProperties *) ext;
-         memcpy(properties->driverUUID, pdevice->driver_uuid, VK_UUID_SIZE);
-         memcpy(properties->deviceUUID, pdevice->device_uuid, VK_UUID_SIZE);
-         properties->deviceLUIDValid = false;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_PROPERTIES: {
-         VkPhysicalDeviceMultiviewProperties *properties =
-            (VkPhysicalDeviceMultiviewProperties *) ext;
-         properties->maxMultiviewViewCount = MAX_VIEWS;
-         properties->maxMultiviewInstanceIndex = INT_MAX;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_POINT_CLIPPING_PROPERTIES: {
-         VkPhysicalDevicePointClippingProperties *properties =
-            (VkPhysicalDevicePointClippingProperties *) ext;
-         properties->pointClippingBehavior =
-            VK_POINT_CLIPPING_BEHAVIOR_ALL_CLIP_PLANES;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES: {
-         VkPhysicalDeviceMaintenance3Properties *properties =
-            (VkPhysicalDeviceMaintenance3Properties *) ext;
-         /* Make sure everything is addressable by a signed 32-bit int, and
-          * our largest descriptors are 96 bytes. */
-         properties->maxPerSetDescriptors = (1ull << 31) / 96;
-         /* Our buffer size fields allow only this much */
-         properties->maxMemoryAllocationSize = 0xFFFFFFFFull;
-         break;
-      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_PROPERTIES_EXT: {
          VkPhysicalDeviceTransformFeedbackPropertiesEXT *properties =
             (VkPhysicalDeviceTransformFeedbackPropertiesEXT *)ext;
@@ -974,24 +1097,6 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
          properties->variableSampleLocations = true;
          break;
       }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_FILTER_MINMAX_PROPERTIES: {
-         VkPhysicalDeviceSamplerFilterMinmaxProperties *properties =
-            (VkPhysicalDeviceSamplerFilterMinmaxProperties *)ext;
-         properties->filterMinmaxImageComponentMapping = true;
-         properties->filterMinmaxSingleComponentFormats = true;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES: {
-         VkPhysicalDeviceSubgroupProperties *properties =
-            (VkPhysicalDeviceSubgroupProperties *)ext;
-         properties->subgroupSize = 128;
-         properties->supportedStages = VK_SHADER_STAGE_COMPUTE_BIT;
-         properties->supportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT |
-                                           VK_SUBGROUP_FEATURE_VOTE_BIT |
-                                           VK_SUBGROUP_FEATURE_BALLOT_BIT;
-         properties->quadOperationsInAllStages = false;
-         break;
-      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_PROPERTIES_EXT: {
          VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *props =
             (VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *)ext;
@@ -1003,72 +1108,12 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
          props->maxCustomBorderColorSamplers = TU_BORDER_COLOR_COUNT;
          break;
       }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_STENCIL_RESOLVE_PROPERTIES: {
-         VkPhysicalDeviceDepthStencilResolveProperties *props =
-            (VkPhysicalDeviceDepthStencilResolveProperties *)ext;
-         props->independentResolve = false;
-         props->independentResolveNone = false;
-         props->supportedDepthResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT;
-         props->supportedStencilResolveModes = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT;
-         break;
-      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_PROPERTIES_KHR: {
          VkPhysicalDevicePerformanceQueryPropertiesKHR *properties =
             (VkPhysicalDevicePerformanceQueryPropertiesKHR *)ext;
          properties->allowCommandBufferQueryCopies = false;
          break;
       }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_PROPERTIES_EXT: {
-         VkPhysicalDeviceDescriptorIndexingPropertiesEXT *props =
-            (VkPhysicalDeviceDescriptorIndexingPropertiesEXT *)ext;
-         props->shaderUniformBufferArrayNonUniformIndexingNative = true;
-         props->shaderSampledImageArrayNonUniformIndexingNative = true;
-         props->shaderStorageBufferArrayNonUniformIndexingNative = true;
-         props->shaderStorageImageArrayNonUniformIndexingNative = true;
-         props->shaderInputAttachmentArrayNonUniformIndexingNative = false;
-         props->robustBufferAccessUpdateAfterBind = false;
-         props->quadDivergentImplicitLod = false;
-
-         props->maxUpdateAfterBindDescriptorsInAllPools = max_descriptor_set_size;
-         props->maxPerStageDescriptorUpdateAfterBindSamplers = max_descriptor_set_size;
-         props->maxPerStageDescriptorUpdateAfterBindUniformBuffers = max_descriptor_set_size;
-         props->maxPerStageDescriptorUpdateAfterBindStorageBuffers = max_descriptor_set_size;
-         props->maxPerStageDescriptorUpdateAfterBindSampledImages = max_descriptor_set_size;
-         props->maxPerStageDescriptorUpdateAfterBindStorageImages = max_descriptor_set_size;
-         props->maxPerStageDescriptorUpdateAfterBindInputAttachments = max_descriptor_set_size;
-         props->maxPerStageUpdateAfterBindResources = max_descriptor_set_size;
-         props->maxDescriptorSetUpdateAfterBindSamplers = max_descriptor_set_size;
-         props->maxDescriptorSetUpdateAfterBindUniformBuffers = max_descriptor_set_size;
-         props->maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = MAX_DYNAMIC_UNIFORM_BUFFERS;
-         props->maxDescriptorSetUpdateAfterBindStorageBuffers = max_descriptor_set_size;
-         props->maxDescriptorSetUpdateAfterBindStorageBuffersDynamic = MAX_DYNAMIC_STORAGE_BUFFERS;
-         props->maxDescriptorSetUpdateAfterBindSampledImages = max_descriptor_set_size;
-         props->maxDescriptorSetUpdateAfterBindStorageImages = max_descriptor_set_size;
-         props->maxDescriptorSetUpdateAfterBindInputAttachments = max_descriptor_set_size;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT_CONTROLS_PROPERTIES: {
-         VkPhysicalDeviceFloatControlsProperties *properties =
-            (VkPhysicalDeviceFloatControlsProperties *) ext;
-         properties->denormBehaviorIndependence = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL;
-         properties->roundingModeIndependence = VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL;
-         properties->shaderSignedZeroInfNanPreserveFloat16 = true;
-         properties->shaderSignedZeroInfNanPreserveFloat32 = true;
-         properties->shaderSignedZeroInfNanPreserveFloat64 = false;
-         properties->shaderDenormPreserveFloat16 = false;
-         properties->shaderDenormPreserveFloat32 = false;
-         properties->shaderDenormPreserveFloat64 = false;
-         properties->shaderDenormFlushToZeroFloat16 = true;
-         properties->shaderDenormFlushToZeroFloat32 = true;
-         properties->shaderDenormFlushToZeroFloat64 = false;
-         properties->shaderRoundingModeRTEFloat16 = true;
-         properties->shaderRoundingModeRTEFloat32 = true;
-         properties->shaderRoundingModeRTEFloat64 = false;
-         properties->shaderRoundingModeRTZFloat16 = false;
-         properties->shaderRoundingModeRTZFloat32 = false;
-         properties->shaderRoundingModeRTZFloat64 = false;
-         break;
-      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_PROPERTIES_EXT: {
          VkPhysicalDeviceRobustness2PropertiesEXT *props = (void *)ext;
          /* see write_buffer_descriptor() */
@@ -1077,12 +1122,7 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
          props->robustUniformBufferAccessSizeAlignment = 16;
          break;
       }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_PROPERTIES: {
-         VkPhysicalDeviceTimelineSemaphorePropertiesKHR *props =
-            (VkPhysicalDeviceTimelineSemaphorePropertiesKHR *) ext;
-         props->maxTimelineSemaphoreValueDifference = UINT64_MAX;
-         break;
-      }
+
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROVOKING_VERTEX_PROPERTIES_EXT: {
          VkPhysicalDeviceProvokingVertexPropertiesEXT *properties =
             (VkPhysicalDeviceProvokingVertexPropertiesEXT *)ext;
@@ -1090,6 +1130,13 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
          properties->transformFeedbackPreservesTriangleFanProvokingVertex = false;
          break;
       }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LINE_RASTERIZATION_PROPERTIES_EXT: {
+         VkPhysicalDeviceLineRasterizationPropertiesEXT *props =
+            (VkPhysicalDeviceLineRasterizationPropertiesEXT *)ext;
+         props->lineSubPixelPrecisionBits = 8;
+         break;
+      }
+
       default:
          break;
       }
@@ -1201,16 +1248,14 @@ tu_GetPhysicalDeviceMemoryProperties2(VkPhysicalDevice pdev,
 static VkResult
 tu_queue_init(struct tu_device *device,
               struct tu_queue *queue,
-              uint32_t queue_family_index,
               int idx,
-              VkDeviceQueueCreateFlags flags)
+              const VkDeviceQueueCreateInfo *create_info)
 {
-   vk_object_base_init(&device->vk, &queue->base, VK_OBJECT_TYPE_QUEUE);
+   VkResult result = vk_queue_init(&queue->vk, &device->vk, create_info, idx);
+   if (result != VK_SUCCESS)
+      return result;
 
    queue->device = device;
-   queue->queue_family_index = queue_family_index;
-   queue->queue_idx = idx;
-   queue->flags = flags;
 
    list_inithead(&queue->queued_submits);
 
@@ -1227,12 +1272,173 @@ tu_queue_init(struct tu_device *device,
 static void
 tu_queue_finish(struct tu_queue *queue)
 {
-   vk_object_base_finish(&queue->base);
+   vk_queue_finish(&queue->vk);
    if (queue->fence >= 0)
       close(queue->fence);
    tu_drm_submitqueue_close(queue->device, queue->msm_queue_id);
 }
 
+uint64_t
+tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts)
+{
+   /* This is based on the 19.2MHz always-on rbbm timer.
+    *
+    * TODO we should probably query this value from kernel..
+    */
+   return ts * (1000000000 / 19200000);
+}
+
+static void*
+tu_trace_create_ts_buffer(struct u_trace_context *utctx, uint32_t size)
+{
+   struct tu_device *device =
+      container_of(utctx, struct tu_device, trace_context);
+
+   struct tu_bo *bo = ralloc(NULL, struct tu_bo);
+   tu_bo_init_new(device, bo, size, false);
+
+   return bo;
+}
+
+static void
+tu_trace_destroy_ts_buffer(struct u_trace_context *utctx, void *timestamps)
+{
+   struct tu_device *device =
+      container_of(utctx, struct tu_device, trace_context);
+   struct tu_bo *bo = timestamps;
+
+   tu_bo_finish(device, bo);
+   ralloc_free(bo);
+}
+
+static void
+tu_trace_record_ts(struct u_trace *ut, void *cs, void *timestamps,
+                   unsigned idx)
+{
+   struct tu_bo *bo = timestamps;
+   struct tu_cs *ts_cs = cs;
+
+   unsigned ts_offset = idx * sizeof(uint64_t);
+   tu_cs_emit_pkt7(ts_cs, CP_EVENT_WRITE, 4);
+   tu_cs_emit(ts_cs, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | CP_EVENT_WRITE_0_TIMESTAMP);
+   tu_cs_emit_qw(ts_cs, bo->iova + ts_offset);
+   tu_cs_emit(ts_cs, 0x00000000);
+}
+
+static uint64_t
+tu_trace_read_ts(struct u_trace_context *utctx,
+                 void *timestamps, unsigned idx, void *flush_data)
+{
+   struct tu_device *device =
+      container_of(utctx, struct tu_device, trace_context);
+   struct tu_bo *bo = timestamps;
+   struct tu_u_trace_flush_data *trace_flush_data = flush_data;
+
+   /* Only need to stall on results for the first entry: */
+   if (idx == 0) {
+      tu_device_wait_u_trace(device, trace_flush_data->syncobj);
+   }
+
+   if (tu_bo_map(device, bo) != VK_SUCCESS) {
+      return U_TRACE_NO_TIMESTAMP;
+   }
+
+   uint64_t *ts = bo->map;
+
+   /* Don't translate the no-timestamp marker: */
+   if (ts[idx] == U_TRACE_NO_TIMESTAMP)
+      return U_TRACE_NO_TIMESTAMP;
+
+   return tu_device_ticks_to_ns(device, ts[idx]);
+}
+
+static void
+tu_trace_delete_flush_data(struct u_trace_context *utctx, void *flush_data)
+{
+   struct tu_device *device =
+      container_of(utctx, struct tu_device, trace_context);
+   struct tu_u_trace_flush_data *trace_flush_data = flush_data;
+
+   tu_u_trace_cmd_data_finish(device, trace_flush_data->cmd_trace_data,
+                              trace_flush_data->trace_count);
+   vk_free(&device->vk.alloc, trace_flush_data->syncobj);
+   vk_free(&device->vk.alloc, trace_flush_data);
+}
+
+void
+tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream,
+                         void *ts_from, uint32_t from_offset,
+                         void *ts_to, uint32_t to_offset,
+                         uint32_t count)
+{
+   struct tu_cs *cs = cmdstream;
+   struct tu_bo *bo_from = ts_from;
+   struct tu_bo *bo_to = ts_to;
+
+   tu_cs_emit_pkt7(cs, CP_MEMCPY, 5);
+   tu_cs_emit(cs, count * sizeof(uint64_t) / sizeof(uint32_t));
+   tu_cs_emit_qw(cs, bo_from->iova + from_offset * sizeof(uint64_t));
+   tu_cs_emit_qw(cs, bo_to->iova + to_offset * sizeof(uint64_t));
+}
+
+VkResult
+tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
+                            struct u_trace **trace_copy)
+{
+   *cs = vk_zalloc(&cmdbuf->device->vk.alloc, sizeof(struct tu_cs), 8,
+                   VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+
+   if (*cs == NULL) {
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+   }
+
+   tu_cs_init(*cs, cmdbuf->device, TU_CS_MODE_GROW,
+              list_length(&cmdbuf->trace.trace_chunks) * 6 + 3);
+
+   tu_cs_begin(*cs);
+
+   tu_cs_emit_wfi(*cs);
+   tu_cs_emit_pkt7(*cs, CP_WAIT_FOR_ME, 0);
+
+   *trace_copy = vk_zalloc(&cmdbuf->device->vk.alloc, sizeof(struct u_trace), 8,
+                           VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+
+   if (*trace_copy == NULL) {
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+   }
+
+   u_trace_init(*trace_copy, cmdbuf->trace.utctx);
+   u_trace_clone_append(u_trace_begin_iterator(&cmdbuf->trace),
+                        u_trace_end_iterator(&cmdbuf->trace),
+                        *trace_copy, *cs,
+                        tu_copy_timestamp_buffer);
+
+   tu_cs_emit_wfi(*cs);
+
+   tu_cs_end(*cs);
+
+   return VK_SUCCESS;
+}
+
+void
+tu_u_trace_cmd_data_finish(struct tu_device *device,
+                           struct tu_u_trace_cmd_data *trace_data,
+                           uint32_t entry_count)
+{
+   for (uint32_t i = 0; i < entry_count; ++i) {
+      /* Only if we had to create a copy of trace we should free it */
+      if (trace_data[i].timestamp_copy_cs != NULL) {
+         tu_cs_finish(trace_data[i].timestamp_copy_cs);
+         vk_free(&device->vk.alloc, trace_data[i].timestamp_copy_cs);
+
+         u_trace_fini(trace_data[i].trace);
+         vk_free(&device->vk.alloc, trace_data[i].trace);
+      }
+   }
+
+   vk_free(&device->vk.alloc, trace_data);
+}
+
 VKAPI_ATTR VkResult VKAPI_CALL
 tu_CreateDevice(VkPhysicalDevice physicalDevice,
                 const VkDeviceCreateInfo *pCreateInfo,
@@ -1246,24 +1452,6 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
    bool perf_query_pools = false;
    bool robust_buffer_access2 = false;
 
-   /* Check enabled features */
-   if (pCreateInfo->pEnabledFeatures) {
-      VkPhysicalDeviceFeatures2 supported_features = {
-         .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2,
-      };
-      tu_GetPhysicalDeviceFeatures2(physicalDevice, &supported_features);
-      VkBool32 *supported_feature = (VkBool32 *) &supported_features.features;
-      VkBool32 *enabled_feature = (VkBool32 *) pCreateInfo->pEnabledFeatures;
-      unsigned num_features =
-         sizeof(VkPhysicalDeviceFeatures) / sizeof(VkBool32);
-      for (uint32_t i = 0; i < num_features; i++) {
-         if (enabled_feature[i] && !supported_feature[i])
-            return vk_startup_errorf(physical_device->instance,
-                                     VK_ERROR_FEATURE_NOT_PRESENT,
-                                     "Missing feature bit %d\n", i);
-      }
-   }
-
    vk_foreach_struct_const(ext, pCreateInfo->pNext) {
       switch (ext->sType) {
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_FEATURES_EXT: {
@@ -1295,6 +1483,8 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
    struct vk_device_dispatch_table dispatch_table;
    vk_device_dispatch_table_from_entrypoints(
       &dispatch_table, &tu_device_entrypoints, true);
+   vk_device_dispatch_table_from_entrypoints(
+      &dispatch_table, &wsi_device_entrypoints, false);
 
    result = vk_device_init(&device->vk, &physical_device->vk,
                            &dispatch_table, pCreateInfo, pAllocator);
@@ -1332,8 +1522,8 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
       device->queue_count[qfi] = queue_create->queueCount;
 
       for (unsigned q = 0; q < queue_create->queueCount; q++) {
-         result = tu_queue_init(device, &device->queues[qfi][q], qfi, q,
-                                queue_create->flags);
+         result = tu_queue_init(device, &device->queues[qfi][q], q,
+                                queue_create);
          if (result != VK_SUCCESS)
             goto fail_queues;
       }
@@ -1473,6 +1663,14 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
 
    mtx_init(&device->mutex, mtx_plain);
 
+   device->submit_count = 0;
+   u_trace_context_init(&device->trace_context, device,
+                     tu_trace_create_ts_buffer,
+                     tu_trace_destroy_ts_buffer,
+                     tu_trace_record_ts,
+                     tu_trace_read_ts,
+                     tu_trace_delete_flush_data);
+
    *pDevice = tu_device_to_handle(device);
    return VK_SUCCESS;
 
@@ -1488,7 +1686,8 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
    tu_destroy_clear_blit_shaders(device);
 fail_global_bo_map:
    tu_bo_finish(device, &device->global_bo);
-
+   vk_free(&device->vk.alloc, device->bo_idx);
+   vk_free(&device->vk.alloc, device->bo_list);
 fail_global_bo:
    ir3_compiler_destroy(device->compiler);
 
@@ -1513,6 +1712,8 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
    if (!device)
       return;
 
+   u_trace_context_fini(&device->trace_context);
+
    for (unsigned i = 0; i < TU_MAX_QUEUE_FAMILIES; i++) {
       for (unsigned q = 0; q < device->queue_count[i]; q++)
          tu_queue_finish(&device->queues[i][q]);
@@ -1620,32 +1821,6 @@ tu_EnumerateInstanceLayerProperties(uint32_t *pPropertyCount,
    return VK_SUCCESS;
 }
 
-VKAPI_ATTR void VKAPI_CALL
-tu_GetDeviceQueue2(VkDevice _device,
-                   const VkDeviceQueueInfo2 *pQueueInfo,
-                   VkQueue *pQueue)
-{
-   TU_FROM_HANDLE(tu_device, device, _device);
-   struct tu_queue *queue;
-
-   queue =
-      &device->queues[pQueueInfo->queueFamilyIndex][pQueueInfo->queueIndex];
-   if (pQueueInfo->flags != queue->flags) {
-      /* From the Vulkan 1.1.70 spec:
-       *
-       * "The queue returned by vkGetDeviceQueue2 must have the same
-       * flags value from this structure as that used at device
-       * creation time in a VkDeviceQueueCreateInfo instance. If no
-       * matching flags were specified at device creation time then
-       * pQueue will return VK_NULL_HANDLE."
-       */
-      *pQueue = VK_NULL_HANDLE;
-      return;
-   }
-
-   *pQueue = tu_queue_to_handle(queue);
-}
-
 VKAPI_ATTR VkResult VKAPI_CALL
 tu_QueueWaitIdle(VkQueue _queue)
 {
@@ -1685,22 +1860,6 @@ tu_QueueWaitIdle(VkQueue _queue)
    return VK_SUCCESS;
 }
 
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_DeviceWaitIdle(VkDevice _device)
-{
-   TU_FROM_HANDLE(tu_device, device, _device);
-
-   if (tu_device_is_lost(device))
-      return VK_ERROR_DEVICE_LOST;
-
-   for (unsigned i = 0; i < TU_MAX_QUEUE_FAMILIES; i++) {
-      for (unsigned q = 0; q < device->queue_count[i]; q++) {
-         tu_QueueWaitIdle(tu_queue_to_handle(&device->queues[i][q]));
-      }
-   }
-   return VK_SUCCESS;
-}
-
 VKAPI_ATTR VkResult VKAPI_CALL
 tu_EnumerateInstanceExtensionProperties(const char *pLayerName,
                                         uint32_t *pPropertyCount,
@@ -1757,12 +1916,12 @@ tu_AllocateMemory(VkDevice _device,
    struct tu_memory_heap *mem_heap = &device->physical_device->heap;
    uint64_t mem_heap_used = p_atomic_read(&mem_heap->used);
    if (mem_heap_used > mem_heap->size)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 
    mem = vk_object_alloc(&device->vk, pAllocator, sizeof(*mem),
                          VK_OBJECT_TYPE_DEVICE_MEMORY);
    if (mem == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    const VkImportMemoryFdInfoKHR *fd_info =
       vk_find_struct_const(pAllocateInfo->pNext, IMPORT_MEMORY_FD_INFO_KHR);
@@ -1798,7 +1957,7 @@ tu_AllocateMemory(VkDevice _device,
       if (mem_heap_used > mem_heap->size) {
          p_atomic_add(&mem_heap->used, -mem->bo.size);
          tu_bo_finish(device, &mem->bo);
-         result = vk_errorf(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY,
+         result = vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                             "Out of heap memory");
       }
    }
@@ -2014,7 +2173,7 @@ tu_CreateEvent(VkDevice _device,
          vk_object_alloc(&device->vk, pAllocator, sizeof(*event),
                          VK_OBJECT_TYPE_EVENT);
    if (!event)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    VkResult result = tu_bo_init_new(device, &event->bo, 0x1000,
                                     TU_BO_ALLOC_NO_FLAGS);
@@ -2033,7 +2192,7 @@ tu_CreateEvent(VkDevice _device,
    tu_bo_finish(device, &event->bo);
 fail_alloc:
    vk_object_free(&device->vk, pAllocator, event);
-   return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+   return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 }
 
 VKAPI_ATTR void VKAPI_CALL
@@ -2093,7 +2252,7 @@ tu_CreateBuffer(VkDevice _device,
    buffer = vk_object_alloc(&device->vk, pAllocator, sizeof(*buffer),
                             VK_OBJECT_TYPE_BUFFER);
    if (buffer == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    buffer->size = pCreateInfo->size;
    buffer->usage = pCreateInfo->usage;
@@ -2130,21 +2289,27 @@ tu_CreateFramebuffer(VkDevice _device,
 
    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO);
 
-   size_t size = sizeof(*framebuffer) + sizeof(struct tu_attachment_info) *
-                                           pCreateInfo->attachmentCount;
+   bool imageless = pCreateInfo->flags & VK_FRAMEBUFFER_CREATE_IMAGELESS_BIT;
+
+   size_t size = sizeof(*framebuffer);
+   if (!imageless)
+      size += sizeof(struct tu_attachment_info) * pCreateInfo->attachmentCount;
    framebuffer = vk_object_alloc(&device->vk, pAllocator, size,
                                  VK_OBJECT_TYPE_FRAMEBUFFER);
    if (framebuffer == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    framebuffer->attachment_count = pCreateInfo->attachmentCount;
    framebuffer->width = pCreateInfo->width;
    framebuffer->height = pCreateInfo->height;
    framebuffer->layers = pCreateInfo->layers;
-   for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
-      VkImageView _iview = pCreateInfo->pAttachments[i];
-      struct tu_image_view *iview = tu_image_view_from_handle(_iview);
-      framebuffer->attachments[i].attachment = iview;
+
+   if (!imageless) {
+      for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
+         VkImageView _iview = pCreateInfo->pAttachments[i];
+         struct tu_image_view *iview = tu_image_view_from_handle(_iview);
+         framebuffer->attachments[i].attachment = iview;
+      }
    }
 
    tu_framebuffer_tiling_config(framebuffer, device, pass);
@@ -2252,7 +2417,7 @@ tu_CreateSampler(VkDevice _device,
    sampler = vk_object_alloc(&device->vk, pAllocator, sizeof(*sampler),
                              VK_OBJECT_TYPE_SAMPLER);
    if (!sampler)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    tu_init_sampler(device, sampler, pCreateInfo);
    *pSampler = tu_sampler_to_handle(sampler);
@@ -2348,7 +2513,7 @@ tu_GetMemoryFdKHR(VkDevice _device,
 
    int prime_fd = tu_bo_export_dmabuf(device, &memory->bo);
    if (prime_fd < 0)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 
    *pFd = prime_fd;
    return VK_SUCCESS;
diff --git a/mesa 3D driver/src/freedreno/vulkan/tu_drm.c b/mesa 3D driver/src/freedreno/vulkan/tu_drm.c
index 5ddf290eea..ae8c037beb 100644
--- a/mesa 3D driver/src/freedreno/vulkan/tu_drm.c	
+++ b/mesa 3D driver/src/freedreno/vulkan/tu_drm.c	
@@ -34,9 +34,12 @@
 #include "drm-uapi/msm_drm.h"
 #include "util/timespec.h"
 #include "util/os_time.h"
+#include "util/perf/u_trace.h"
 
 #include "tu_private.h"
 
+#include "tu_cs.h"
+
 struct tu_binary_syncobj {
    uint32_t permanent, temporary;
 };
@@ -84,6 +87,7 @@ struct tu_queue_submit
    struct   list_head link;
 
    VkCommandBuffer *cmd_buffers;
+   struct tu_u_trace_cmd_data *cmd_buffer_trace_data;
    uint32_t cmd_buffer_count;
 
    struct   tu_syncobj **wait_semaphores;
@@ -112,6 +116,12 @@ struct tu_queue_submit
    uint32_t counter_pass_index;
 };
 
+struct tu_u_trace_syncobj
+{
+   uint32_t msm_queue_id;
+   uint32_t fence;
+};
+
 static int
 tu_drm_get_param(const struct tu_physical_device *dev,
                  uint32_t param,
@@ -165,6 +175,12 @@ tu_drm_get_gmem_base(const struct tu_physical_device *dev, uint64_t *base)
    return tu_drm_get_param(dev, MSM_PARAM_GMEM_BASE, base);
 }
 
+int
+tu_drm_get_timestamp(struct tu_physical_device *device, uint64_t *ts)
+{
+   return tu_drm_get_param(device, MSM_PARAM_TIMESTAMP, ts);
+}
+
 int
 tu_drm_submitqueue_new(const struct tu_device *dev,
                        int priority,
@@ -246,10 +262,8 @@ tu_bo_init(struct tu_device *dev,
       struct drm_msm_gem_submit_bo *new_ptr =
          vk_realloc(&dev->vk.alloc, dev->bo_list, new_len * sizeof(*dev->bo_list),
                     8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
-      if (!new_ptr) {
-         tu_gem_close(dev, gem_handle);
-         return VK_ERROR_OUT_OF_HOST_MEMORY;
-      }
+      if (!new_ptr)
+         goto fail_bo_list;
 
       dev->bo_list = new_ptr;
       dev->bo_list_size = new_len;
@@ -261,10 +275,8 @@ tu_bo_init(struct tu_device *dev,
       uint32_t *new_ptr =
          vk_realloc(&dev->vk.alloc, dev->bo_idx, new_len * sizeof(*dev->bo_idx),
                     8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
-      if (!new_ptr) {
-         tu_gem_close(dev, gem_handle);
-         return VK_ERROR_OUT_OF_HOST_MEMORY;
-      }
+      if (!new_ptr)
+         goto fail_bo_idx;
 
       dev->bo_idx = new_ptr;
       dev->bo_idx_size = new_len;
@@ -280,6 +292,12 @@ tu_bo_init(struct tu_device *dev,
    mtx_unlock(&dev->bo_mutex);
 
    return VK_SUCCESS;
+
+fail_bo_idx:
+   vk_free(&dev->vk.alloc, dev->bo_list);
+fail_bo_list:
+   tu_gem_close(dev, gem_handle);
+   return VK_ERROR_OUT_OF_HOST_MEMORY;
 }
 
 VkResult
@@ -300,7 +318,7 @@ tu_bo_init_new(struct tu_device *dev, struct tu_bo *bo, uint64_t size,
    int ret = drmCommandWriteRead(dev->fd,
                                  DRM_MSM_GEM_NEW, &req, sizeof(req));
    if (ret)
-      return vk_error(dev->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 
    return tu_bo_init(dev, bo, req.handle, size, flags & TU_BO_ALLOC_ALLOW_DUMP);
 }
@@ -315,13 +333,13 @@ tu_bo_init_dmabuf(struct tu_device *dev,
    off_t real_size = lseek(prime_fd, 0, SEEK_END);
    lseek(prime_fd, 0, SEEK_SET);
    if (real_size < 0 || (uint64_t) real_size < size)
-      return vk_error(dev->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+      return vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE);
 
    uint32_t gem_handle;
    int ret = drmPrimeFDToHandle(dev->fd, prime_fd,
                                 &gem_handle);
    if (ret)
-      return vk_error(dev->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+      return vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE);
 
    return tu_bo_init(dev, bo, gem_handle, size, false);
 }
@@ -344,13 +362,13 @@ tu_bo_map(struct tu_device *dev, struct tu_bo *bo)
 
    uint64_t offset = tu_gem_info(dev, bo->gem_handle, MSM_INFO_GET_OFFSET);
    if (!offset)
-      return vk_error(dev->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 
    /* TODO: Should we use the wrapper os_mmap() like Freedreno does? */
    void *map = mmap(0, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED,
                     dev->fd, offset);
    if (map == MAP_FAILED)
-      return vk_error(dev->instance, VK_ERROR_MEMORY_MAP_FAILED);
+      return vk_error(dev, VK_ERROR_MEMORY_MAP_FAILED);
 
    bo->map = map;
    return VK_SUCCESS;
@@ -561,7 +579,7 @@ sync_create(VkDevice _device,
          vk_object_alloc(&device->vk, pAllocator, sizeof(*sync),
                          fence ? VK_OBJECT_TYPE_FENCE : VK_OBJECT_TYPE_SEMAPHORE);
    if (!sync)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    if (binary) {
       struct drm_syncobj_create create = {};
@@ -640,7 +658,16 @@ sync_import(VkDevice _device, struct tu_syncobj *sync, bool temporary, bool sync
       *dst = handle.handle;
       close(fd);
    } else {
-      assert(temporary);
+      /* Note: SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT is always temporary, but the
+       * user doesn't have to specify the temporary bit because that's only
+       * needed for choosing a permanence when there's an option.
+       *
+       * "VK_SEMAPHORE_IMPORT_TEMPORARY_BIT specifies that the semaphore payload
+       * will be imported only temporarily, as described in Importing Semaphore
+       * Payloads, regardless of the permanence of handleType"
+       *
+       * https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/2701
+       */
 
       struct drm_syncobj_create create = {};
 
@@ -683,7 +710,7 @@ sync_export(VkDevice _device, struct tu_syncobj *sync, bool sync_fd, int *p_fd)
    };
    int ret = drmIoctl(device->fd, DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD, &handle);
    if (ret)
-      return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+      return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
 
    /* restore permanent payload on export */
    sync_set_temporary(device, sync, 0);
@@ -874,7 +901,7 @@ tu_queue_submit_create_locked(struct tu_queue *queue,
          VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
 
    if (new_submit->cmd_buffers == NULL) {
-      result = vk_error(queue->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY)
+      result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
       goto fail_cmd_buffers;
    }
 
@@ -885,7 +912,7 @@ tu_queue_submit_create_locked(struct tu_queue *queue,
          submit_info->waitSemaphoreCount * sizeof(*new_submit->wait_semaphores),
          8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
    if (new_submit->wait_semaphores == NULL) {
-      result = vk_error(queue->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY)
+      result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
       goto fail_wait_semaphores;
    }
    new_submit->wait_semaphore_count = submit_info->waitSemaphoreCount;
@@ -894,7 +921,7 @@ tu_queue_submit_create_locked(struct tu_queue *queue,
          submit_info->signalSemaphoreCount *sizeof(*new_submit->signal_semaphores),
          8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
    if (new_submit->signal_semaphores == NULL) {
-      result = vk_error(queue->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY)
+      result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
       goto fail_signal_semaphores;
    }
    new_submit->signal_semaphore_count = submit_info->signalSemaphoreCount;
@@ -923,6 +950,9 @@ tu_queue_submit_create_locked(struct tu_queue *queue,
       }
    }
 
+   bool u_trace_enabled = u_trace_context_tracing(&queue->device->trace_context);
+   bool has_trace_points = false;
+
    uint32_t entry_count = 0;
    for (uint32_t j = 0; j < new_submit->cmd_buffer_count; ++j) {
       TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, new_submit->cmd_buffers[j]);
@@ -931,6 +961,13 @@ tu_queue_submit_create_locked(struct tu_queue *queue,
          entry_count++;
 
       entry_count += cmdbuf->cs.entry_count;
+
+      if (u_trace_enabled && u_trace_has_points(&cmdbuf->trace)) {
+         if (!(cmdbuf->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT))
+            entry_count++;
+
+         has_trace_points = true;
+      }
    }
 
    new_submit->cmds = vk_zalloc(&queue->device->vk.alloc,
@@ -938,17 +975,50 @@ tu_queue_submit_create_locked(struct tu_queue *queue,
          VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
 
    if (new_submit->cmds == NULL) {
-      result = vk_error(queue->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY)
+      result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
       goto fail_cmds;
    }
 
+   if (has_trace_points) {
+      new_submit->cmd_buffer_trace_data = vk_zalloc(&queue->device->vk.alloc,
+            new_submit->cmd_buffer_count * sizeof(struct tu_u_trace_cmd_data), 8,
+            VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+
+      if (new_submit->cmd_buffer_trace_data == NULL) {
+         result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
+         goto fail_cmd_trace_data;
+      }
+
+      for (uint32_t i = 0; i < new_submit->cmd_buffer_count; ++i) {
+         TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, new_submit->cmd_buffers[i]);
+
+         if (!(cmdbuf->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) &&
+             u_trace_has_points(&cmdbuf->trace)) {
+            /* A single command buffer could be submitted several times, but we
+             * already backed timestamp iova addresses and trace points are
+             * single-use. Therefor we have to copy trace points and create
+             * a new timestamp buffer on every submit of reusable command buffer.
+             */
+            if (tu_create_copy_timestamp_cs(cmdbuf,
+                  &new_submit->cmd_buffer_trace_data[i].timestamp_copy_cs,
+                  &new_submit->cmd_buffer_trace_data[i].trace) != VK_SUCCESS) {
+               result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
+               goto fail_copy_timestamp_cs;
+            }
+            assert(new_submit->cmd_buffer_trace_data[i].timestamp_copy_cs->entry_count == 1);
+         } else {
+            new_submit->cmd_buffer_trace_data[i].trace = &cmdbuf->trace;
+         }
+      }
+   }
+
    /* Allocate without wait timeline semaphores */
    new_submit->in_syncobjs = vk_zalloc(&queue->device->vk.alloc,
          (nr_in_syncobjs - new_submit->wait_timeline_count) *
          sizeof(*new_submit->in_syncobjs), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
 
    if (new_submit->in_syncobjs == NULL) {
-      result = vk_error(queue->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY)
+      result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
       goto fail_in_syncobjs;
    }
 
@@ -958,7 +1028,7 @@ tu_queue_submit_create_locked(struct tu_queue *queue,
          VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
 
    if (new_submit->out_syncobjs == NULL) {
-      result = vk_error(queue->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY)
+      result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
       goto fail_out_syncobjs;
    }
 
@@ -977,6 +1047,12 @@ tu_queue_submit_create_locked(struct tu_queue *queue,
 fail_out_syncobjs:
    vk_free(&queue->device->vk.alloc, new_submit->in_syncobjs);
 fail_in_syncobjs:
+   if (new_submit->cmd_buffer_trace_data)
+      tu_u_trace_cmd_data_finish(queue->device, new_submit->cmd_buffer_trace_data,
+                                 new_submit->cmd_buffer_count);
+fail_copy_timestamp_cs:
+   vk_free(&queue->device->vk.alloc, new_submit->cmd_buffer_trace_data);
+fail_cmd_trace_data:
    vk_free(&queue->device->vk.alloc, new_submit->cmds);
 fail_cmds:
 fail_signal_timelines:
@@ -1044,12 +1120,35 @@ tu_queue_build_msm_gem_submit_cmds(struct tu_queue *queue,
          cmds[entry_idx].nr_relocs = 0;
          cmds[entry_idx].relocs = 0;
       }
+
+      if (submit->cmd_buffer_trace_data) {
+         struct tu_cs *ts_cs = submit->cmd_buffer_trace_data[j].timestamp_copy_cs;
+         if (ts_cs) {
+            cmds[entry_idx].type = MSM_SUBMIT_CMD_BUF;
+            cmds[entry_idx].submit_idx =
+               queue->device->bo_idx[ts_cs->entries[0].bo->gem_handle];
+
+            assert(cmds[entry_idx].submit_idx < queue->device->bo_count);
+
+            cmds[entry_idx].submit_offset = ts_cs->entries[0].offset;
+            cmds[entry_idx].size = ts_cs->entries[0].size;
+            cmds[entry_idx].pad = 0;
+            cmds[entry_idx].nr_relocs = 0;
+            cmds[entry_idx++].relocs = 0;
+         }
+      }
    }
 }
 
 static VkResult
 tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit)
 {
+   queue->device->submit_count++;
+
+#if HAVE_PERFETTO
+   tu_perfetto_submit(queue->device, queue->device->submit_count);
+#endif
+
    uint32_t flags = MSM_PIPE_3D0;
 
    if (submit->nr_in_syncobjs)
@@ -1116,6 +1215,27 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit)
       sem->timeline.highest_submitted = signal_value;
    }
 
+   if (submit->cmd_buffer_trace_data) {
+      struct tu_u_trace_flush_data *flush_data =
+         vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_flush_data),
+               8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+      flush_data->submission_id = queue->device->submit_count;
+      flush_data->syncobj =
+         vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_syncobj),
+               8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+      flush_data->syncobj->fence = req.fence;
+      flush_data->syncobj->msm_queue_id = queue->msm_queue_id;
+
+      flush_data->cmd_trace_data = submit->cmd_buffer_trace_data;
+      flush_data->trace_count = submit->cmd_buffer_count;
+      submit->cmd_buffer_trace_data = NULL;
+
+      for (uint32_t i = 0; i < submit->cmd_buffer_count; i++) {
+         bool free_data = i == (submit->cmd_buffer_count - 1);
+         u_trace_flush(flush_data->cmd_trace_data[i].trace, flush_data, free_data);
+      }
+   }
+
    pthread_cond_broadcast(&queue->device->timeline_cond);
 
    return VK_SUCCESS;
@@ -1147,14 +1267,14 @@ tu_timeline_add_point_locked(struct tu_device *device,
             VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
 
       if (!(*point))
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
       struct drm_syncobj_create create = {};
 
       int ret = drmIoctl(device->fd, DRM_IOCTL_SYNCOBJ_CREATE, &create);
       if (ret) {
          vk_free(&device->vk.alloc, *point);
-         return vk_error(device->instance, VK_ERROR_DEVICE_LOST);
+         return vk_error(device, VK_ERROR_DEVICE_LOST);
       }
 
       (*point)->syncobj = create.handle;
@@ -1244,6 +1364,35 @@ tu_device_submit_deferred_locked(struct tu_device *dev)
     return result;
 }
 
+static inline void
+get_abs_timeout(struct drm_msm_timespec *tv, uint64_t ns)
+{
+   struct timespec t;
+   clock_gettime(CLOCK_MONOTONIC, &t);
+   tv->tv_sec = t.tv_sec + ns / 1000000000;
+   tv->tv_nsec = t.tv_nsec + ns % 1000000000;
+}
+
+VkResult
+tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj)
+{
+   struct drm_msm_wait_fence req = {
+      .fence = syncobj->fence,
+      .queueid = syncobj->msm_queue_id,
+   };
+   int ret;
+
+   get_abs_timeout(&req.timeout, 1000000000);
+
+   ret = drmCommandWrite(dev->fd, DRM_MSM_WAIT_FENCE, &req, sizeof(req));
+   if (ret && (ret != -ETIMEDOUT)) {
+      fprintf(stderr, "wait-fence failed! %d (%s)", ret, strerror(errno));
+      return VK_TIMEOUT;
+   }
+
+   return VK_SUCCESS;
+}
+
 VKAPI_ATTR VkResult VKAPI_CALL
 tu_QueueSubmit(VkQueue _queue,
                uint32_t submitCount,
diff --git a/mesa 3D driver/src/freedreno/vulkan/tu_formats.c b/mesa 3D driver/src/freedreno/vulkan/tu_formats.c
index abe8929c71..a84e0616b9 100644
--- a/mesa 3D driver/src/freedreno/vulkan/tu_formats.c	
+++ b/mesa 3D driver/src/freedreno/vulkan/tu_formats.c	
@@ -27,361 +27,162 @@
 
 #include "adreno_common.xml.h"
 #include "a6xx.xml.h"
+#include "fdl/fd6_format_table.h"
 
 #include "vk_format.h"
 #include "vk_util.h"
 #include "drm-uapi/drm_fourcc.h"
 
-#define TU6_FMT(vkfmt, hwfmt, swapfmt, valid) \
-   [VK_FORMAT_##vkfmt] = {                   \
-      .fmt = FMT6_##hwfmt,                     \
-      .swap = swapfmt,                       \
-      .supported = valid,                    \
+struct tu_native_format
+tu6_format_vtx(VkFormat vk_format)
+{
+   enum pipe_format format = vk_format_to_pipe_format(vk_format);
+   struct tu_native_format fmt = {
+      .fmt = fd6_vertex_format(format),
+      .swap = fd6_vertex_swap(format),
+   };
+   assert(fmt.fmt != FMT6_NONE);
+   return fmt;
+}
+
+bool
+tu6_format_vtx_supported(VkFormat vk_format)
+{
+   enum pipe_format format = vk_format_to_pipe_format(vk_format);
+   return fd6_vertex_format(format) != FMT6_NONE;
+}
+
+/* Map non-colorspace-converted YUV formats to RGB pipe formats where we can,
+ * since our hardware doesn't support colorspace conversion.
+ *
+ * Really, we should probably be returning the RGB formats in
+ * vk_format_to_pipe_format, but we don't have all the equivalent pipe formats
+ * for VK RGB formats yet, and we'd have to switch all consumers of that
+ * function at once.
+ */
+enum pipe_format
+tu_vk_format_to_pipe_format(VkFormat vk_format)
+{
+   switch (vk_format) {
+   case VK_FORMAT_G8B8G8R8_422_UNORM: /* YUYV */
+      return PIPE_FORMAT_R8G8_R8B8_UNORM;
+   case VK_FORMAT_B8G8R8G8_422_UNORM: /* UYVY */
+      return PIPE_FORMAT_G8R8_B8R8_UNORM;
+   case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
+      return PIPE_FORMAT_R8_G8B8_420_UNORM;
+   case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
+      return PIPE_FORMAT_R8_G8_B8_420_UNORM;
+   default:
+      return vk_format_to_pipe_format(vk_format);
    }
-
-#define TU6_VTC(vk, fmt, swap) TU6_FMT(vk, fmt, swap, FMT_VERTEX | FMT_TEXTURE | FMT_COLOR)
-#define TU6_xTC(vk, fmt, swap) TU6_FMT(vk, fmt, swap, FMT_TEXTURE | FMT_COLOR)
-#define TU6_Vxx(vk, fmt, swap) TU6_FMT(vk, fmt, swap, FMT_VERTEX)
-#define TU6_xTx(vk, fmt, swap) TU6_FMT(vk, fmt, swap, FMT_TEXTURE)
-#define TU6_xxx(vk, fmt, swap) TU6_FMT(vk, NONE, WZYX, 0)
-
-static const struct tu_native_format tu6_format_table[] = {
-   TU6_xxx(UNDEFINED,                  x,                 x),    /* 0 */
-
-   /* 8-bit packed */
-   TU6_xxx(R4G4_UNORM_PACK8,           4_4_UNORM,         WZXY), /* 1 */
-
-   /* 16-bit packed */
-   TU6_xTC(R4G4B4A4_UNORM_PACK16,      4_4_4_4_UNORM,     XYZW), /* 2 */
-   TU6_xTC(B4G4R4A4_UNORM_PACK16,      4_4_4_4_UNORM,     ZYXW), /* 3 */
-   TU6_xTC(R5G6B5_UNORM_PACK16,        5_6_5_UNORM,       WXYZ), /* 4 */
-   TU6_xTC(B5G6R5_UNORM_PACK16,        5_6_5_UNORM,       WZYX), /* 5 */
-   TU6_xTC(R5G5B5A1_UNORM_PACK16,      5_5_5_1_UNORM,     XYZW), /* 6 */
-   TU6_xTC(B5G5R5A1_UNORM_PACK16,      5_5_5_1_UNORM,     ZYXW), /* 7 */
-   TU6_xTC(A1R5G5B5_UNORM_PACK16,      5_5_5_1_UNORM,     WXYZ), /* 8 */
-
-   /* 8-bit R */
-   TU6_VTC(R8_UNORM,                   8_UNORM,           WZYX), /* 9 */
-   TU6_VTC(R8_SNORM,                   8_SNORM,           WZYX), /* 10 */
-   TU6_Vxx(R8_USCALED,                 8_UINT,            WZYX), /* 11 */
-   TU6_Vxx(R8_SSCALED,                 8_SINT,            WZYX), /* 12 */
-   TU6_VTC(R8_UINT,                    8_UINT,            WZYX), /* 13 */
-   TU6_VTC(R8_SINT,                    8_SINT,            WZYX), /* 14 */
-   TU6_xTC(R8_SRGB,                    8_UNORM,           WZYX), /* 15 */
-
-   /* 16-bit RG */
-   TU6_VTC(R8G8_UNORM,                 8_8_UNORM,         WZYX), /* 16 */
-   TU6_VTC(R8G8_SNORM,                 8_8_SNORM,         WZYX), /* 17 */
-   TU6_Vxx(R8G8_USCALED,               8_8_UINT,          WZYX), /* 18 */
-   TU6_Vxx(R8G8_SSCALED,               8_8_SINT,          WZYX), /* 19 */
-   TU6_VTC(R8G8_UINT,                  8_8_UINT,          WZYX), /* 20 */
-   TU6_VTC(R8G8_SINT,                  8_8_SINT,          WZYX), /* 21 */
-   TU6_xTC(R8G8_SRGB,                  8_8_UNORM,         WZYX), /* 22 */
-
-   /* 24-bit RGB */
-   TU6_Vxx(R8G8B8_UNORM,               8_8_8_UNORM,       WZYX), /* 23 */
-   TU6_Vxx(R8G8B8_SNORM,               8_8_8_SNORM,       WZYX), /* 24 */
-   TU6_Vxx(R8G8B8_USCALED,             8_8_8_UINT,        WZYX), /* 25 */
-   TU6_Vxx(R8G8B8_SSCALED,             8_8_8_SINT,        WZYX), /* 26 */
-   TU6_Vxx(R8G8B8_UINT,                8_8_8_UINT,        WZYX), /* 27 */
-   TU6_Vxx(R8G8B8_SINT,                8_8_8_SINT,        WZYX), /* 28 */
-   TU6_xxx(R8G8B8_SRGB,                8_8_8_UNORM,       WZYX), /* 29 */
-
-   /* 24-bit BGR */
-   TU6_xxx(B8G8R8_UNORM,               8_8_8_UNORM,       WXYZ), /* 30 */
-   TU6_xxx(B8G8R8_SNORM,               8_8_8_SNORM,       WXYZ), /* 31 */
-   TU6_xxx(B8G8R8_USCALED,             8_8_8_UINT,        WXYZ), /* 32 */
-   TU6_xxx(B8G8R8_SSCALED,             8_8_8_SINT,        WXYZ), /* 33 */
-   TU6_xxx(B8G8R8_UINT,                8_8_8_UINT,        WXYZ), /* 34 */
-   TU6_xxx(B8G8R8_SINT,                8_8_8_SINT,        WXYZ), /* 35 */
-   TU6_xxx(B8G8R8_SRGB,                8_8_8_UNORM,       WXYZ), /* 36 */
-
-   /* 32-bit RGBA */
-   TU6_VTC(R8G8B8A8_UNORM,             8_8_8_8_UNORM,     WZYX), /* 37 */
-   TU6_VTC(R8G8B8A8_SNORM,             8_8_8_8_SNORM,     WZYX), /* 38 */
-   TU6_Vxx(R8G8B8A8_USCALED,           8_8_8_8_UINT,      WZYX), /* 39 */
-   TU6_Vxx(R8G8B8A8_SSCALED,           8_8_8_8_SINT,      WZYX), /* 40 */
-   TU6_VTC(R8G8B8A8_UINT,              8_8_8_8_UINT,      WZYX), /* 41 */
-   TU6_VTC(R8G8B8A8_SINT,              8_8_8_8_SINT,      WZYX), /* 42 */
-   TU6_xTC(R8G8B8A8_SRGB,              8_8_8_8_UNORM,     WZYX), /* 43 */
-
-   /* 32-bit BGRA */
-   TU6_VTC(B8G8R8A8_UNORM,             8_8_8_8_UNORM,     WXYZ), /* 44 */
-   TU6_VTC(B8G8R8A8_SNORM,             8_8_8_8_SNORM,     WXYZ), /* 45 */
-   TU6_Vxx(B8G8R8A8_USCALED,           8_8_8_8_UINT,      WXYZ), /* 46 */
-   TU6_Vxx(B8G8R8A8_SSCALED,           8_8_8_8_SINT,      WXYZ), /* 47 */
-   TU6_VTC(B8G8R8A8_UINT,              8_8_8_8_UINT,      WXYZ), /* 48 */
-   TU6_VTC(B8G8R8A8_SINT,              8_8_8_8_SINT,      WXYZ), /* 49 */
-   TU6_xTC(B8G8R8A8_SRGB,              8_8_8_8_UNORM,     WXYZ), /* 50 */
-
-   /* 32-bit packed */
-   TU6_VTC(A8B8G8R8_UNORM_PACK32,      8_8_8_8_UNORM,     WZYX), /* 51 */
-   TU6_VTC(A8B8G8R8_SNORM_PACK32,      8_8_8_8_SNORM,     WZYX), /* 52 */
-   TU6_Vxx(A8B8G8R8_USCALED_PACK32,    8_8_8_8_UINT,      WZYX), /* 53 */
-   TU6_Vxx(A8B8G8R8_SSCALED_PACK32,    8_8_8_8_SINT,      WZYX), /* 54 */
-   TU6_VTC(A8B8G8R8_UINT_PACK32,       8_8_8_8_UINT,      WZYX), /* 55 */
-   TU6_VTC(A8B8G8R8_SINT_PACK32,       8_8_8_8_SINT,      WZYX), /* 56 */
-   TU6_xTC(A8B8G8R8_SRGB_PACK32,       8_8_8_8_UNORM,     WZYX), /* 57 */
-   TU6_VTC(A2R10G10B10_UNORM_PACK32,   10_10_10_2_UNORM,  WXYZ), /* 58 */
-   TU6_Vxx(A2R10G10B10_SNORM_PACK32,   10_10_10_2_SNORM,  WXYZ), /* 59 */
-   TU6_Vxx(A2R10G10B10_USCALED_PACK32, 10_10_10_2_UINT,   WXYZ), /* 60 */
-   TU6_Vxx(A2R10G10B10_SSCALED_PACK32, 10_10_10_2_SINT,   WXYZ), /* 61 */
-   TU6_VTC(A2R10G10B10_UINT_PACK32,    10_10_10_2_UINT,   WXYZ), /* 62 */
-   TU6_Vxx(A2R10G10B10_SINT_PACK32,    10_10_10_2_SINT,   WXYZ), /* 63 */
-   TU6_VTC(A2B10G10R10_UNORM_PACK32,   10_10_10_2_UNORM,  WZYX), /* 64 */
-   TU6_Vxx(A2B10G10R10_SNORM_PACK32,   10_10_10_2_SNORM,  WZYX), /* 65 */
-   TU6_Vxx(A2B10G10R10_USCALED_PACK32, 10_10_10_2_UINT,   WZYX), /* 66 */
-   TU6_Vxx(A2B10G10R10_SSCALED_PACK32, 10_10_10_2_SINT,   WZYX), /* 67 */
-   TU6_VTC(A2B10G10R10_UINT_PACK32,    10_10_10_2_UINT,   WZYX), /* 68 */
-   TU6_Vxx(A2B10G10R10_SINT_PACK32,    10_10_10_2_SINT,   WZYX), /* 69 */
-
-   /* 16-bit R */
-   TU6_VTC(R16_UNORM,                  16_UNORM,          WZYX), /* 70 */
-   TU6_VTC(R16_SNORM,                  16_SNORM,          WZYX), /* 71 */
-   TU6_Vxx(R16_USCALED,                16_UINT,           WZYX), /* 72 */
-   TU6_Vxx(R16_SSCALED,                16_SINT,           WZYX), /* 73 */
-   TU6_VTC(R16_UINT,                   16_UINT,           WZYX), /* 74 */
-   TU6_VTC(R16_SINT,                   16_SINT,           WZYX), /* 75 */
-   TU6_VTC(R16_SFLOAT,                 16_FLOAT,          WZYX), /* 76 */
-
-   /* 32-bit RG */
-   TU6_VTC(R16G16_UNORM,               16_16_UNORM,       WZYX), /* 77 */
-   TU6_VTC(R16G16_SNORM,               16_16_SNORM,       WZYX), /* 78 */
-   TU6_Vxx(R16G16_USCALED,             16_16_UINT,        WZYX), /* 79 */
-   TU6_Vxx(R16G16_SSCALED,             16_16_SINT,        WZYX), /* 80 */
-   TU6_VTC(R16G16_UINT,                16_16_UINT,        WZYX), /* 81 */
-   TU6_VTC(R16G16_SINT,                16_16_SINT,        WZYX), /* 82 */
-   TU6_VTC(R16G16_SFLOAT,              16_16_FLOAT,       WZYX), /* 83 */
-
-   /* 48-bit RGB */
-   TU6_Vxx(R16G16B16_UNORM,            16_16_16_UNORM,    WZYX), /* 84 */
-   TU6_Vxx(R16G16B16_SNORM,            16_16_16_SNORM,    WZYX), /* 85 */
-   TU6_Vxx(R16G16B16_USCALED,          16_16_16_UINT,     WZYX), /* 86 */
-   TU6_Vxx(R16G16B16_SSCALED,          16_16_16_SINT,     WZYX), /* 87 */
-   TU6_Vxx(R16G16B16_UINT,             16_16_16_UINT,     WZYX), /* 88 */
-   TU6_Vxx(R16G16B16_SINT,             16_16_16_SINT,     WZYX), /* 89 */
-   TU6_Vxx(R16G16B16_SFLOAT,           16_16_16_FLOAT,    WZYX), /* 90 */
-
-   /* 64-bit RGBA */
-   TU6_VTC(R16G16B16A16_UNORM,         16_16_16_16_UNORM, WZYX), /* 91 */
-   TU6_VTC(R16G16B16A16_SNORM,         16_16_16_16_SNORM, WZYX), /* 92 */
-   TU6_Vxx(R16G16B16A16_USCALED,       16_16_16_16_UINT,  WZYX), /* 93 */
-   TU6_Vxx(R16G16B16A16_SSCALED,       16_16_16_16_SINT,  WZYX), /* 94 */
-   TU6_VTC(R16G16B16A16_UINT,          16_16_16_16_UINT,  WZYX), /* 95 */
-   TU6_VTC(R16G16B16A16_SINT,          16_16_16_16_SINT,  WZYX), /* 96 */
-   TU6_VTC(R16G16B16A16_SFLOAT,        16_16_16_16_FLOAT, WZYX), /* 97 */
-
-   /* 32-bit R */
-   TU6_VTC(R32_UINT,                   32_UINT,           WZYX), /* 98 */
-   TU6_VTC(R32_SINT,                   32_SINT,           WZYX), /* 99 */
-   TU6_VTC(R32_SFLOAT,                 32_FLOAT,          WZYX), /* 100 */
-
-   /* 64-bit RG */
-   TU6_VTC(R32G32_UINT,                32_32_UINT,        WZYX), /* 101 */
-   TU6_VTC(R32G32_SINT,                32_32_SINT,        WZYX), /* 102 */
-   TU6_VTC(R32G32_SFLOAT,              32_32_FLOAT,       WZYX), /* 103 */
-
-   /* 96-bit RGB */
-   TU6_Vxx(R32G32B32_UINT,             32_32_32_UINT,     WZYX), /* 104 */
-   TU6_Vxx(R32G32B32_SINT,             32_32_32_SINT,     WZYX), /* 105 */
-   TU6_Vxx(R32G32B32_SFLOAT,           32_32_32_FLOAT,    WZYX), /* 106 */
-
-   /* 128-bit RGBA */
-   TU6_VTC(R32G32B32A32_UINT,          32_32_32_32_UINT,  WZYX), /* 107 */
-   TU6_VTC(R32G32B32A32_SINT,          32_32_32_32_SINT,  WZYX), /* 108 */
-   TU6_VTC(R32G32B32A32_SFLOAT,        32_32_32_32_FLOAT, WZYX), /* 109 */
-
-   /* 64-bit R */
-   TU6_xxx(R64_UINT,                   64_UINT,           WZYX), /* 110 */
-   TU6_xxx(R64_SINT,                   64_SINT,           WZYX), /* 111 */
-   TU6_xxx(R64_SFLOAT,                 64_FLOAT,          WZYX), /* 112 */
-
-   /* 128-bit RG */
-   TU6_xxx(R64G64_UINT,                64_64_UINT,        WZYX), /* 113 */
-   TU6_xxx(R64G64_SINT,                64_64_SINT,        WZYX), /* 114 */
-   TU6_xxx(R64G64_SFLOAT,              64_64_FLOAT,       WZYX), /* 115 */
-
-   /* 192-bit RGB */
-   TU6_xxx(R64G64B64_UINT,             64_64_64_UINT,     WZYX), /* 116 */
-   TU6_xxx(R64G64B64_SINT,             64_64_64_SINT,     WZYX), /* 117 */
-   TU6_xxx(R64G64B64_SFLOAT,           64_64_64_FLOAT,    WZYX), /* 118 */
-
-   /* 256-bit RGBA */
-   TU6_xxx(R64G64B64A64_UINT,          64_64_64_64_UINT,  WZYX), /* 119 */
-   TU6_xxx(R64G64B64A64_SINT,          64_64_64_64_SINT,  WZYX), /* 120 */
-   TU6_xxx(R64G64B64A64_SFLOAT,        64_64_64_64_FLOAT, WZYX), /* 121 */
-
-   /* 32-bit packed float */
-   TU6_VTC(B10G11R11_UFLOAT_PACK32,    11_11_10_FLOAT,    WZYX), /* 122 */
-   TU6_xTx(E5B9G9R9_UFLOAT_PACK32,     9_9_9_E5_FLOAT,    WZYX), /* 123 */
-
-   /* depth/stencil
-    * X8_D24_UNORM/D24_UNORM_S8_UINT should be Z24_UNORM_S8_UINT_AS_R8G8B8A8
-    * but the format doesn't work on A630 when UBWC is disabled, so use
-    * 8_8_8_8_UNORM as the default and override it when UBWC is enabled
-    */
-   TU6_xTC(D16_UNORM,                  16_UNORM,                      WZYX), /* 124 */
-   TU6_xTC(X8_D24_UNORM_PACK32,        8_8_8_8_UNORM,                 WZYX), /* 125 */
-   TU6_xTC(D32_SFLOAT,                 32_FLOAT,                      WZYX), /* 126 */
-   TU6_xTC(S8_UINT,                    8_UINT,                        WZYX), /* 127 */
-   TU6_xxx(D16_UNORM_S8_UINT,          X8Z16_UNORM,                   WZYX), /* 128 */
-   TU6_xTC(D24_UNORM_S8_UINT,          8_8_8_8_UNORM,                 WZYX), /* 129 */
-   TU6_xTC(D32_SFLOAT_S8_UINT,         NONE,                          WZYX), /* 130 */
-
-   /* compressed */
-   TU6_xTx(BC1_RGB_UNORM_BLOCK,        DXT1,              WZYX), /* 131 */
-   TU6_xTx(BC1_RGB_SRGB_BLOCK,         DXT1,              WZYX), /* 132 */
-   TU6_xTx(BC1_RGBA_UNORM_BLOCK,       DXT1,              WZYX), /* 133 */
-   TU6_xTx(BC1_RGBA_SRGB_BLOCK,        DXT1,              WZYX), /* 134 */
-   TU6_xTx(BC2_UNORM_BLOCK,            DXT3,              WZYX), /* 135 */
-   TU6_xTx(BC2_SRGB_BLOCK,             DXT3,              WZYX), /* 136 */
-   TU6_xTx(BC3_UNORM_BLOCK,            DXT5,              WZYX), /* 137 */
-   TU6_xTx(BC3_SRGB_BLOCK,             DXT5,              WZYX), /* 138 */
-   TU6_xTx(BC4_UNORM_BLOCK,            RGTC1_UNORM,       WZYX), /* 139 */
-   TU6_xTx(BC4_SNORM_BLOCK,            RGTC1_SNORM,       WZYX), /* 140 */
-   TU6_xTx(BC5_UNORM_BLOCK,            RGTC2_UNORM,       WZYX), /* 141 */
-   TU6_xTx(BC5_SNORM_BLOCK,            RGTC2_SNORM,       WZYX), /* 142 */
-   TU6_xTx(BC6H_UFLOAT_BLOCK,          BPTC_UFLOAT,       WZYX), /* 143 */
-   TU6_xTx(BC6H_SFLOAT_BLOCK,          BPTC_FLOAT,        WZYX), /* 144 */
-   TU6_xTx(BC7_UNORM_BLOCK,            BPTC,              WZYX), /* 145 */
-   TU6_xTx(BC7_SRGB_BLOCK,             BPTC,              WZYX), /* 146 */
-   TU6_xTx(ETC2_R8G8B8_UNORM_BLOCK,    ETC2_RGB8,         WZYX), /* 147 */
-   TU6_xTx(ETC2_R8G8B8_SRGB_BLOCK,     ETC2_RGB8,         WZYX), /* 148 */
-   TU6_xTx(ETC2_R8G8B8A1_UNORM_BLOCK,  ETC2_RGB8A1,       WZYX), /* 149 */
-   TU6_xTx(ETC2_R8G8B8A1_SRGB_BLOCK,   ETC2_RGB8A1,       WZYX), /* 150 */
-   TU6_xTx(ETC2_R8G8B8A8_UNORM_BLOCK,  ETC2_RGBA8,        WZYX), /* 151 */
-   TU6_xTx(ETC2_R8G8B8A8_SRGB_BLOCK,   ETC2_RGBA8,        WZYX), /* 152 */
-   TU6_xTx(EAC_R11_UNORM_BLOCK,        ETC2_R11_UNORM,    WZYX), /* 153 */
-   TU6_xTx(EAC_R11_SNORM_BLOCK,        ETC2_R11_SNORM,    WZYX), /* 154 */
-   TU6_xTx(EAC_R11G11_UNORM_BLOCK,     ETC2_RG11_UNORM,   WZYX), /* 155 */
-   TU6_xTx(EAC_R11G11_SNORM_BLOCK,     ETC2_RG11_SNORM,   WZYX), /* 156 */
-   TU6_xTx(ASTC_4x4_UNORM_BLOCK,       ASTC_4x4,          WZYX), /* 157 */
-   TU6_xTx(ASTC_4x4_SRGB_BLOCK,        ASTC_4x4,          WZYX), /* 158 */
-   TU6_xTx(ASTC_5x4_UNORM_BLOCK,       ASTC_5x4,          WZYX), /* 159 */
-   TU6_xTx(ASTC_5x4_SRGB_BLOCK,        ASTC_5x4,          WZYX), /* 160 */
-   TU6_xTx(ASTC_5x5_UNORM_BLOCK,       ASTC_5x5,          WZYX), /* 161 */
-   TU6_xTx(ASTC_5x5_SRGB_BLOCK,        ASTC_5x5,          WZYX), /* 162 */
-   TU6_xTx(ASTC_6x5_UNORM_BLOCK,       ASTC_6x5,          WZYX), /* 163 */
-   TU6_xTx(ASTC_6x5_SRGB_BLOCK,        ASTC_6x5,          WZYX), /* 164 */
-   TU6_xTx(ASTC_6x6_UNORM_BLOCK,       ASTC_6x6,          WZYX), /* 165 */
-   TU6_xTx(ASTC_6x6_SRGB_BLOCK,        ASTC_6x6,          WZYX), /* 166 */
-   TU6_xTx(ASTC_8x5_UNORM_BLOCK,       ASTC_8x5,          WZYX), /* 167 */
-   TU6_xTx(ASTC_8x5_SRGB_BLOCK,        ASTC_8x5,          WZYX), /* 168 */
-   TU6_xTx(ASTC_8x6_UNORM_BLOCK,       ASTC_8x6,          WZYX), /* 169 */
-   TU6_xTx(ASTC_8x6_SRGB_BLOCK,        ASTC_8x6,          WZYX), /* 170 */
-   TU6_xTx(ASTC_8x8_UNORM_BLOCK,       ASTC_8x8,          WZYX), /* 171 */
-   TU6_xTx(ASTC_8x8_SRGB_BLOCK,        ASTC_8x8,          WZYX), /* 172 */
-   TU6_xTx(ASTC_10x5_UNORM_BLOCK,      ASTC_10x5,         WZYX), /* 173 */
-   TU6_xTx(ASTC_10x5_SRGB_BLOCK,       ASTC_10x5,         WZYX), /* 174 */
-   TU6_xTx(ASTC_10x6_UNORM_BLOCK,      ASTC_10x6,         WZYX), /* 175 */
-   TU6_xTx(ASTC_10x6_SRGB_BLOCK,       ASTC_10x6,         WZYX), /* 176 */
-   TU6_xTx(ASTC_10x8_UNORM_BLOCK,      ASTC_10x8,         WZYX), /* 177 */
-   TU6_xTx(ASTC_10x8_SRGB_BLOCK,       ASTC_10x8,         WZYX), /* 178 */
-   TU6_xTx(ASTC_10x10_UNORM_BLOCK,     ASTC_10x10,        WZYX), /* 179 */
-   TU6_xTx(ASTC_10x10_SRGB_BLOCK,      ASTC_10x10,        WZYX), /* 180 */
-   TU6_xTx(ASTC_12x10_UNORM_BLOCK,     ASTC_12x10,        WZYX), /* 181 */
-   TU6_xTx(ASTC_12x10_SRGB_BLOCK,      ASTC_12x10,        WZYX), /* 182 */
-   TU6_xTx(ASTC_12x12_UNORM_BLOCK,     ASTC_12x12,        WZYX), /* 183 */
-   TU6_xTx(ASTC_12x12_SRGB_BLOCK,      ASTC_12x12,        WZYX), /* 184 */
-};
-
-#undef TU6_FMT
-#define TU6_FMT(vkfmt, hwfmt, swapfmt, valid)   \
-   case VK_FORMAT_##vkfmt:                      \
-      fmt = (struct tu_native_format) {         \
-         .fmt = FMT6_##hwfmt,                   \
-         .swap = swapfmt,                       \
-         .supported = valid,                    \
-      }; break;
+}
 
 static struct tu_native_format
-tu6_get_native_format(VkFormat format)
+tu6_format_color_unchecked(enum pipe_format format, enum a6xx_tile_mode tile_mode)
 {
-   struct tu_native_format fmt = {};
+   struct tu_native_format fmt = {
+      .fmt = fd6_color_format(format, tile_mode),
+      .swap = fd6_color_swap(format, tile_mode),
+   };
 
-   if (format < ARRAY_SIZE(tu6_format_table)) {
-      fmt = tu6_format_table[format];
-   } else {
-      switch (format) {
-      TU6_xTx(G8B8G8R8_422_UNORM,         R8G8R8B8_422_UNORM,        WZYX)
-      TU6_xTx(B8G8R8G8_422_UNORM,         G8R8B8R8_422_UNORM,        WZYX)
-      TU6_xTx(G8_B8_R8_3PLANE_420_UNORM,  R8_G8_B8_3PLANE_420_UNORM, WZYX)
-      TU6_xTx(G8_B8R8_2PLANE_420_UNORM,   R8_G8B8_2PLANE_420_UNORM,  WZYX)
-      TU6_xTC(A4R4G4B4_UNORM_PACK16_EXT,  4_4_4_4_UNORM,             WXYZ)
-      TU6_xTC(A4B4G4R4_UNORM_PACK16_EXT,  4_4_4_4_UNORM,             WZYX)
-      default:
-         break;
-      }
+   switch (format) {
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+      fmt.fmt = FMT6_8_8_8_8_UNORM;
+      break;
+
+   default:
+      break;
    }
 
-   if (fmt.supported && vk_format_to_pipe_format(format) == PIPE_FORMAT_NONE) {
-      tu_finishme("vk_format %d missing matching pipe format.\n", format);
-      fmt.supported = false;
+   return fmt;
+}
+
+bool
+tu6_format_color_supported(enum pipe_format format)
+{
+   return tu6_format_color_unchecked(format, TILE6_LINEAR).fmt != FMT6_NONE;
+}
+
+struct tu_native_format
+tu6_format_color(enum pipe_format format, enum a6xx_tile_mode tile_mode)
+{
+   struct tu_native_format fmt = tu6_format_color_unchecked(format, tile_mode);
+   assert(fmt.fmt != FMT6_NONE);
+   return fmt;
+}
+
+static struct tu_native_format
+tu6_format_texture_unchecked(enum pipe_format format, enum a6xx_tile_mode tile_mode)
+{
+   struct tu_native_format fmt = {
+      .fmt = fd6_texture_format(format, tile_mode),
+      .swap = fd6_texture_swap(format, tile_mode),
+   };
+
+   /* No texturing support for NPOT textures yet.  See
+    * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5536
+    */
+   if (util_format_is_plain(format) &&
+       !util_is_power_of_two_nonzero(util_format_get_blocksize(format))) {
+      fmt.fmt = FMT6_NONE;
+   }
+
+   switch (format) {
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+      /* freedreno uses Z24_UNORM_S8_UINT (sampling) or
+       * FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 (blits) for this format, while we use
+       * FMT6_8_8_8_8_UNORM or FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8
+       */
+      fmt.fmt = FMT6_8_8_8_8_UNORM;
+      break;
+
+   default:
+      break;
    }
 
    return fmt;
 }
 
 struct tu_native_format
-tu6_format_vtx(VkFormat format)
+tu6_format_texture(enum pipe_format format, enum a6xx_tile_mode tile_mode)
 {
-   struct tu_native_format fmt = tu6_get_native_format(format);
-   assert(fmt.supported & FMT_VERTEX);
+   struct tu_native_format fmt = tu6_format_texture_unchecked(format, tile_mode);
+   assert(fmt.fmt != FMT6_NONE);
    return fmt;
 }
 
-struct tu_native_format
-tu6_format_color(VkFormat format, enum a6xx_tile_mode tile_mode)
+bool
+tu6_format_texture_supported(enum pipe_format format)
 {
-   struct tu_native_format fmt = tu6_get_native_format(format);
-   assert(fmt.supported & FMT_COLOR);
-
-   if (fmt.fmt == FMT6_10_10_10_2_UNORM)
-      fmt.fmt = FMT6_10_10_10_2_UNORM_DEST;
-
-   if (tile_mode)
-      fmt.swap = WZYX;
-
-   return fmt;
-}
-
-struct tu_native_format
-tu6_format_texture(VkFormat format, enum a6xx_tile_mode tile_mode)
-{
-   struct tu_native_format fmt = tu6_get_native_format(format);
-   assert(fmt.supported & FMT_TEXTURE);
-
-   if (!tile_mode) {
-      /* different from format table when used as linear src */
-      if (format == VK_FORMAT_R5G5B5A1_UNORM_PACK16)
-         fmt.fmt = FMT6_1_5_5_5_UNORM, fmt.swap = WXYZ;
-      if (format == VK_FORMAT_B5G5R5A1_UNORM_PACK16)
-         fmt.fmt = FMT6_1_5_5_5_UNORM, fmt.swap = WZYX;
-   } else {
-      fmt.swap = WZYX;
-   }
-
-   return fmt;
+   return tu6_format_texture_unchecked(format, TILE6_LINEAR).fmt != FMT6_NONE;
 }
 
 static void
 tu_physical_device_get_format_properties(
    struct tu_physical_device *physical_device,
-   VkFormat format,
+   VkFormat vk_format,
    VkFormatProperties *out_properties)
 {
    VkFormatFeatureFlags linear = 0, optimal = 0, buffer = 0;
-   const struct util_format_description *desc = vk_format_description(format);
-   const struct tu_native_format native_fmt = tu6_get_native_format(format);
-   if (!desc || !native_fmt.supported) {
+   enum pipe_format format = tu_vk_format_to_pipe_format(vk_format);
+   const struct util_format_description *desc = util_format_description(format);
+
+   bool supported_vtx = tu6_format_vtx_supported(vk_format);
+   bool supported_color = tu6_format_color_supported(format);
+   bool supported_tex = tu6_format_texture_supported(format);
+
+   if (format == PIPE_FORMAT_NONE ||
+       !(supported_vtx || supported_color || supported_tex)) {
       goto end;
    }
 
    buffer |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT | VK_FORMAT_FEATURE_TRANSFER_DST_BIT;
-   if (native_fmt.supported & FMT_VERTEX)
+   if (supported_vtx)
       buffer |= VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT;
 
-   if (native_fmt.supported & FMT_TEXTURE) {
+   if (supported_tex) {
       optimal |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT |
                  VK_FORMAT_FEATURE_TRANSFER_DST_BIT |
                  VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT |
@@ -400,7 +201,7 @@ tu_physical_device_get_format_properties(
       if (desc->layout != UTIL_FORMAT_LAYOUT_SUBSAMPLED)
          optimal |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_YCBCR_CONVERSION_LINEAR_FILTER_BIT;
 
-      if (!vk_format_is_int(format)) {
+      if (!vk_format_is_int(vk_format)) {
          optimal |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
 
          if (physical_device->vk.supported_extensions.EXT_filter_cubic)
@@ -408,8 +209,8 @@ tu_physical_device_get_format_properties(
       }
    }
 
-   if (native_fmt.supported & FMT_COLOR) {
-      assert(native_fmt.supported & FMT_TEXTURE);
+   if (supported_color) {
+      assert(supported_tex);
       optimal |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
                  VK_FORMAT_FEATURE_BLIT_DST_BIT;
 
@@ -420,7 +221,8 @@ tu_physical_device_get_format_properties(
        * after we enable shaderStorageImageReadWithoutFormat and there are
        * tests for these formats.
        */
-      if (native_fmt.swap == WZYX) {
+      struct tu_native_format tex = tu6_format_texture(format, TILE6_LINEAR);
+      if (tex.swap == WZYX && tex.fmt != FMT6_1_5_5_5_UNORM) {
          optimal |= VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT;
          buffer |= VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT;
       }
@@ -428,12 +230,12 @@ tu_physical_device_get_format_properties(
       /* TODO: The blob also exposes these for R16G16_UINT/R16G16_SINT, but we
        * don't have any tests for those.
        */
-      if (format == VK_FORMAT_R32_UINT || format == VK_FORMAT_R32_SINT) {
+      if (vk_format == VK_FORMAT_R32_UINT || vk_format == VK_FORMAT_R32_SINT) {
          optimal |= VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT;
          buffer |= VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_ATOMIC_BIT;
       }
 
-      if (!vk_format_is_int(format))
+      if (!util_format_is_pure_integer(format))
          optimal |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT;
    }
 
@@ -447,19 +249,21 @@ tu_physical_device_get_format_properties(
     * DEPTH_STENCIL_ATTACHMENT_BIT for the optimal features.
     */
    linear = optimal;
-   if (tu6_pipe2depth(format) != (enum a6xx_depth_format)~0)
+   if (tu6_pipe2depth(vk_format) != (enum a6xx_depth_format)~0)
       optimal |= VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT;
 
-   if (format == VK_FORMAT_G8B8G8R8_422_UNORM ||
-       format == VK_FORMAT_B8G8R8G8_422_UNORM ||
-       format == VK_FORMAT_G8_B8R8_2PLANE_420_UNORM ||
-       format == VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM) {
-      /* no tiling for special UBWC formats
-       * TODO: NV12 can be UBWC but has a special UBWC format for accessing the Y plane aspect
-       * for 3plane, tiling/UBWC might be supported, but the blob doesn't use tiling
-       */
+   if (!tiling_possible(vk_format) &&
+       /* We don't actually support tiling for this format, but we need to
+        * fake it as it's required by VK_KHR_sampler_ycbcr_conversion.
+        */
+       vk_format != VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM) {
       optimal = 0;
+   }
 
+   if (vk_format == VK_FORMAT_G8B8G8R8_422_UNORM ||
+       vk_format == VK_FORMAT_B8G8R8G8_422_UNORM ||
+       vk_format == VK_FORMAT_G8_B8R8_2PLANE_420_UNORM ||
+       vk_format == VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM) {
       /* Disable buffer texturing of subsampled (422) and planar YUV textures.
        * The subsampling requirement comes from "If format is a block-compressed
        * format, then bufferFeatures must not support any features for the
@@ -473,7 +277,7 @@ tu_physical_device_get_format_properties(
    /* D32_SFLOAT_S8_UINT is tiled as two images, so no linear format
     * blob enables some linear features, but its not useful, so don't bother.
     */
-   if (format == VK_FORMAT_D32_SFLOAT_S8_UINT)
+   if (vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT)
       linear = 0;
 
 end:
@@ -508,6 +312,7 @@ tu_GetPhysicalDeviceFormatProperties2(
 
       /* note: ubwc_possible() argument values to be ignored except for format */
       if (pFormatProperties->formatProperties.optimalTilingFeatures &&
+          tiling_possible(format) &&
           ubwc_possible(format, VK_IMAGE_TYPE_2D, 0, 0, physical_device->info, VK_SAMPLE_COUNT_1_BIT)) {
          vk_outarray_append(&out, mod_props) {
             mod_props->drmFormatModifier = DRM_FORMAT_MOD_QCOM_COMPRESSED;
@@ -548,7 +353,8 @@ tu_get_image_format_properties(
          /* falling back to linear/non-UBWC isn't possible with explicit modifier */
 
          /* formats which don't support tiling */
-         if (!format_props.optimalTilingFeatures)
+         if (!format_props.optimalTilingFeatures ||
+             !tiling_possible(info->format))
             return VK_ERROR_FORMAT_NOT_SUPPORTED;
 
          /* for mutable formats, its very unlikely to be possible to use UBWC */
@@ -707,7 +513,7 @@ tu_get_external_image_format_properties(
             VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT;
          break;
       default:
-         return vk_errorf(physical_device->instance, VK_ERROR_FORMAT_NOT_SUPPORTED,
+         return vk_errorf(physical_device, VK_ERROR_FORMAT_NOT_SUPPORTED,
                           "VkExternalMemoryTypeFlagBits(0x%x) unsupported for VkImageType(%d)",
                           handleType, pImageFormatInfo->type);
       }
@@ -717,7 +523,7 @@ tu_get_external_image_format_properties(
       compat_flags = VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT;
       break;
    default:
-      return vk_errorf(physical_device->instance, VK_ERROR_FORMAT_NOT_SUPPORTED,
+      return vk_errorf(physical_device, VK_ERROR_FORMAT_NOT_SUPPORTED,
                        "VkExternalMemoryTypeFlagBits(0x%x) unsupported",
                        handleType);
    }
diff --git a/mesa 3D driver/src/freedreno/vulkan/tu_image.c b/mesa 3D driver/src/freedreno/vulkan/tu_image.c
index 804c5dd5b5..089d07eef9 100644
--- a/mesa 3D driver/src/freedreno/vulkan/tu_image.c	
+++ b/mesa 3D driver/src/freedreno/vulkan/tu_image.c	
@@ -26,6 +26,7 @@
  */
 
 #include "tu_private.h"
+#include "fdl/fd6_format_table.h"
 
 #include "util/debug.h"
 #include "util/u_atomic.h"
@@ -50,23 +51,22 @@ tu6_plane_count(VkFormat format)
    }
 }
 
-static VkFormat
+enum pipe_format
 tu6_plane_format(VkFormat format, uint32_t plane)
 {
    switch (format) {
    case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
-      /* note: with UBWC, and Y plane UBWC is different from R8_UNORM */
-      return plane ? VK_FORMAT_R8G8_UNORM : VK_FORMAT_R8_UNORM;
+      return plane ? PIPE_FORMAT_R8G8_UNORM : PIPE_FORMAT_Y8_UNORM;
    case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
-      return VK_FORMAT_R8_UNORM;
+      return PIPE_FORMAT_R8_UNORM;
    case VK_FORMAT_D32_SFLOAT_S8_UINT:
-      return plane ? VK_FORMAT_S8_UINT : VK_FORMAT_D32_SFLOAT;
+      return plane ? PIPE_FORMAT_S8_UINT : PIPE_FORMAT_Z32_FLOAT;
    default:
-      return format;
+      return tu_vk_format_to_pipe_format(format);
    }
 }
 
-static uint32_t
+uint32_t
 tu6_plane_index(VkFormat format, VkImageAspectFlags aspect_mask)
 {
    switch (aspect_mask) {
@@ -81,6 +81,30 @@ tu6_plane_index(VkFormat format, VkImageAspectFlags aspect_mask)
    }
 }
 
+enum pipe_format
+tu_format_for_aspect(enum pipe_format format, VkImageAspectFlags aspect_mask)
+{
+   switch (format) {
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+      if (aspect_mask & VK_IMAGE_ASPECT_COLOR_BIT)
+         return PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
+      if (aspect_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
+         if (aspect_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
+            return PIPE_FORMAT_Z24_UNORM_S8_UINT;
+         else
+            return PIPE_FORMAT_X24S8_UINT;
+      } else {
+         return PIPE_FORMAT_Z24X8_UNORM;
+      }
+   case PIPE_FORMAT_Z24X8_UNORM:
+      if (aspect_mask & VK_IMAGE_ASPECT_COLOR_BIT)
+         return PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
+      return PIPE_FORMAT_Z24X8_UNORM;
+   default:
+      return format;
+   }
+}
+
 static void
 compose_swizzle(unsigned char *swiz, const VkComponentMapping *mapping)
 {
@@ -136,11 +160,17 @@ tu6_texswiz(const VkComponentMapping *comps,
    case VK_FORMAT_D24_UNORM_S8_UINT:
       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT) {
          if (!has_z24uint_s8uint) {
-            /* using FMT6_8_8_8_8_UINT */
+            /* using FMT6_8_8_8_8_UINT, so need to pick out the W channel and
+             * swizzle (0,0,1) in the rest (see "Conversion to RGBA").
+             */
             swiz[0] = A6XX_TEX_W;
             swiz[1] = A6XX_TEX_ZERO;
+            swiz[2] = A6XX_TEX_ZERO;
+            swiz[3] = A6XX_TEX_ONE;
          } else {
-            /* using FMT6_Z24_UINT_S8_UINT */
+            /* using FMT6_Z24_UINT_S8_UINT, which is (d, s, 0, 1), so need to
+             * swizzle away the d.
+             */
             swiz[0] = A6XX_TEX_Y;
             swiz[1] = A6XX_TEX_ZERO;
          }
@@ -161,7 +191,7 @@ tu6_texswiz(const VkComponentMapping *comps,
 }
 
 void
-tu_cs_image_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
+tu_cs_image_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer)
 {
    tu_cs_emit(cs, iview->PITCH);
    tu_cs_emit(cs, iview->layer_size >> 6);
@@ -177,7 +207,7 @@ tu_cs_image_stencil_ref(struct tu_cs *cs, const struct tu_image_view *iview, uin
 }
 
 void
-tu_cs_image_ref_2d(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer, bool src)
+tu_cs_image_ref_2d(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, bool src)
 {
    tu_cs_emit_qw(cs, iview->base_addr + iview->layer_size * layer);
    /* SP_PS_2D_SRC_PITCH has shifted pitch field */
@@ -185,7 +215,7 @@ tu_cs_image_ref_2d(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t
 }
 
 void
-tu_cs_image_flag_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
+tu_cs_image_flag_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer)
 {
    tu_cs_emit_qw(cs, iview->ubwc_addr + iview->ubwc_layer_size * layer);
    tu_cs_emit(cs, iview->FLAG_BUFFER_PITCH);
@@ -198,7 +228,7 @@ tu_image_view_init(struct tu_image_view *iview,
 {
    TU_FROM_HANDLE(tu_image, image, pCreateInfo->image);
    const VkImageSubresourceRange *range = &pCreateInfo->subresourceRange;
-   VkFormat format = pCreateInfo->format;
+   VkFormat vk_format = pCreateInfo->format;
    VkImageAspectFlagBits aspect_mask = pCreateInfo->subresourceRange.aspectMask;
 
    const struct VkSamplerYcbcrConversionInfo *ycbcr_conversion =
@@ -208,238 +238,83 @@ tu_image_view_init(struct tu_image_view *iview,
 
    iview->image = image;
 
-   memset(iview->descriptor, 0, sizeof(iview->descriptor));
+   const struct fdl_layout *layouts[3];
 
-   struct fdl_layout *layout =
-      &image->layout[tu6_plane_index(image->vk_format, aspect_mask)];
-
-   uint32_t width = u_minify(layout->width0, range->baseMipLevel);
-   uint32_t height = u_minify(layout->height0, range->baseMipLevel);
-   uint32_t storage_depth = tu_get_layerCount(image, range);
-   if (pCreateInfo->viewType == VK_IMAGE_VIEW_TYPE_3D) {
-      storage_depth = u_minify(image->layout[0].depth0, range->baseMipLevel);
-   }
-
-   uint32_t depth = storage_depth;
-   if (pCreateInfo->viewType == VK_IMAGE_VIEW_TYPE_CUBE ||
-       pCreateInfo->viewType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY) {
-      /* Cubes are treated as 2D arrays for storage images, so only divide the
-       * depth by 6 for the texture descriptor.
-       */
-      depth /= 6;
-   }
-
-   uint64_t base_addr = image->bo->iova + image->bo_offset +
-      fdl_surface_offset(layout, range->baseMipLevel, range->baseArrayLayer);
-   uint64_t ubwc_addr = image->bo->iova + image->bo_offset +
-      fdl_ubwc_offset(layout, range->baseMipLevel, range->baseArrayLayer);
-
-   uint32_t pitch = fdl_pitch(layout, range->baseMipLevel);
-   uint32_t ubwc_pitch = fdl_ubwc_pitch(layout, range->baseMipLevel);
-   uint32_t layer_size = fdl_layer_stride(layout, range->baseMipLevel);
+   layouts[0] = &image->layout[tu6_plane_index(image->vk_format, aspect_mask)];
 
+   enum pipe_format format;
    if (aspect_mask != VK_IMAGE_ASPECT_COLOR_BIT)
-      format = tu6_plane_format(format, tu6_plane_index(format, aspect_mask));
+      format = tu6_plane_format(vk_format, tu6_plane_index(vk_format, aspect_mask));
+   else
+      format = tu_vk_format_to_pipe_format(vk_format);
 
-   struct tu_native_format fmt = tu6_format_texture(format, layout->tile_mode);
-   /* note: freedreno layout assumes no TILE_ALL bit for non-UBWC color formats
-    * this means smaller mipmap levels have a linear tile mode.
-    * Depth/stencil formats have non-linear tile mode.
-    */
-   fmt.tile_mode = fdl_tile_mode(layout, range->baseMipLevel);
-
-   bool ubwc_enabled = fdl_ubwc_enabled(layout, range->baseMipLevel);
-
-   bool is_d24s8 = (format == VK_FORMAT_D24_UNORM_S8_UINT ||
-                    format == VK_FORMAT_X8_D24_UNORM_PACK32);
-
-   if (is_d24s8 && ubwc_enabled)
-      fmt.fmt = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
-
-   unsigned fmt_tex = fmt.fmt;
-   if (is_d24s8) {
-      if (aspect_mask & VK_IMAGE_ASPECT_DEPTH_BIT)
-         fmt_tex = FMT6_Z24_UNORM_S8_UINT;
-      if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
-         fmt_tex = has_z24uint_s8uint ? FMT6_Z24_UINT_S8_UINT : FMT6_8_8_8_8_UINT;
-      /* TODO: also use this format with storage descriptor ? */
-   }
-
-   iview->descriptor[0] =
-      A6XX_TEX_CONST_0_TILE_MODE(fmt.tile_mode) |
-      COND(vk_format_is_srgb(format), A6XX_TEX_CONST_0_SRGB) |
-      A6XX_TEX_CONST_0_FMT(fmt_tex) |
-      A6XX_TEX_CONST_0_SAMPLES(tu_msaa_samples(layout->nr_samples)) |
-      A6XX_TEX_CONST_0_SWAP(fmt.swap) |
-      tu6_texswiz(&pCreateInfo->components, conversion, format, aspect_mask, has_z24uint_s8uint) |
-      A6XX_TEX_CONST_0_MIPLVLS(tu_get_levelCount(image, range) - 1);
-   iview->descriptor[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
-   iview->descriptor[2] =
-      A6XX_TEX_CONST_2_PITCHALIGN(layout->pitchalign - 6) |
-      A6XX_TEX_CONST_2_PITCH(pitch) |
-      A6XX_TEX_CONST_2_TYPE(tu6_tex_type(pCreateInfo->viewType, false));
-   iview->descriptor[3] = A6XX_TEX_CONST_3_ARRAY_PITCH(layer_size);
-   iview->descriptor[4] = base_addr;
-   iview->descriptor[5] = (base_addr >> 32) | A6XX_TEX_CONST_5_DEPTH(depth);
-
-   if (layout->tile_all)
-      iview->descriptor[3] |= A6XX_TEX_CONST_3_TILE_ALL;
-
-   if (format == VK_FORMAT_G8_B8R8_2PLANE_420_UNORM ||
-       format == VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM) {
-      /* chroma offset re-uses MIPLVLS bits */
-      assert(tu_get_levelCount(image, range) == 1);
-      if (conversion) {
-         if (conversion->chroma_offsets[0] == VK_CHROMA_LOCATION_MIDPOINT)
-            iview->descriptor[0] |= A6XX_TEX_CONST_0_CHROMA_MIDPOINT_X;
-         if (conversion->chroma_offsets[1] == VK_CHROMA_LOCATION_MIDPOINT)
-            iview->descriptor[0] |= A6XX_TEX_CONST_0_CHROMA_MIDPOINT_Y;
-      }
-
-      uint64_t base_addr[3];
-
-      iview->descriptor[3] |= A6XX_TEX_CONST_3_TILE_ALL;
-      if (ubwc_enabled) {
-         iview->descriptor[3] |= A6XX_TEX_CONST_3_FLAG;
-         /* no separate ubwc base, image must have the expected layout */
-         for (uint32_t i = 0; i < 3; i++) {
-            base_addr[i] = image->bo->iova + image->bo_offset +
-               fdl_ubwc_offset(&image->layout[i], range->baseMipLevel, range->baseArrayLayer);
-         }
+   if (image->vk_format == VK_FORMAT_G8_B8R8_2PLANE_420_UNORM &&
+       aspect_mask == VK_IMAGE_ASPECT_PLANE_0_BIT) {
+      if (vk_format == VK_FORMAT_R8_UNORM) {
+         /* The 0'th plane of this format has a different UBWC compression. */
+         format = PIPE_FORMAT_Y8_UNORM;
       } else {
-         for (uint32_t i = 0; i < 3; i++) {
-            base_addr[i] = image->bo->iova + image->bo_offset +
-               fdl_surface_offset(&image->layout[i], range->baseMipLevel, range->baseArrayLayer);
-         }
+         /* If the user wants to reinterpret this plane, then they should've
+          * set MUTABLE_FORMAT_BIT which should disable UBWC and tiling.
+          */
+         assert(!layouts[0]->ubwc);
       }
-
-      iview->descriptor[4] = base_addr[0];
-      iview->descriptor[5] |= base_addr[0] >> 32;
-      iview->descriptor[6] =
-         A6XX_TEX_CONST_6_PLANE_PITCH(fdl_pitch(&image->layout[1], range->baseMipLevel));
-      iview->descriptor[7] = base_addr[1];
-      iview->descriptor[8] = base_addr[1] >> 32;
-      iview->descriptor[9] = base_addr[2];
-      iview->descriptor[10] = base_addr[2] >> 32;
-
-      assert(pCreateInfo->viewType != VK_IMAGE_VIEW_TYPE_3D);
-      return;
    }
 
-   if (ubwc_enabled) {
-      uint32_t block_width, block_height;
-      fdl6_get_ubwc_blockwidth(layout, &block_width, &block_height);
-
-      iview->descriptor[3] |= A6XX_TEX_CONST_3_FLAG;
-      iview->descriptor[7] = ubwc_addr;
-      iview->descriptor[8] = ubwc_addr >> 32;
-      iview->descriptor[9] |= A6XX_TEX_CONST_9_FLAG_BUFFER_ARRAY_PITCH(layout->ubwc_layer_size >> 2);
-      iview->descriptor[10] |=
-         A6XX_TEX_CONST_10_FLAG_BUFFER_PITCH(ubwc_pitch) |
-         A6XX_TEX_CONST_10_FLAG_BUFFER_LOGW(util_logbase2_ceil(DIV_ROUND_UP(width, block_width))) |
-         A6XX_TEX_CONST_10_FLAG_BUFFER_LOGH(util_logbase2_ceil(DIV_ROUND_UP(height, block_height)));
+   if (aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT &&
+       (vk_format == VK_FORMAT_G8_B8R8_2PLANE_420_UNORM ||
+        vk_format == VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM)) {
+      layouts[1] = &image->layout[1];
+      layouts[2] = &image->layout[2];
    }
 
-   if (pCreateInfo->viewType == VK_IMAGE_VIEW_TYPE_3D) {
-      iview->descriptor[3] |=
-         A6XX_TEX_CONST_3_MIN_LAYERSZ(layout->slices[image->level_count - 1].size0);
+   struct fdl_view_args args = {};
+   args.iova = image->bo->iova + image->bo_offset;
+   args.base_array_layer = range->baseArrayLayer;
+   args.base_miplevel = range->baseMipLevel;
+   args.layer_count = tu_get_layerCount(image, range);
+   args.level_count = tu_get_levelCount(image, range);
+   args.format = tu_format_for_aspect(format, aspect_mask);
+   vk_component_mapping_to_pipe_swizzle(pCreateInfo->components, args.swiz);
+   if (conversion) {
+      unsigned char conversion_swiz[4], create_swiz[4];
+      memcpy(create_swiz, args.swiz, sizeof(create_swiz));
+      vk_component_mapping_to_pipe_swizzle(conversion->components,
+                                           conversion_swiz);
+      util_format_compose_swizzles(create_swiz, conversion_swiz, args.swiz);
    }
 
-   iview->SP_PS_2D_SRC_INFO = A6XX_SP_PS_2D_SRC_INFO(
-      .color_format = fmt.fmt,
-      .tile_mode = fmt.tile_mode,
-      .color_swap = fmt.swap,
-      .flags = ubwc_enabled,
-      .srgb = vk_format_is_srgb(format),
-      .samples = tu_msaa_samples(layout->nr_samples),
-      .samples_average = layout->nr_samples > 1 &&
-                           !vk_format_is_int(format) &&
-                           !vk_format_is_depth_or_stencil(format),
-      .unk20 = 1,
-      .unk22 = 1).value;
-   iview->SP_PS_2D_SRC_SIZE =
-      A6XX_SP_PS_2D_SRC_SIZE(.width = width, .height = height).value;
-
-   /* note: these have same encoding for MRT and 2D (except 2D PITCH src) */
-   iview->PITCH = A6XX_RB_DEPTH_BUFFER_PITCH(pitch).value;
-   iview->FLAG_BUFFER_PITCH = A6XX_RB_DEPTH_FLAG_BUFFER_PITCH(
-      .pitch = ubwc_pitch, .array_pitch = layout->ubwc_layer_size >> 2).value;
-
-   iview->base_addr = base_addr;
-   iview->ubwc_addr = ubwc_addr;
-   iview->layer_size = layer_size;
-   iview->ubwc_layer_size = layout->ubwc_layer_size;
-
-   /* Don't set fields that are only used for attachments/blit dest if COLOR
-    * is unsupported.
-    */
-   if (!(fmt.supported & FMT_COLOR))
-      return;
-
-   struct tu_native_format cfmt = tu6_format_color(format, layout->tile_mode);
-   cfmt.tile_mode = fmt.tile_mode;
-
-   if (is_d24s8 && ubwc_enabled)
-      cfmt.fmt = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
-
-   memset(iview->storage_descriptor, 0, sizeof(iview->storage_descriptor));
-
-   iview->storage_descriptor[0] =
-      A6XX_IBO_0_FMT(fmt.fmt) |
-      A6XX_IBO_0_TILE_MODE(fmt.tile_mode);
-   iview->storage_descriptor[1] =
-      A6XX_IBO_1_WIDTH(width) |
-      A6XX_IBO_1_HEIGHT(height);
-   iview->storage_descriptor[2] =
-      A6XX_IBO_2_PITCH(pitch) |
-      A6XX_IBO_2_TYPE(tu6_tex_type(pCreateInfo->viewType, true));
-   iview->storage_descriptor[3] = A6XX_IBO_3_ARRAY_PITCH(layer_size);
-
-   iview->storage_descriptor[4] = base_addr;
-   iview->storage_descriptor[5] = (base_addr >> 32) | A6XX_IBO_5_DEPTH(storage_depth);
-
-   if (ubwc_enabled) {
-      iview->storage_descriptor[3] |= A6XX_IBO_3_FLAG | A6XX_IBO_3_UNK27;
-      iview->storage_descriptor[7] |= ubwc_addr;
-      iview->storage_descriptor[8] |= ubwc_addr >> 32;
-      iview->storage_descriptor[9] = A6XX_IBO_9_FLAG_BUFFER_ARRAY_PITCH(layout->ubwc_layer_size >> 2);
-      iview->storage_descriptor[10] =
-         A6XX_IBO_10_FLAG_BUFFER_PITCH(ubwc_pitch);
+   switch (pCreateInfo->viewType) {
+   case VK_IMAGE_VIEW_TYPE_1D:
+   case VK_IMAGE_VIEW_TYPE_1D_ARRAY:
+      args.type = FDL_VIEW_TYPE_1D;
+      break;
+   case VK_IMAGE_VIEW_TYPE_2D:
+   case VK_IMAGE_VIEW_TYPE_2D_ARRAY:
+      args.type = FDL_VIEW_TYPE_2D;
+      break;
+   case VK_IMAGE_VIEW_TYPE_CUBE:
+   case VK_IMAGE_VIEW_TYPE_CUBE_ARRAY:
+      args.type = FDL_VIEW_TYPE_CUBE;
+      break;
+   case VK_IMAGE_VIEW_TYPE_3D:
+      args.type = FDL_VIEW_TYPE_3D;
+      break;
+   default:
+      unreachable("unknown view type");
    }
 
-   iview->extent.width = width;
-   iview->extent.height = height;
-   iview->need_y2_align =
-      (fmt.tile_mode == TILE6_LINEAR && range->baseMipLevel != image->level_count - 1);
+   STATIC_ASSERT((unsigned)VK_CHROMA_LOCATION_COSITED_EVEN == (unsigned)FDL_CHROMA_LOCATION_COSITED_EVEN);
+   STATIC_ASSERT((unsigned)VK_CHROMA_LOCATION_MIDPOINT == (unsigned)FDL_CHROMA_LOCATION_MIDPOINT);
+   if (conversion) {
+      args.chroma_offsets[0] = (enum fdl_chroma_location) conversion->chroma_offsets[0];
+      args.chroma_offsets[1] = (enum fdl_chroma_location) conversion->chroma_offsets[1];
+   }
 
-   iview->ubwc_enabled = ubwc_enabled;
-
-   iview->RB_MRT_BUF_INFO = A6XX_RB_MRT_BUF_INFO(0,
-                              .color_tile_mode = cfmt.tile_mode,
-                              .color_format = cfmt.fmt,
-                              .color_swap = cfmt.swap).value;
-
-   iview->SP_FS_MRT_REG = A6XX_SP_FS_MRT_REG(0,
-                              .color_format = cfmt.fmt,
-                              .color_sint = vk_format_is_sint(format),
-                              .color_uint = vk_format_is_uint(format)).value;
-
-   iview->RB_2D_DST_INFO = A6XX_RB_2D_DST_INFO(
-      .color_format = cfmt.fmt,
-      .tile_mode = cfmt.tile_mode,
-      .color_swap = cfmt.swap,
-      .flags = ubwc_enabled,
-      .srgb = vk_format_is_srgb(format)).value;
-
-   iview->RB_BLIT_DST_INFO = A6XX_RB_BLIT_DST_INFO(
-      .tile_mode = cfmt.tile_mode,
-      .samples = tu_msaa_samples(layout->nr_samples),
-      .color_format = cfmt.fmt,
-      .color_swap = cfmt.swap,
-      .flags = ubwc_enabled).value;
+   fdl6_view_init(&iview->view, layouts, &args, has_z24uint_s8uint);
 
    if (image->vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
-      layout = &image->layout[1];
+      struct fdl_layout *layout = &image->layout[1];
       iview->stencil_base_addr = image->bo->iova + image->bo_offset +
          fdl_surface_offset(layout, range->baseMipLevel, range->baseArrayLayer);
       iview->stencil_layer_size = fdl_layer_stride(layout, range->baseMipLevel);
@@ -447,6 +322,17 @@ tu_image_view_init(struct tu_image_view *iview,
    }
 }
 
+bool
+tiling_possible(VkFormat format)
+{
+   if (format == VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM ||
+       format == VK_FORMAT_G8B8G8R8_422_UNORM ||
+       format == VK_FORMAT_B8G8R8G8_422_UNORM)
+      return false;
+
+   return true;
+}
+
 bool
 ubwc_possible(VkFormat format, VkImageType type, VkImageUsageFlags usage,
               VkImageUsageFlags stencil_usage, const struct fd_dev_info *info,
@@ -561,7 +447,7 @@ tu_CreateImage(VkDevice _device,
    image = vk_object_zalloc(&device->vk, alloc, sizeof(*image),
                             VK_OBJECT_TYPE_IMAGE);
    if (!image)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    const VkExternalMemoryImageCreateInfo *external_info =
       vk_find_struct_const(pCreateInfo->pNext, EXTERNAL_MEMORY_IMAGE_CREATE_INFO);
@@ -581,6 +467,12 @@ tu_CreateImage(VkDevice _device,
       ubwc_enabled = false;
    }
 
+   /* Force linear tiling for formats with "fake" optimalTilingFeatures */
+   if (!tiling_possible(image->vk_format)) {
+      tile_mode = TILE6_LINEAR;
+      ubwc_enabled = false;
+   }
+
    /* Mutable images can be reinterpreted as any other compatible format.
     * This is a problem with UBWC (compression for different formats is different),
     * but also tiling ("swap" affects how tiled formats are stored in memory)
@@ -601,7 +493,7 @@ tu_CreateImage(VkDevice _device,
       if (fmt_list) {
          may_be_swapped = false;
          for (uint32_t i = 0; i < fmt_list->viewFormatCount; i++) {
-            if (tu6_format_texture(fmt_list->pViewFormats[i], TILE6_LINEAR).swap) {
+            if (tu6_format_texture(tu_vk_format_to_pipe_format(fmt_list->pViewFormats[i]), TILE6_LINEAR).swap) {
                may_be_swapped = true;
                break;
             }
@@ -625,7 +517,7 @@ tu_CreateImage(VkDevice _device,
 
    for (uint32_t i = 0; i < tu6_plane_count(image->vk_format); i++) {
       struct fdl_layout *layout = &image->layout[i];
-      VkFormat format = tu6_plane_format(image->vk_format, i);
+      enum pipe_format format = tu6_plane_format(image->vk_format, i);
       uint32_t width0 = pCreateInfo->extent.width;
       uint32_t height0 = pCreateInfo->extent.height;
 
@@ -663,7 +555,7 @@ tu_CreateImage(VkDevice _device,
       layout->tile_mode = tile_mode;
       layout->ubwc = ubwc_enabled;
 
-      if (!fdl6_layout(layout, vk_format_to_pipe_format(format),
+      if (!fdl6_layout(layout, format,
                        pCreateInfo->samples,
                        width0, height0,
                        pCreateInfo->extent.depth,
@@ -730,7 +622,7 @@ tu_CreateImage(VkDevice _device,
 
 invalid_layout:
    vk_object_free(&device->vk, alloc, image);
-   return vk_error(device->instance, VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT);
+   return vk_error(device, VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT);
 }
 
 VKAPI_ATTR void VKAPI_CALL
@@ -812,7 +704,7 @@ tu_CreateImageView(VkDevice _device,
    view = vk_object_alloc(&device->vk, pAllocator, sizeof(*view),
                           VK_OBJECT_TYPE_IMAGE_VIEW);
    if (view == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    tu_image_view_init(view, pCreateInfo, device->physical_device->info->a6xx.has_z24uint_s8uint);
 
@@ -845,8 +737,8 @@ tu_buffer_view_init(struct tu_buffer_view *view,
    view->buffer = buffer;
 
    enum VkFormat vfmt = pCreateInfo->format;
-   enum pipe_format pfmt = vk_format_to_pipe_format(vfmt);
-   const struct tu_native_format fmt = tu6_format_texture(vfmt, TILE6_LINEAR);
+   enum pipe_format pfmt = tu_vk_format_to_pipe_format(vfmt);
+   const struct tu_native_format fmt = tu6_format_texture(pfmt, TILE6_LINEAR);
 
    uint32_t range;
    if (pCreateInfo->range == VK_WHOLE_SIZE)
@@ -895,7 +787,7 @@ tu_CreateBufferView(VkDevice _device,
    view = vk_object_alloc(&device->vk, pAllocator, sizeof(*view),
                           VK_OBJECT_TYPE_BUFFER_VIEW);
    if (!view)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    tu_buffer_view_init(view, device, pCreateInfo);
 
diff --git a/mesa 3D driver/src/freedreno/vulkan/tu_kgsl.c b/mesa 3D driver/src/freedreno/vulkan/tu_kgsl.c
index 8249b55540..2017a0a130 100644
--- a/mesa 3D driver/src/freedreno/vulkan/tu_kgsl.c	
+++ b/mesa 3D driver/src/freedreno/vulkan/tu_kgsl.c	
@@ -96,7 +96,7 @@ tu_bo_init_new(struct tu_device *dev, struct tu_bo *bo, uint64_t size,
    ret = safe_ioctl(dev->physical_device->local_fd,
                     IOCTL_KGSL_GPUMEM_ALLOC_ID, &req);
    if (ret) {
-      return vk_errorf(dev->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY,
+      return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                        "GPUMEM_ALLOC_ID failed (%s)", strerror(errno));
    }
 
@@ -129,7 +129,7 @@ tu_bo_init_dmabuf(struct tu_device *dev,
    ret = safe_ioctl(dev->physical_device->local_fd,
                     IOCTL_KGSL_GPUOBJ_IMPORT, &req);
    if (ret)
-      return vk_errorf(dev->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY,
+      return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                        "Failed to import dma-buf (%s)\n", strerror(errno));
 
    struct kgsl_gpuobj_info info_req = {
@@ -139,7 +139,7 @@ tu_bo_init_dmabuf(struct tu_device *dev,
    ret = safe_ioctl(dev->physical_device->local_fd,
                     IOCTL_KGSL_GPUOBJ_INFO, &info_req);
    if (ret)
-      return vk_errorf(dev->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY,
+      return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                        "Failed to get dma-buf info (%s)\n", strerror(errno));
 
    *bo = (struct tu_bo) {
@@ -169,7 +169,7 @@ tu_bo_map(struct tu_device *dev, struct tu_bo *bo)
    void *map = mmap(0, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED,
                     dev->physical_device->local_fd, offset);
    if (map == MAP_FAILED)
-      return vk_error(dev->instance, VK_ERROR_MEMORY_MAP_FAILED);
+      return vk_error(dev, VK_ERROR_MEMORY_MAP_FAILED);
 
    bo->map = map;
 
@@ -366,7 +366,7 @@ tu_QueueSubmit(VkQueue _queue,
                sizeof(cmds[0]) * max_entry_count, 8,
                VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
    if (cmds == NULL)
-      return vk_error(queue->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    for (uint32_t i = 0; i < submitCount; ++i) {
       const VkSubmitInfo *submit = pSubmits + i;
@@ -482,7 +482,7 @@ sync_create(VkDevice _device,
          vk_object_alloc(&device->vk, pAllocator, sizeof(*sync),
                          fence ? VK_OBJECT_TYPE_FENCE : VK_OBJECT_TYPE_SEMAPHORE);
    if (!sync)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    if (signaled)
       tu_finishme("CREATE FENCE SIGNALED");
@@ -656,6 +656,20 @@ tu_device_submit_deferred_locked(struct tu_device *dev)
    return VK_SUCCESS;
 }
 
+VkResult
+tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj)
+{
+   tu_finishme("tu_device_wait_u_trace");
+   return VK_SUCCESS;
+}
+
+int
+tu_drm_get_timestamp(struct tu_physical_device *device, uint64_t *ts)
+{
+   tu_finishme("tu_drm_get_timestamp");
+   return 0;
+}
+
 #ifdef ANDROID
 VKAPI_ATTR VkResult VKAPI_CALL
 tu_QueueSignalReleaseImageANDROID(VkQueue _queue,
diff --git a/mesa 3D driver/src/freedreno/vulkan/tu_pass.c b/mesa 3D driver/src/freedreno/vulkan/tu_pass.c
index feef006c3c..c9a6325ea4 100644
--- a/mesa 3D driver/src/freedreno/vulkan/tu_pass.c	
+++ b/mesa 3D driver/src/freedreno/vulkan/tu_pass.c	
@@ -99,15 +99,6 @@ tu_render_pass_add_subpass_dep(struct tu_render_pass *pass,
    if (dep_invalid_for_gmem(dep))
       pass->gmem_pixels = 0;
 
-   struct tu_subpass_barrier *src_barrier;
-   if (src == VK_SUBPASS_EXTERNAL) {
-      src_barrier = &pass->subpasses[0].start_barrier;
-   } else if (src == pass->subpass_count - 1) {
-      src_barrier = &pass->end_barrier;
-   } else {
-      src_barrier = &pass->subpasses[src + 1].start_barrier;
-   }
-
    struct tu_subpass_barrier *dst_barrier;
    if (dst == VK_SUBPASS_EXTERNAL) {
       dst_barrier = &pass->end_barrier;
@@ -115,9 +106,9 @@ tu_render_pass_add_subpass_dep(struct tu_render_pass *pass,
       dst_barrier = &pass->subpasses[dst].start_barrier;
    }
 
-   if (dep->dstStageMask != VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT)
-      src_barrier->src_stage_mask |= dep->srcStageMask;
-   src_barrier->src_access_mask |= dep->srcAccessMask;
+   dst_barrier->src_stage_mask |= dep->srcStageMask;
+   dst_barrier->dst_stage_mask |= dep->dstStageMask;
+   dst_barrier->src_access_mask |= dep->srcAccessMask;
    dst_barrier->dst_access_mask |= dep->dstAccessMask;
 }
 
@@ -651,7 +642,7 @@ tu_CreateRenderPass2(VkDevice _device,
    pass = vk_object_zalloc(&device->vk, pAllocator, size,
                            VK_OBJECT_TYPE_RENDER_PASS);
    if (pass == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    pass->attachment_count = pCreateInfo->attachmentCount;
    pass->subpass_count = pCreateInfo->subpassCount;
@@ -671,9 +662,19 @@ tu_CreateRenderPass2(VkDevice _device,
          att->cpp = vk_format_get_blocksize(att->format) * att->samples;
       att->gmem_offset = -1;
 
+      VkAttachmentLoadOp loadOp = pCreateInfo->pAttachments[i].loadOp;
+      VkAttachmentLoadOp stencilLoadOp = pCreateInfo->pAttachments[i].stencilLoadOp;
+
+      if (device->instance->debug_flags & TU_DEBUG_DONT_CARE_AS_LOAD) {
+         if (loadOp == VK_ATTACHMENT_LOAD_OP_DONT_CARE)
+            loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
+         if (stencilLoadOp == VK_ATTACHMENT_LOAD_OP_DONT_CARE)
+            stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
+      }
+
       attachment_set_ops(att,
-                         pCreateInfo->pAttachments[i].loadOp,
-                         pCreateInfo->pAttachments[i].stencilLoadOp,
+                         loadOp,
+                         stencilLoadOp,
                          pCreateInfo->pAttachments[i].storeOp,
                          pCreateInfo->pAttachments[i].stencilStoreOp);
    }
@@ -697,7 +698,7 @@ tu_CreateRenderPass2(VkDevice _device,
          VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
       if (pass->subpass_attachments == NULL) {
          vk_object_free(&device->vk, pAllocator, pass);
-         return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
       }
    } else
       pass->subpass_attachments = NULL;
@@ -775,6 +776,8 @@ tu_CreateRenderPass2(VkDevice _device,
       if (a != VK_ATTACHMENT_UNUSED) {
             pass->attachments[a].gmem_offset = 0;
             update_samples(subpass, pCreateInfo->pAttachments[a].samples);
+
+            pass->attachments[a].clear_views |= subpass->multiview_mask;
       }
    }
 
diff --git a/mesa 3D driver/src/freedreno/vulkan/tu_perfetto.cc b/mesa 3D driver/src/freedreno/vulkan/tu_perfetto.cc
new file mode 100644
index 0000000000..656b0b978d
--- /dev/null
+++ b/mesa 3D driver/src/freedreno/vulkan/tu_perfetto.cc	
@@ -0,0 +1,296 @@
+/*
+ * Copyright © 2021 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <perfetto.h>
+
+#include "tu_perfetto.h"
+
+#include "util/u_perfetto.h"
+#include "util/hash_table.h"
+
+#include "tu_tracepoints.h"
+#include "tu_tracepoints_perfetto.h"
+
+static uint32_t gpu_clock_id;
+static uint64_t next_clock_sync_ns; /* cpu time of next clk sync */
+
+/**
+ * The timestamp at the point where we first emitted the clock_sync..
+ * this  will be a *later* timestamp that the first GPU traces (since
+ * we capture the first clock_sync from the CPU *after* the first GPU
+ * tracepoints happen).  To avoid confusing perfetto we need to drop
+ * the GPU traces with timestamps before this.
+ */
+static uint64_t sync_gpu_ts;
+
+struct TuRenderpassIncrementalState {
+   bool was_cleared = true;
+};
+
+struct TuRenderpassTraits : public perfetto::DefaultDataSourceTraits {
+   using IncrementalStateType = TuRenderpassIncrementalState;
+};
+
+class TuRenderpassDataSource : public perfetto::DataSource<TuRenderpassDataSource, TuRenderpassTraits> {
+public:
+   void OnSetup(const SetupArgs &) override
+   {
+      // Use this callback to apply any custom configuration to your data source
+      // based on the TraceConfig in SetupArgs.
+   }
+
+   void OnStart(const StartArgs &) override
+   {
+      // This notification can be used to initialize the GPU driver, enable
+      // counters, etc. StartArgs will contains the DataSourceDescriptor,
+      // which can be extended.
+      u_trace_perfetto_start();
+      PERFETTO_LOG("Tracing started");
+
+      /* Note: clock_id's below 128 are reserved.. for custom clock sources,
+       * using the hash of a namespaced string is the recommended approach.
+       * See: https://perfetto.dev/docs/concepts/clock-sync
+       */
+      gpu_clock_id =
+         _mesa_hash_string("org.freedesktop.mesa.freedreno") | 0x80000000;
+   }
+
+   void OnStop(const StopArgs &) override
+   {
+      PERFETTO_LOG("Tracing stopped");
+
+      // Undo any initialization done in OnStart.
+      u_trace_perfetto_stop();
+      // TODO we should perhaps block until queued traces are flushed?
+
+      Trace([](TuRenderpassDataSource::TraceContext ctx) {
+         auto packet = ctx.NewTracePacket();
+         packet->Finalize();
+         ctx.Flush();
+      });
+   }
+};
+
+PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(TuRenderpassDataSource);
+PERFETTO_DEFINE_DATA_SOURCE_STATIC_MEMBERS(TuRenderpassDataSource);
+
+static void
+send_descriptors(TuRenderpassDataSource::TraceContext &ctx, uint64_t ts_ns)
+{
+   PERFETTO_LOG("Sending renderstage descriptors");
+
+   auto packet = ctx.NewTracePacket();
+
+   packet->set_timestamp(0);
+
+   auto event = packet->set_gpu_render_stage_event();
+   event->set_gpu_id(0);
+
+   auto spec = event->set_specifications();
+
+   for (unsigned i = 0; i < ARRAY_SIZE(queues); i++) {
+      auto desc = spec->add_hw_queue();
+
+      desc->set_name(queues[i].name);
+      desc->set_description(queues[i].desc);
+   }
+
+   for (unsigned i = 0; i < ARRAY_SIZE(stages); i++) {
+      auto desc = spec->add_stage();
+
+      desc->set_name(stages[i].name);
+      if (stages[i].desc)
+         desc->set_description(stages[i].desc);
+   }
+}
+
+static void
+stage_start(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage)
+{
+   struct tu_perfetto_state *p = tu_device_get_perfetto_state(dev);
+
+   p->start_ts[stage] = ts_ns;
+}
+
+typedef void (*trace_payload_as_extra_func)(perfetto::protos::pbzero::GpuRenderStageEvent *, const void*);
+
+static void
+stage_end(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage,
+          uint32_t submission_id, const void* payload = nullptr,
+          trace_payload_as_extra_func payload_as_extra = nullptr)
+{
+   struct tu_perfetto_state *p = tu_device_get_perfetto_state(dev);
+
+   /* If we haven't managed to calibrate the alignment between GPU and CPU
+    * timestamps yet, then skip this trace, otherwise perfetto won't know
+    * what to do with it.
+    */
+   if (!sync_gpu_ts)
+      return;
+
+   TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) {
+      if (auto state = tctx.GetIncrementalState(); state->was_cleared) {
+         send_descriptors(tctx, p->start_ts[stage]);
+         state->was_cleared = false;
+      }
+
+      auto packet = tctx.NewTracePacket();
+
+      packet->set_timestamp(p->start_ts[stage]);
+      packet->set_timestamp_clock_id(gpu_clock_id);
+
+      auto event = packet->set_gpu_render_stage_event();
+      event->set_event_id(0); // ???
+      event->set_hw_queue_id(DEFAULT_HW_QUEUE_ID);
+      event->set_duration(ts_ns - p->start_ts[stage]);
+      event->set_stage_id(stage);
+      event->set_context((uintptr_t)dev);
+      event->set_submission_id(submission_id);
+
+      if (payload && payload_as_extra) {
+         payload_as_extra(event, payload);
+      }
+   });
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void
+tu_perfetto_init(void)
+{
+   util_perfetto_init();
+
+   perfetto::DataSourceDescriptor dsd;
+   dsd.set_name("gpu.renderstages.msm");
+   TuRenderpassDataSource::Register(dsd);
+}
+
+static void
+sync_timestamp(struct tu_device *dev)
+{
+   uint64_t cpu_ts = perfetto::base::GetBootTimeNs().count();
+   uint64_t gpu_ts = 0;
+
+   if (cpu_ts < next_clock_sync_ns)
+      return;
+
+    if (tu_device_get_timestamp(dev, &gpu_ts)) {
+      PERFETTO_ELOG("Could not sync CPU and GPU clocks");
+      return;
+    }
+
+   /* convert GPU ts into ns: */
+   gpu_ts = tu_device_ticks_to_ns(dev, gpu_ts);
+
+   TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) {
+      auto packet = tctx.NewTracePacket();
+
+      packet->set_timestamp(cpu_ts);
+
+      auto event = packet->set_clock_snapshot();
+
+      {
+         auto clock = event->add_clocks();
+
+         clock->set_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME);
+         clock->set_timestamp(cpu_ts);
+      }
+
+      {
+         auto clock = event->add_clocks();
+
+         clock->set_clock_id(gpu_clock_id);
+         clock->set_timestamp(gpu_ts);
+      }
+
+      sync_gpu_ts = gpu_ts;
+      next_clock_sync_ns = cpu_ts + 30000000;
+   });
+}
+
+static void
+emit_submit_id(uint32_t submission_id)
+{
+   TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) {
+      auto packet = tctx.NewTracePacket();
+
+      packet->set_timestamp(perfetto::base::GetBootTimeNs().count());
+
+      auto event = packet->set_vulkan_api_event();
+      auto submit = event->set_vk_queue_submit();
+
+      submit->set_submission_id(submission_id);
+   });
+}
+
+void
+tu_perfetto_submit(struct tu_device *dev, uint32_t submission_id)
+{
+   sync_timestamp(dev);
+   emit_submit_id(submission_id);
+}
+
+/*
+ * Trace callbacks, called from u_trace once the timestamps from GPU have been
+ * collected.
+ */
+
+#define CREATE_EVENT_CALLBACK(event_name, stage)                              \
+void                                                                          \
+tu_start_##event_name(struct tu_device *dev, uint64_t ts_ns,                  \
+                   const void *flush_data,                                    \
+                   const struct trace_start_##event_name *payload)            \
+{                                                                             \
+   stage_start(dev, ts_ns, stage);                                            \
+}                                                                             \
+                                                                              \
+void                                                                          \
+tu_end_##event_name(struct tu_device *dev, uint64_t ts_ns,                    \
+                   const void *flush_data,                                    \
+                   const struct trace_end_##event_name *payload)              \
+{                                                                             \
+   auto trace_flush_data = (const struct tu_u_trace_flush_data *) flush_data; \
+   uint32_t submission_id =                                                   \
+      tu_u_trace_flush_data_get_submit_id(trace_flush_data);                  \
+   stage_end(dev, ts_ns, stage, submission_id, payload,                       \
+      (trace_payload_as_extra_func) &trace_payload_as_extra_end_##event_name);\
+}
+
+CREATE_EVENT_CALLBACK(render_pass, SURFACE_STAGE_ID)
+CREATE_EVENT_CALLBACK(binning_ib, BINNING_STAGE_ID)
+CREATE_EVENT_CALLBACK(draw_ib_gmem, GMEM_STAGE_ID)
+CREATE_EVENT_CALLBACK(draw_ib_sysmem, BYPASS_STAGE_ID)
+CREATE_EVENT_CALLBACK(blit, BLIT_STAGE_ID)
+CREATE_EVENT_CALLBACK(compute, COMPUTE_STAGE_ID)
+CREATE_EVENT_CALLBACK(gmem_clear, CLEAR_GMEM_STAGE_ID)
+CREATE_EVENT_CALLBACK(sysmem_clear, CLEAR_SYSMEM_STAGE_ID)
+CREATE_EVENT_CALLBACK(sysmem_clear_all, CLEAR_SYSMEM_STAGE_ID)
+CREATE_EVENT_CALLBACK(gmem_load, GMEM_LOAD_STAGE_ID)
+CREATE_EVENT_CALLBACK(gmem_store, GMEM_STORE_STAGE_ID)
+CREATE_EVENT_CALLBACK(sysmem_resolve, SYSMEM_RESOLVE_STAGE_ID)
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/mesa 3D driver/src/freedreno/vulkan/tu_perfetto.h b/mesa 3D driver/src/freedreno/vulkan/tu_perfetto.h
new file mode 100644
index 0000000000..b6c5b1dcca
--- /dev/null
+++ b/mesa 3D driver/src/freedreno/vulkan/tu_perfetto.h	
@@ -0,0 +1,116 @@
+/*
+ * Copyright © 2021 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef TU_PERFETTO_H_
+#define TU_PERFETTO_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef HAVE_PERFETTO
+
+/**
+ * Render-stage id's
+ */
+enum tu_stage_id {
+   SURFACE_STAGE_ID, /* Surface is a sort of meta-stage for render-target info */
+   BINNING_STAGE_ID,
+   GMEM_STAGE_ID,
+   BYPASS_STAGE_ID,
+   BLIT_STAGE_ID,
+   COMPUTE_STAGE_ID,
+   CLEAR_SYSMEM_STAGE_ID,
+   CLEAR_GMEM_STAGE_ID,
+   GMEM_LOAD_STAGE_ID,
+   GMEM_STORE_STAGE_ID,
+   SYSMEM_RESOLVE_STAGE_ID,
+   // TODO add the rest
+
+   NUM_STAGES
+};
+
+static const struct {
+   const char *name;
+   const char *desc;
+} stages[] = {
+   [SURFACE_STAGE_ID] = {"Surface"},
+   [BINNING_STAGE_ID] = {"Binning", "Perform Visibility pass and determine target bins"},
+   [GMEM_STAGE_ID]    = {"Render", "Rendering to GMEM"},
+   [BYPASS_STAGE_ID]  = {"Render", "Rendering to system memory"},
+   [BLIT_STAGE_ID]    = {"Blit", "Performing a Blit operation"},
+   [COMPUTE_STAGE_ID] = {"Compute", "Compute job"},
+   [CLEAR_SYSMEM_STAGE_ID] = {"Clear Sysmem", ""},
+   [CLEAR_GMEM_STAGE_ID] = {"Clear GMEM", "Per-tile (GMEM) clear"},
+   [GMEM_LOAD_STAGE_ID] = {"GMEM Load", "Per tile system memory to GMEM load"},
+   [GMEM_STORE_STAGE_ID] = {"GMEM Store", "Per tile GMEM to system memory store"},
+   [SYSMEM_RESOLVE_STAGE_ID] = {"SysMem Resolve", "System memory MSAA resolve"},
+   // TODO add the rest
+};
+
+/**
+ * Queue-id's
+ */
+enum {
+   DEFAULT_HW_QUEUE_ID,
+};
+
+static const struct {
+   const char *name;
+   const char *desc;
+} queues[] = {
+   [DEFAULT_HW_QUEUE_ID] = {"GPU Queue 0", "Default Adreno Hardware Queue"},
+};
+
+struct tu_perfetto_state {
+   uint64_t start_ts[NUM_STAGES];
+};
+
+void tu_perfetto_init(void);
+
+struct tu_device;
+void tu_perfetto_submit(struct tu_device *dev, uint32_t submission_id);
+
+/* Helpers */
+
+struct tu_perfetto_state *
+tu_device_get_perfetto_state(struct tu_device *dev);
+
+int
+tu_device_get_timestamp(struct tu_device *dev,
+                        uint64_t *ts);
+
+uint64_t
+tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
+
+struct tu_u_trace_flush_data;
+uint32_t
+tu_u_trace_flush_data_get_submit_id(const struct tu_u_trace_flush_data *data);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* TU_PERFETTO_H_ */
diff --git a/mesa 3D driver/src/freedreno/vulkan/tu_perfetto_util.c b/mesa 3D driver/src/freedreno/vulkan/tu_perfetto_util.c
new file mode 100644
index 0000000000..7d13678c64
--- /dev/null
+++ b/mesa 3D driver/src/freedreno/vulkan/tu_perfetto_util.c	
@@ -0,0 +1,48 @@
+/*
+ * Copyright © 2021 Igalia S.L.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "tu_private.h"
+#include "tu_perfetto.h"
+
+/* Including tu_private.h in tu_perfetto.cc doesn't work, so
+ * we need some helper methods to access tu_device.
+ */
+
+struct tu_perfetto_state *
+tu_device_get_perfetto_state(struct tu_device *dev)
+{
+    return &dev->perfetto;
+}
+
+int
+tu_device_get_timestamp(struct tu_device *dev,
+                     uint64_t *ts)
+{
+    return tu_drm_get_timestamp(dev->physical_device, ts);
+}
+
+uint32_t
+tu_u_trace_flush_data_get_submit_id(const struct tu_u_trace_flush_data *data)
+{
+    return data->submission_id;
+}
diff --git a/mesa 3D driver/src/freedreno/vulkan/tu_pipeline.c b/mesa 3D driver/src/freedreno/vulkan/tu_pipeline.c
index fc4afd9e9f..7543e4503c 100644
--- a/mesa 3D driver/src/freedreno/vulkan/tu_pipeline.c	
+++ b/mesa 3D driver/src/freedreno/vulkan/tu_pipeline.c	
@@ -257,6 +257,8 @@ struct tu_pipeline_builder
    uint64_t shader_iova[MESA_SHADER_FRAGMENT + 1];
    uint64_t binning_vs_iova;
 
+   uint32_t additional_cs_reserve_size;
+
    struct tu_pvtmem_config pvtmem;
 
    bool rasterizer_discard;
@@ -390,6 +392,32 @@ static const struct xs_config {
    },
 };
 
+static uint32_t
+tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant *xs)
+{
+   const struct ir3_const_state *const_state = ir3_const_state(xs);
+   uint32_t base = const_state->offsets.immediate;
+   int32_t size = DIV_ROUND_UP(const_state->immediates_count, 4);
+
+   /* truncate size to avoid writing constants that shader
+    * does not use:
+    */
+   size = MIN2(size + base, xs->constlen) - base;
+
+   return MAX2(size, 0) * 4;
+}
+
+/* We allocate fixed-length substreams for shader state, however some
+ * parts of the state may have unbound length. Their additional space
+ * requirements should be calculated here.
+ */
+static uint32_t
+tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant *xs)
+{
+   uint32_t size = tu_xs_get_immediates_packet_size_dwords(xs);
+   return size;
+}
+
 void
 tu6_emit_xs_config(struct tu_cs *cs,
                    gl_shader_stage stage, /* xs->type, but xs may be NULL */
@@ -529,24 +557,19 @@ tu6_emit_xs(struct tu_cs *cs,
 
    const struct ir3_const_state *const_state = ir3_const_state(xs);
    uint32_t base = const_state->offsets.immediate;
-   int size = DIV_ROUND_UP(const_state->immediates_count, 4);
+   unsigned immediate_size = tu_xs_get_immediates_packet_size_dwords(xs);
 
-   /* truncate size to avoid writing constants that shader
-    * does not use:
-    */
-   size = MIN2(size + base, xs->constlen) - base;
-
-   if (size > 0) {
-      tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + size * 4);
+   if (immediate_size > 0) {
+      tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + immediate_size);
       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
-                 CP_LOAD_STATE6_0_NUM_UNIT(size));
+                 CP_LOAD_STATE6_0_NUM_UNIT(immediate_size / 4));
       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
 
-      tu_cs_emit_array(cs, const_state->immediates, size * 4);
+      tu_cs_emit_array(cs, const_state->immediates, immediate_size);
    }
 
    if (const_state->constant_data_ubo != -1) {
@@ -611,6 +634,12 @@ tu6_emit_cs_config(struct tu_cs *cs, const struct tu_shader *shader,
    tu_cs_emit(cs, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) |
                   A6XX_SP_CS_UNKNOWN_A9B1_UNK6);
 
+   if (cs->device->physical_device->info->a6xx.has_lpac) {
+      tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_UNKNOWN_B9D0, 1);
+      tu_cs_emit(cs, A6XX_HLSQ_CS_UNKNOWN_B9D0_SHARED_SIZE(shared_size) |
+                     A6XX_HLSQ_CS_UNKNOWN_B9D0_UNK6);
+   }
+
    uint32_t local_invocation_id =
       ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
    uint32_t work_group_id =
@@ -625,6 +654,17 @@ tu6_emit_cs_config(struct tu_cs *cs, const struct tu_shader *shader,
               A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
    tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
                   A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz));
+
+   if (cs->device->physical_device->info->a6xx.has_lpac) {
+      tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 2);
+      tu_cs_emit(cs,
+                 A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
+                 A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
+                 A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
+                 A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
+      tu_cs_emit(cs, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
+                     A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz));
+   }
 }
 
 static void
@@ -657,11 +697,9 @@ tu6_emit_vs_system_values(struct tu_cs *cs,
    const uint32_t gs_primitiveid_regid = gs ?
          ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID) :
          regid(63, 0);
-   const uint32_t hs_primitiveid_regid = hs ?
+   const uint32_t vs_primitiveid_regid = hs ?
          ir3_find_sysval_regid(hs, SYSTEM_VALUE_PRIMITIVE_ID) :
-         regid(63, 0);
-   const uint32_t vs_primitiveid_regid = gs ? gs_primitiveid_regid :
-      hs_primitiveid_regid;
+         gs_primitiveid_regid;
    const uint32_t ds_primitiveid_regid = ds ?
          ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID) :
          regid(63, 0);
@@ -1326,12 +1364,6 @@ tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs)
    for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++)
       ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i);
 
-   if (VALIDREG(ij_regid[IJ_LINEAR_SAMPLE]))
-      tu_finishme("linear sample varying");
-
-   if (VALIDREG(ij_regid[IJ_LINEAR_CENTROID]))
-      tu_finishme("linear centroid varying");
-
    if (fs->num_sampler_prefetch > 0) {
       assert(VALIDREG(ij_regid[IJ_PERSP_PIXEL]));
       /* also, it seems like ij_pix is *required* to be r0.x */
@@ -1392,16 +1424,17 @@ tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs)
       else
          need_size = true;
    }
-   if (VALIDREG(ij_regid[IJ_LINEAR_PIXEL]))
-      need_size = true;
 
    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CNTL, 1);
    tu_cs_emit(cs,
          CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) |
          CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) |
          CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) |
-         COND(need_size, A6XX_GRAS_CNTL_SIZE) |
-         COND(need_size_persamp, A6XX_GRAS_CNTL_SIZE_PERSAMP) |
+         CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
+         CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_GRAS_CNTL_IJ_LINEAR_CENTROID) |
+         CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
+         COND(need_size, A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
+         COND(need_size_persamp, A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
          COND(fs->fragcoord_compmask != 0, A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask)));
 
    tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CONTROL0, 2);
@@ -1409,9 +1442,12 @@ tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs)
          CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) |
          CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) |
          CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) |
-         COND(need_size, A6XX_RB_RENDER_CONTROL0_SIZE) |
+         CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
+         CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) |
+         CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
+         COND(need_size, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
          COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) |
-         COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_SIZE_PERSAMP) |
+         COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
          COND(fs->fragcoord_compmask != 0,
                            A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask)));
    tu_cs_emit(cs,
@@ -1934,7 +1970,7 @@ tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp
 
 static uint32_t
 tu6_gras_su_cntl(const VkPipelineRasterizationStateCreateInfo *rast_info,
-                 VkSampleCountFlagBits samples,
+                 enum a5xx_line_mode line_mode,
                  bool multiview)
 {
    uint32_t gras_su_cntl = 0;
@@ -1953,8 +1989,7 @@ tu6_gras_su_cntl(const VkPipelineRasterizationStateCreateInfo *rast_info,
    if (rast_info->depthBiasEnable)
       gras_su_cntl |= A6XX_GRAS_SU_CNTL_POLY_OFFSET;
 
-   if (samples > VK_SAMPLE_COUNT_1_BIT)
-      gras_su_cntl |= A6XX_GRAS_SU_CNTL_MSAA_ENABLE;
+   gras_su_cntl |= A6XX_GRAS_SU_CNTL_LINE_MODE(line_mode);
 
    if (multiview) {
       gras_su_cntl |=
@@ -2153,9 +2188,27 @@ tu_pipeline_allocate_cs(struct tu_device *dev,
       pvtmem_bytes = MAX2(pvtmem_bytes, builder->binning_variant->pvtmem_size);
 
       size += calc_pvtmem_size(dev, NULL, pvtmem_bytes) / 4;
+
+      builder->additional_cs_reserve_size = 0;
+      for (unsigned i = 0; i < ARRAY_SIZE(builder->variants); i++) {
+         struct ir3_shader_variant *variant = builder->variants[i];
+         if (variant) {
+            builder->additional_cs_reserve_size +=
+               tu_xs_get_additional_cs_size_dwords(variant);
+
+            if (variant->binning) {
+               builder->additional_cs_reserve_size +=
+                  tu_xs_get_additional_cs_size_dwords(variant->binning);
+            }
+         }
+      }
+
+      size += builder->additional_cs_reserve_size;
    } else {
       size += compute->info.size / 4;
       size += calc_pvtmem_size(dev, NULL, compute->pvtmem_size) / 4;
+
+      size += tu_xs_get_additional_cs_size_dwords(compute);
    }
 
    tu_cs_init(&pipeline->cs, dev, TU_CS_MODE_SUB_STREAM, size);
@@ -2563,11 +2616,11 @@ tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder,
    tu6_emit_program_config(&prog_cs, builder);
    pipeline->program.config_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
 
-   tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs);
+   tu_cs_begin_sub_stream(&pipeline->cs, 512 + builder->additional_cs_reserve_size, &prog_cs);
    tu6_emit_program(&prog_cs, builder, false, pipeline);
    pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
 
-   tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs);
+   tu_cs_begin_sub_stream(&pipeline->cs, 512 + builder->additional_cs_reserve_size, &prog_cs);
    tu6_emit_program(&prog_cs, builder, true, pipeline);
    pipeline->program.binning_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
 
@@ -2596,7 +2649,11 @@ tu_pipeline_builder_parse_vertex_input(struct tu_pipeline_builder *builder,
    const struct ir3_shader_variant *vs = builder->variants[MESA_SHADER_VERTEX];
    const struct ir3_shader_variant *bs = builder->binning_variant;
 
-   pipeline->num_vbs = vi_info->vertexBindingDescriptionCount;
+   /* Bindings may contain holes */
+   for (unsigned i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
+      pipeline->num_vbs =
+         MAX2(pipeline->num_vbs, vi_info->pVertexBindingDescriptions[i].binding + 1);
+   }
 
    struct tu_cs vi_cs;
    tu_cs_begin_sub_stream(&pipeline->cs,
@@ -2705,6 +2762,19 @@ tu_pipeline_builder_parse_rasterization(struct tu_pipeline_builder *builder,
    if (depth_clip_state)
       depth_clip_disable = !depth_clip_state->depthClipEnable;
 
+   pipeline->line_mode = RECTANGULAR;
+
+   if (tu6_primtype_line(pipeline->ia.primtype)) {
+      const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line_state =
+         vk_find_struct_const(rast_info->pNext,
+                              PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
+
+      if (rast_line_state && rast_line_state->lineRasterizationMode ==
+               VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT) {
+         pipeline->line_mode = BRESENHAM;
+      }
+   }
+
    struct tu_cs cs;
    uint32_t cs_size = 9 + (builder->emit_msaa_state ? 11 : 0);
    pipeline->rast_state = tu_cs_draw_state(&pipeline->cs, &cs, cs_size);
@@ -2729,11 +2799,18 @@ tu_pipeline_builder_parse_rasterization(struct tu_pipeline_builder *builder,
                    A6XX_GRAS_SU_POINT_MINMAX(.min = 1.0f / 16.0f, .max = 4092.0f),
                    A6XX_GRAS_SU_POINT_SIZE(1.0f));
 
+   if (builder->device->physical_device->info->a6xx.has_shading_rate) {
+      tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A00());
+      tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A10());
+      tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A20());
+      tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A30());
+   }
+
    /* If samples count couldn't be devised from the subpass, we should emit it here.
     * It happens when subpass doesn't use any color/depth attachment.
     */
    if (builder->emit_msaa_state)
-      tu6_emit_msaa(&cs, builder->samples);
+      tu6_emit_msaa(&cs, builder->samples, pipeline->line_mode);
 
    const VkPipelineRasterizationStateStreamCreateInfoEXT *stream_info =
       vk_find_struct_const(rast_info->pNext,
@@ -2753,7 +2830,7 @@ tu_pipeline_builder_parse_rasterization(struct tu_pipeline_builder *builder,
    }
 
    pipeline->gras_su_cntl =
-      tu6_gras_su_cntl(rast_info, builder->samples, builder->multiview_mask != 0);
+      tu6_gras_su_cntl(rast_info, pipeline->line_mode, builder->multiview_mask != 0);
 
    if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_GRAS_SU_CNTL, 2))
       tu_cs_emit_regs(&cs, A6XX_GRAS_SU_CNTL(.dword = pipeline->gras_su_cntl));
@@ -3273,7 +3350,8 @@ tu_compute_pipeline_create(VkDevice device,
    pipeline->compute.subgroup_size = v->info.double_threadsize ? 128 : 64;
 
    struct tu_cs prog_cs;
-   tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs);
+   uint32_t additional_reserve_size = tu_xs_get_additional_cs_size_dwords(v);
+   tu_cs_begin_sub_stream(&pipeline->cs, 64 + additional_reserve_size, &prog_cs);
    tu6_emit_cs_config(&prog_cs, shader, v, &pvtmem, shader_iova);
    pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
 
@@ -3489,6 +3567,24 @@ tu_GetPipelineExecutableStatisticsKHR(
       }
    }
 
+   vk_outarray_append(&out, stat) {
+      WRITE_STR(stat->name, "STP Count");
+      WRITE_STR(stat->description,
+                "Number of STore Private instructions in the final generated "
+                "shader executable.");
+      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+      stat->value.u64 = exe->stats.stp_count;
+   }
+
+   vk_outarray_append(&out, stat) {
+      WRITE_STR(stat->name, "LDP Count");
+      WRITE_STR(stat->description,
+                "Number of LoaD Private instructions in the final generated "
+                "shader executable.");
+      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+      stat->value.u64 = exe->stats.ldp_count;
+   }
+
    return vk_outarray_status(&out);
 }
 
diff --git a/mesa 3D driver/src/freedreno/vulkan/tu_pipeline_cache.c b/mesa 3D driver/src/freedreno/vulkan/tu_pipeline_cache.c
index b5b15542c1..5cfc79fbfb 100644
--- a/mesa 3D driver/src/freedreno/vulkan/tu_pipeline_cache.c	
+++ b/mesa 3D driver/src/freedreno/vulkan/tu_pipeline_cache.c	
@@ -162,7 +162,7 @@ tu_pipeline_cache_grow(struct tu_pipeline_cache *cache)
 
    table = malloc(byte_size);
    if (table == NULL)
-      return vk_error(cache->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(cache, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    cache->hash_table = table;
    cache->table_size = table_size;
@@ -212,9 +212,9 @@ tu_pipeline_cache_load(struct tu_pipeline_cache *cache,
       return;
    if (header.header_version != VK_PIPELINE_CACHE_HEADER_VERSION_ONE)
       return;
-   if (header.vendor_id != 0 /* TODO */)
+   if (header.vendor_id != 0x5143)
       return;
-   if (header.device_id != 0 /* TODO */)
+   if (header.device_id != device->physical_device->dev_id.chip_id)
       return;
    if (memcmp(header.uuid, device->physical_device->cache_uuid,
               VK_UUID_SIZE) != 0)
@@ -257,7 +257,7 @@ tu_CreatePipelineCache(VkDevice _device,
    cache = vk_object_alloc(&device->vk, pAllocator, sizeof(*cache),
                            VK_OBJECT_TYPE_PIPELINE_CACHE);
    if (cache == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    if (pAllocator)
       cache->alloc = *pAllocator;
@@ -319,8 +319,8 @@ tu_GetPipelineCacheData(VkDevice _device,
    header = p;
    header->header_size = sizeof(*header);
    header->header_version = VK_PIPELINE_CACHE_HEADER_VERSION_ONE;
-   header->vendor_id = 0 /* TODO */;
-   header->device_id = 0 /* TODO */;
+   header->vendor_id = 0x5143;
+   header->device_id = device->physical_device->dev_id.chip_id;
    memcpy(header->uuid, device->physical_device->cache_uuid, VK_UUID_SIZE);
    p += header->header_size;
 
diff --git a/mesa 3D driver/src/freedreno/vulkan/tu_private.h b/mesa 3D driver/src/freedreno/vulkan/tu_private.h
index 9ae3d161ff..613424db7c 100644
--- a/mesa 3D driver/src/freedreno/vulkan/tu_private.h	
+++ b/mesa 3D driver/src/freedreno/vulkan/tu_private.h	
@@ -53,12 +53,15 @@
 #include "util/macros.h"
 #include "util/u_atomic.h"
 #include "util/u_dynarray.h"
+#include "util/xmlconfig.h"
+#include "util/perf/u_trace.h"
 #include "vk_alloc.h"
 #include "vk_debug_report.h"
 #include "vk_device.h"
 #include "vk_dispatch_table.h"
 #include "vk_extensions.h"
 #include "vk_instance.h"
+#include "vk_log.h"
 #include "vk_physical_device.h"
 #include "vk_shader_module.h"
 #include "wsi_common.h"
@@ -75,6 +78,7 @@
 
 #include "tu_descriptor_set.h"
 #include "tu_util.h"
+#include "tu_perfetto.h"
 
 /* Pre-declarations needed for WSI entrypoints */
 struct wl_surface;
@@ -90,12 +94,15 @@ typedef uint32_t xcb_window_t;
 #include "tu_entrypoints.h"
 
 #include "vk_format.h"
+#include "vk_command_buffer.h"
+#include "vk_queue.h"
 
 #define MAX_VBS 32
 #define MAX_VERTEX_ATTRIBS 32
 #define MAX_RTS 8
 #define MAX_VSC_PIPES 32
 #define MAX_VIEWPORTS 16
+#define MAX_VIEWPORT_SIZE (1 << 14)
 #define MAX_SCISSORS 16
 #define MAX_DISCARD_RECTANGLES 4
 #define MAX_PUSH_CONSTANTS_SIZE 128
@@ -130,25 +137,21 @@ typedef uint32_t xcb_window_t;
 struct tu_instance;
 
 VkResult
-__vk_errorf(struct tu_instance *instance,
-            VkResult error,
-            bool force_print,
-            const char *file,
-            int line,
-            const char *format,
-            ...) PRINTFLIKE(6, 7);
-
-#define vk_error(instance, error)                                            \
-   __vk_errorf(instance, error, false, __FILE__, __LINE__, NULL);
-#define vk_errorf(instance, error, format, ...)                              \
-   __vk_errorf(instance, error, false, __FILE__, __LINE__, format, ##__VA_ARGS__);
+__vk_startup_errorf(struct tu_instance *instance,
+                    VkResult error,
+                    bool force_print,
+                    const char *file,
+                    int line,
+                    const char *format,
+                    ...) PRINTFLIKE(6, 7);
 
 /* Prints startup errors if TU_DEBUG=startup is set or on a debug driver
  * build.
  */
 #define vk_startup_errorf(instance, error, format, ...) \
-   __vk_errorf(instance, error, instance->debug_flags & TU_DEBUG_STARTUP, \
-               __FILE__, __LINE__, format, ##__VA_ARGS__)
+   __vk_startup_errorf(instance, error, \
+                       instance->debug_flags & TU_DEBUG_STARTUP, \
+                       __FILE__, __LINE__, format, ##__VA_ARGS__)
 
 void
 __tu_finishme(const char *file, int line, const char *format, ...)
@@ -236,6 +239,7 @@ enum tu_debug_flags
    TU_DEBUG_PERFC = 1 << 9,
    TU_DEBUG_FLUSHALL = 1 << 10,
    TU_DEBUG_SYNCDRAW = 1 << 11,
+   TU_DEBUG_DONT_CARE_AS_LOAD = 1 << 12,
 };
 
 struct tu_instance
@@ -246,6 +250,9 @@ struct tu_instance
    int physical_device_count;
    struct tu_physical_device physical_devices[TU_MAX_DRM_DEVICES];
 
+   struct driOptionCache dri_options;
+   struct driOptionCache available_dri_options;
+
    enum tu_debug_flags debug_flags;
 };
 
@@ -291,15 +298,13 @@ struct tu_pipeline_key
 #define TU_MAX_QUEUE_FAMILIES 1
 
 struct tu_syncobj;
+struct tu_u_trace_syncobj;
 
 struct tu_queue
 {
-   struct vk_object_base base;
+   struct vk_queue vk;
 
    struct tu_device *device;
-   uint32_t queue_family_index;
-   int queue_idx;
-   VkDeviceQueueCreateFlags flags;
 
    uint32_t msm_queue_id;
    int fence;
@@ -425,6 +430,14 @@ struct tu_device
       TU_GRALLOC_OTHER,
    } gralloc_type;
 #endif
+
+   uint32_t submit_count;
+
+   struct u_trace_context trace_context;
+
+   #ifdef HAVE_PERFETTO
+   struct tu_perfetto_state perfetto;
+   #endif
 };
 
 void tu_init_clear_blit_shaders(struct tu_device *dev);
@@ -445,6 +458,12 @@ tu_device_is_lost(struct tu_device *device)
 VkResult
 tu_device_submit_deferred_locked(struct tu_device *dev);
 
+VkResult
+tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj);
+
+uint64_t
+tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
+
 enum tu_bo_alloc_flags
 {
    TU_BO_ALLOC_NO_FLAGS = 0,
@@ -763,41 +782,23 @@ enum tu_cmd_access_mask {
     * the location of a cache entry in CCU, to avoid conflicts. We assume that
     * any access in a renderpass after or before an access by a transfer needs
     * a flush/invalidate, and use the _INCOHERENT variants to represent access
-    * by a transfer.
+    * by a renderpass.
     */
    TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
    TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
    TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
    TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
 
-   /* Accesses by the host */
-   TU_ACCESS_HOST_READ = 1 << 10,
-   TU_ACCESS_HOST_WRITE = 1 << 11,
-
-   /* Accesses by a GPU engine which bypasses any cache. e.g. writes via
-    * CP_EVENT_WRITE::BLIT and the CP are SYSMEM_WRITE.
+   /* Accesses which bypasses any cache. e.g. writes via the host,
+    * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
     */
-   TU_ACCESS_SYSMEM_READ = 1 << 12,
-   TU_ACCESS_SYSMEM_WRITE = 1 << 13,
-
-   /* Set if a WFI is required. This can be required for:
-    * - 2D engine which (on some models) doesn't wait for flushes to complete
-    *   before starting
-    * - CP draw indirect opcodes, where we need to wait for any flushes to
-    *   complete but the CP implicitly waits for WFI's to complete and
-    *   therefore we only need a WFI after the flushes.
-    */
-   TU_ACCESS_WFI_READ = 1 << 14,
-
-   /* Set if a CP_WAIT_FOR_ME is required due to the data being read by the CP
-    * without it waiting for any WFI.
-    */
-   TU_ACCESS_WFM_READ = 1 << 15,
+   TU_ACCESS_SYSMEM_READ = 1 << 10,
+   TU_ACCESS_SYSMEM_WRITE = 1 << 11,
 
    /* Memory writes from the CP start in-order with draws and event writes,
     * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
     */
-   TU_ACCESS_CP_WRITE = 1 << 16,
+   TU_ACCESS_CP_WRITE = 1 << 12,
 
    TU_ACCESS_READ =
       TU_ACCESS_UCHE_READ |
@@ -805,10 +806,7 @@ enum tu_cmd_access_mask {
       TU_ACCESS_CCU_DEPTH_READ |
       TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
       TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
-      TU_ACCESS_HOST_READ |
-      TU_ACCESS_SYSMEM_READ |
-      TU_ACCESS_WFI_READ |
-      TU_ACCESS_WFM_READ,
+      TU_ACCESS_SYSMEM_READ,
 
    TU_ACCESS_WRITE =
       TU_ACCESS_UCHE_WRITE |
@@ -816,7 +814,6 @@ enum tu_cmd_access_mask {
       TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
       TU_ACCESS_CCU_DEPTH_WRITE |
       TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
-      TU_ACCESS_HOST_WRITE |
       TU_ACCESS_SYSMEM_WRITE |
       TU_ACCESS_CP_WRITE,
 
@@ -825,6 +822,57 @@ enum tu_cmd_access_mask {
       TU_ACCESS_WRITE,
 };
 
+/* Starting with a6xx, the pipeline is split into several "clusters" (really
+ * pipeline stages). Each stage has its own pair of register banks and can
+ * switch them independently, so that earlier stages can run ahead of later
+ * ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at
+ * the same time.
+ *
+ * As a result of this, we need to insert a WFI when an earlier stage depends
+ * on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any
+ * pending WFI's to complete before starting, and usually before reading
+ * indirect params even, so a WFI also acts as a full "pipeline stall".
+ *
+ * Note, the names of the stages come from CLUSTER_* in devcoredump. We
+ * include all the stages for completeness, even ones which do not read/write
+ * anything.
+ */
+
+enum tu_stage {
+   /* This doesn't correspond to a cluster, but we need it for tracking
+    * indirect draw parameter reads etc.
+    */
+   TU_STAGE_CP,
+
+   /* - Fetch index buffer
+    * - Fetch vertex attributes, dispatch VS
+    */
+   TU_STAGE_FE,
+
+   /* Execute all geometry stages (VS thru GS) */
+   TU_STAGE_SP_VS,
+
+   /* Write to VPC, do primitive assembly. */
+   TU_STAGE_PC_VS,
+
+   /* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according
+    * to devcoredump so presumably this stage stalls for TU_STAGE_PS when
+    * early depth testing is enabled before dispatching fragments? However
+    * GRAS reads and writes LRZ directly.
+    */
+   TU_STAGE_GRAS,
+
+   /* Execute FS */
+   TU_STAGE_SP_PS,
+
+   /* - Fragment tests
+    * - Write color/depth
+    * - Streamout writes (???)
+    * - Varying interpolation (???)
+    */
+   TU_STAGE_PS,
+};
+
 enum tu_cmd_flush_bits {
    TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0,
    TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1,
@@ -845,18 +893,10 @@ enum tu_cmd_flush_bits {
        */
       TU_CMD_FLAG_WAIT_MEM_WRITES,
 
-   TU_CMD_FLAG_GPU_INVALIDATE =
+   TU_CMD_FLAG_ALL_INVALIDATE =
       TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
       TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
       TU_CMD_FLAG_CACHE_INVALIDATE,
-
-   TU_CMD_FLAG_ALL_INVALIDATE =
-      TU_CMD_FLAG_GPU_INVALIDATE |
-      /* Treat the CP as a sort of "cache" which may need to be "invalidated"
-       * via waiting for UCHE/CCU flushes to land with WFI/WFM.
-       */
-      TU_CMD_FLAG_WAIT_FOR_IDLE |
-      TU_CMD_FLAG_WAIT_FOR_ME,
 };
 
 /* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
@@ -911,7 +951,6 @@ struct tu_lrz_state
 };
 
 struct tu_vs_params {
-   uint32_t params_offset;
    uint32_t vertex_offset;
    uint32_t first_instance;
 };
@@ -981,11 +1020,14 @@ struct tu_cmd_state
    const struct tu_framebuffer *framebuffer;
    VkRect2D render_area;
 
+   const struct tu_image_view **attachments;
+
    bool xfb_used;
    bool has_tess;
    bool has_subpass_predication;
    bool predication_active;
    bool disable_gmem;
+   enum a5xx_line_mode line_mode;
 
    struct tu_lrz_state lrz;
 
@@ -1015,13 +1057,17 @@ enum tu_cmd_buffer_status
 
 struct tu_cmd_buffer
 {
-   struct vk_object_base base;
+   struct vk_command_buffer vk;
 
    struct tu_device *device;
 
    struct tu_cmd_pool *pool;
    struct list_head pool_link;
 
+   struct u_trace trace;
+   struct u_trace_iterator trace_renderpass_start;
+   struct u_trace_iterator trace_renderpass_end;
+
    VkCommandBufferUsageFlags usage_flags;
    VkCommandBufferLevel level;
    enum tu_cmd_buffer_status status;
@@ -1173,6 +1219,8 @@ struct tu_pipeline
 
    bool rb_depth_cntl_disable;
 
+   enum a5xx_line_mode line_mode;
+
    /* draw states for the pipeline */
    struct tu_draw_state load_state, rast_state, blend_state;
 
@@ -1242,7 +1290,8 @@ tu6_emit_depth_bias(struct tu_cs *cs,
                     float clamp,
                     float slope_factor);
 
-void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples);
+void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
+                   enum a5xx_line_mode line_mode);
 
 void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
 
@@ -1289,8 +1338,8 @@ struct tu_image_view;
 void
 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
                   struct tu_cs *cs,
-                  struct tu_image_view *src,
-                  struct tu_image_view *dst,
+                  const struct tu_image_view *src,
+                  const struct tu_image_view *dst,
                   uint32_t layer_mask,
                   uint32_t layers,
                   const VkRect2D *rect);
@@ -1324,26 +1373,25 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
                          uint32_t a,
                          uint32_t gmem_a);
 
-enum tu_supported_formats {
-   FMT_VERTEX = 1,
-   FMT_TEXTURE = 2,
-   FMT_COLOR = 4,
-};
+enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format);
 
 struct tu_native_format
 {
    enum a6xx_format fmt : 8;
    enum a3xx_color_swap swap : 8;
    enum a6xx_tile_mode tile_mode : 8;
-   enum tu_supported_formats supported : 8;
 };
 
+enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format);
+bool tu6_format_vtx_supported(VkFormat format);
 struct tu_native_format tu6_format_vtx(VkFormat format);
-struct tu_native_format tu6_format_color(VkFormat format, enum a6xx_tile_mode tile_mode);
-struct tu_native_format tu6_format_texture(VkFormat format, enum a6xx_tile_mode tile_mode);
+bool tu6_format_color_supported(enum pipe_format format);
+struct tu_native_format tu6_format_color(enum pipe_format format, enum a6xx_tile_mode tile_mode);
+bool tu6_format_texture_supported(enum pipe_format format);
+struct tu_native_format tu6_format_texture(enum pipe_format format, enum a6xx_tile_mode tile_mode);
 
 static inline enum a6xx_format
-tu6_base_format(VkFormat format)
+tu6_base_format(enum pipe_format format)
 {
    /* note: tu6_format_color doesn't care about tiling for .fmt field */
    return tu6_format_color(format, TILE6_LINEAR).fmt;
@@ -1397,43 +1445,20 @@ tu_get_levelCount(const struct tu_image *image,
              : range->levelCount;
 }
 
+enum pipe_format tu6_plane_format(VkFormat format, uint32_t plane);
+
+uint32_t tu6_plane_index(VkFormat format, VkImageAspectFlags aspect_mask);
+
+enum pipe_format tu_format_for_aspect(enum pipe_format format,
+                                      VkImageAspectFlags aspect_mask);
+
 struct tu_image_view
 {
    struct vk_object_base base;
 
    struct tu_image *image; /**< VkImageViewCreateInfo::image */
 
-   uint64_t base_addr;
-   uint64_t ubwc_addr;
-   uint32_t layer_size;
-   uint32_t ubwc_layer_size;
-
-   /* used to determine if fast gmem store path can be used */
-   VkExtent2D extent;
-   bool need_y2_align;
-
-   bool ubwc_enabled;
-
-   uint32_t descriptor[A6XX_TEX_CONST_DWORDS];
-
-   /* Descriptor for use as a storage image as opposed to a sampled image.
-    * This has a few differences for cube maps (e.g. type).
-    */
-   uint32_t storage_descriptor[A6XX_TEX_CONST_DWORDS];
-
-   /* pre-filled register values */
-   uint32_t PITCH;
-   uint32_t FLAG_BUFFER_PITCH;
-
-   uint32_t RB_MRT_BUF_INFO;
-   uint32_t SP_FS_MRT_REG;
-
-   uint32_t SP_PS_2D_SRC_INFO;
-   uint32_t SP_PS_2D_SRC_SIZE;
-
-   uint32_t RB_2D_DST_INFO;
-
-   uint32_t RB_BLIT_DST_INFO;
+   struct fdl6_view view;
 
    /* for d32s8 separate stencil */
    uint64_t stencil_base_addr;
@@ -1460,19 +1485,19 @@ struct tu_sampler {
 };
 
 void
-tu_cs_image_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
+tu_cs_image_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer);
 
 void
-tu_cs_image_ref_2d(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer, bool src);
+tu_cs_image_ref_2d(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, bool src);
 
 void
-tu_cs_image_flag_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
+tu_cs_image_flag_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer);
 
 void
 tu_cs_image_stencil_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
 
 #define tu_image_view_stencil(iview, x) \
-   ((iview->x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_8_UINT))
+   ((iview->view.x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_8_UINT))
 
 VkResult
 tu_gralloc_info(struct tu_device *device,
@@ -1491,6 +1516,9 @@ tu_image_view_init(struct tu_image_view *iview,
                    const VkImageViewCreateInfo *pCreateInfo,
                    bool limited_z24s8);
 
+bool
+tiling_possible(VkFormat format);
+
 bool
 ubwc_possible(VkFormat format, VkImageType type, VkImageUsageFlags usage, VkImageUsageFlags stencil_usage,
               const struct fd_dev_info *info, VkSampleCountFlagBits samples);
@@ -1546,6 +1574,7 @@ tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
 
 struct tu_subpass_barrier {
    VkPipelineStageFlags src_stage_mask;
+   VkPipelineStageFlags dst_stage_mask;
    VkAccessFlags src_access_mask;
    VkAccessFlags dst_access_mask;
    bool incoherent_ccu_color, incoherent_ccu_depth;
@@ -1670,6 +1699,10 @@ tu_physical_device_init(struct tu_physical_device *device,
 VkResult
 tu_enumerate_devices(struct tu_instance *instance);
 
+int
+tu_drm_get_timestamp(struct tu_physical_device *device,
+                     uint64_t *ts);
+
 int
 tu_drm_submitqueue_new(const struct tu_device *dev,
                        int priority,
@@ -1684,60 +1717,87 @@ tu_signal_fences(struct tu_device *device, struct tu_syncobj *fence1, struct tu_
 int
 tu_syncobj_to_fd(struct tu_device *device, struct tu_syncobj *sync);
 
-#define TU_DEFINE_HANDLE_CASTS(__tu_type, __VkType)                          \
-                                                                             \
-   static inline struct __tu_type *__tu_type##_from_handle(__VkType _handle) \
-   {                                                                         \
-      return (struct __tu_type *) _handle;                                   \
-   }                                                                         \
-                                                                             \
-   static inline __VkType __tu_type##_to_handle(struct __tu_type *_obj)      \
-   {                                                                         \
-      return (__VkType) _obj;                                                \
-   }
 
-#define TU_DEFINE_NONDISP_HANDLE_CASTS(__tu_type, __VkType)                  \
-                                                                             \
-   static inline struct __tu_type *__tu_type##_from_handle(__VkType _handle) \
-   {                                                                         \
-      return (struct __tu_type *) (uintptr_t) _handle;                       \
-   }                                                                         \
-                                                                             \
-   static inline __VkType __tu_type##_to_handle(struct __tu_type *_obj)      \
-   {                                                                         \
-      return (__VkType)(uintptr_t) _obj;                                     \
-   }
+void
+tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream,
+                         void *ts_from, uint32_t from_offset,
+                         void *ts_to, uint32_t to_offset,
+                         uint32_t count);
+
+
+VkResult
+tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
+                            struct u_trace **trace_copy);
+
+struct tu_u_trace_cmd_data
+{
+   struct tu_cs *timestamp_copy_cs;
+   struct u_trace *trace;
+};
+
+void
+tu_u_trace_cmd_data_finish(struct tu_device *device,
+                           struct tu_u_trace_cmd_data *trace_data,
+                           uint32_t entry_count);
+
+struct tu_u_trace_flush_data
+{
+   uint32_t submission_id;
+   struct tu_u_trace_syncobj *syncobj;
+   uint32_t trace_count;
+   struct tu_u_trace_cmd_data *cmd_trace_data;
+};
 
 #define TU_FROM_HANDLE(__tu_type, __name, __handle)                          \
-   struct __tu_type *__name = __tu_type##_from_handle(__handle)
+   VK_FROM_HANDLE(__tu_type, __name, __handle)
 
-TU_DEFINE_HANDLE_CASTS(tu_cmd_buffer, VkCommandBuffer)
-TU_DEFINE_HANDLE_CASTS(tu_device, VkDevice)
-TU_DEFINE_HANDLE_CASTS(tu_instance, VkInstance)
-TU_DEFINE_HANDLE_CASTS(tu_physical_device, VkPhysicalDevice)
-TU_DEFINE_HANDLE_CASTS(tu_queue, VkQueue)
+VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
+                       VK_OBJECT_TYPE_COMMAND_BUFFER)
+VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
+VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
+                       VK_OBJECT_TYPE_INSTANCE)
+VK_DEFINE_HANDLE_CASTS(tu_physical_device, vk.base, VkPhysicalDevice,
+                       VK_OBJECT_TYPE_PHYSICAL_DEVICE)
+VK_DEFINE_HANDLE_CASTS(tu_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
 
-TU_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, VkCommandPool)
-TU_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, VkBuffer)
-TU_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer_view, VkBufferView)
-TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_pool, VkDescriptorPool)
-TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set, VkDescriptorSet)
-TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set_layout,
-                               VkDescriptorSetLayout)
-TU_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_update_template,
-                               VkDescriptorUpdateTemplate)
-TU_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, VkDeviceMemory)
-TU_DEFINE_NONDISP_HANDLE_CASTS(tu_event, VkEvent)
-TU_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, VkFramebuffer)
-TU_DEFINE_NONDISP_HANDLE_CASTS(tu_image, VkImage)
-TU_DEFINE_NONDISP_HANDLE_CASTS(tu_image_view, VkImageView);
-TU_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_cache, VkPipelineCache)
-TU_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline, VkPipeline)
-TU_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_layout, VkPipelineLayout)
-TU_DEFINE_NONDISP_HANDLE_CASTS(tu_query_pool, VkQueryPool)
-TU_DEFINE_NONDISP_HANDLE_CASTS(tu_render_pass, VkRenderPass)
-TU_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, VkSampler)
-TU_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler_ycbcr_conversion, VkSamplerYcbcrConversion)
+VK_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, base, VkCommandPool,
+                               VK_OBJECT_TYPE_COMMAND_POOL)
+VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, base, VkBuffer,
+                               VK_OBJECT_TYPE_BUFFER)
+VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer_view, base, VkBufferView,
+                               VK_OBJECT_TYPE_BUFFER_VIEW)
+VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_pool, base, VkDescriptorPool,
+                               VK_OBJECT_TYPE_DESCRIPTOR_POOL)
+VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set, base, VkDescriptorSet,
+                               VK_OBJECT_TYPE_DESCRIPTOR_SET)
+VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set_layout, base,
+                               VkDescriptorSetLayout,
+                               VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT)
+VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_update_template, base,
+                               VkDescriptorUpdateTemplate,
+                               VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, base, VkDeviceMemory,
+                               VK_OBJECT_TYPE_DEVICE_MEMORY)
+VK_DEFINE_NONDISP_HANDLE_CASTS(tu_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
+VK_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, base, VkFramebuffer,
+                               VK_OBJECT_TYPE_FRAMEBUFFER)
+VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image, base, VkImage, VK_OBJECT_TYPE_IMAGE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image_view, base, VkImageView,
+                               VK_OBJECT_TYPE_IMAGE_VIEW);
+VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_cache, base, VkPipelineCache,
+                               VK_OBJECT_TYPE_PIPELINE_CACHE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline, base, VkPipeline,
+                               VK_OBJECT_TYPE_PIPELINE)
+VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_layout, base, VkPipelineLayout,
+                               VK_OBJECT_TYPE_PIPELINE_LAYOUT)
+VK_DEFINE_NONDISP_HANDLE_CASTS(tu_query_pool, base, VkQueryPool,
+                               VK_OBJECT_TYPE_QUERY_POOL)
+VK_DEFINE_NONDISP_HANDLE_CASTS(tu_render_pass, base, VkRenderPass,
+                               VK_OBJECT_TYPE_RENDER_PASS)
+VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, base, VkSampler,
+                               VK_OBJECT_TYPE_SAMPLER)
+VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler_ycbcr_conversion, base, VkSamplerYcbcrConversion,
+                               VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION)
 
 /* for TU_FROM_HANDLE with both VkFence and VkSemaphore: */
 #define tu_syncobj_from_handle(x) ((struct tu_syncobj*) (uintptr_t) (x))
diff --git a/mesa 3D driver/src/freedreno/vulkan/tu_query.c b/mesa 3D driver/src/freedreno/vulkan/tu_query.c
index 11be8f9dcf..48da4f1ae2 100644
--- a/mesa 3D driver/src/freedreno/vulkan/tu_query.c	
+++ b/mesa 3D driver/src/freedreno/vulkan/tu_query.c	
@@ -265,7 +265,7 @@ tu_CreateQueryPool(VkDevice _device,
          vk_object_alloc(&device->vk, pAllocator, pool_size,
                          VK_OBJECT_TYPE_QUERY_POOL);
    if (!pool)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    if (pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
       pool->perf_group = fd_perfcntrs(&device->physical_device->dev_id,
@@ -426,7 +426,7 @@ wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
       if (query_is_available(slot))
          return VK_SUCCESS;
    }
-   return vk_error(device->instance, VK_TIMEOUT);
+   return vk_error(device, VK_TIMEOUT);
 }
 
 /* Writes a query value to a buffer from the CPU. */
diff --git a/mesa 3D driver/src/freedreno/vulkan/tu_shader.c b/mesa 3D driver/src/freedreno/vulkan/tu_shader.c
index 8060fd6831..a0baf0921b 100644
--- a/mesa 3D driver/src/freedreno/vulkan/tu_shader.c	
+++ b/mesa 3D driver/src/freedreno/vulkan/tu_shader.c	
@@ -38,8 +38,6 @@ tu_spirv_to_nir(struct tu_device *dev,
 {
    /* TODO these are made-up */
    const struct spirv_to_nir_options spirv_options = {
-      .frag_coord_is_sysval = true,
-
       .ubo_addr_format = nir_address_format_vec2_index_32bit_offset,
       .ssbo_addr_format = nir_address_format_vec2_index_32bit_offset,
 
@@ -112,6 +110,11 @@ tu_spirv_to_nir(struct tu_device *dev,
    assert(nir->info.stage == stage);
    nir_validate_shader(nir, "after spirv_to_nir");
 
+   const struct nir_lower_sysvals_to_varyings_options sysvals_to_varyings = {
+      .point_coord = true,
+   };
+   NIR_PASS_V(nir, nir_lower_sysvals_to_varyings, &sysvals_to_varyings);
+
    if (unlikely(dev->physical_device->instance->debug_flags & TU_DEBUG_NIR)) {
       fprintf(stderr, "translated nir:\n");
       nir_print_shader(nir, stderr);
@@ -342,7 +345,7 @@ build_bindless(nir_builder *b, nir_deref_instr *deref, bool is_sampler,
       const struct glsl_type *glsl_type = glsl_without_array(var->type);
       uint32_t idx = var->data.index * 2;
 
-      BITSET_SET_RANGE(b->shader->info.textures_used, idx * 2, ((idx * 2) + (bind_layout->array_size * 2)) - 1);
+      BITSET_SET_RANGE_INSIDE_WORD(b->shader->info.textures_used, idx * 2, ((idx * 2) + (bind_layout->array_size * 2)) - 1);
 
       /* D24S8 workaround: stencil of D24S8 will be sampled as uint */
       if (glsl_get_sampler_result_type(glsl_type) == GLSL_TYPE_UINT)
diff --git a/mesa 3D driver/src/freedreno/vulkan/tu_tracepoints.py b/mesa 3D driver/src/freedreno/vulkan/tu_tracepoints.py
new file mode 100644
index 0000000000..0017f1bb7f
--- /dev/null
+++ b/mesa 3D driver/src/freedreno/vulkan/tu_tracepoints.py	
@@ -0,0 +1,158 @@
+#
+# Copyright © 2021 Igalia S.L.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+
+import argparse
+import sys
+
+#
+# TODO can we do this with less boilerplate?
+#
+parser = argparse.ArgumentParser()
+parser.add_argument('-p', '--import-path', required=True)
+parser.add_argument('--utrace-src', required=True)
+parser.add_argument('--utrace-hdr', required=True)
+parser.add_argument('--perfetto-hdr', required=True)
+args = parser.parse_args()
+sys.path.insert(0, args.import_path)
+
+
+from u_trace import Header, HeaderScope
+from u_trace import ForwardDecl
+from u_trace import Tracepoint
+from u_trace import TracepointArg as Arg
+from u_trace import TracepointArgStruct as ArgStruct
+from u_trace import utrace_generate
+from u_trace import utrace_generate_perfetto_utils
+
+#
+# Tracepoint definitions:
+#
+
+Header('util/u_dump.h')
+Header('vk_format.h')
+Header('freedreno/vulkan/tu_private.h', scope=HeaderScope.SOURCE)
+
+ForwardDecl('struct tu_device')
+
+Tracepoint('start_render_pass',
+    tp_perfetto='tu_start_render_pass'
+)
+Tracepoint('end_render_pass',
+    args=[ArgStruct(type='const struct tu_framebuffer *', var='fb')],
+    tp_struct=[Arg(type='uint16_t', name='width',        var='fb->width',                                    c_format='%u'),
+               Arg(type='uint16_t', name='height',       var='fb->height',                                   c_format='%u'),
+               Arg(type='uint8_t',  name='MRTs',         var='fb->attachment_count',                         c_format='%u'),
+            #    Arg(type='uint8_t',  name='samples',      var='fb->samples',                                  c_format='%u'),
+               Arg(type='uint16_t', name='numberOfBins', var='fb->tile_count.width * fb->tile_count.height', c_format='%u'),
+               Arg(type='uint16_t', name='binWidth',     var='fb->tile0.width',                              c_format='%u'),
+               Arg(type='uint16_t', name='binHeight',    var='fb->tile0.height',                             c_format='%u')],
+    tp_perfetto='tu_end_render_pass')
+
+Tracepoint('start_binning_ib',
+    tp_perfetto='tu_start_binning_ib')
+Tracepoint('end_binning_ib',
+    tp_perfetto='tu_end_binning_ib')
+
+Tracepoint('start_resolve',
+    tp_perfetto='tu_start_resolve')
+Tracepoint('end_resolve',
+    tp_perfetto='tu_end_resolve')
+
+Tracepoint('start_draw_ib_sysmem',
+    tp_perfetto='tu_start_draw_ib_sysmem')
+Tracepoint('end_draw_ib_sysmem',
+    tp_perfetto='tu_end_draw_ib_sysmem')
+
+Tracepoint('start_draw_ib_gmem',
+    tp_perfetto='tu_start_draw_ib_gmem')
+Tracepoint('end_draw_ib_gmem',
+    tp_perfetto='tu_end_draw_ib_gmem')
+
+Tracepoint('start_gmem_clear',
+    tp_perfetto='tu_start_gmem_clear')
+Tracepoint('end_gmem_clear',
+    args=[Arg(type='enum VkFormat',  var='format',  c_format='%s', to_prim_type='vk_format_description({})->short_name'),
+          Arg(type='uint8_t',        var='samples', c_format='%u')],
+    tp_perfetto='tu_end_gmem_clear')
+
+Tracepoint('start_sysmem_clear',
+    tp_perfetto='tu_start_sysmem_clear')
+Tracepoint('end_sysmem_clear',
+    args=[Arg(type='enum VkFormat',  var='format',      c_format='%s', to_prim_type='vk_format_description({})->short_name'),
+          Arg(type='uint8_t',        var='uses_3d_ops', c_format='%u'),
+          Arg(type='uint8_t',        var='samples',     c_format='%u')],
+    tp_perfetto='tu_end_sysmem_clear')
+
+Tracepoint('start_sysmem_clear_all',
+    tp_perfetto='tu_start_sysmem_clear_all')
+Tracepoint('end_sysmem_clear_all',
+    args=[Arg(type='uint8_t',        var='mrt_count',   c_format='%u'),
+          Arg(type='uint8_t',        var='rect_count',  c_format='%u')],
+    tp_perfetto='tu_end_sysmem_clear_all')
+
+Tracepoint('start_gmem_load',
+    tp_perfetto='tu_start_gmem_load')
+Tracepoint('end_gmem_load',
+    args=[Arg(type='enum VkFormat',  var='format',   c_format='%s', to_prim_type='vk_format_description({})->short_name'),
+          Arg(type='uint8_t',        var='force_load', c_format='%u')],
+    tp_perfetto='tu_end_gmem_load')
+
+Tracepoint('start_gmem_store',
+    tp_perfetto='tu_start_gmem_store')
+Tracepoint('end_gmem_store',
+    args=[Arg(type='enum VkFormat',  var='format',   c_format='%s', to_prim_type='vk_format_description({})->short_name'),
+          Arg(type='uint8_t',        var='fast_path', c_format='%u'),
+          Arg(type='uint8_t',        var='unaligned', c_format='%u')],
+    tp_perfetto='tu_end_gmem_store')
+
+Tracepoint('start_sysmem_resolve',
+    tp_perfetto='tu_start_sysmem_resolve')
+Tracepoint('end_sysmem_resolve',
+    args=[Arg(type='enum VkFormat',  var='format',   c_format='%s', to_prim_type='vk_format_description({})->short_name')],
+    tp_perfetto='tu_end_sysmem_resolve')
+
+Tracepoint('start_blit',
+    tp_perfetto='tu_start_blit',
+)
+Tracepoint('end_blit',
+    # TODO: add source megapixels count and target megapixels count arguments
+    args=[Arg(type='uint8_t',        var='uses_3d_blit', c_format='%u'),
+          Arg(type='enum VkFormat',  var='src_format',   c_format='%s', to_prim_type='vk_format_description({})->short_name'),
+          Arg(type='enum VkFormat',  var='dst_format',   c_format='%s', to_prim_type='vk_format_description({})->short_name'),
+          Arg(type='uint8_t',        var='layers',       c_format='%u')],
+    tp_perfetto='tu_end_blit')
+
+Tracepoint('start_compute',
+    tp_perfetto='tu_start_compute')
+Tracepoint('end_compute',
+    args=[Arg(type='uint8_t',  var='indirect',       c_format='%u'),
+          Arg(type='uint16_t', var='local_size_x',   c_format='%u'),
+          Arg(type='uint16_t', var='local_size_y',   c_format='%u'),
+          Arg(type='uint16_t', var='local_size_z',   c_format='%u'),
+          Arg(type='uint16_t', var='num_groups_x',   c_format='%u'),
+          Arg(type='uint16_t', var='num_groups_y',   c_format='%u'),
+          Arg(type='uint16_t', var='num_groups_z',   c_format='%u')],
+    tp_perfetto='tu_end_compute')
+
+utrace_generate(cpath=args.utrace_src, hpath=args.utrace_hdr, ctx_param='struct tu_device *dev')
+utrace_generate_perfetto_utils(hpath=args.perfetto_hdr)
diff --git a/mesa 3D driver/src/freedreno/vulkan/tu_util.c b/mesa 3D driver/src/freedreno/vulkan/tu_util.c
index b13d1397a5..cf6c0bfea8 100644
--- a/mesa 3D driver/src/freedreno/vulkan/tu_util.c	
+++ b/mesa 3D driver/src/freedreno/vulkan/tu_util.c	
@@ -47,13 +47,13 @@ void PRINTFLIKE(3, 4)
 }
 
 VkResult
-__vk_errorf(struct tu_instance *instance,
-            VkResult error,
-            bool always_print,
-            const char *file,
-            int line,
-            const char *format,
-            ...)
+__vk_startup_errorf(struct tu_instance *instance,
+                    VkResult error,
+                    bool always_print,
+                    const char *file,
+                    int line,
+                    const char *format,
+                    ...)
 {
    va_list ap;
    char buffer[256];
diff --git a/mesa 3D driver/src/freedreno/vulkan/tu_util.h b/mesa 3D driver/src/freedreno/vulkan/tu_util.h
index 9c42cfdbad..2ad7f86e36 100644
--- a/mesa 3D driver/src/freedreno/vulkan/tu_util.h	
+++ b/mesa 3D driver/src/freedreno/vulkan/tu_util.h	
@@ -85,6 +85,20 @@ tu6_rop(VkLogicOp op)
    return lookup[op];
 }
 
+static inline bool
+tu6_primtype_line(enum pc_di_primtype type)
+{
+    switch(type) {
+    case DI_PT_LINELIST:
+    case DI_PT_LINESTRIP:
+    case DI_PT_LINE_ADJ:
+    case DI_PT_LINESTRIP_ADJ:
+       return true;
+    default:
+       return false;
+    }
+}
+
 static inline enum pc_di_primtype
 tu6_primtype(VkPrimitiveTopology topology)
 {
diff --git a/mesa 3D driver/src/freedreno/vulkan/tu_wsi.c b/mesa 3D driver/src/freedreno/vulkan/tu_wsi.c
index 4101f9a0f0..e7b0440f74 100644
--- a/mesa 3D driver/src/freedreno/vulkan/tu_wsi.c	
+++ b/mesa 3D driver/src/freedreno/vulkan/tu_wsi.c	
@@ -45,12 +45,14 @@ tu_wsi_init(struct tu_physical_device *physical_device)
                             tu_physical_device_to_handle(physical_device),
                             tu_wsi_proc_addr,
                             &physical_device->instance->vk.alloc,
-                            physical_device->master_fd, NULL,
+                            physical_device->master_fd,
+                            &physical_device->instance->dri_options,
                             false);
    if (result != VK_SUCCESS)
       return result;
 
    physical_device->wsi_device.supports_modifiers = true;
+   physical_device->vk.wsi_device = &physical_device->wsi_device;
 
    return VK_SUCCESS;
 }
@@ -58,171 +60,11 @@ tu_wsi_init(struct tu_physical_device *physical_device)
 void
 tu_wsi_finish(struct tu_physical_device *physical_device)
 {
+   physical_device->vk.wsi_device = NULL;
    wsi_device_finish(&physical_device->wsi_device,
                      &physical_device->instance->vk.alloc);
 }
 
-VKAPI_ATTR void VKAPI_CALL
-tu_DestroySurfaceKHR(VkInstance _instance,
-                     VkSurfaceKHR _surface,
-                     const VkAllocationCallbacks *pAllocator)
-{
-   TU_FROM_HANDLE(tu_instance, instance, _instance);
-   ICD_FROM_HANDLE(VkIcdSurfaceBase, surface, _surface);
-
-   vk_free2(&instance->vk.alloc, pAllocator, surface);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetPhysicalDeviceSurfaceSupportKHR(VkPhysicalDevice physicalDevice,
-                                      uint32_t queueFamilyIndex,
-                                      VkSurfaceKHR surface,
-                                      VkBool32 *pSupported)
-{
-   TU_FROM_HANDLE(tu_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_support(
-      &device->wsi_device, queueFamilyIndex, surface, pSupported);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetPhysicalDeviceSurfaceCapabilitiesKHR(
-   VkPhysicalDevice physicalDevice,
-   VkSurfaceKHR surface,
-   VkSurfaceCapabilitiesKHR *pSurfaceCapabilities)
-{
-   TU_FROM_HANDLE(tu_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_capabilities(&device->wsi_device, surface,
-                                              pSurfaceCapabilities);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetPhysicalDeviceSurfaceCapabilities2KHR(
-   VkPhysicalDevice physicalDevice,
-   const VkPhysicalDeviceSurfaceInfo2KHR *pSurfaceInfo,
-   VkSurfaceCapabilities2KHR *pSurfaceCapabilities)
-{
-   TU_FROM_HANDLE(tu_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_capabilities2(
-      &device->wsi_device, pSurfaceInfo, pSurfaceCapabilities);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetPhysicalDeviceSurfaceCapabilities2EXT(
-   VkPhysicalDevice physicalDevice,
-   VkSurfaceKHR surface,
-   VkSurfaceCapabilities2EXT *pSurfaceCapabilities)
-{
-   TU_FROM_HANDLE(tu_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_capabilities2ext(
-      &device->wsi_device, surface, pSurfaceCapabilities);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetPhysicalDeviceSurfaceFormatsKHR(VkPhysicalDevice physicalDevice,
-                                      VkSurfaceKHR surface,
-                                      uint32_t *pSurfaceFormatCount,
-                                      VkSurfaceFormatKHR *pSurfaceFormats)
-{
-   TU_FROM_HANDLE(tu_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_formats(
-      &device->wsi_device, surface, pSurfaceFormatCount, pSurfaceFormats);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetPhysicalDeviceSurfaceFormats2KHR(
-   VkPhysicalDevice physicalDevice,
-   const VkPhysicalDeviceSurfaceInfo2KHR *pSurfaceInfo,
-   uint32_t *pSurfaceFormatCount,
-   VkSurfaceFormat2KHR *pSurfaceFormats)
-{
-   TU_FROM_HANDLE(tu_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_formats2(&device->wsi_device, pSurfaceInfo,
-                                          pSurfaceFormatCount,
-                                          pSurfaceFormats);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetPhysicalDeviceSurfacePresentModesKHR(VkPhysicalDevice physicalDevice,
-                                           VkSurfaceKHR surface,
-                                           uint32_t *pPresentModeCount,
-                                           VkPresentModeKHR *pPresentModes)
-{
-   TU_FROM_HANDLE(tu_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_present_modes(
-      &device->wsi_device, surface, pPresentModeCount, pPresentModes);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_CreateSwapchainKHR(VkDevice _device,
-                      const VkSwapchainCreateInfoKHR *pCreateInfo,
-                      const VkAllocationCallbacks *pAllocator,
-                      VkSwapchainKHR *pSwapchain)
-{
-   TU_FROM_HANDLE(tu_device, device, _device);
-   const VkAllocationCallbacks *alloc;
-   if (pAllocator)
-      alloc = pAllocator;
-   else
-      alloc = &device->vk.alloc;
-
-   return wsi_common_create_swapchain(&device->physical_device->wsi_device,
-                                      tu_device_to_handle(device),
-                                      pCreateInfo, alloc, pSwapchain);
-}
-
-VKAPI_ATTR void VKAPI_CALL
-tu_DestroySwapchainKHR(VkDevice _device,
-                       VkSwapchainKHR swapchain,
-                       const VkAllocationCallbacks *pAllocator)
-{
-   TU_FROM_HANDLE(tu_device, device, _device);
-   const VkAllocationCallbacks *alloc;
-
-   if (pAllocator)
-      alloc = pAllocator;
-   else
-      alloc = &device->vk.alloc;
-
-   wsi_common_destroy_swapchain(_device, swapchain, alloc);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetSwapchainImagesKHR(VkDevice device,
-                         VkSwapchainKHR swapchain,
-                         uint32_t *pSwapchainImageCount,
-                         VkImage *pSwapchainImages)
-{
-   return wsi_common_get_images(swapchain, pSwapchainImageCount,
-                                pSwapchainImages);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_AcquireNextImageKHR(VkDevice device,
-                       VkSwapchainKHR swapchain,
-                       uint64_t timeout,
-                       VkSemaphore semaphore,
-                       VkFence fence,
-                       uint32_t *pImageIndex)
-{
-   VkAcquireNextImageInfoKHR acquire_info = {
-      .sType = VK_STRUCTURE_TYPE_ACQUIRE_NEXT_IMAGE_INFO_KHR,
-      .swapchain = swapchain,
-      .timeout = timeout,
-      .semaphore = semaphore,
-      .fence = fence,
-      .deviceMask = 0,
-   };
-
-   return tu_AcquireNextImage2KHR(device, &acquire_info, pImageIndex);
-}
-
 VKAPI_ATTR VkResult VKAPI_CALL
 tu_AcquireNextImage2KHR(VkDevice _device,
                         const VkAcquireNextImageInfoKHR *pAcquireInfo,
@@ -247,42 +89,11 @@ VKAPI_ATTR VkResult VKAPI_CALL
 tu_QueuePresentKHR(VkQueue _queue, const VkPresentInfoKHR *pPresentInfo)
 {
    TU_FROM_HANDLE(tu_queue, queue, _queue);
+
+   u_trace_context_process(&queue->device->trace_context, true);
+
    return wsi_common_queue_present(
       &queue->device->physical_device->wsi_device,
-      tu_device_to_handle(queue->device), _queue, queue->queue_family_index,
+      tu_device_to_handle(queue->device), _queue, queue->vk.queue_family_index,
       pPresentInfo);
 }
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetDeviceGroupPresentCapabilitiesKHR(
-   VkDevice device, VkDeviceGroupPresentCapabilitiesKHR *pCapabilities)
-{
-   memset(pCapabilities->presentMask, 0, sizeof(pCapabilities->presentMask));
-   pCapabilities->presentMask[0] = 0x1;
-   pCapabilities->modes = VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHR;
-
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetDeviceGroupSurfacePresentModesKHR(
-   VkDevice device,
-   VkSurfaceKHR surface,
-   VkDeviceGroupPresentModeFlagsKHR *pModes)
-{
-   *pModes = VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHR;
-
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetPhysicalDevicePresentRectanglesKHR(VkPhysicalDevice physicalDevice,
-                                         VkSurfaceKHR surface,
-                                         uint32_t *pRectCount,
-                                         VkRect2D *pRects)
-{
-   TU_FROM_HANDLE(tu_physical_device, device, physicalDevice);
-
-   return wsi_common_get_present_rectangles(&device->wsi_device, surface,
-                                            pRectCount, pRects);
-}
diff --git a/mesa 3D driver/src/freedreno/vulkan/tu_wsi_display.c b/mesa 3D driver/src/freedreno/vulkan/tu_wsi_display.c
index 877a02a159..fce8d7a7d6 100644
--- a/mesa 3D driver/src/freedreno/vulkan/tu_wsi_display.c	
+++ b/mesa 3D driver/src/freedreno/vulkan/tu_wsi_display.c	
@@ -34,231 +34,8 @@
 #include "util/debug.h"
 #include "wsi_common_display.h"
 
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetPhysicalDeviceDisplayPropertiesKHR(VkPhysicalDevice physical_device,
-                                         uint32_t *property_count,
-                                         VkDisplayPropertiesKHR *properties)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_physical_device_display_properties(
-             physical_device,
-             &pdevice->wsi_device,
-             property_count,
-             properties);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetPhysicalDeviceDisplayProperties2KHR(VkPhysicalDevice physical_device,
-                                          uint32_t *property_count,
-                                          VkDisplayProperties2KHR *properties)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_physical_device_display_properties2(
-             physical_device,
-             &pdevice->wsi_device,
-             property_count,
-             properties);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetPhysicalDeviceDisplayPlanePropertiesKHR(
-   VkPhysicalDevice physical_device,
-   uint32_t *property_count,
-   VkDisplayPlanePropertiesKHR *properties)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_physical_device_display_plane_properties(
-             physical_device,
-             &pdevice->wsi_device,
-             property_count,
-             properties);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetPhysicalDeviceDisplayPlaneProperties2KHR(
-   VkPhysicalDevice physical_device,
-   uint32_t *property_count,
-   VkDisplayPlaneProperties2KHR *properties)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_physical_device_display_plane_properties2(
-             physical_device,
-             &pdevice->wsi_device,
-             property_count,
-             properties);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetDisplayPlaneSupportedDisplaysKHR(VkPhysicalDevice physical_device,
-                                       uint32_t plane_index,
-                                       uint32_t *display_count,
-                                       VkDisplayKHR *displays)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_display_plane_supported_displays(
-             physical_device,
-             &pdevice->wsi_device,
-             plane_index,
-             display_count,
-             displays);
-}
-
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetDisplayModePropertiesKHR(VkPhysicalDevice physical_device,
-                               VkDisplayKHR display,
-                               uint32_t *property_count,
-                               VkDisplayModePropertiesKHR *properties)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_display_mode_properties(physical_device,
-                                                  &pdevice->wsi_device,
-                                                  display,
-                                                  property_count,
-                                                  properties);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetDisplayModeProperties2KHR(VkPhysicalDevice physical_device,
-                                VkDisplayKHR display,
-                                uint32_t *property_count,
-                                VkDisplayModeProperties2KHR *properties)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_display_mode_properties2(physical_device,
-                                                   &pdevice->wsi_device,
-                                                   display,
-                                                   property_count,
-                                                   properties);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_CreateDisplayModeKHR(VkPhysicalDevice physical_device,
-                        VkDisplayKHR display,
-                        const VkDisplayModeCreateInfoKHR *create_info,
-                        const VkAllocationCallbacks *allocator,
-                        VkDisplayModeKHR *mode)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_display_create_display_mode(physical_device,
-                                          &pdevice->wsi_device,
-                                          display,
-                                          create_info,
-                                          allocator,
-                                          mode);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetDisplayPlaneCapabilitiesKHR(VkPhysicalDevice physical_device,
-                                  VkDisplayModeKHR mode_khr,
-                                  uint32_t plane_index,
-                                  VkDisplayPlaneCapabilitiesKHR *capabilities)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_get_display_plane_capabilities(physical_device,
-                                             &pdevice->wsi_device,
-                                             mode_khr,
-                                             plane_index,
-                                             capabilities);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetDisplayPlaneCapabilities2KHR(VkPhysicalDevice physical_device,
-                                   const VkDisplayPlaneInfo2KHR *pDisplayPlaneInfo,
-                                   VkDisplayPlaneCapabilities2KHR *capabilities)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_get_display_plane_capabilities2(physical_device,
-                                              &pdevice->wsi_device,
-                                              pDisplayPlaneInfo,
-                                              capabilities);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_CreateDisplayPlaneSurfaceKHR(
-   VkInstance _instance,
-   const VkDisplaySurfaceCreateInfoKHR *create_info,
-   const VkAllocationCallbacks *allocator,
-   VkSurfaceKHR *surface)
-{
-   TU_FROM_HANDLE(tu_instance, instance, _instance);
-   const VkAllocationCallbacks *alloc;
-
-   if (allocator)
-      alloc = allocator;
-   else
-      alloc = &instance->vk.alloc;
-
-   return wsi_create_display_surface(_instance, alloc,
-                                     create_info, surface);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_ReleaseDisplayEXT(VkPhysicalDevice physical_device,
-                     VkDisplayKHR     display)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_release_display(physical_device,
-                              &pdevice->wsi_device,
-                              display);
-}
-
-#ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_AcquireXlibDisplayEXT(VkPhysicalDevice     physical_device,
-                         Display              *dpy,
-                         VkDisplayKHR         display)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_acquire_xlib_display(physical_device,
-                                   &pdevice->wsi_device,
-                                   dpy,
-                                   display);
-}
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetRandROutputDisplayEXT(VkPhysicalDevice  physical_device,
-                            Display           *dpy,
-                            RROutput          output,
-                            VkDisplayKHR      *display)
-{
-   TU_FROM_HANDLE(tu_physical_device, pdevice, physical_device);
-
-   return wsi_get_randr_output_display(physical_device,
-                                       &pdevice->wsi_device,
-                                       dpy,
-                                       output,
-                                       display);
-}
-#endif /* VK_USE_PLATFORM_XLIB_XRANDR_EXT */
-
 /* VK_EXT_display_control */
 
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_DisplayPowerControlEXT(VkDevice                    _device,
-                          VkDisplayKHR                display,
-                          const VkDisplayPowerInfoEXT *display_power_info)
-{
-   TU_FROM_HANDLE(tu_device, device, _device);
-
-   return wsi_display_power_control(_device,
-                                    &device->physical_device->wsi_device,
-                                    display,
-                                    display_power_info);
-}
-
 VKAPI_ATTR VkResult VKAPI_CALL
 tu_RegisterDeviceEventEXT(VkDevice                    _device,
                           const VkDeviceEventInfoEXT  *device_event_info,
@@ -333,19 +110,3 @@ tu_RegisterDisplayEventEXT(VkDevice                           _device,
 
    return ret;
 }
-
-VKAPI_ATTR VkResult VKAPI_CALL
-tu_GetSwapchainCounterEXT(VkDevice                    _device,
-                          VkSwapchainKHR              swapchain,
-                          VkSurfaceCounterFlagBitsEXT flag_bits,
-                          uint64_t                    *value)
-{
-   TU_FROM_HANDLE(tu_device, device, _device);
-
-   return wsi_get_swapchain_counter(_device,
-                                    &device->physical_device->wsi_device,
-                                    swapchain,
-                                    flag_bits,
-                                    value);
-}
-
diff --git a/mesa 3D driver/src/gallium/auxiliary/cso_cache/cso_context.c b/mesa 3D driver/src/gallium/auxiliary/cso_cache/cso_context.c
index 42b64b34c1..97b4da0afd 100644
--- a/mesa 3D driver/src/gallium/auxiliary/cso_cache/cso_context.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/cso_cache/cso_context.c	
@@ -74,6 +74,8 @@ struct cso_context {
    boolean has_compute_shader;
    boolean has_streamout;
 
+   uint32_t max_fs_samplerviews : 16;
+
    unsigned saved_state;  /**< bitmask of CSO_BIT_x flags */
    unsigned saved_compute_state;  /**< bitmask of CSO_BIT_COMPUTE_x flags */
 
@@ -283,6 +285,9 @@ cso_create_context(struct pipe_context *pipe, unsigned flags)
       ctx->has_streamout = TRUE;
    }
 
+   ctx->max_fs_samplerviews = pipe->screen->get_shader_param(pipe->screen, PIPE_SHADER_FRAGMENT,
+                                                             PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS);
+
    ctx->max_sampler_seen = -1;
    return ctx;
 }
@@ -339,7 +344,7 @@ void cso_unbind_context(struct cso_context *ctx)
                ctx->pipe->bind_sampler_states(ctx->pipe, sh, 0, maxsam, zeros);
             }
             if (maxview > 0) {
-               ctx->pipe->set_sampler_views(ctx->pipe, sh, 0, maxview, 0, views);
+               ctx->pipe->set_sampler_views(ctx->pipe, sh, 0, maxview, 0, false, views);
             }
             if (maxssbo > 0) {
                ctx->pipe->set_shader_buffers(ctx->pipe, sh, 0, maxssbo, ssbos, 0);
@@ -1479,46 +1484,60 @@ cso_save_state(struct cso_context *cso, unsigned state_mask)
  * Restore the state which was saved by cso_save_state().
  */
 void
-cso_restore_state(struct cso_context *cso)
+cso_restore_state(struct cso_context *cso, unsigned unbind)
 {
    unsigned state_mask = cso->saved_state;
 
    assert(state_mask);
 
-   if (state_mask & CSO_BIT_BLEND)
-      cso_restore_blend(cso);
    if (state_mask & CSO_BIT_DEPTH_STENCIL_ALPHA)
       cso_restore_depth_stencil_alpha(cso);
-   if (state_mask & CSO_BIT_FRAGMENT_SAMPLERS)
-      cso_restore_fragment_samplers(cso);
+   if (state_mask & CSO_BIT_STENCIL_REF)
+      cso_restore_stencil_ref(cso);
    if (state_mask & CSO_BIT_FRAGMENT_SHADER)
       cso_restore_fragment_shader(cso);
-   if (state_mask & CSO_BIT_FRAMEBUFFER)
-      cso_restore_framebuffer(cso);
    if (state_mask & CSO_BIT_GEOMETRY_SHADER)
       cso_restore_geometry_shader(cso);
-   if (state_mask & CSO_BIT_MIN_SAMPLES)
-      cso_restore_min_samples(cso);
+   if (state_mask & CSO_BIT_TESSEVAL_SHADER)
+      cso_restore_tesseval_shader(cso);
+   if (state_mask & CSO_BIT_TESSCTRL_SHADER)
+      cso_restore_tessctrl_shader(cso);
+   if (state_mask & CSO_BIT_VERTEX_SHADER)
+      cso_restore_vertex_shader(cso);
+   if (unbind & CSO_UNBIND_FS_SAMPLERVIEWS)
+      cso->pipe->set_sampler_views(cso->pipe, PIPE_SHADER_FRAGMENT, 0, 0,
+                                   cso->max_fs_samplerviews, false, NULL);
+   if (unbind & CSO_UNBIND_FS_SAMPLERVIEW0)
+      cso->pipe->set_sampler_views(cso->pipe, PIPE_SHADER_FRAGMENT, 0, 0,
+                                   1, false, NULL);
+   if (state_mask & CSO_BIT_FRAGMENT_SAMPLERS)
+      cso_restore_fragment_samplers(cso);
+   if (unbind & CSO_UNBIND_FS_IMAGE0)
+      cso->pipe->set_shader_images(cso->pipe, PIPE_SHADER_FRAGMENT, 0, 0, 1, NULL);
+   if (state_mask & CSO_BIT_FRAMEBUFFER)
+      cso_restore_framebuffer(cso);
+   if (state_mask & CSO_BIT_BLEND)
+      cso_restore_blend(cso);
    if (state_mask & CSO_BIT_RASTERIZER)
       cso_restore_rasterizer(cso);
+   if (state_mask & CSO_BIT_MIN_SAMPLES)
+      cso_restore_min_samples(cso);
    if (state_mask & CSO_BIT_RENDER_CONDITION)
       cso_restore_render_condition(cso);
    if (state_mask & CSO_BIT_SAMPLE_MASK)
       cso_restore_sample_mask(cso);
-   if (state_mask & CSO_BIT_STENCIL_REF)
-      cso_restore_stencil_ref(cso);
-   if (state_mask & CSO_BIT_STREAM_OUTPUTS)
-      cso_restore_stream_outputs(cso);
-   if (state_mask & CSO_BIT_TESSCTRL_SHADER)
-      cso_restore_tessctrl_shader(cso);
-   if (state_mask & CSO_BIT_TESSEVAL_SHADER)
-      cso_restore_tesseval_shader(cso);
-   if (state_mask & CSO_BIT_VERTEX_ELEMENTS)
-      cso_restore_vertex_elements(cso);
-   if (state_mask & CSO_BIT_VERTEX_SHADER)
-      cso_restore_vertex_shader(cso);
    if (state_mask & CSO_BIT_VIEWPORT)
       cso_restore_viewport(cso);
+   if (unbind & CSO_UNBIND_VS_CONSTANTS)
+      cso->pipe->set_constant_buffer(cso->pipe, PIPE_SHADER_VERTEX, 0, false, NULL);
+   if (unbind & CSO_UNBIND_FS_CONSTANTS)
+      cso->pipe->set_constant_buffer(cso->pipe, PIPE_SHADER_FRAGMENT, 0, false, NULL);
+   if (state_mask & CSO_BIT_VERTEX_ELEMENTS)
+      cso_restore_vertex_elements(cso);
+   if (unbind & CSO_UNBIND_VERTEX_BUFFER0)
+      cso->pipe->set_vertex_buffers(cso->pipe, 0, 0, 1, false, NULL);
+   if (state_mask & CSO_BIT_STREAM_OUTPUTS)
+      cso_restore_stream_outputs(cso);
    if (state_mask & CSO_BIT_PAUSE_QUERIES)
       cso->pipe->set_active_query_state(cso->pipe, true);
 
diff --git a/mesa 3D driver/src/gallium/auxiliary/cso_cache/cso_context.h b/mesa 3D driver/src/gallium/auxiliary/cso_cache/cso_context.h
index 32420fe384..6507bd026c 100644
--- a/mesa 3D driver/src/gallium/auxiliary/cso_cache/cso_context.h	
+++ b/mesa 3D driver/src/gallium/auxiliary/cso_cache/cso_context.h	
@@ -98,6 +98,15 @@ void cso_set_stream_outputs(struct cso_context *ctx,
                             const unsigned *offsets);
 
 
+enum cso_unbind_flags {
+   CSO_UNBIND_FS_SAMPLERVIEWS = (1 << 0),
+   CSO_UNBIND_FS_SAMPLERVIEW0 = (1 << 1),
+   CSO_UNBIND_FS_IMAGE0 = (1 << 2),
+   CSO_UNBIND_VS_CONSTANTS = (1 << 3),
+   CSO_UNBIND_FS_CONSTANTS = (1 << 4),
+   CSO_UNBIND_VERTEX_BUFFER0 = (1 << 5),
+};
+
 /*
  * We don't provide shader caching in CSO.  Most of the time the api provides
  * object semantics for shaders anyway, and the cases where it doesn't
@@ -165,7 +174,7 @@ void cso_set_render_condition(struct cso_context *cso,
 #define CSO_BIT_COMPUTE_SAMPLERS (1<<1)
 
 void cso_save_state(struct cso_context *cso, unsigned state_mask);
-void cso_restore_state(struct cso_context *cso);
+void cso_restore_state(struct cso_context *cso, unsigned unbind);
 
 void cso_save_compute_state(struct cso_context *cso, unsigned state_mask);
 void cso_restore_compute_state(struct cso_context *cso);
diff --git a/mesa 3D driver/src/gallium/auxiliary/draw/draw_context.h b/mesa 3D driver/src/gallium/auxiliary/draw/draw_context.h
index 1069954c9e..ab10c60e1e 100644
--- a/mesa 3D driver/src/gallium/auxiliary/draw/draw_context.h	
+++ b/mesa 3D driver/src/gallium/auxiliary/draw/draw_context.h	
@@ -328,7 +328,8 @@ void draw_vbo(struct draw_context *draw,
               unsigned drawid_offset,
               const struct pipe_draw_indirect_info *indirect,
               const struct pipe_draw_start_count_bias *draws,
-              unsigned num_draws);
+              unsigned num_draws,
+              uint8_t patch_vertices);
 
 
 /*******************************************************************************
diff --git a/mesa 3D driver/src/gallium/auxiliary/draw/draw_pipe_clip.c b/mesa 3D driver/src/gallium/auxiliary/draw/draw_pipe_clip.c
index 2a9c944dc1..d341feef30 100644
--- a/mesa 3D driver/src/gallium/auxiliary/draw/draw_pipe_clip.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/draw/draw_pipe_clip.c	
@@ -469,12 +469,18 @@ do_clip_tri(struct draw_stage *stage,
             new_edge = &outEdges[outcount];
             outlist[outcount++] = new_vert;
 
+            float denom = dp - dp_prev;
             if (dp < 0.0f) {
                /* Going out of bounds.  Avoid division by zero as we
                 * know dp != dp_prev from different_sign, above.
                 */
-               float t = dp / (dp - dp_prev);
-               interp( clipper, new_vert, t, vert, vert_prev, viewport_index );
+               if (-dp < dp_prev) {
+                  float t = dp / denom;
+                  interp( clipper, new_vert, t, vert, vert_prev, viewport_index );
+               } else {
+                  float t = -dp_prev / denom;
+                  interp( clipper, new_vert, t, vert_prev, vert, viewport_index );
+               }
 
                /* Whether or not to set edge flag for the new vert depends
                 * on whether it's a user-defined clipping plane.  We're
@@ -494,8 +500,13 @@ do_clip_tri(struct draw_stage *stage,
             else {
                /* Coming back in.
                 */
-               float t = dp_prev / (dp_prev - dp);
-               interp( clipper, new_vert, t, vert_prev, vert, viewport_index );
+               if (-dp_prev < dp) {
+                  float t = -dp_prev / denom;
+                  interp( clipper, new_vert, t, vert_prev, vert, viewport_index );
+               } else {
+                  float t = dp / denom;
+                  interp( clipper, new_vert, t, vert, vert_prev, viewport_index );
+               }
 
                /* Copy starting vert's edgeflag:
                 */
diff --git a/mesa 3D driver/src/gallium/auxiliary/draw/draw_pipe_offset.c b/mesa 3D driver/src/gallium/auxiliary/draw/draw_pipe_offset.c
index 769e7ca372..87db9cddac 100644
--- a/mesa 3D driver/src/gallium/auxiliary/draw/draw_pipe_offset.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/draw/draw_pipe_offset.c	
@@ -97,7 +97,7 @@ static void do_offset_tri( struct draw_stage *stage,
    if (stage->draw->floating_point_depth) {
       float bias;
       union fi maxz;
-      maxz.f = MAX3(v0[2], v1[2], v2[2]);
+      maxz.f = MAX3(fabs(v0[2]), fabs(v1[2]), fabs(v2[2]));
       /* just do the math directly on shifted number */
       maxz.ui &= 0xff << 23;
       maxz.i -= 23 << 23;
diff --git a/mesa 3D driver/src/gallium/auxiliary/draw/draw_pipe_pstipple.c b/mesa 3D driver/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
index eb7ad8bf24..d5f757eb13 100644
--- a/mesa 3D driver/src/gallium/auxiliary/draw/draw_pipe_pstipple.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/draw/draw_pipe_pstipple.c	
@@ -109,6 +109,7 @@ struct pstip_stage
                                     enum pipe_shader_type shader,
                                     unsigned start, unsigned count,
                                     unsigned unbind_num_trailing_slots,
+                                    bool take_ownership,
                                     struct pipe_sampler_view **);
 
    void (*driver_set_polygon_stipple)(struct pipe_context *,
@@ -224,7 +225,8 @@ pstip_first_tri(struct draw_stage *stage, struct prim_header *header)
                                      num_samplers, pstip->state.samplers);
 
    pstip->driver_set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0,
-                                   num_sampler_views, 0, pstip->state.sampler_views);
+                                   num_sampler_views, 0, false,
+                                   pstip->state.sampler_views);
 
    draw->suspend_flushing = FALSE;
 
@@ -253,7 +255,7 @@ pstip_flush(struct draw_stage *stage, unsigned flags)
                                      pstip->state.samplers);
 
    pstip->driver_set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0,
-                                   pstip->num_sampler_views, 0,
+                                   pstip->num_sampler_views, 0, false,
                                    pstip->state.sampler_views);
 
    draw->suspend_flushing = FALSE;
@@ -418,6 +420,7 @@ pstip_set_sampler_views(struct pipe_context *pipe,
                         enum pipe_shader_type shader,
                         unsigned start, unsigned num,
                         unsigned unbind_num_trailing_slots,
+                        bool take_ownership,
                         struct pipe_sampler_view **views)
 {
    struct pstip_stage *pstip = pstip_stage_from_pipe(pipe);
@@ -438,7 +441,7 @@ pstip_set_sampler_views(struct pipe_context *pipe,
 
    /* pass-through */
    pstip->driver_set_sampler_views(pstip->pipe, shader, start, num,
-                                   unbind_num_trailing_slots, views);
+                                   unbind_num_trailing_slots, take_ownership, views);
 }
 
 
diff --git a/mesa 3D driver/src/gallium/auxiliary/draw/draw_pt.c b/mesa 3D driver/src/gallium/auxiliary/draw/draw_pt.c
index 80084bc772..f1821878ba 100644
--- a/mesa 3D driver/src/gallium/auxiliary/draw/draw_pt.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/draw/draw_pt.c	
@@ -498,7 +498,8 @@ draw_vbo(struct draw_context *draw,
          unsigned drawid_offset,
          const struct pipe_draw_indirect_info *indirect,
          const struct pipe_draw_start_count_bias *draws,
-         unsigned num_draws)
+         unsigned num_draws,
+         uint8_t patch_vertices)
 {
    unsigned index_limit;
    unsigned fpstate = util_fpstate_get();
@@ -532,7 +533,7 @@ draw_vbo(struct draw_context *draw,
    draw->pt.user.drawid = drawid_offset;
    draw->pt.user.increment_draw_id = use_info->increment_draw_id;
    draw->pt.user.viewid = 0;
-   draw->pt.vertices_per_patch = use_info->vertices_per_patch;
+   draw->pt.vertices_per_patch = patch_vertices;
 
    if (0) {
       for (unsigned i = 0; i < num_draws; i++)
diff --git a/mesa 3D driver/src/gallium/auxiliary/draw/draw_tess.c b/mesa 3D driver/src/gallium/auxiliary/draw/draw_tess.c
index ec0cb23a4a..7aa64f076e 100644
--- a/mesa 3D driver/src/gallium/auxiliary/draw/draw_tess.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/draw/draw_tess.c	
@@ -33,6 +33,7 @@
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "util/ralloc.h"
+#ifdef DRAW_LLVM_AVAILABLE
 static inline int
 draw_tes_get_input_index(int semantic, int index,
                          const struct tgsi_shader_info *input_info)
@@ -48,7 +49,6 @@ draw_tes_get_input_index(int semantic, int index,
    return -1;
 }
 
-#ifdef DRAW_LLVM_AVAILABLE
 #define DEBUG_INPUTS 0
 static void
 llvm_fetch_tcs_input(struct draw_tess_ctrl_shader *shader,
diff --git a/mesa 3D driver/src/gallium/auxiliary/driver_ddebug/dd_context.c b/mesa 3D driver/src/gallium/auxiliary/driver_ddebug/dd_context.c
index d24b2c55cc..53b68a3529 100644
--- a/mesa 3D driver/src/gallium/auxiliary/driver_ddebug/dd_context.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/driver_ddebug/dd_context.c	
@@ -411,6 +411,15 @@ static void dd_context_set_tess_state(struct pipe_context *_pipe,
    pipe->set_tess_state(pipe, default_outer_level, default_inner_level);
 }
 
+static void dd_context_set_patch_vertices(struct pipe_context *_pipe,
+                                          uint8_t patch_vertices)
+{
+   struct dd_context *dctx = dd_context(_pipe);
+   struct pipe_context *pipe = dctx->pipe;
+
+   pipe->set_patch_vertices(pipe, patch_vertices);
+}
+
 static void dd_context_set_window_rectangles(struct pipe_context *_pipe,
                                              bool include,
                                              unsigned num_rectangles,
@@ -511,6 +520,7 @@ dd_context_set_sampler_views(struct pipe_context *_pipe,
                              enum pipe_shader_type shader,
                              unsigned start, unsigned num,
                              unsigned unbind_num_trailing_slots,
+                             bool take_ownership,
                              struct pipe_sampler_view **views)
 {
    struct dd_context *dctx = dd_context(_pipe);
@@ -520,7 +530,7 @@ dd_context_set_sampler_views(struct pipe_context *_pipe,
                sizeof(views[0]) * num);
    safe_memcpy(&dctx->draw_state.sampler_views[shader][start + num], views,
                sizeof(views[0]) * unbind_num_trailing_slots);
-   pipe->set_sampler_views(pipe, shader, start, num,
+   pipe->set_sampler_views(pipe, shader, start, num, take_ownership,
                            unbind_num_trailing_slots, views);
 }
 
@@ -903,6 +913,7 @@ dd_context_create(struct dd_screen *dscreen, struct pipe_context *pipe)
    CTX_INIT(set_viewport_states);
    CTX_INIT(set_sampler_views);
    CTX_INIT(set_tess_state);
+   CTX_INIT(set_patch_vertices);
    CTX_INIT(set_shader_buffers);
    CTX_INIT(set_shader_images);
    CTX_INIT(set_vertex_buffers);
diff --git a/mesa 3D driver/src/gallium/auxiliary/driver_ddebug/dd_screen.c b/mesa 3D driver/src/gallium/auxiliary/driver_ddebug/dd_screen.c
index 7c5deb7dc1..b9a60b1a32 100644
--- a/mesa 3D driver/src/gallium/auxiliary/driver_ddebug/dd_screen.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/driver_ddebug/dd_screen.c	
@@ -414,12 +414,12 @@ dd_screen_memobj_destroy(struct pipe_screen *_screen,
  * screen
  */
 
-static void
+static char *
 dd_screen_finalize_nir(struct pipe_screen *_screen, void *nir)
 {
    struct pipe_screen *screen = dd_screen(_screen)->screen;
 
-   screen->finalize_nir(screen, nir);
+   return screen->finalize_nir(screen, nir);
 }
 
 static void
diff --git a/mesa 3D driver/src/gallium/auxiliary/driver_noop/noop_pipe.c b/mesa 3D driver/src/gallium/auxiliary/driver_noop/noop_pipe.c
index 2413a1ccbc..19421dec59 100644
--- a/mesa 3D driver/src/gallium/auxiliary/driver_noop/noop_pipe.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/driver_noop/noop_pipe.c	
@@ -29,6 +29,7 @@
 #include "util/u_memory.h"
 #include "util/u_inlines.h"
 #include "util/format/u_format.h"
+#include "util/u_helpers.h"
 #include "util/u_upload_mgr.h"
 #include "util/u_threaded_context.h"
 #include "noop_public.h"
@@ -119,7 +120,7 @@ static struct pipe_resource *noop_resource_create(struct pipe_screen *screen,
       FREE(nresource);
       return NULL;
    }
-   threaded_resource_init(&nresource->b.b);
+   threaded_resource_init(&nresource->b.b, false, 0);
    return &nresource->b.b;
 }
 
@@ -456,9 +457,11 @@ static struct pipe_context *noop_create_context(struct pipe_screen *screen,
       threaded_context_create(ctx,
                               &((struct noop_pipe_screen*)screen)->pool_transfers,
                               noop_replace_buffer_storage,
-                              noop_create_fence,
-                              noop_is_resource_busy,
-                              false, NULL);
+                              &(struct threaded_context_options) {
+                                 .create_fence = noop_create_fence,
+                                 .is_resource_busy = noop_is_resource_busy,
+                              },
+                              NULL);
 
    if (tc && tc != ctx)
       threaded_context_init_bytes_mapped_limit((struct threaded_context *)tc, 4);
@@ -599,11 +602,11 @@ static const void *noop_get_compiler_options(struct pipe_screen *pscreen,
    return screen->get_compiler_options(screen, ir, shader);
 }
 
-static void noop_finalize_nir(struct pipe_screen *pscreen, void *nir)
+static char *noop_finalize_nir(struct pipe_screen *pscreen, void *nir)
 {
    struct pipe_screen *screen = ((struct noop_pipe_screen*)pscreen)->oscreen;
 
-   screen->finalize_nir(screen, nir);
+   return screen->finalize_nir(screen, nir);
 }
 
 static bool noop_check_resource_capability(struct pipe_screen *screen,
@@ -673,6 +676,32 @@ static void noop_query_dmabuf_modifiers(struct pipe_screen *screen,
                                    external_only, count);
 }
 
+static struct pipe_vertex_state *
+noop_create_vertex_state(struct pipe_screen *screen,
+                         struct pipe_vertex_buffer *buffer,
+                         const struct pipe_vertex_element *elements,
+                         unsigned num_elements,
+                         struct pipe_resource *indexbuf,
+                         uint32_t full_velem_mask)
+{
+   struct pipe_vertex_state *state = CALLOC_STRUCT(pipe_vertex_state);
+
+   if (!state)
+      return NULL;
+
+   util_init_pipe_vertex_state(screen, buffer, elements, num_elements, indexbuf,
+                               full_velem_mask, state);
+   return state;
+}
+
+static void noop_vertex_state_destroy(struct pipe_screen *screen,
+                                      struct pipe_vertex_state *state)
+{
+   pipe_vertex_buffer_unreference(&state->input.vbuffer);
+   pipe_resource_reference(&state->input.indexbuf, NULL);
+   FREE(state);
+}
+
 struct pipe_screen *noop_screen_create(struct pipe_screen *oscreen)
 {
    struct noop_pipe_screen *noop_screen;
@@ -722,6 +751,8 @@ struct pipe_screen *noop_screen_create(struct pipe_screen *oscreen)
    screen->get_device_uuid = noop_get_device_uuid;
    screen->query_dmabuf_modifiers = noop_query_dmabuf_modifiers;
    screen->resource_create_with_modifiers = noop_resource_create_with_modifiers;
+   screen->create_vertex_state = noop_create_vertex_state;
+   screen->vertex_state_destroy = noop_vertex_state_destroy;
 
    slab_create_parent(&noop_screen->pool_transfers,
                       sizeof(struct pipe_transfer), 64);
diff --git a/mesa 3D driver/src/gallium/auxiliary/driver_noop/noop_state.c b/mesa 3D driver/src/gallium/auxiliary/driver_noop/noop_state.c
index 6d4330b268..56036e22ed 100644
--- a/mesa 3D driver/src/gallium/auxiliary/driver_noop/noop_state.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/driver_noop/noop_state.c	
@@ -38,6 +38,15 @@ static void noop_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 {
 }
 
+static void noop_draw_vertex_state(struct pipe_context *ctx,
+                                   struct pipe_vertex_state *state,
+                                   uint32_t partial_velem_mask,
+                                   struct pipe_draw_vertex_state_info info,
+                                   const struct pipe_draw_start_count_bias *draws,
+                                   unsigned num_draws)
+{
+}
+
 static void noop_launch_grid(struct pipe_context *ctx,
                              const struct pipe_grid_info *info)
 {
@@ -116,6 +125,7 @@ static void noop_set_sampler_views(struct pipe_context *ctx,
                                    enum pipe_shader_type shader,
                                    unsigned start, unsigned count,
                                    unsigned unbind_num_trailing_slots,
+                                   bool take_ownership,
                                    struct pipe_sampler_view **views)
 {
 }
@@ -396,6 +406,11 @@ static void noop_make_image_handle_resident(struct pipe_context *ctx, uint64_t h
 {
 }
 
+static void noop_set_patch_vertices(struct pipe_context *ctx,
+                                    uint8_t patch_vertices)
+{
+}
+
 void noop_init_state_functions(struct pipe_context *ctx);
 
 void noop_init_state_functions(struct pipe_context *ctx)
@@ -453,6 +468,7 @@ void noop_init_state_functions(struct pipe_context *ctx)
    ctx->sampler_view_destroy = noop_sampler_view_destroy;
    ctx->surface_destroy = noop_surface_destroy;
    ctx->draw_vbo = noop_draw_vbo;
+   ctx->draw_vertex_state = noop_draw_vertex_state;
    ctx->launch_grid = noop_launch_grid;
    ctx->create_stream_output_target = noop_create_stream_output_target;
    ctx->stream_output_target_destroy = noop_stream_output_target_destroy;
@@ -476,4 +492,5 @@ void noop_init_state_functions(struct pipe_context *ctx)
    ctx->create_image_handle = noop_create_image_handle;
    ctx->delete_image_handle = noop_delete_image_handle;
    ctx->make_image_handle_resident = noop_make_image_handle_resident;
+   ctx->set_patch_vertices = noop_set_patch_vertices;
 }
diff --git a/mesa 3D driver/src/gallium/auxiliary/driver_rbug/rbug_context.c b/mesa 3D driver/src/gallium/auxiliary/driver_rbug/rbug_context.c
index 40df61be75..b0c283d9b9 100644
--- a/mesa 3D driver/src/gallium/auxiliary/driver_rbug/rbug_context.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/driver_rbug/rbug_context.c	
@@ -740,6 +740,7 @@ rbug_set_sampler_views(struct pipe_context *_pipe,
                        unsigned start,
                        unsigned num,
                        unsigned unbind_num_trailing_slots,
+                       bool take_ownership,
                        struct pipe_sampler_view **_views)
 {
    struct rbug_context *rb_pipe = rbug_context(_pipe);
@@ -769,7 +770,7 @@ rbug_set_sampler_views(struct pipe_context *_pipe,
    }
 
    pipe->set_sampler_views(pipe, shader, start, num,
-                           unbind_num_trailing_slots, views);
+                           unbind_num_trailing_slots, take_ownership, views);
 
    mtx_unlock(&rb_pipe->call_mutex);
 }
diff --git a/mesa 3D driver/src/gallium/auxiliary/driver_rbug/rbug_screen.c b/mesa 3D driver/src/gallium/auxiliary/driver_rbug/rbug_screen.c
index 6adb5cf8a9..9eb9ba379c 100644
--- a/mesa 3D driver/src/gallium/auxiliary/driver_rbug/rbug_screen.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/driver_rbug/rbug_screen.c	
@@ -410,12 +410,12 @@ rbug_screen_fence_get_fd(struct pipe_screen *_screen,
    return screen->fence_get_fd(screen, fence);
 }
 
-static void
+static char *
 rbug_screen_finalize_nir(struct pipe_screen *_screen, void *nir)
 {
    struct pipe_screen *screen = rbug_screen(_screen)->screen;
 
-   screen->finalize_nir(screen, nir);
+   return screen->finalize_nir(screen, nir);
 }
 
 bool
diff --git a/mesa 3D driver/src/gallium/auxiliary/driver_trace/tr_context.c b/mesa 3D driver/src/gallium/auxiliary/driver_trace/tr_context.c
index f0666d9773..b451a34d1e 100644
--- a/mesa 3D driver/src/gallium/auxiliary/driver_trace/tr_context.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/driver_trace/tr_context.c	
@@ -141,6 +141,39 @@ trace_context_draw_vbo(struct pipe_context *_pipe,
 }
 
 
+static void
+trace_context_draw_vertex_state(struct pipe_context *_pipe,
+                                struct pipe_vertex_state *state,
+                                uint32_t partial_velem_mask,
+                                struct pipe_draw_vertex_state_info info,
+                                const struct pipe_draw_start_count_bias *draws,
+                                unsigned num_draws)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   if (!tr_ctx->seen_fb_state && trace_dump_is_triggered())
+      dump_fb_state(tr_ctx, "current_framebuffer_state", true);
+
+   trace_dump_call_begin("pipe_context", "draw_vertex_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+   trace_dump_arg(uint, partial_velem_mask);
+   trace_dump_arg(draw_vertex_state_info, info);
+   trace_dump_arg_begin("draws");
+   trace_dump_struct_array(draw_start_count, draws, num_draws);
+   trace_dump_arg_end();
+   trace_dump_arg(uint, num_draws);
+
+   trace_dump_trace_flush();
+
+   pipe->draw_vertex_state(pipe, state, partial_velem_mask, info, draws,
+                           num_draws);
+   trace_dump_call_end();
+}
+
+
 static struct pipe_query *
 trace_context_create_query(struct pipe_context *_pipe,
                            unsigned query_type,
@@ -1022,6 +1055,8 @@ trace_context_create_sampler_view(struct pipe_context *_pipe,
    pipe_resource_reference(&tr_view->base.texture, resource);
    tr_view->base.context = _pipe;
    tr_view->sampler_view = result;
+   result->reference.count += 100000000;
+   tr_view->refcount = 100000000;
    result = &tr_view->base;
 
    return result;
@@ -1042,6 +1077,7 @@ trace_context_sampler_view_destroy(struct pipe_context *_pipe,
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(ptr, view);
 
+   p_atomic_add(&tr_view->sampler_view->reference.count, -tr_view->refcount);
    pipe_sampler_view_reference(&tr_view->sampler_view, NULL);
 
    trace_dump_call_end();
@@ -1112,6 +1148,7 @@ trace_context_set_sampler_views(struct pipe_context *_pipe,
                                 unsigned start,
                                 unsigned num,
                                 unsigned unbind_num_trailing_slots,
+                                bool take_ownership,
                                 struct pipe_sampler_view **views)
 {
    struct trace_context *tr_ctx = trace_context(_pipe);
@@ -1125,6 +1162,13 @@ trace_context_set_sampler_views(struct pipe_context *_pipe,
 
    for (i = 0; i < num; ++i) {
       tr_view = trace_sampler_view(views[i]);
+      if (tr_view) {
+         tr_view->refcount--;
+         if (!tr_view->refcount) {
+            tr_view->refcount = 100000000;
+            p_atomic_add(&tr_view->sampler_view->reference.count, tr_view->refcount);
+         }
+      }
       unwrapped_views[i] = tr_view ? tr_view->sampler_view : NULL;
    }
    views = unwrapped_views;
@@ -1136,10 +1180,11 @@ trace_context_set_sampler_views(struct pipe_context *_pipe,
    trace_dump_arg(uint, start);
    trace_dump_arg(uint, num);
    trace_dump_arg(uint, unbind_num_trailing_slots);
+   trace_dump_arg(bool, take_ownership);
    trace_dump_arg_array(ptr, views, num);
 
    pipe->set_sampler_views(pipe, shader, start, num,
-                           unbind_num_trailing_slots, views);
+                           unbind_num_trailing_slots, take_ownership, views);
 
    trace_dump_call_end();
 }
@@ -1949,6 +1994,20 @@ trace_context_set_tess_state(struct pipe_context *_context,
    context->set_tess_state(context, default_outer_level, default_inner_level);
 }
 
+static void
+trace_context_set_patch_vertices(struct pipe_context *_context,
+                                 uint8_t patch_vertices)
+{
+   struct trace_context *tr_context = trace_context(_context);
+   struct pipe_context *context = tr_context->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_patch_vertices");
+   trace_dump_arg(ptr, context);
+   trace_dump_arg(uint, patch_vertices);
+   trace_dump_call_end();
+
+   context->set_patch_vertices(context, patch_vertices);
+}
 
 static void trace_context_set_shader_buffers(struct pipe_context *_context,
                                              enum pipe_shader_type shader,
@@ -2151,6 +2210,7 @@ trace_context_create(struct trace_screen *tr_scr,
    tr_ctx->base . _member = pipe -> _member ? trace_context_ ## _member : NULL
 
    TR_CTX_INIT(draw_vbo);
+   TR_CTX_INIT(draw_vertex_state);
    TR_CTX_INIT(render_condition);
    TR_CTX_INIT(create_query);
    TR_CTX_INIT(destroy_query);
@@ -2227,6 +2287,7 @@ trace_context_create(struct trace_screen *tr_scr,
    TR_CTX_INIT(memory_barrier);
    TR_CTX_INIT(resource_commit);
    TR_CTX_INIT(set_tess_state);
+   TR_CTX_INIT(set_patch_vertices);
    TR_CTX_INIT(set_shader_buffers);
    TR_CTX_INIT(launch_grid);
    TR_CTX_INIT(set_shader_images);
diff --git a/mesa 3D driver/src/gallium/auxiliary/driver_trace/tr_context.h b/mesa 3D driver/src/gallium/auxiliary/driver_trace/tr_context.h
index 953ccd328c..f687fa2939 100644
--- a/mesa 3D driver/src/gallium/auxiliary/driver_trace/tr_context.h	
+++ b/mesa 3D driver/src/gallium/auxiliary/driver_trace/tr_context.h	
@@ -86,8 +86,7 @@ trace_context_create(struct trace_screen *tr_scr,
 struct pipe_context *
 trace_context_create_threaded(struct pipe_screen *screen, struct pipe_context *pipe,
                               tc_replace_buffer_storage_func *replace_buffer,
-                              tc_create_fence_func *create_fence,
-                              tc_is_resource_busy *is_resource_busy);
+                              struct threaded_context_options *options);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mesa 3D driver/src/gallium/auxiliary/driver_trace/tr_dump_state.c b/mesa 3D driver/src/gallium/auxiliary/driver_trace/tr_dump_state.c
index 65d9a0acc1..1f5da80193 100644
--- a/mesa 3D driver/src/gallium/auxiliary/driver_trace/tr_dump_state.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/driver_trace/tr_dump_state.c	
@@ -700,6 +700,10 @@ void trace_dump_vertex_element(const struct pipe_vertex_element *state)
 
    trace_dump_member(uint, state, vertex_buffer_index);
 
+   trace_dump_member(uint, state, instance_divisor);
+
+   trace_dump_member(bool, state, dual_slot);
+
    trace_dump_member(format, state, src_format);
 
    trace_dump_struct_end();
@@ -820,8 +824,6 @@ void trace_dump_draw_info(const struct pipe_draw_info *state)
    trace_dump_member(uint, state, start_instance);
    trace_dump_member(uint, state, instance_count);
 
-   trace_dump_member(uint, state, vertices_per_patch);
-
    trace_dump_member(uint, state, min_index);
    trace_dump_member(uint, state, max_index);
 
@@ -832,6 +834,17 @@ void trace_dump_draw_info(const struct pipe_draw_info *state)
    trace_dump_struct_end();
 }
 
+void trace_dump_draw_vertex_state_info(struct pipe_draw_vertex_state_info state)
+{
+   if (!trace_dumping_enabled_locked())
+      return;
+
+   trace_dump_struct_begin("pipe_draw_vertex_state_info");
+   trace_dump_member(uint, &state, mode);
+   trace_dump_member(uint, &state, take_vertex_state_ownership);
+   trace_dump_struct_end();
+}
+
 void trace_dump_draw_start_count(const struct pipe_draw_start_count_bias *state)
 {
    if (!trace_dumping_enabled_locked())
diff --git a/mesa 3D driver/src/gallium/auxiliary/driver_trace/tr_dump_state.h b/mesa 3D driver/src/gallium/auxiliary/driver_trace/tr_dump_state.h
index f3d89ebd7a..f5633b3be1 100644
--- a/mesa 3D driver/src/gallium/auxiliary/driver_trace/tr_dump_state.h	
+++ b/mesa 3D driver/src/gallium/auxiliary/driver_trace/tr_dump_state.h	
@@ -86,6 +86,8 @@ void trace_dump_shader_buffer(const struct pipe_shader_buffer *buffer);
 
 void trace_dump_draw_info(const struct pipe_draw_info *state);
 
+void trace_dump_draw_vertex_state_info(struct pipe_draw_vertex_state_info state);
+
 void trace_dump_draw_start_count(const struct pipe_draw_start_count_bias *state);
 
 void trace_dump_draw_indirect_info(const struct pipe_draw_indirect_info *state);
diff --git a/mesa 3D driver/src/gallium/auxiliary/driver_trace/tr_screen.c b/mesa 3D driver/src/gallium/auxiliary/driver_trace/tr_screen.c
index 9856dbca82..81eaff92f3 100644
--- a/mesa 3D driver/src/gallium/auxiliary/driver_trace/tr_screen.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/driver_trace/tr_screen.c	
@@ -333,8 +333,7 @@ trace_context_is_resource_busy(struct pipe_screen *_screen,
 struct pipe_context *
 trace_context_create_threaded(struct pipe_screen *screen, struct pipe_context *pipe,
                               tc_replace_buffer_storage_func *replace_buffer,
-                              tc_create_fence_func *create_fence,
-                              tc_is_resource_busy *is_resource_busy)
+                              struct threaded_context_options *options)
 {
    if (!trace_screens)
       return pipe;
@@ -353,14 +352,14 @@ trace_context_create_threaded(struct pipe_screen *screen, struct pipe_context *p
 
    struct trace_context *tr_ctx = trace_context(ctx);
    tr_ctx->replace_buffer_storage = *replace_buffer;
-   tr_ctx->create_fence = *create_fence;
-   tr_scr->is_resource_busy = *is_resource_busy;
+   tr_ctx->create_fence = options->create_fence;
+   tr_scr->is_resource_busy = options->is_resource_busy;
    tr_ctx->threaded = true;
    *replace_buffer = trace_context_replace_buffer_storage;
-   if (*create_fence)
-      *create_fence = trace_context_create_fence;
-   if (*is_resource_busy)
-      *is_resource_busy = trace_context_is_resource_busy;
+   if (options->create_fence)
+      options->create_fence = trace_context_create_fence;
+   if (options->is_resource_busy)
+      options->is_resource_busy = trace_context_is_resource_busy;
    return ctx;
 }
 
@@ -919,12 +918,12 @@ trace_screen_get_timestamp(struct pipe_screen *_screen)
    return result;
 }
 
-static void
+static char *
 trace_screen_finalize_nir(struct pipe_screen *_screen, void *nir)
 {
    struct pipe_screen *screen = trace_screen(_screen)->screen;
 
-   screen->finalize_nir(screen, nir);
+   return screen->finalize_nir(screen, nir);
 }
 
 static void
@@ -1040,6 +1039,49 @@ trace_screen_get_dmabuf_modifier_planes(struct pipe_screen *_screen, uint64_t mo
    return ret;
 }
 
+static struct pipe_vertex_state *
+trace_screen_create_vertex_state(struct pipe_screen *_screen,
+                                 struct pipe_vertex_buffer *buffer,
+                                 const struct pipe_vertex_element *elements,
+                                 unsigned num_elements,
+                                 struct pipe_resource *indexbuf,
+                                 uint32_t full_velem_mask)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+
+   trace_dump_call_begin("pipe_screen", "create_vertex_state");
+
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(ptr, buffer->buffer.resource);
+   trace_dump_arg(vertex_buffer, buffer);
+   trace_dump_struct_array(vertex_element, elements, num_elements);
+   trace_dump_arg(uint, num_elements);
+   trace_dump_arg(ptr, indexbuf);
+   trace_dump_arg(uint, full_velem_mask);
+
+   struct pipe_vertex_state *vstate =
+      screen->create_vertex_state(screen, buffer, elements, num_elements,
+                                  indexbuf, full_velem_mask);
+   trace_dump_ret(ptr, vstate);
+   trace_dump_call_end();
+   return vstate;
+}
+
+static void trace_screen_vertex_state_destroy(struct pipe_screen *_screen,
+                                              struct pipe_vertex_state *state)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+
+   trace_dump_call_begin("pipe_screen", "vertex_state_destroy");
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(ptr, state);
+   trace_dump_call_end();
+
+   screen->vertex_state_destroy(screen, state);
+}
+
 bool
 trace_enabled(void)
 {
@@ -1134,6 +1176,8 @@ trace_screen_create(struct pipe_screen *screen)
    SCR_INIT(get_driver_uuid);
    SCR_INIT(get_device_uuid);
    SCR_INIT(finalize_nir);
+   SCR_INIT(create_vertex_state);
+   SCR_INIT(vertex_state_destroy);
    tr_scr->base.transfer_helper = screen->transfer_helper;
 
    tr_scr->screen = screen;
diff --git a/mesa 3D driver/src/gallium/auxiliary/driver_trace/tr_texture.h b/mesa 3D driver/src/gallium/auxiliary/driver_trace/tr_texture.h
index 567842a5b2..b9caf968d1 100644
--- a/mesa 3D driver/src/gallium/auxiliary/driver_trace/tr_texture.h	
+++ b/mesa 3D driver/src/gallium/auxiliary/driver_trace/tr_texture.h	
@@ -57,6 +57,7 @@ struct trace_surface
 struct trace_sampler_view
 {
    struct pipe_sampler_view base;
+   unsigned refcount;
 
    struct pipe_sampler_view *sampler_view;
 };
diff --git a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 14fda8db0f..1c71c05082 100644
--- a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_arit.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_arit.c	
@@ -1357,8 +1357,21 @@ lp_build_lerp_simple(struct lp_build_context *bld,
          }
 
          /* (x * delta) >> n */
-         res = lp_build_mul(bld, x, delta);
-         res = lp_build_shr_imm(bld, res, half_width);
+	 /*
+	  * For this multiply, higher internal precision is required to pass CTS,
+	  * the most efficient path to that is pmulhrsw on ssse3 and above.
+	  * This could be opencoded on other arches if conformance was required.
+	  */
+         if (bld->type.width == 16 && bld->type.length == 8 && util_get_cpu_caps()->has_ssse3) {
+            res = lp_build_intrinsic_binary(builder, "llvm.x86.ssse3.pmul.hr.sw.128", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
+            res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
+         } else if (bld->type.width == 16 && bld->type.length == 16 && util_get_cpu_caps()->has_avx2) {
+            res = lp_build_intrinsic_binary(builder, "llvm.x86.avx2.pmul.hr.sw", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
+            res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
+         } else {
+            res = lp_build_mul(bld, x, delta);
+            res = lp_build_shr_imm(bld, res, half_width);
+         }
       } else {
          /*
           * The rescaling trick above doesn't work for signed numbers, so
@@ -2030,6 +2043,12 @@ lp_build_trunc(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
+   if (type.width == 16) {
+      char intrinsic[64];
+      lp_format_intrinsic(intrinsic, 64, "llvm.trunc", bld->vec_type);
+      return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
+   }
+
    if (arch_rounding_available(type)) {
       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
    }
@@ -2083,6 +2102,12 @@ lp_build_round(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
+   if (type.width == 16) {
+      char intrinsic[64];
+      lp_format_intrinsic(intrinsic, 64, "llvm.round", bld->vec_type);
+      return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
+   }
+
    if (arch_rounding_available(type)) {
       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
    }
@@ -3012,6 +3037,17 @@ LLVMValueRef
 lp_build_sin(struct lp_build_context *bld,
              LLVMValueRef a)
 {
+   const struct lp_type type = bld->type;
+
+   if (type.width == 16) {
+      LLVMBuilderRef builder = bld->gallivm->builder;
+      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
+      char intrinsic[32];
+      lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sin", vec_type);
+      LLVMValueRef args[] = { a };
+      return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
+   }
+
    return lp_build_sin_or_cos(bld, a, FALSE);
 }
 
@@ -3023,6 +3059,17 @@ LLVMValueRef
 lp_build_cos(struct lp_build_context *bld,
              LLVMValueRef a)
 {
+   const struct lp_type type = bld->type;
+
+   if (type.width == 16) {
+      LLVMBuilderRef builder = bld->gallivm->builder;
+      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
+      char intrinsic[32];
+      lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.cos", vec_type);
+      LLVMValueRef args[] = { a };
+      return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
+   }
+
    return lp_build_sin_or_cos(bld, a, TRUE);
 }
 
@@ -3205,6 +3252,13 @@ lp_build_exp2(struct lp_build_context *bld,
    LLVMValueRef expfpart = NULL;
    LLVMValueRef res = NULL;
 
+   if (type.floating && type.width == 16) {
+      char intrinsic[32];
+      lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.exp2", vec_type);
+      LLVMValueRef args[] = { x };
+      return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
+   }
+
    assert(lp_check_value(bld->type, x));
 
    /* TODO: optimize the constant case */
@@ -3386,6 +3440,15 @@ lp_build_log2_approx(struct lp_build_context *bld,
    LLVMValueRef p_z = NULL;
    LLVMValueRef res = NULL;
 
+   if (bld->type.width == 16) {
+      char intrinsic[32];
+      lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.log2", bld->vec_type);
+      LLVMValueRef args[] = { x };
+      if (p_log2)
+         *p_log2 = lp_build_intrinsic(builder, intrinsic, bld->vec_type, args, 1, 0);
+      return;
+   }
+
    assert(lp_check_value(bld->type, x));
 
    if(p_exp || p_floor_log2 || p_log2) {
diff --git a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_const.c b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_const.c
index 18ece7324d..4f4bddf44b 100644
--- a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_const.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_const.c	
@@ -42,7 +42,7 @@
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_init.h"
-
+#include "lp_bld_limits.h"
 
 unsigned
 lp_mantissa(struct lp_type type)
@@ -256,7 +256,7 @@ lp_build_one(struct gallivm_state *gallivm, struct lp_type type)
 
    elem_type = lp_build_elem_type(gallivm, type);
 
-   if(type.floating && type.width == 16)
+   if(!lp_has_fp16() && type.floating && type.width == 16)
       elems[0] = LLVMConstInt(elem_type, _mesa_float_to_half(1.0f), 0);
    else if(type.floating)
       elems[0] = LLVMConstReal(elem_type, 1.0);
@@ -303,7 +303,7 @@ lp_build_const_elem(struct gallivm_state *gallivm,
    LLVMTypeRef elem_type = lp_build_elem_type(gallivm, type);
    LLVMValueRef elem;
 
-   if(type.floating && type.width == 16) {
+   if (!lp_has_fp16() && type.floating && type.width == 16) {
       elem = LLVMConstInt(elem_type, _mesa_float_to_half((float)val), 0);
    } else if(type.floating) {
       elem = LLVMConstReal(elem_type, val);
diff --git a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index e211fe66df..1073ff4fa6 100644
--- a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_conv.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_conv.c	
@@ -120,6 +120,8 @@ lp_build_half_to_float(struct gallivm_state *gallivm,
          else {
             intrinsic = "llvm.x86.vcvtph2ps.256";
          }
+         src = LLVMBuildBitCast(builder, src,
+                                LLVMVectorType(LLVMInt16TypeInContext(gallivm->context), 8), "");
          return lp_build_intrinsic_unary(builder, intrinsic,
                                          lp_build_vec_type(gallivm, f32_type), src);
       } else {
@@ -193,6 +195,7 @@ lp_build_float_to_half(struct gallivm_state *gallivm,
       if (length == 4) {
          result = lp_build_extract_range(gallivm, result, 0, 4);
       }
+      result = LLVMBuildBitCast(builder, result, lp_build_vec_type(gallivm, lp_type_float_vec(16, 16 * length)), "");
    }
 
    else {
diff --git a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_coro.c b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_coro.c
index 28f722e93c..d3d5e6dc96 100644
--- a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_coro.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_coro.c	
@@ -176,9 +176,8 @@ void lp_build_coro_declare_malloc_hooks(struct gallivm_state *gallivm)
 
 LLVMValueRef lp_build_coro_begin_alloc_mem(struct gallivm_state *gallivm, LLVMValueRef coro_id)
 {
-   LLVMValueRef do_alloc = lp_build_coro_alloc(gallivm, coro_id);
    LLVMTypeRef mem_ptr_type = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
-   LLVMValueRef alloc_mem_store = lp_build_alloca(gallivm, mem_ptr_type, "coro mem");
+   LLVMValueRef do_alloc = lp_build_coro_alloc(gallivm, coro_id);
    struct lp_build_if_state if_state_coro;
    lp_build_if(&if_state_coro, gallivm, do_alloc);
    LLVMValueRef coro_size = lp_build_coro_size(gallivm);
@@ -186,14 +185,40 @@ LLVMValueRef lp_build_coro_begin_alloc_mem(struct gallivm_state *gallivm, LLVMVa
 
    assert(gallivm->coro_malloc_hook);
    alloc_mem = LLVMBuildCall(gallivm->builder, gallivm->coro_malloc_hook, &coro_size, 1, "");
-
-   LLVMBuildStore(gallivm->builder, alloc_mem, alloc_mem_store);
    lp_build_endif(&if_state_coro);
-   alloc_mem = LLVMBuildLoad(gallivm->builder, alloc_mem_store, "");
-   LLVMValueRef coro_hdl = lp_build_coro_begin(gallivm, coro_id, alloc_mem);
+
+   LLVMValueRef phi = LLVMBuildPhi(gallivm->builder, mem_ptr_type, "");
+   LLVMValueRef zero_bool = LLVMConstNull(mem_ptr_type);
+   LLVMAddIncoming(phi, &alloc_mem, &if_state_coro.true_block, 1);
+   LLVMAddIncoming(phi, &zero_bool, &if_state_coro.entry_block, 1);
+
+   LLVMValueRef coro_hdl = lp_build_coro_begin(gallivm, coro_id, phi);
    return coro_hdl;
 }
 
+LLVMValueRef lp_build_coro_alloc_mem_array(struct gallivm_state *gallivm,
+					   LLVMValueRef coro_hdl_ptr, LLVMValueRef coro_idx,
+					   LLVMValueRef coro_num_hdls)
+{
+   LLVMTypeRef mem_ptr_type = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
+   LLVMValueRef alloced_ptr = LLVMBuildLoad(gallivm->builder, coro_hdl_ptr, "");
+
+   LLVMValueRef not_alloced = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, alloced_ptr, LLVMConstNull(mem_ptr_type), "");
+   LLVMValueRef coro_size = lp_build_coro_size(gallivm);
+
+   struct lp_build_if_state if_state_coro;
+   lp_build_if(&if_state_coro, gallivm, not_alloced);
+
+   LLVMValueRef alloc_mem;
+   LLVMValueRef alloc_size = LLVMBuildMul(gallivm->builder, coro_num_hdls, coro_size, "");
+   assert(gallivm->coro_malloc_hook);
+   alloc_mem = LLVMBuildCall(gallivm->builder, gallivm->coro_malloc_hook, &alloc_size, 1, "");
+   LLVMBuildStore(gallivm->builder, alloc_mem, coro_hdl_ptr);
+   lp_build_endif(&if_state_coro);
+
+   return LLVMBuildMul(gallivm->builder, coro_size, coro_idx, "");
+}
+
 void lp_build_coro_free_mem(struct gallivm_state *gallivm, LLVMValueRef coro_id, LLVMValueRef coro_hdl)
 {
    LLVMValueRef alloc_mem = lp_build_coro_free(gallivm, coro_id, coro_hdl);
diff --git a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_coro.h b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_coro.h
index 2ffc130c9a..1853217ed7 100644
--- a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_coro.h	
+++ b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_coro.h	
@@ -55,6 +55,10 @@ LLVMValueRef lp_build_coro_suspend(struct gallivm_state *gallivm, bool last);
 LLVMValueRef lp_build_coro_alloc(struct gallivm_state *gallivm, LLVMValueRef id);
 
 LLVMValueRef lp_build_coro_begin_alloc_mem(struct gallivm_state *gallivm, LLVMValueRef coro_id);
+
+LLVMValueRef lp_build_coro_alloc_mem_array(struct gallivm_state *gallivm,
+					   LLVMValueRef coro_hdl_ptr, LLVMValueRef coro_idx,
+					   LLVMValueRef coro_num_hdls);
 void lp_build_coro_free_mem(struct gallivm_state *gallivm, LLVMValueRef coro_id, LLVMValueRef coro_hdl);
 
 struct lp_build_coro_suspend_info {
diff --git a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index 77e562206e..8e57a5e349 100644
--- a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c	
@@ -919,7 +919,15 @@ lp_build_insert_soa_chan(struct lp_build_context *bld,
     case UTIL_FORMAT_TYPE_SIGNED:
        if (chan_desc.pure_integer) {
           chan = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
-          chan = LLVMBuildAnd(builder, chan, lp_build_const_int_vec(gallivm, type, chan_mask), "");
+          /* clamp to SINT range for < 32-bit values */
+          if (width < 32) {
+             struct lp_build_context int_bld;
+             lp_build_context_init(&int_bld, gallivm, lp_int_type(bld->type));
+             chan = lp_build_clamp(&int_bld, chan,
+                                   lp_build_const_int_vec(gallivm, type, -(1ULL << (width - 1))),
+                                   lp_build_const_int_vec(gallivm, type, (1ULL << (width - 1)) - 1));
+             chan = LLVMBuildAnd(builder, chan, lp_build_const_int_vec(gallivm, type, chan_mask), "");
+          }
        } else if (type.floating) {
           if (chan_desc.normalized) {
              char intrin[32];
@@ -945,6 +953,8 @@ lp_build_insert_soa_chan(struct lp_build_context *bld,
        if (type.floating) {
           if (chan_desc.size == 16) {
              chan = lp_build_float_to_half(gallivm, rgba);
+             chan = LLVMBuildBitCast(builder, chan,
+				     lp_build_vec_type(gallivm, lp_type_int_vec(16, 16 * type.length)), "");
              chan = LLVMBuildZExt(builder, chan, bld->int_vec_type, "");
              if (start)
                 chan = LLVMBuildShl(builder, chan,
@@ -977,6 +987,7 @@ lp_build_pack_rgba_soa(struct gallivm_state *gallivm,
 {
    unsigned chan;
    struct lp_build_context bld;
+   LLVMValueRef rgba_swiz[4];
    assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
    assert(format_desc->block.width == 1);
    assert(format_desc->block.height == 1);
@@ -985,13 +996,16 @@ lp_build_pack_rgba_soa(struct gallivm_state *gallivm,
    assert(type.width == 32);
 
    lp_build_context_init(&bld, gallivm, type);
+
+   lp_build_format_swizzle_soa(format_desc, &bld, rgba_in, rgba_swiz);
+
    for (chan = 0; chan < format_desc->nr_channels; ++chan) {
       struct util_format_channel_description chan_desc = format_desc->channel[chan];
 
       lp_build_insert_soa_chan(&bld, format_desc->block.bits,
                                chan_desc,
                                packed,
-                               rgba_in[chan]);
+                               rgba_swiz[chan]);
    }
 }
 
diff --git a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_intr.c b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_intr.c
index 5e9cc70ef3..2ce723c7e5 100644
--- a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_intr.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_intr.c	
@@ -86,6 +86,10 @@ lp_format_intrinsic(char *name,
       c = 'f';
       width = 64;
       break;
+   case LLVMHalfTypeKind:
+      c = 'f';
+      width = 16;
+      break;
    default:
       unreachable("unexpected LLVMTypeKind");
    }
diff --git a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_limits.h b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_limits.h
index ab61f58021..573237e23d 100644
--- a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_limits.h	
+++ b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_limits.h	
@@ -34,7 +34,7 @@
 
 #include "pipe/p_state.h"
 #include "pipe/p_defines.h"
-
+#include "util/u_cpu_detect.h"
 
 /*
  * TGSI translation limits.
@@ -73,11 +73,12 @@
 /**
  * Maximum control flow nesting
  *
+ * Vulkan CTS tests seem to have up to 76 levels. Add a few for safety.
  * SM4.0 requires 64 (per subroutine actually, subroutine nesting itself is 32)
  * SM3.0 requires 24 (most likely per subroutine too)
  * add 2 more (some translation could add one more)
  */
-#define LP_MAX_TGSI_NESTING 66
+#define LP_MAX_TGSI_NESTING 80
 
 /**
  * Maximum iterations before loop termination
@@ -85,6 +86,11 @@
  */
 #define LP_MAX_TGSI_LOOP_ITERATIONS 65535
 
+static inline bool
+lp_has_fp16(void)
+{
+   return util_get_cpu_caps()->has_f16c;
+}
 
 /**
  * Some of these limits are actually infinite (i.e., only limited by available
@@ -124,10 +130,11 @@ gallivm_get_shader_param(enum pipe_shader_cap param)
       return 1;
    case PIPE_SHADER_CAP_INTEGERS:
       return 1;
-   case PIPE_SHADER_CAP_INT64_ATOMICS:
    case PIPE_SHADER_CAP_FP16:
    case PIPE_SHADER_CAP_FP16_DERIVATIVES:
    case PIPE_SHADER_CAP_FP16_CONST_BUFFERS:
+      return lp_has_fp16();
+   case PIPE_SHADER_CAP_INT64_ATOMICS:
       return 0;
    case PIPE_SHADER_CAP_INT16:
    case PIPE_SHADER_CAP_GLSL_16BIT_CONSTS:
diff --git a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_nir.c b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_nir.c
index e53e0fd61f..ca56330f9e 100644
--- a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_nir.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_nir.c	
@@ -27,10 +27,12 @@
 #include "lp_bld_arit.h"
 #include "lp_bld_bitarit.h"
 #include "lp_bld_const.h"
+#include "lp_bld_conv.h"
 #include "lp_bld_gather.h"
 #include "lp_bld_logic.h"
 #include "lp_bld_quad.h"
 #include "lp_bld_flow.h"
+#include "lp_bld_intr.h"
 #include "lp_bld_struct.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_printf.h"
@@ -48,7 +50,7 @@ static LLVMValueRef cast_type(struct lp_build_nir_context *bld_base, LLVMValueRe
    case nir_type_float:
       switch (bit_size) {
       case 16:
-         return LLVMBuildBitCast(builder, val, LLVMVectorType(LLVMHalfTypeInContext(bld_base->base.gallivm->context), bld_base->base.type.length), "");
+         return LLVMBuildBitCast(builder, val, bld_base->half_bld.vec_type, "");
       case 32:
          return LLVMBuildBitCast(builder, val, bld_base->base.vec_type, "");
       case 64:
@@ -223,6 +225,8 @@ static LLVMValueRef flt_to_bool32(struct lp_build_nir_context *bld_base,
    LLVMValueRef result = lp_build_cmp(flt_bld, PIPE_FUNC_NOTEQUAL, val, flt_bld->zero);
    if (src_bit_size == 64)
       result = LLVMBuildTrunc(builder, result, bld_base->int_bld.vec_type, "");
+   if (src_bit_size == 16)
+      result = LLVMBuildSExt(builder, result, bld_base->int_bld.vec_type, "");
    return result;
 }
 
@@ -241,6 +245,8 @@ static LLVMValueRef fcmp32(struct lp_build_nir_context *bld_base,
       result = lp_build_cmp(flt_bld, compare, src[0], src[1]);
    if (src_bit_size == 64)
       result = LLVMBuildTrunc(builder, result, bld_base->int_bld.vec_type, "");
+   else if (src_bit_size == 16)
+      result = LLVMBuildSExt(builder, result, bld_base->int_bld.vec_type, "");
    return result;
 }
 
@@ -307,6 +313,9 @@ static LLVMValueRef emit_b2f(struct lp_build_nir_context *bld_base,
                                       "");
    result = LLVMBuildBitCast(builder, result, bld_base->base.vec_type, "");
    switch (bitsize) {
+   case 16:
+      result = LLVMBuildFPTrunc(builder, result, bld_base->half_bld.vec_type, "");
+      break;
    case 32:
       break;
    case 64:
@@ -545,7 +554,7 @@ do_quantize_to_f16(struct lp_build_nir_context *bld_base,
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef result, cond, cond2, temp;
 
-   result = LLVMBuildFPTrunc(builder, src, LLVMVectorType(LLVMHalfTypeInContext(gallivm->context), bld_base->base.type.length), "");
+   result = LLVMBuildFPTrunc(builder, src, bld_base->half_bld.vec_type, "");
    result = LLVMBuildFPExt(builder, result, bld_base->base.vec_type, "");
 
    temp = lp_build_abs(get_flt_bld(bld_base, 32), result);
@@ -568,6 +577,9 @@ static LLVMValueRef do_alu_action(struct lp_build_nir_context *bld_base,
    LLVMValueRef result;
 
    switch (instr->op) {
+   case nir_op_b2f16:
+      result = emit_b2f(bld_base, src[0], 16);
+      break;
    case nir_op_b2f32:
       result = emit_b2f(bld_base, src[0], 32);
       break;
@@ -610,7 +622,7 @@ static LLVMValueRef do_alu_action(struct lp_build_nir_context *bld_base,
          src[0] = LLVMBuildFPTrunc(builder, src[0],
                                    bld_base->base.vec_type, "");
       result = LLVMBuildFPTrunc(builder, src[0],
-                                LLVMVectorType(LLVMHalfTypeInContext(gallivm->context), bld_base->base.type.length), "");
+                                bld_base->half_bld.vec_type, "");
       break;
    case nir_op_f2f32:
       if (src_bit_size[0] < 32)
@@ -673,17 +685,17 @@ static LLVMValueRef do_alu_action(struct lp_build_nir_context *bld_base,
       result = lp_build_ceil(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
       break;
    case nir_op_fcos:
-      result = lp_build_cos(&bld_base->base, src[0]);
+      result = lp_build_cos(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
       break;
    case nir_op_fddx:
    case nir_op_fddx_coarse:
    case nir_op_fddx_fine:
-      result = lp_build_ddx(&bld_base->base, src[0]);
+      result = lp_build_ddx(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
       break;
    case nir_op_fddy:
    case nir_op_fddy_coarse:
    case nir_op_fddy_fine:
-      result = lp_build_ddy(&bld_base->base, src[0]);
+      result = lp_build_ddy(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
       break;
    case nir_op_fdiv:
       result = lp_build_div(get_flt_bld(bld_base, src_bit_size[0]),
@@ -693,7 +705,7 @@ static LLVMValueRef do_alu_action(struct lp_build_nir_context *bld_base,
       result = fcmp32(bld_base, PIPE_FUNC_EQUAL, src_bit_size[0], src);
       break;
    case nir_op_fexp2:
-      result = lp_build_exp2(&bld_base->base, src[0]);
+      result = lp_build_exp2(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
       break;
    case nir_op_ffloor:
       result = lp_build_floor(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
@@ -723,7 +735,7 @@ static LLVMValueRef do_alu_action(struct lp_build_nir_context *bld_base,
       result = lp_build_isfinite(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
       break;
    case nir_op_flog2:
-      result = lp_build_log2_safe(&bld_base->base, src[0]);
+      result = lp_build_log2_safe(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
       break;
    case nir_op_flt:
    case nir_op_flt32:
@@ -778,7 +790,7 @@ static LLVMValueRef do_alu_action(struct lp_build_nir_context *bld_base,
       result = lp_build_negate(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
       break;
    case nir_op_fpow:
-      result = lp_build_pow(&bld_base->base, src[0], src[1]);
+      result = lp_build_pow(get_flt_bld(bld_base, src_bit_size[0]), src[0], src[1]);
       break;
    case nir_op_fquantize2f16:
       result = do_quantize_to_f16(bld_base, src[0]);
@@ -787,7 +799,13 @@ static LLVMValueRef do_alu_action(struct lp_build_nir_context *bld_base,
       result = lp_build_rcp(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
       break;
    case nir_op_fround_even:
-      result = lp_build_round(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
+      if (src_bit_size[0] == 16) {
+	 struct lp_build_context *bld = get_flt_bld(bld_base, 16);
+	 char intrinsic[64];
+	 lp_format_intrinsic(intrinsic, 64, "llvm.roundeven", bld->vec_type);
+	 result = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, src[0]);
+      } else
+	 result = lp_build_round(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
       break;
    case nir_op_frsq:
       result = lp_build_rsqrt(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
@@ -799,7 +817,7 @@ static LLVMValueRef do_alu_action(struct lp_build_nir_context *bld_base,
       result = lp_build_sgn(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
       break;
    case nir_op_fsin:
-      result = lp_build_sin(&bld_base->base, src[0]);
+      result = lp_build_sin(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
       break;
    case nir_op_fsqrt:
       result = lp_build_sqrt(get_flt_bld(bld_base, src_bit_size[0]), src[0]);
@@ -810,6 +828,10 @@ static LLVMValueRef do_alu_action(struct lp_build_nir_context *bld_base,
    case nir_op_i2b32:
       result = int_to_bool32(bld_base, src_bit_size[0], false, src[0]);
       break;
+   case nir_op_i2f16:
+      result = LLVMBuildSIToFP(builder, src[0],
+                               bld_base->half_bld.vec_type, "");
+      break;
    case nir_op_i2f32:
       result = lp_build_int_to_float(&bld_base->base, src[0]);
       break;
@@ -950,6 +972,10 @@ static LLVMValueRef do_alu_action(struct lp_build_nir_context *bld_base,
       result = LLVMBuildBitCast(builder, tmp, bld_base->uint64_bld.vec_type, "");
       break;
    }
+   case nir_op_u2f16:
+      result = LLVMBuildUIToFP(builder, src[0],
+                               bld_base->half_bld.vec_type, "");
+      break;
    case nir_op_u2f32:
       result = LLVMBuildUIToFP(builder, src[0], bld_base->base.vec_type, "");
       break;
@@ -1317,7 +1343,7 @@ static void visit_load_ssbo(struct lp_build_nir_context *bld_base,
                            nir_intrinsic_instr *instr,
                            LLVMValueRef result[NIR_MAX_VEC_COMPONENTS])
 {
-   LLVMValueRef idx = get_src(bld_base, instr->src[0]);
+   LLVMValueRef idx = cast_type(bld_base, get_src(bld_base, instr->src[0]), nir_type_uint, 32);
    LLVMValueRef offset = get_src(bld_base, instr->src[1]);
    bld_base->load_mem(bld_base, nir_dest_num_components(instr->dest), nir_dest_bit_size(instr->dest),
                        idx, offset, result);
@@ -1327,7 +1353,7 @@ static void visit_store_ssbo(struct lp_build_nir_context *bld_base,
                              nir_intrinsic_instr *instr)
 {
    LLVMValueRef val = get_src(bld_base, instr->src[0]);
-   LLVMValueRef idx = get_src(bld_base, instr->src[1]);
+   LLVMValueRef idx = cast_type(bld_base, get_src(bld_base, instr->src[1]), nir_type_uint, 32);
    LLVMValueRef offset = get_src(bld_base, instr->src[2]);
    int writemask = instr->const_index[0];
    int nc = nir_src_num_components(instr->src[0]);
@@ -1339,7 +1365,7 @@ static void visit_get_ssbo_size(struct lp_build_nir_context *bld_base,
                                 nir_intrinsic_instr *instr,
                                 LLVMValueRef result[NIR_MAX_VEC_COMPONENTS])
 {
-   LLVMValueRef idx = get_src(bld_base, instr->src[0]);
+   LLVMValueRef idx = cast_type(bld_base, get_src(bld_base, instr->src[0]), nir_type_uint, 32);
    result[0] = bld_base->get_ssbo_size(bld_base, idx);
 }
 
@@ -1347,7 +1373,7 @@ static void visit_ssbo_atomic(struct lp_build_nir_context *bld_base,
                               nir_intrinsic_instr *instr,
                               LLVMValueRef result[NIR_MAX_VEC_COMPONENTS])
 {
-   LLVMValueRef idx = get_src(bld_base, instr->src[0]);
+   LLVMValueRef idx = cast_type(bld_base, get_src(bld_base, instr->src[0]), nir_type_uint, 32);
    LLVMValueRef offset = get_src(bld_base, instr->src[1]);
    LLVMValueRef val = get_src(bld_base, instr->src[2]);
    LLVMValueRef val2 = NULL;
@@ -2154,8 +2180,12 @@ static void visit_tex(struct lp_build_nir_context *bld_base, nir_tex_instr *inst
 
    if (nir_dest_bit_size(instr->dest) != 32) {
       assert(nir_dest_bit_size(instr->dest) == 16);
-      LLVMTypeRef vec_type;
+      LLVMTypeRef vec_type = NULL;
+      bool is_float = false;
       switch (nir_alu_type_get_base_type(instr->dest_type)) {
+      case nir_type_float:
+         is_float = true;
+	 break;
       case nir_type_int:
          vec_type = bld_base->int16_bld.vec_type;
          break;
@@ -2166,12 +2196,17 @@ static void visit_tex(struct lp_build_nir_context *bld_base, nir_tex_instr *inst
          unreachable("unexpected alu type");
       }
       for (int i = 0; i < nir_dest_num_components(instr->dest); ++i) {
-         texel[i] = LLVMBuildBitCast(builder, texel[i], bld_base->int_bld.vec_type, "");
-         texel[i] = LLVMBuildTrunc(builder, texel[i], vec_type, "");
+         if (is_float) {
+            texel[i] = lp_build_float_to_half(gallivm, texel[i]);
+         } else {
+            texel[i] = LLVMBuildBitCast(builder, texel[i], bld_base->int_bld.vec_type, "");
+            texel[i] = LLVMBuildTrunc(builder, texel[i], vec_type, "");
+         }
       }
    }
 
    assign_dest(bld_base, &instr->dest, texel);
+
 }
 
 static void visit_ssa_undef(struct lp_build_nir_context *bld_base,
diff --git a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_nir.h b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_nir.h
index 4b42732527..874a5d55e3 100644
--- a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_nir.h	
+++ b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_nir.h	
@@ -49,6 +49,7 @@ struct lp_build_nir_context
    struct lp_build_context int8_bld;
    struct lp_build_context uint16_bld;
    struct lp_build_context int16_bld;
+   struct lp_build_context half_bld;
    struct lp_build_context dbl_bld;
    struct lp_build_context uint64_bld;
    struct lp_build_context int64_bld;
@@ -289,6 +290,8 @@ static inline struct lp_build_context *get_flt_bld(struct lp_build_nir_context *
    switch (op_bit_size) {
    case 64:
       return &bld_base->dbl_bld;
+   case 16:
+      return &bld_base->half_bld;
    default:
    case 32:
       return &bld_base->base;
diff --git a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c
index eb4e0f4e0d..b771b7cc74 100644
--- a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c	
@@ -300,7 +300,8 @@ emit_mask_scatter(struct lp_build_nir_soa_context *bld,
       if (scalar_pred) {
          LLVMValueRef real_val, dst_val;
          dst_val = LLVMBuildLoad(builder, scalar_ptr, "");
-         real_val = lp_build_select(&bld->uint_elem_bld, scalar_pred, val, dst_val);
+         scalar_pred = LLVMBuildTrunc(builder, scalar_pred, LLVMInt1TypeInContext(gallivm->context), "");
+         real_val = LLVMBuildSelect(builder, scalar_pred, val, dst_val, "");
          LLVMBuildStore(builder, real_val, scalar_ptr);
       }
       else {
@@ -472,7 +473,7 @@ static void emit_load_var(struct lp_build_nir_context *bld_base,
       break;
    case nir_var_shader_out:
       if (bld->fs_iface && bld->fs_iface->fb_fetch) {
-         bld->fs_iface->fb_fetch(bld->fs_iface, &bld_base->base, var->data.driver_location, result);
+         bld->fs_iface->fb_fetch(bld->fs_iface, &bld_base->base, var->data.location, result);
          return;
       }
       for (unsigned i = 0; i < num_components; i++) {
@@ -1038,7 +1039,6 @@ static void emit_load_mem(struct lp_build_nir_context *bld_base,
    struct gallivm_state *gallivm = bld_base->base.gallivm;
    struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
    LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
-   LLVMValueRef ssbo_ptr = NULL;
    struct lp_build_context *uint_bld = &bld_base->uint_bld;
    LLVMValueRef ssbo_limit = NULL;
    struct lp_build_context *load_bld;
@@ -1046,51 +1046,61 @@ static void emit_load_mem(struct lp_build_nir_context *bld_base,
 
    load_bld = get_int_bld(bld_base, true, bit_size);
 
-   if (index) {
-      LLVMValueRef ssbo_size_ptr = lp_build_array_get(gallivm, bld->ssbo_sizes_ptr, LLVMBuildExtractElement(builder, index, lp_build_const_int32(gallivm, 0), ""));
-      ssbo_limit = LLVMBuildAShr(gallivm->builder, ssbo_size_ptr, lp_build_const_int32(gallivm, shift_val), "");
-      ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit);
-
-      ssbo_ptr = lp_build_array_get(gallivm, bld->ssbo_ptr, LLVMBuildExtractElement(builder, index, lp_build_const_int32(gallivm, 0), ""));
-   } else
-      ssbo_ptr = bld->shared_ptr;
-
    offset = LLVMBuildAShr(gallivm->builder, offset, lp_build_const_int_vec(gallivm, uint_bld->type, shift_val), "");
-   for (unsigned c = 0; c < nc; c++) {
-      LLVMValueRef loop_index = lp_build_add(uint_bld, offset, lp_build_const_int_vec(gallivm, uint_bld->type, c));
-      LLVMValueRef exec_mask = mask_vec(bld_base);
 
+   /* although the index is dynamically uniform that doesn't count if exec mask isn't set, so read the one-by-one */
+
+   LLVMValueRef result[NIR_MAX_VEC_COMPONENTS];
+   for (unsigned c = 0; c < nc; c++)
+      result[c] = lp_build_alloca(gallivm, load_bld->vec_type, "");
+
+   LLVMValueRef exec_mask = mask_vec(bld_base);
+   LLVMValueRef cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, "");
+   struct lp_build_loop_state loop_state;
+   lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
+   LLVMValueRef loop_cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
+   LLVMValueRef loop_offset = LLVMBuildExtractElement(gallivm->builder, offset, loop_state.counter, "");
+
+   struct lp_build_if_state exec_ifthen;
+   lp_build_if(&exec_ifthen, gallivm, loop_cond);
+
+   LLVMValueRef mem_ptr;
+
+   if (index) {
+      LLVMValueRef ssbo_idx = LLVMBuildExtractElement(gallivm->builder, index, loop_state.counter, "");
+      LLVMValueRef ssbo_size_ptr = lp_build_array_get(gallivm, bld->ssbo_sizes_ptr, ssbo_idx);
+      LLVMValueRef ssbo_ptr = lp_build_array_get(gallivm, bld->ssbo_ptr, ssbo_idx);
+      ssbo_limit = LLVMBuildAShr(gallivm->builder, ssbo_size_ptr, lp_build_const_int32(gallivm, shift_val), "");
+      mem_ptr = ssbo_ptr;
+   } else
+      mem_ptr = bld->shared_ptr;
+
+   for (unsigned c = 0; c < nc; c++) {
+      LLVMValueRef loop_index = LLVMBuildAdd(builder, loop_offset, lp_build_const_int32(gallivm, c), "");
+      LLVMValueRef do_fetch = lp_build_const_int32(gallivm, -1);
       if (ssbo_limit) {
-         LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, loop_index, ssbo_limit);
-         exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, "");
+         LLVMValueRef ssbo_oob_cmp = lp_build_compare(gallivm, lp_elem_type(uint_bld->type), PIPE_FUNC_LESS, loop_index, ssbo_limit);
+         do_fetch = LLVMBuildAnd(builder, do_fetch, ssbo_oob_cmp, "");
       }
 
-      LLVMValueRef result = lp_build_alloca(gallivm, load_bld->vec_type, "");
-      struct lp_build_loop_state loop_state;
-      lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
-
       struct lp_build_if_state ifthen;
-      LLVMValueRef cond, temp_res;
+      LLVMValueRef fetch_cond, temp_res;
 
-      loop_index = LLVMBuildExtractElement(gallivm->builder, loop_index,
-                                           loop_state.counter, "");
+      fetch_cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, do_fetch, lp_build_const_int32(gallivm, 0), "");
 
-      cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, "");
-      cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
-
-      lp_build_if(&ifthen, gallivm, cond);
+      lp_build_if(&ifthen, gallivm, fetch_cond);
       LLVMValueRef scalar;
       if (bit_size != 32) {
-         LLVMValueRef ssbo_ptr2 = LLVMBuildBitCast(builder, ssbo_ptr, LLVMPointerType(load_bld->elem_type, 0), "");
-         scalar = lp_build_pointer_get(builder, ssbo_ptr2, loop_index);
+         LLVMValueRef mem_ptr2 = LLVMBuildBitCast(builder, mem_ptr, LLVMPointerType(load_bld->elem_type, 0), "");
+         scalar = lp_build_pointer_get(builder, mem_ptr2, loop_index);
       } else
-         scalar = lp_build_pointer_get(builder, ssbo_ptr, loop_index);
+         scalar = lp_build_pointer_get(builder, mem_ptr, loop_index);
 
-      temp_res = LLVMBuildLoad(builder, result, "");
+      temp_res = LLVMBuildLoad(builder, result[c], "");
       temp_res = LLVMBuildInsertElement(builder, temp_res, scalar, loop_state.counter, "");
-      LLVMBuildStore(builder, temp_res, result);
+      LLVMBuildStore(builder, temp_res, result[c]);
       lp_build_else(&ifthen);
-      temp_res = LLVMBuildLoad(builder, result, "");
+      temp_res = LLVMBuildLoad(builder, result[c], "");
       LLVMValueRef zero;
       if (bit_size == 64)
          zero = LLVMConstInt(LLVMInt64TypeInContext(gallivm->context), 0, 0);
@@ -1101,12 +1111,16 @@ static void emit_load_mem(struct lp_build_nir_context *bld_base,
       else
          zero = lp_build_const_int32(gallivm, 0);
       temp_res = LLVMBuildInsertElement(builder, temp_res, zero, loop_state.counter, "");
-      LLVMBuildStore(builder, temp_res, result);
+      LLVMBuildStore(builder, temp_res, result[c]);
       lp_build_endif(&ifthen);
-      lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length),
-                                NULL, LLVMIntUGE);
-      outval[c] = LLVMBuildLoad(gallivm->builder, result, "");
    }
+
+   lp_build_endif(&exec_ifthen);
+   lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length),
+                          NULL, LLVMIntUGE);
+   for (unsigned c = 0; c < nc; c++)
+      outval[c] = LLVMBuildLoad(gallivm->builder, result[c], "");
+
 }
 
 static void emit_store_mem(struct lp_build_nir_context *bld_base,
@@ -1120,56 +1134,66 @@ static void emit_store_mem(struct lp_build_nir_context *bld_base,
    struct gallivm_state *gallivm = bld_base->base.gallivm;
    struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
    LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
-   LLVMValueRef ssbo_ptr;
+   LLVMValueRef mem_ptr;
    struct lp_build_context *uint_bld = &bld_base->uint_bld;
    LLVMValueRef ssbo_limit = NULL;
    struct lp_build_context *store_bld;
    uint32_t shift_val = bit_size_to_shift_size(bit_size);
    store_bld = get_int_bld(bld_base, true, bit_size);
 
-   if (index) {
-      LLVMValueRef ssbo_size_ptr = lp_build_array_get(gallivm, bld->ssbo_sizes_ptr, LLVMBuildExtractElement(builder, index, lp_build_const_int32(gallivm, 0), ""));
-      ssbo_limit = LLVMBuildAShr(gallivm->builder, ssbo_size_ptr, lp_build_const_int32(gallivm, shift_val), "");
-      ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit);
-      ssbo_ptr = lp_build_array_get(gallivm, bld->ssbo_ptr, LLVMBuildExtractElement(builder, index, lp_build_const_int32(gallivm, 0), ""));
-   } else
-      ssbo_ptr = bld->shared_ptr;
-
    offset = lp_build_shr_imm(uint_bld, offset, shift_val);
+
+   LLVMValueRef exec_mask = mask_vec(bld_base);
+   LLVMValueRef cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, "");
+   struct lp_build_loop_state loop_state;
+   lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
+   LLVMValueRef loop_cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
+   LLVMValueRef loop_offset = LLVMBuildExtractElement(gallivm->builder, offset, loop_state.counter, "");
+
+   struct lp_build_if_state exec_ifthen;
+   lp_build_if(&exec_ifthen, gallivm, loop_cond);
+
+   if (index) {
+      LLVMValueRef ssbo_idx = LLVMBuildExtractElement(gallivm->builder, index, loop_state.counter, "");
+      LLVMValueRef ssbo_size_ptr = lp_build_array_get(gallivm, bld->ssbo_sizes_ptr, ssbo_idx);
+      LLVMValueRef ssbo_ptr = lp_build_array_get(gallivm, bld->ssbo_ptr, ssbo_idx);
+      ssbo_limit = LLVMBuildAShr(gallivm->builder, ssbo_size_ptr, lp_build_const_int32(gallivm, shift_val), "");
+      mem_ptr = ssbo_ptr;
+   } else
+      mem_ptr = bld->shared_ptr;
+
    for (unsigned c = 0; c < nc; c++) {
       if (!(writemask & (1u << c)))
          continue;
-      LLVMValueRef loop_index = lp_build_add(uint_bld, offset, lp_build_const_int_vec(gallivm, uint_bld->type, c));
+      LLVMValueRef loop_index = LLVMBuildAdd(builder, loop_offset, lp_build_const_int32(gallivm, c), "");
       LLVMValueRef val = (nc == 1) ? dst : LLVMBuildExtractValue(builder, dst, c, "");
+      LLVMValueRef do_store = lp_build_const_int32(gallivm, -1);
 
-      LLVMValueRef exec_mask = mask_vec(bld_base);
       if (ssbo_limit) {
-         LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, loop_index, ssbo_limit);
-         exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, "");
+         LLVMValueRef ssbo_oob_cmp = lp_build_compare(gallivm, lp_elem_type(uint_bld->type), PIPE_FUNC_LESS, loop_index, ssbo_limit);
+         do_store = LLVMBuildAnd(builder, do_store, ssbo_oob_cmp, "");
       }
 
-      struct lp_build_loop_state loop_state;
-      lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
       LLVMValueRef value_ptr = LLVMBuildExtractElement(gallivm->builder, val,
                                                        loop_state.counter, "");
       value_ptr = LLVMBuildBitCast(gallivm->builder, value_ptr, store_bld->elem_type, "");
       struct lp_build_if_state ifthen;
-      LLVMValueRef cond;
+      LLVMValueRef store_cond;
 
-      loop_index = LLVMBuildExtractElement(gallivm->builder, loop_index,
-                                           loop_state.counter, "");
-      cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, "");
-      cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
-      lp_build_if(&ifthen, gallivm, cond);
+      store_cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, do_store, lp_build_const_int32(gallivm, 0), "");
+      lp_build_if(&ifthen, gallivm, store_cond);
       if (bit_size != 32) {
-         LLVMValueRef ssbo_ptr2 = LLVMBuildBitCast(builder, ssbo_ptr, LLVMPointerType(store_bld->elem_type, 0), "");
-         lp_build_pointer_set(builder, ssbo_ptr2, loop_index, value_ptr);
+         LLVMValueRef mem_ptr2 = LLVMBuildBitCast(builder, mem_ptr, LLVMPointerType(store_bld->elem_type, 0), "");
+         lp_build_pointer_set(builder, mem_ptr2, loop_index, value_ptr);
       } else
-         lp_build_pointer_set(builder, ssbo_ptr, loop_index, value_ptr);
+         lp_build_pointer_set(builder, mem_ptr, loop_index, value_ptr);
       lp_build_endif(&ifthen);
-      lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length),
-                             NULL, LLVMIntUGE);
    }
+
+   lp_build_endif(&exec_ifthen);
+   lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length),
+                             NULL, LLVMIntUGE);
+
 }
 
 static void emit_atomic_mem(struct lp_build_nir_context *bld_base,
@@ -1182,52 +1206,58 @@ static void emit_atomic_mem(struct lp_build_nir_context *bld_base,
    struct gallivm_state *gallivm = bld_base->base.gallivm;
    struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
    LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
-   LLVMValueRef ssbo_ptr;
    struct lp_build_context *uint_bld = &bld_base->uint_bld;
    LLVMValueRef ssbo_limit = NULL;
    uint32_t shift_val = bit_size_to_shift_size(bit_size);
    struct lp_build_context *atomic_bld = get_int_bld(bld_base, true, bit_size);
-   if (index) {
-      LLVMValueRef ssbo_size_ptr = lp_build_array_get(gallivm, bld->ssbo_sizes_ptr, LLVMBuildExtractElement(builder, index, lp_build_const_int32(gallivm, 0), ""));
-      ssbo_limit = LLVMBuildAShr(gallivm->builder, ssbo_size_ptr, lp_build_const_int32(gallivm, 2), "");
-      ssbo_limit = lp_build_broadcast_scalar(uint_bld, ssbo_limit);
-      ssbo_ptr = lp_build_array_get(gallivm, bld->ssbo_ptr, LLVMBuildExtractElement(builder, index, lp_build_const_int32(gallivm, 0), ""));
-   } else
-      ssbo_ptr = bld->shared_ptr;
 
    offset = lp_build_shr_imm(uint_bld, offset, shift_val);
    LLVMValueRef atom_res = lp_build_alloca(gallivm,
                                            atomic_bld->vec_type, "");
 
    LLVMValueRef exec_mask = mask_vec(bld_base);
-   if (ssbo_limit) {
-      LLVMValueRef ssbo_oob_cmp = lp_build_cmp(uint_bld, PIPE_FUNC_LESS, offset, ssbo_limit);
-      exec_mask = LLVMBuildAnd(builder, exec_mask, ssbo_oob_cmp, "");
-   }
-
+   LLVMValueRef cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, "");
    struct lp_build_loop_state loop_state;
    lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
+   LLVMValueRef loop_cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
+   LLVMValueRef loop_offset = LLVMBuildExtractElement(gallivm->builder, offset, loop_state.counter, "");
+
+   struct lp_build_if_state exec_ifthen;
+   lp_build_if(&exec_ifthen, gallivm, loop_cond);
+
+   LLVMValueRef mem_ptr;
+   if (index) {
+      LLVMValueRef ssbo_idx = LLVMBuildExtractElement(gallivm->builder, index, loop_state.counter, "");
+      LLVMValueRef ssbo_size_ptr = lp_build_array_get(gallivm, bld->ssbo_sizes_ptr, ssbo_idx);
+      LLVMValueRef ssbo_ptr = lp_build_array_get(gallivm, bld->ssbo_ptr, ssbo_idx);
+      ssbo_limit = LLVMBuildAShr(gallivm->builder, ssbo_size_ptr, lp_build_const_int32(gallivm, shift_val), "");
+      mem_ptr = ssbo_ptr;
+   } else
+      mem_ptr = bld->shared_ptr;
+
+   LLVMValueRef do_fetch = lp_build_const_int32(gallivm, -1);
+   if (ssbo_limit) {
+      LLVMValueRef ssbo_oob_cmp = lp_build_compare(gallivm, lp_elem_type(uint_bld->type), PIPE_FUNC_LESS, loop_offset, ssbo_limit);
+      do_fetch = LLVMBuildAnd(builder, do_fetch, ssbo_oob_cmp, "");
+   }
 
    LLVMValueRef value_ptr = LLVMBuildExtractElement(gallivm->builder, val,
                                                     loop_state.counter, "");
    value_ptr = LLVMBuildBitCast(gallivm->builder, value_ptr, atomic_bld->elem_type, "");
 
-   offset = LLVMBuildExtractElement(gallivm->builder, offset,
-                                   loop_state.counter, "");
-
    LLVMValueRef scalar_ptr;
    if (bit_size != 32) {
-      LLVMValueRef ssbo_ptr2 = LLVMBuildBitCast(builder, ssbo_ptr, LLVMPointerType(atomic_bld->elem_type, 0), "");
-      scalar_ptr = LLVMBuildGEP(builder, ssbo_ptr2, &offset, 1, "");
+      LLVMValueRef mem_ptr2 = LLVMBuildBitCast(builder, mem_ptr, LLVMPointerType(atomic_bld->elem_type, 0), "");
+      scalar_ptr = LLVMBuildGEP(builder, mem_ptr2, &loop_offset, 1, "");
    } else
-      scalar_ptr = LLVMBuildGEP(builder, ssbo_ptr, &offset, 1, "");
+      scalar_ptr = LLVMBuildGEP(builder, mem_ptr, &loop_offset, 1, "");
 
    struct lp_build_if_state ifthen;
-   LLVMValueRef cond, temp_res;
+   LLVMValueRef inner_cond, temp_res;
    LLVMValueRef scalar;
-   cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, "");
-   cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
-   lp_build_if(&ifthen, gallivm, cond);
+
+   inner_cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, do_fetch, lp_build_const_int32(gallivm, 0), "");
+   lp_build_if(&ifthen, gallivm, inner_cond);
 
    if (nir_op == nir_intrinsic_ssbo_atomic_comp_swap || nir_op == nir_intrinsic_shared_atomic_comp_swap) {
       LLVMValueRef cas_src_ptr = LLVMBuildExtractElement(gallivm->builder, val2,
@@ -1297,6 +1327,7 @@ static void emit_atomic_mem(struct lp_build_nir_context *bld_base,
    LLVMBuildStore(builder, temp_res, atom_res);
    lp_build_endif(&ifthen);
 
+   lp_build_endif(&exec_ifthen);
    lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, uint_bld->type.length),
                           NULL, LLVMIntUGE);
    *result = LLVMBuildLoad(builder, atom_res, "");
@@ -1999,36 +2030,106 @@ static void emit_reduce(struct lp_build_nir_context *bld_base, LLVMValueRef src,
    switch (reduction_op) {
    case nir_op_fmin: {
       LLVMValueRef flt_max = bit_size == 64 ? LLVMConstReal(LLVMDoubleTypeInContext(gallivm->context), INFINITY) :
-         lp_build_const_float(gallivm, INFINITY);
+         (bit_size == 16 ? LLVMConstReal(LLVMHalfTypeInContext(gallivm->context), INFINITY) : lp_build_const_float(gallivm, INFINITY));
       store_val = LLVMBuildBitCast(builder, flt_max, int_bld->elem_type, "");
       break;
    }
    case nir_op_fmax: {
       LLVMValueRef flt_min = bit_size == 64 ? LLVMConstReal(LLVMDoubleTypeInContext(gallivm->context), -INFINITY) :
-         lp_build_const_float(gallivm, -INFINITY);
+         (bit_size == 16 ? LLVMConstReal(LLVMHalfTypeInContext(gallivm->context), -INFINITY) : lp_build_const_float(gallivm, -INFINITY));
       store_val = LLVMBuildBitCast(builder, flt_min, int_bld->elem_type, "");
       break;
    }
    case nir_op_fmul: {
       LLVMValueRef flt_one = bit_size == 64 ? LLVMConstReal(LLVMDoubleTypeInContext(gallivm->context), 1.0) :
-         lp_build_const_float(gallivm, 1.0);
+         (bit_size == 16 ? LLVMConstReal(LLVMHalfTypeInContext(gallivm->context), 1.0) : lp_build_const_float(gallivm, 1.0));
       store_val = LLVMBuildBitCast(builder, flt_one, int_bld->elem_type, "");
       break;
    }
    case nir_op_umin:
-      store_val = lp_build_const_int32(gallivm, UINT_MAX);
+      switch (bit_size) {
+      case 8:
+         store_val = LLVMConstInt(LLVMInt8TypeInContext(gallivm->context), UINT8_MAX, 0);
+         break;
+      case 16:
+         store_val = LLVMConstInt(LLVMInt16TypeInContext(gallivm->context), UINT16_MAX, 0);
+         break;
+      case 32:
+      default:
+         store_val  = lp_build_const_int32(gallivm, UINT_MAX);
+         break;
+      case 64:
+         store_val  = lp_build_const_int64(gallivm, UINT64_MAX);
+         break;
+      }
       break;
    case nir_op_imin:
-      store_val = lp_build_const_int32(gallivm, INT_MAX);
+      switch (bit_size) {
+      case 8:
+         store_val = LLVMConstInt(LLVMInt8TypeInContext(gallivm->context), INT8_MAX, 0);
+         break;
+      case 16:
+         store_val = LLVMConstInt(LLVMInt16TypeInContext(gallivm->context), INT16_MAX, 0);
+         break;
+      case 32:
+      default:
+         store_val  = lp_build_const_int32(gallivm, INT_MAX);
+         break;
+      case 64:
+         store_val  = lp_build_const_int64(gallivm, INT64_MAX);
+         break;
+      }
       break;
    case nir_op_imax:
-      store_val = lp_build_const_int32(gallivm, INT_MIN);
+      switch (bit_size) {
+      case 8:
+         store_val = LLVMConstInt(LLVMInt8TypeInContext(gallivm->context), INT8_MIN, 0);
+         break;
+      case 16:
+         store_val = LLVMConstInt(LLVMInt16TypeInContext(gallivm->context), INT16_MIN, 0);
+         break;
+      case 32:
+      default:
+         store_val  = lp_build_const_int32(gallivm, INT_MIN);
+         break;
+      case 64:
+         store_val  = lp_build_const_int64(gallivm, INT64_MIN);
+         break;
+      }
       break;
    case nir_op_imul:
-      store_val = lp_build_const_int32(gallivm, 1);
+      switch (bit_size) {
+      case 8:
+         store_val = LLVMConstInt(LLVMInt8TypeInContext(gallivm->context), 1, 0);
+         break;
+      case 16:
+         store_val = LLVMConstInt(LLVMInt16TypeInContext(gallivm->context), 1, 0);
+         break;
+      case 32:
+      default:
+         store_val  = lp_build_const_int32(gallivm, 1);
+         break;
+      case 64:
+         store_val  = lp_build_const_int64(gallivm, 1);
+         break;
+      }
       break;
    case nir_op_iand:
-      store_val = lp_build_const_int32(gallivm, 0xffffffff);
+      switch (bit_size) {
+      case 8:
+         store_val = LLVMConstInt(LLVMInt8TypeInContext(gallivm->context), 0xff, 0);
+         break;
+      case 16:
+         store_val = LLVMConstInt(LLVMInt16TypeInContext(gallivm->context), 0xffff, 0);
+         break;
+      case 32:
+      default:
+         store_val  = lp_build_const_int32(gallivm, 0xffffffff);
+         break;
+      case 64:
+         store_val  = lp_build_const_int64(gallivm, 0xffffffffffffffffLL);
+         break;
+      }
       break;
    default:
       break;
@@ -2119,28 +2220,27 @@ static void emit_read_invocation(struct lp_build_nir_context *bld_base,
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef idx;
    struct lp_build_context *uint_bld = get_int_bld(bld_base, true, bit_size);
-   if (invoc) {
-      idx = invoc;
-      idx = LLVMBuildExtractElement(gallivm->builder, idx, lp_build_const_int32(gallivm, 0), "");
-   } else {
-      /* have to find the first active invocation */
-      LLVMValueRef exec_mask = mask_vec(bld_base);
-      struct lp_build_loop_state loop_state;
-      LLVMValueRef res_store = lp_build_alloca(gallivm, bld_base->int_bld.elem_type, "");
-      LLVMValueRef outer_cond = LLVMBuildICmp(builder, LLVMIntNE, exec_mask, bld_base->uint_bld.zero, "");
-      lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, bld_base->uint_bld.type.length));
 
-      LLVMValueRef if_cond = LLVMBuildExtractElement(gallivm->builder, outer_cond, loop_state.counter, "");
-      struct lp_build_if_state ifthen;
+   /* have to find the first active invocation */
+   LLVMValueRef exec_mask = mask_vec(bld_base);
+   struct lp_build_loop_state loop_state;
+   LLVMValueRef res_store = lp_build_alloca(gallivm, bld_base->int_bld.elem_type, "");
+   LLVMValueRef outer_cond = LLVMBuildICmp(builder, LLVMIntNE, exec_mask, bld_base->uint_bld.zero, "");
+   lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, bld_base->uint_bld.type.length));
 
-      lp_build_if(&ifthen, gallivm, if_cond);
-      LLVMBuildStore(builder, loop_state.counter, res_store);
-      lp_build_endif(&ifthen);
+   LLVMValueRef if_cond = LLVMBuildExtractElement(gallivm->builder, outer_cond, loop_state.counter, "");
+   struct lp_build_if_state ifthen;
 
-      lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, -1),
-                             lp_build_const_int32(gallivm, -1), LLVMIntEQ);
-      idx = LLVMBuildLoad(builder, res_store, "");
-   }
+   lp_build_if(&ifthen, gallivm, if_cond);
+   LLVMValueRef store_val = loop_state.counter;
+   if (invoc)
+      store_val = LLVMBuildExtractElement(gallivm->builder, invoc, loop_state.counter, "");
+   LLVMBuildStore(builder, store_val, res_store);
+   lp_build_endif(&ifthen);
+
+   lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, -1),
+                          lp_build_const_int32(gallivm, -1), LLVMIntEQ);
+   idx = LLVMBuildLoad(builder, res_store, "");
 
    LLVMValueRef value = LLVMBuildExtractElement(gallivm->builder,
                                                 src, idx, "");
@@ -2325,6 +2425,12 @@ void lp_build_nir_soa(struct gallivm_state *gallivm,
       dbl_type.width *= 2;
       lp_build_context_init(&bld.bld_base.dbl_bld, gallivm, dbl_type);
    }
+   {
+      struct lp_type half_type;
+      half_type = type;
+      half_type.width /= 2;
+      lp_build_context_init(&bld.bld_base.half_bld, gallivm, half_type);
+   }
    {
       struct lp_type uint64_type;
       uint64_type = lp_uint_type(type);
diff --git a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index d2053c32b1..b3b7baa229 100644
--- a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h	
+++ b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h	
@@ -258,7 +258,7 @@ struct lp_build_fs_iface {
 
    void (*fb_fetch)(const struct lp_build_fs_iface *iface,
                     struct lp_build_context *bld,
-                    unsigned cbuf,
+                    int location,
                     LLVMValueRef result[4]);
 };
 
diff --git a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_type.c b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_type.c
index da139a838e..a261ae9814 100644
--- a/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_type.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/gallivm/lp_bld_type.c	
@@ -31,7 +31,7 @@
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_init.h"
-
+#include "lp_bld_limits.h"
 
 LLVMTypeRef
 lp_build_elem_type(struct gallivm_state *gallivm, struct lp_type type)
@@ -39,7 +39,7 @@ lp_build_elem_type(struct gallivm_state *gallivm, struct lp_type type)
    if (type.floating) {
       switch(type.width) {
       case 16:
-         return LLVMIntTypeInContext(gallivm->context, 16);
+         return lp_has_fp16() ? LLVMHalfTypeInContext(gallivm->context) : LLVMInt16TypeInContext(gallivm->context);
          break;
       case 32:
          return LLVMFloatTypeInContext(gallivm->context);
@@ -89,7 +89,7 @@ lp_check_elem_type(struct lp_type type, LLVMTypeRef elem_type)
    if (type.floating) {
       switch(type.width) {
       case 16:
-         if(elem_kind != LLVMIntegerTypeKind)
+         if(elem_kind != (lp_has_fp16() ? LLVMHalfTypeKind : LLVMIntegerTypeKind))
             return FALSE;
          break;
       case 32:
@@ -259,6 +259,8 @@ lp_sizeof_llvm_type(LLVMTypeRef t)
       return 8 * sizeof(float);
    case LLVMDoubleTypeKind:
       return 8 * sizeof(double);
+   case LLVMHalfTypeKind:
+      return 8 * sizeof(uint16_t);
    case LLVMVectorTypeKind:
       {
          LLVMTypeRef elem = LLVMGetElementType(t);
@@ -291,6 +293,8 @@ lp_typekind_name(LLVMTypeKind t)
       return "LLVMVoidTypeKind";
    case LLVMFloatTypeKind:
       return "LLVMFloatTypeKind";
+   case LLVMHalfTypeKind:
+      return "LLVMHalfTypeKind";
    case LLVMDoubleTypeKind:
       return "LLVMDoubleTypeKind";
    case LLVMX86_FP80TypeKind:
diff --git a/mesa 3D driver/src/gallium/auxiliary/hud/hud_context.c b/mesa 3D driver/src/gallium/auxiliary/hud/hud_context.c
index 5d6b5690b7..b1887aa849 100644
--- a/mesa 3D driver/src/gallium/auxiliary/hud/hud_context.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/hud/hud_context.c	
@@ -541,7 +541,7 @@ hud_draw_results(struct hud_context *hud, struct pipe_resource *tex)
    cso_set_vertex_shader_handle(cso, hud->vs_color);
    cso_set_vertex_elements(cso, &hud->velems);
    cso_set_render_condition(cso, NULL, FALSE, 0);
-   pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 1, 0,
+   pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 1, 0, false,
                            &hud->font_sampler_view);
    cso_set_samplers(cso, PIPE_SHADER_FRAGMENT, 1, sampler_states);
    pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, false, &hud->constbuf);
@@ -609,12 +609,7 @@ hud_draw_results(struct hud_context *hud, struct pipe_resource *tex)
    }
 
 done:
-   cso_restore_state(cso);
-
-   /* Unbind resources that we have bound. */
-   pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, false, NULL);
-   pipe->set_vertex_buffers(pipe, 0, 0, 1, false, NULL);
-   pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 0, 1, NULL);
+   cso_restore_state(cso, CSO_UNBIND_FS_SAMPLERVIEW0 | CSO_UNBIND_VS_CONSTANTS | CSO_UNBIND_VERTEX_BUFFER0);
 
    /* restore states not restored by cso */
    if (hud->st) {
diff --git a/mesa 3D driver/src/gallium/auxiliary/indices/u_indices.c b/mesa 3D driver/src/gallium/auxiliary/indices/u_indices.c
index b61674334c..d0a5fb8c12 100644
--- a/mesa 3D driver/src/gallium/auxiliary/indices/u_indices.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/indices/u_indices.c	
@@ -85,6 +85,8 @@ u_index_prim_type_convert(unsigned hw_mask, enum pipe_prim_type prim, bool pv_ma
    case PIPE_PRIM_TRIANGLES_ADJACENCY:
    case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
       return PIPE_PRIM_TRIANGLES_ADJACENCY;
+   case PIPE_PRIM_PATCHES:
+      return PIPE_PRIM_PATCHES;
    default:
       assert(0);
       break;
@@ -173,6 +175,7 @@ u_index_count_converted_indices(unsigned hw_mask, bool pv_matches, enum pipe_pri
 
    switch (prim) {
    case PIPE_PRIM_POINTS:
+   case PIPE_PRIM_PATCHES:
       return nr;
    case PIPE_PRIM_LINES:
       return nr;
diff --git a/mesa 3D driver/src/gallium/auxiliary/indices/u_primconvert.c b/mesa 3D driver/src/gallium/auxiliary/indices/u_primconvert.c
index d8704237e4..62956910aa 100644
--- a/mesa 3D driver/src/gallium/auxiliary/indices/u_primconvert.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/indices/u_primconvert.c	
@@ -179,10 +179,12 @@ util_primconvert_draw_vbo(struct primconvert_context *pc,
       src = (const uint8_t *)src;
 
       /* if the resulting primitive type is not supported by the driver for primitive restart,
+       * or if the original primitive type was not supported by the driver,
        * the draw needs to be rewritten to not use primitive restart
        */
       if (info->primitive_restart &&
-          !(pc->cfg.restart_primtypes_mask & BITFIELD_BIT(mode))) {
+          (!(pc->cfg.restart_primtypes_mask & BITFIELD_BIT(mode)) ||
+           !(pc->cfg.primtypes_mask & BITFIELD_BIT(info->mode)))) {
          /* step 1: rewrite draw to not use primitive primitive restart;
           *         this pre-filters degenerate primitives
           */
diff --git a/mesa 3D driver/src/gallium/auxiliary/meson.build b/mesa 3D driver/src/gallium/auxiliary/meson.build
index ca272fe428..7682357edf 100644
--- a/mesa 3D driver/src/gallium/auxiliary/meson.build	
+++ b/mesa 3D driver/src/gallium/auxiliary/meson.build	
@@ -238,14 +238,10 @@ files_libgallium = files(
   'util/u_cache.h',
   'util/u_compute.c',
   'util/u_compute.h',
-  'util/u_debug_describe.c',
-  'util/u_debug_describe.h',
   'util/u_debug_flush.c',
   'util/u_debug_flush.h',
   'util/u_debug_image.c',
   'util/u_debug_image.h',
-  'util/u_debug_refcnt.c',
-  'util/u_debug_refcnt.h',
   'util/u_dirty_flags.h',
   'util/u_dirty_surfaces.h',
   'util/u_dl.c',
@@ -260,15 +256,12 @@ files_libgallium = files(
   'util/u_dump_defines.c',
   'util/u_dump.h',
   'util/u_dump_state.c',
-  'util/u_fifo.h',
   'util/u_framebuffer.c',
   'util/u_framebuffer.h',
   'util/u_gen_mipmap.c',
   'util/u_gen_mipmap.h',
   'util/u_handle_table.c',
   'util/u_handle_table.h',
-  'util/u_hash_table.c',
-  'util/u_hash_table.h',
   'util/u_helpers.c',
   'util/u_helpers.h',
   'util/u_index_modify.c',
@@ -315,9 +308,6 @@ files_libgallium = files(
   'util/u_texture.h',
   'util/u_tile.c',
   'util/u_tile.h',
-  'util/u_trace.c',
-  'util/u_trace.h',
-  'util/u_trace_priv.h',
   'util/u_transfer.c',
   'util/u_transfer.h',
   'util/u_transfer_helper.c',
@@ -325,10 +315,14 @@ files_libgallium = files(
   'util/u_threaded_context.c',
   'util/u_threaded_context.h',
   'util/u_threaded_context_calls.h',
+  'util/u_trace_gallium.c',
+  'util/u_trace_gallium.h',
   'util/u_upload_mgr.c',
   'util/u_upload_mgr.h',
   'util/u_vbuf.c',
   'util/u_vbuf.h',
+  'util/u_vertex_state_cache.c',
+  'util/u_vertex_state_cache.h',
   'util/u_video.h',
   'util/u_viewport.h',
   'nir/tgsi_to_nir.c',
@@ -484,15 +478,13 @@ if with_dri2 and with_platform_x11
   endif
 endif
 
-u_trace_py = files('util/u_trace.py')
-
 files_libgallium += custom_target(
   'u_tracepoints.c',
   input: 'util/u_tracepoints.py',
   output: 'u_tracepoints.c',
   command: [
     prog_python, '@INPUT@',
-    '-p', join_paths(meson.source_root(), 'src/gallium/auxiliary/util/'),
+    '-p', join_paths(meson.source_root(), 'src/util/perf/'),
     '-C', '@OUTPUT@',
   ],
   depend_files: u_trace_py,
@@ -504,7 +496,7 @@ files_u_tracepoints = custom_target(
   output: 'u_tracepoints.h',
   command: [
     prog_python, '@INPUT@',
-    '-p', join_paths(meson.source_root(), 'src/gallium/auxiliary/util/'),
+    '-p', join_paths(meson.source_root(), 'src/util/perf/'),
     '-H', '@OUTPUT@',
   ],
   depend_files: u_trace_py,
diff --git a/mesa 3D driver/src/gallium/auxiliary/nir/nir_to_tgsi.c b/mesa 3D driver/src/gallium/auxiliary/nir/nir_to_tgsi.c
index 135793dbde..3919485c47 100644
--- a/mesa 3D driver/src/gallium/auxiliary/nir/nir_to_tgsi.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/nir/nir_to_tgsi.c	
@@ -54,7 +54,7 @@ struct ntt_compile {
 
    /* TGSI temps for our NIR SSA and register values. */
    struct ureg_dst *reg_temp;
-   struct ureg_dst *ssa_temp;
+   struct ureg_src *ssa_temp;
 
    nir_instr_liveness *liveness;
 
@@ -66,6 +66,8 @@ struct ntt_compile {
    struct ureg_src *input_index_map;
    uint64_t centroid_inputs;
 
+   uint32_t first_ubo;
+
    struct ureg_src images[PIPE_MAX_SHADER_IMAGES];
 };
 
@@ -185,7 +187,7 @@ ntt_tgsi_var_usage_mask(const struct nir_variable *var)
 }
 
 static struct ureg_dst
-ntt_store_output_decl(struct ntt_compile *c, nir_intrinsic_instr *instr, uint32_t *frac)
+ntt_output_decl(struct ntt_compile *c, nir_intrinsic_instr *instr, uint32_t *frac)
 {
    nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
    int base = nir_intrinsic_base(instr);
@@ -194,9 +196,6 @@ ntt_store_output_decl(struct ntt_compile *c, nir_intrinsic_instr *instr, uint32_
 
    struct ureg_dst out;
    if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
-      if (semantics.location == FRAG_RESULT_COLOR)
-         ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);
-
       unsigned semantic_name, semantic_index;
       tgsi_get_gl_frag_result_semantic(semantics.location,
                                        &semantic_name, &semantic_index);
@@ -247,7 +246,11 @@ ntt_store_output_decl(struct ntt_compile *c, nir_intrinsic_instr *instr, uint32_
                                     invariant);
    }
 
-   unsigned write_mask = nir_intrinsic_write_mask(instr);
+   unsigned write_mask;
+   if (nir_intrinsic_has_write_mask(instr))
+      write_mask = nir_intrinsic_write_mask(instr);
+   else
+      write_mask = ((1 << instr->num_components) - 1) << *frac;
 
    if (is_64) {
       write_mask = ntt_64bit_write_mask(write_mask);
@@ -296,7 +299,7 @@ ntt_try_store_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst,
    }
 
    uint32_t frac;
-   *dst = ntt_store_output_decl(c, intr, &frac);
+   *dst = ntt_output_decl(c, intr, &frac);
    dst->Index += ntt_src_as_uint(c, intr->src[1]);
 
    return frac == 0;
@@ -360,15 +363,14 @@ ntt_setup_inputs(struct ntt_compile *c)
 
       uint32_t usage_mask = ntt_tgsi_var_usage_mask(var);
 
-      decl = ureg_DECL_fs_input_cyl_centroid_layout(c->ureg,
-                                                    semantic_name,
-                                                    semantic_index,
-                                                    interpolation,
-                                                    0,
-                                                    sample_loc,
-                                                    var->data.driver_location,
-                                                    usage_mask,
-                                                    array_id, array_len);
+      decl = ureg_DECL_fs_input_centroid_layout(c->ureg,
+                                                semantic_name,
+                                                semantic_index,
+                                                interpolation,
+                                                sample_loc,
+                                                var->data.driver_location,
+                                                usage_mask,
+                                                array_id, array_len);
 
       if (semantic_name == TGSI_SEMANTIC_FACE) {
          struct ureg_dst temp = ureg_DECL_temporary(c->ureg);
@@ -384,20 +386,65 @@ ntt_setup_inputs(struct ntt_compile *c)
    }
 }
 
+static int
+ntt_sort_by_location(const nir_variable *a, const nir_variable *b)
+{
+   return a->data.location - b->data.location;
+}
+
+/**
+ * Workaround for virglrenderer requiring that TGSI FS output color variables
+ * are declared in order.  Besides, it's a lot nicer to read the TGSI this way.
+ */
+static void
+ntt_setup_outputs(struct ntt_compile *c)
+{
+   if (c->s->info.stage != MESA_SHADER_FRAGMENT)
+      return;
+
+   nir_sort_variables_with_modes(c->s, ntt_sort_by_location, nir_var_shader_out);
+
+   nir_foreach_shader_out_variable(var, c->s) {
+      if (var->data.location == FRAG_RESULT_COLOR)
+         ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);
+
+      unsigned semantic_name, semantic_index;
+      tgsi_get_gl_frag_result_semantic(var->data.location,
+                                       &semantic_name, &semantic_index);
+
+      (void)ureg_DECL_output(c->ureg, semantic_name, semantic_index);
+   }
+}
+
 static enum tgsi_texture_type
-tgsi_target_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array)
+tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array, bool is_shadow)
 {
    switch (dim) {
    case GLSL_SAMPLER_DIM_1D:
-      return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D;
+      if (is_shadow)
+         return is_array ? TGSI_TEXTURE_SHADOW1D_ARRAY : TGSI_TEXTURE_SHADOW1D;
+      else
+         return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D;
    case GLSL_SAMPLER_DIM_2D:
-      return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D;
+   case GLSL_SAMPLER_DIM_EXTERNAL:
+      if (is_shadow)
+         return is_array ? TGSI_TEXTURE_SHADOW2D_ARRAY : TGSI_TEXTURE_SHADOW2D;
+      else
+         return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D;
    case GLSL_SAMPLER_DIM_3D:
       return TGSI_TEXTURE_3D;
    case GLSL_SAMPLER_DIM_CUBE:
-      return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE;
+      if (is_shadow)
+         return is_array ? TGSI_TEXTURE_SHADOWCUBE_ARRAY : TGSI_TEXTURE_SHADOWCUBE;
+      else
+         return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE;
    case GLSL_SAMPLER_DIM_RECT:
-      return TGSI_TEXTURE_RECT;
+      if (is_shadow)
+         return TGSI_TEXTURE_SHADOWRECT;
+      else
+         return TGSI_TEXTURE_RECT;
+   case GLSL_SAMPLER_DIM_MS:
+      return is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA;
    case GLSL_SAMPLER_DIM_BUF:
       return TGSI_TEXTURE_BUFFER;
    default:
@@ -405,44 +452,102 @@ tgsi_target_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array)
    }
 }
 
+static enum tgsi_return_type
+tgsi_return_type_from_base_type(enum glsl_base_type type)
+{
+   switch (type) {
+   case GLSL_TYPE_INT:
+      return TGSI_RETURN_TYPE_SINT;
+   case GLSL_TYPE_UINT:
+      return TGSI_RETURN_TYPE_UINT;
+   case GLSL_TYPE_FLOAT:
+     return TGSI_RETURN_TYPE_FLOAT;
+   default:
+      unreachable("unexpected texture type");
+   }
+}
+
 static void
 ntt_setup_uniforms(struct ntt_compile *c)
 {
-   struct pipe_screen *screen = c->screen;
-   bool packed = screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS);
-
    nir_foreach_uniform_variable(var, c->s) {
-      if (glsl_type_is_image(var->type)) {
-         enum tgsi_texture_type tex_type =
-             tgsi_target_from_sampler_dim(glsl_get_sampler_dim(var->type),
-                                          glsl_sampler_type_is_array(var->type));
+      if (glsl_type_is_sampler(glsl_without_array(var->type)) ||
+          glsl_type_is_texture(glsl_without_array(var->type))) {
+         /* Don't use this size for the check for samplers -- arrays of structs
+          * containing samplers should be ignored, and just the separate lowered
+          * sampler uniform decl used.
+          */
+         int size = glsl_type_get_sampler_count(var->type) +
+                    glsl_type_get_texture_count(var->type);
 
-         c->images[var->data.binding] = ureg_DECL_image(c->ureg,
-                                                        var->data.binding,
-                                                        tex_type,
-                                                        var->data.image.format,
-                                                        !(var->data.access & ACCESS_NON_WRITEABLE),
-                                                        false);
+         const struct glsl_type *stype = glsl_without_array(var->type);
+         enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(stype),
+                                                                            glsl_sampler_type_is_array(stype),
+                                                                            glsl_sampler_type_is_shadow(stype));
+         enum tgsi_return_type ret_type = tgsi_return_type_from_base_type(glsl_get_sampler_result_type(stype));
+         for (int i = 0; i < size; i++) {
+            ureg_DECL_sampler_view(c->ureg, var->data.binding + i,
+               target, ret_type, ret_type, ret_type, ret_type);
+            ureg_DECL_sampler(c->ureg, var->data.binding + i);
+         }
       } else if (glsl_contains_atomic(var->type)) {
          uint32_t offset = var->data.offset / 4;
          uint32_t size = glsl_atomic_size(var->type) / 4;
          ureg_DECL_hw_atomic(c->ureg, offset, offset + size - 1, var->data.binding, 0);
-      } else {
-         unsigned size;
-         if (packed) {
-            size = DIV_ROUND_UP(glsl_count_dword_slots(var->type,
-                                                       var->data.bindless), 4);
-         } else {
-            size = glsl_count_vec4_slots(var->type, false, var->data.bindless);
-         }
+      }
 
-         for (unsigned i = 0; i < size; i++)
-            ureg_DECL_constant(c->ureg, var->data.driver_location + i);
+      /* lower_uniforms_to_ubo lowered non-sampler uniforms to UBOs, so CB0
+       * size declaration happens with other UBOs below.
+       */
+   }
+
+   nir_foreach_image_variable(var, c->s) {
+      int image_count = glsl_type_get_image_count(var->type);
+      const struct glsl_type *itype = glsl_without_array(var->type);
+      enum tgsi_texture_type tex_type =
+            tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(itype),
+                                               glsl_sampler_type_is_array(itype), false);
+
+      for (int i = 0; i < image_count; i++) {
+         c->images[var->data.binding] = ureg_DECL_image(c->ureg,
+                                                        var->data.binding + i,
+                                                        tex_type,
+                                                        var->data.image.format,
+                                                        !(var->data.access & ACCESS_NON_WRITEABLE),
+                                                        false);
       }
    }
 
+   c->first_ubo = ~0;
+
+   unsigned ubo_sizes[PIPE_MAX_CONSTANT_BUFFERS] = {0};
    nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ubo) {
-      ureg_DECL_constant2D(c->ureg, 0, 0, var->data.driver_location);
+      int ubo = var->data.driver_location;
+      if (ubo == -1)
+         continue;
+
+      if (!(ubo == 0 && c->s->info.first_ubo_is_default_ubo))
+         c->first_ubo = MIN2(c->first_ubo, ubo);
+
+      unsigned size = glsl_get_explicit_size(var->interface_type, false);
+
+      int array_size = 1;
+      if (glsl_type_is_interface(glsl_without_array(var->type)))
+         array_size = MAX2(1, glsl_array_size(var->type));
+      for (int i = 0; i < array_size; i++) {
+         /* Even if multiple NIR variables are in the same uniform block, their
+          * explicit size is the size of the block.
+          */
+         if (ubo_sizes[ubo + i])
+            assert(ubo_sizes[ubo + i] == size);
+
+         ubo_sizes[ubo + i] = size;
+      }
+   }
+
+   for (int i = 0; i < ARRAY_SIZE(ubo_sizes); i++) {
+      if (ubo_sizes[i])
+         ureg_DECL_constant2D(c->ureg, 0, DIV_ROUND_UP(ubo_sizes[i], 16) - 1, i);
    }
 
    for (int i = 0; i < c->s->info.num_ssbos; i++) {
@@ -452,11 +557,6 @@ ntt_setup_uniforms(struct ntt_compile *c)
       bool atomic = false;
       ureg_DECL_buffer(c->ureg, i, atomic);
    }
-
-   for (int i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      if (BITSET_TEST(c->s->info.textures_used, i))
-         ureg_DECL_sampler(c->ureg, i);
-   }
 }
 
 static void
@@ -570,7 +670,7 @@ ntt_get_src(struct ntt_compile *c, nir_src src)
       if (src.ssa->parent_instr->type == nir_instr_type_load_const)
          return ntt_get_load_const_src(c, nir_instr_as_load_const(src.ssa->parent_instr));
 
-      return ureg_src(c->ssa_temp[src.ssa->index]);
+      return c->ssa_temp[src.ssa->index];
    } else {
       nir_register *reg = src.reg.reg;
       struct ureg_dst reg_temp = c->reg_temp[reg->index];
@@ -636,7 +736,7 @@ ntt_swizzle_for_write_mask(struct ureg_src src, uint32_t write_mask)
                        (write_mask & TGSI_WRITEMASK_W) ? TGSI_SWIZZLE_W : first_chan);
 }
 
-static struct ureg_dst *
+static struct ureg_dst
 ntt_get_ssa_def_decl(struct ntt_compile *c, nir_ssa_def *ssa)
 {
    uint32_t writemask = BITSET_MASK(ssa->num_components);
@@ -647,24 +747,24 @@ ntt_get_ssa_def_decl(struct ntt_compile *c, nir_ssa_def *ssa)
    if (!ntt_try_store_in_tgsi_output(c, &dst, &ssa->uses, &ssa->if_uses))
       dst = ureg_DECL_temporary(c->ureg);
 
-   c->ssa_temp[ssa->index] = ureg_writemask(dst, writemask);
+   c->ssa_temp[ssa->index] = ntt_swizzle_for_write_mask(ureg_src(dst), writemask);
 
-   return &c->ssa_temp[ssa->index];
+   return ureg_writemask(dst, writemask);
 }
 
-static struct ureg_dst *
+static struct ureg_dst
 ntt_get_dest_decl(struct ntt_compile *c, nir_dest *dest)
 {
    if (dest->is_ssa)
       return ntt_get_ssa_def_decl(c, &dest->ssa);
    else
-      return &c->reg_temp[dest->reg.reg->index];
+      return c->reg_temp[dest->reg.reg->index];
 }
 
 static struct ureg_dst
 ntt_get_dest(struct ntt_compile *c, nir_dest *dest)
 {
-   struct ureg_dst dst = *ntt_get_dest_decl(c, dest);
+   struct ureg_dst dst = ntt_get_dest_decl(c, dest);
 
    if (!dest->is_ssa) {
       dst.Index += dest->reg.base_offset;
@@ -684,22 +784,18 @@ ntt_get_dest(struct ntt_compile *c, nir_dest *dest)
 static void
 ntt_store_def(struct ntt_compile *c, nir_ssa_def *def, struct ureg_src src)
 {
-   if (!src.Negate && !src.Absolute && !src.Indirect && !src.DimIndirect &&
-       src.SwizzleX == TGSI_SWIZZLE_X &&
-       (src.SwizzleY == TGSI_SWIZZLE_Y || def->num_components < 2) &&
-       (src.SwizzleZ == TGSI_SWIZZLE_Z || def->num_components < 3) &&
-       (src.SwizzleW == TGSI_SWIZZLE_W || def->num_components < 4)) {
+   if (!src.Indirect && !src.DimIndirect) {
       switch (src.File) {
       case TGSI_FILE_IMMEDIATE:
       case TGSI_FILE_INPUT:
       case TGSI_FILE_CONSTANT:
       case TGSI_FILE_SYSTEM_VALUE:
-         c->ssa_temp[def->index] = ureg_dst(src);
+         c->ssa_temp[def->index] = src;
          return;
       }
    }
 
-   ureg_MOV(c->ureg, *ntt_get_ssa_def_decl(c, def), src);
+   ureg_MOV(c->ureg, ntt_get_ssa_def_decl(c, def), src);
 }
 
 static void
@@ -1073,10 +1169,12 @@ ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr)
          /* NIR is src0 != 0 ? src1 : src2.
           * TGSI is src0 < 0 ? src1 : src2.
           *
-          * However, fcsel so far as I can find only appears on
-          * bools-as-floats (1.0 or 0.0), so we can negate it for the TGSI op.
+          * However, fcsel so far as I can find only appears on bools-as-floats
+          * (1.0 or 0.0), so we can just negate it for the TGSI op.  It's
+          * important to not have an abs here, as i915g has to make extra
+          * instructions to do the abs.
           */
-         ureg_CMP(c->ureg, dst, ureg_negate(ureg_abs(src[0])), src[1], src[2]);
+         ureg_CMP(c->ureg, dst, ureg_negate(src[0]), src[1], src[2]);
          break;
 
          /* It would be nice if we could get this left as scalar in NIR, since
@@ -1226,7 +1324,25 @@ ntt_emit_load_ubo(struct ntt_compile *c, nir_intrinsic_instr *instr)
 
    struct ureg_src src = ureg_src_register(TGSI_FILE_CONSTANT, 0);
 
-   src = ntt_ureg_src_dimension_indirect(c, src, instr->src[0]);
+   struct ureg_dst addr_temp = ureg_dst_undef();
+
+   if (nir_src_is_const(instr->src[0])) {
+      src = ureg_src_dimension(src, ntt_src_as_uint(c, instr->src[0]));
+   } else {
+      /* virglrenderer requires that indirect UBO references have the UBO
+       * array's base index in the Index field, not added to the indrect
+       * address.
+       *
+       * Many nir intrinsics have a base address const value for the start of
+       * their array indirection, but load_ubo doesn't.  We fake it by
+       * subtracting it off here.
+       */
+      addr_temp = ureg_DECL_temporary(c->ureg);
+      ureg_UADD(c->ureg, addr_temp, ntt_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, -c->first_ubo));
+      src = ureg_src_dimension_indirect(src,
+                                         ntt_reladdr(c, ureg_src(addr_temp)),
+                                         c->first_ubo);
+   }
 
    if (instr->intrinsic == nir_intrinsic_load_ubo_vec4) {
       /* !PIPE_CAP_LOAD_CONSTBUF: Just emit it as a vec4 reference to the const
@@ -1264,6 +1380,8 @@ ntt_emit_load_ubo(struct ntt_compile *c, nir_intrinsic_instr *instr)
                        0 /* format: unused */
       );
    }
+
+   ureg_release_temporary(c->ureg, addr_temp);
 }
 
 static unsigned
@@ -1295,6 +1413,7 @@ ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr,
    struct ureg_src src[4];
    int num_src = 0;
    int nir_src;
+   struct ureg_dst addr_temp = ureg_dst_undef();
 
    struct ureg_src memory;
    switch (mode) {
@@ -1308,9 +1427,16 @@ ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr,
       nir_src = 0;
       break;
    case nir_var_uniform: { /* HW atomic buffers */
-      uint32_t offset = nir_src_as_uint(instr->src[0]);
-      memory = ureg_src_dimension(ureg_src_register(TGSI_FILE_HW_ATOMIC, offset / 4),
-                                  nir_intrinsic_base(instr));
+      memory = ureg_src_register(TGSI_FILE_HW_ATOMIC, 0);
+      /* ntt_ureg_src_indirect, except dividing by 4 */
+      if (nir_src_is_const(instr->src[0])) {
+         memory.Index += nir_src_as_uint(instr->src[0]) / 4;
+      } else {
+         addr_temp = ureg_DECL_temporary(c->ureg);
+         ureg_USHR(c->ureg, addr_temp, ntt_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, 2));
+         memory = ureg_src_indirect(memory, ntt_reladdr(c, ureg_src(addr_temp)));
+      }
+      memory = ureg_src_dimension(memory, nir_intrinsic_base(instr));
       nir_src = 0;
       break;
    }
@@ -1438,6 +1564,8 @@ ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr,
                     qualifier,
                     TGSI_TEXTURE_BUFFER,
                     0 /* format: unused */);
+
+   ureg_release_temporary(c->ureg, addr_temp);
 }
 
 static void
@@ -1451,7 +1579,7 @@ ntt_emit_image_load_store(struct ntt_compile *c, nir_intrinsic_instr *instr)
 
    struct ureg_dst temp = ureg_dst_undef();
 
-   enum tgsi_texture_type target = tgsi_target_from_sampler_dim(dim, is_array);
+   enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(dim, is_array, false);
 
    struct ureg_src resource =
       ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_IMAGE, 0),
@@ -1600,6 +1728,10 @@ ntt_emit_load_input(struct ntt_compile *c, nir_intrinsic_instr *instr)
 
       switch (bary_instr->intrinsic) {
       case nir_intrinsic_load_barycentric_pixel:
+      case nir_intrinsic_load_barycentric_sample:
+         /* For these, we know that the barycentric load matches the
+          * interpolation on the input declaration, so we can use it directly.
+          */
          ntt_store(c, &instr->dest, input);
          break;
 
@@ -1617,9 +1749,9 @@ ntt_emit_load_input(struct ntt_compile *c, nir_intrinsic_instr *instr)
          break;
 
       case nir_intrinsic_load_barycentric_at_sample:
+         /* We stored the sample in the fake "bary" dest. */
          ureg_INTERP_SAMPLE(c->ureg, ntt_get_dest(c, &instr->dest), input,
-                            ureg_imm1u(c->ureg,
-                                       ntt_src_as_uint(c, bary_instr->src[0])));
+                            ntt_get_src(c, instr->src[0]));
          break;
 
       case nir_intrinsic_load_barycentric_at_offset:
@@ -1653,7 +1785,7 @@ ntt_emit_store_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
    }
 
    uint32_t frac;
-   struct ureg_dst out = ntt_store_output_decl(c, instr, &frac);
+   struct ureg_dst out = ntt_output_decl(c, instr, &frac);
 
    if (instr->intrinsic == nir_intrinsic_store_per_vertex_output) {
       out = ntt_ureg_dst_indirect(c, out, instr->src[2]);
@@ -1674,6 +1806,29 @@ ntt_emit_store_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
    ntt_reladdr_dst_put(c, out);
 }
 
+static void
+ntt_emit_load_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
+{
+   /* ntt_try_store_in_tgsi_output() optimization is not valid if load_output
+    * is present.
+    */
+   assert(c->s->info.stage != MESA_SHADER_VERTEX &&
+          c->s->info.stage != MESA_SHADER_FRAGMENT);
+
+   uint32_t frac;
+   struct ureg_dst out = ntt_output_decl(c, instr, &frac);
+
+   if (instr->intrinsic == nir_intrinsic_load_per_vertex_output) {
+      out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
+      out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[0]);
+   } else {
+      out = ntt_ureg_dst_indirect(c, out, instr->src[0]);
+   }
+
+   ureg_MOV(c->ureg, ntt_get_dest(c, &instr->dest), ureg_src(out));
+   ntt_reladdr_dst_put(c, out);
+}
+
 static void
 ntt_emit_load_sysval(struct ntt_compile *c, nir_intrinsic_instr *instr)
 {
@@ -1761,6 +1916,11 @@ ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
       ntt_emit_store_output(c, instr);
       break;
 
+   case nir_intrinsic_load_output:
+   case nir_intrinsic_load_per_vertex_output:
+      ntt_emit_load_output(c, instr);
+      break;
+
    case nir_intrinsic_discard:
       ureg_KILL(c->ureg);
       break;
@@ -1848,6 +2008,7 @@ ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
       break;
 
    case nir_intrinsic_control_barrier:
+   case nir_intrinsic_memory_barrier_tcs_patch:
       ureg_BARRIER(c->ureg);
       break;
 
@@ -1893,14 +2054,14 @@ ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
       break;
 
       /* In TGSI we don't actually generate the barycentric coords, and emit
-       * interp intrinsics later.  However, we do need to store the _at_offset
-       * argument so that we can use it at that point.
+       * interp intrinsics later.  However, we do need to store the
+       * load_barycentric_at_* argument so that we can use it at that point.
        */
    case nir_intrinsic_load_barycentric_pixel:
    case nir_intrinsic_load_barycentric_centroid:
-   case nir_intrinsic_load_barycentric_at_sample:
+   case nir_intrinsic_load_barycentric_sample:
       break;
-
+   case nir_intrinsic_load_barycentric_at_sample:
    case nir_intrinsic_load_barycentric_at_offset:
       ntt_store(c, &instr->dest, ntt_get_src(c, instr->src[0]));
       break;
@@ -1935,7 +2096,7 @@ static void
 ntt_emit_texture(struct ntt_compile *c, nir_tex_instr *instr)
 {
    struct ureg_dst dst = ntt_get_dest(c, &instr->dest);
-   unsigned target;
+   enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(instr->sampler_dim, instr->is_array, instr->is_shadow);
    unsigned tex_opcode;
 
    struct ureg_src sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index);
@@ -1999,79 +2160,12 @@ ntt_emit_texture(struct ntt_compile *c, nir_tex_instr *instr)
    ntt_push_tex_arg(c, instr, nir_tex_src_backend2, &s);
 
    /* non-coord arg for TXQ */
-   ntt_push_tex_arg(c, instr, nir_tex_src_lod, &s);
-
-   switch (instr->sampler_dim) {
-   case GLSL_SAMPLER_DIM_1D:
-      if (instr->is_array) {
-         if (instr->is_shadow) {
-            target = TGSI_TEXTURE_SHADOW1D_ARRAY;
-         } else {
-            target = TGSI_TEXTURE_1D_ARRAY;
-         }
-      } else {
-         if (instr->is_shadow) {
-            target = TGSI_TEXTURE_SHADOW1D;
-         } else {
-            target = TGSI_TEXTURE_1D;
-         }
-      }
-      break;
-   case GLSL_SAMPLER_DIM_2D:
-   case GLSL_SAMPLER_DIM_EXTERNAL:
-      if (instr->is_array) {
-         if (instr->is_shadow) {
-            target = TGSI_TEXTURE_SHADOW2D_ARRAY;
-         } else {
-            target = TGSI_TEXTURE_2D_ARRAY;
-         }
-      } else {
-         if (instr->is_shadow) {
-            target = TGSI_TEXTURE_SHADOW2D;
-         } else {
-            target = TGSI_TEXTURE_2D;
-         }
-      }
-      break;
-   case GLSL_SAMPLER_DIM_MS:
-      if (instr->is_array) {
-         target = TGSI_TEXTURE_2D_ARRAY_MSAA;
-      } else {
-         target = TGSI_TEXTURE_2D_ARRAY;
-      }
-      break;
-   case GLSL_SAMPLER_DIM_3D:
-      assert(!instr->is_shadow);
-      target = TGSI_TEXTURE_3D;
-      break;
-   case GLSL_SAMPLER_DIM_RECT:
-      if (instr->is_shadow) {
-         target = TGSI_TEXTURE_SHADOWRECT;
-      } else {
-         target = TGSI_TEXTURE_RECT;
-      }
-      break;
-   case GLSL_SAMPLER_DIM_CUBE:
-      if (instr->is_array) {
-         if (instr->is_shadow) {
-            target = TGSI_TEXTURE_SHADOWCUBE_ARRAY;
-         } else {
-            target = TGSI_TEXTURE_CUBE_ARRAY;
-         }
-      } else {
-         if (instr->is_shadow) {
-            target = TGSI_TEXTURE_SHADOWCUBE;
-         } else {
-            target = TGSI_TEXTURE_CUBE;
-         }
-      }
-      break;
-   case GLSL_SAMPLER_DIM_BUF:
-      target = TGSI_TEXTURE_BUFFER;
-      break;
-   default:
-      fprintf(stderr, "Unknown sampler dimensions: %d\n", instr->sampler_dim);
-      abort();
+   if (tex_opcode == TGSI_OPCODE_TXQ) {
+      ntt_push_tex_arg(c, instr, nir_tex_src_lod, &s);
+      /* virglrenderer mistakenly looks at .w instead of .x, so make sure it's
+       * scalar
+       */
+      s.srcs[s.i - 1] = ureg_scalar(s.srcs[s.i - 1], 0);
    }
 
    if (s.i > 1) {
@@ -2265,7 +2359,7 @@ ntt_free_ssa_temp_by_index(struct ntt_compile *c, int index)
    if (c->ssa_temp[index].File != TGSI_FILE_TEMPORARY)
       return;
 
-   ureg_release_temporary(c->ureg, c->ssa_temp[index]);
+   ureg_release_temporary(c->ureg, ureg_dst(c->ssa_temp[index]));
    memset(&c->ssa_temp[index], 0, sizeof(c->ssa_temp[index]));
 }
 
@@ -2350,7 +2444,7 @@ ntt_emit_impl(struct ntt_compile *c, nir_function_impl *impl)
    c->impl = impl;
    c->liveness = nir_live_ssa_defs_per_instr(impl);
 
-   c->ssa_temp = rzalloc_array(c, struct ureg_dst, impl->ssa_alloc);
+   c->ssa_temp = rzalloc_array(c, struct ureg_src, impl->ssa_alloc);
    c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->reg_alloc);
 
    ntt_setup_registers(c, &impl->registers);
@@ -2745,6 +2839,20 @@ nir_to_tgsi_lower_tex_instr(nir_builder *b, nir_instr *instr, void *data)
    if (nir_tex_instr_src_index(tex, nir_tex_src_coord) < 0)
       return false;
 
+   /* NIR after lower_tex will have LOD set to 0 for tex ops that wanted
+    * implicit lod in shader stages that don't have quad-based derivatives.
+    * TGSI doesn't want that, it requires that the backend do implict LOD 0 for
+    * those stages.
+    */
+   if (!nir_shader_supports_implicit_lod(b->shader) && tex->op == nir_texop_txl) {
+      int lod_index = nir_tex_instr_src_index(tex, nir_tex_src_lod);
+      nir_src *lod_src = &tex->src[lod_index].src;
+      if (nir_src_is_const(*lod_src) && nir_src_as_uint(*lod_src) == 0) {
+         nir_tex_instr_remove_src(tex, lod_index);
+         tex->op = nir_texop_tex;
+      }
+   }
+
    b->cursor = nir_before_instr(instr);
 
    struct ntt_lower_tex_state s = {0};
@@ -2761,6 +2869,7 @@ nir_to_tgsi_lower_tex_instr(nir_builder *b, nir_instr *instr, void *data)
    /* XXX: LZ */
    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_lod, &s);
    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_projector, &s);
+   nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_ms_index, &s);
 
    /* No need to pack undefs in unused channels of the tex instr */
    while (!s.channels[s.i - 1])
@@ -2900,6 +3009,45 @@ nir_to_tgsi_lower_txp(nir_shader *s)
    NIR_PASS_V(s, nir_lower_tex, &lower_tex_options);
 }
 
+static bool
+nir_lower_primid_sysval_to_input_filter(const nir_instr *instr, const void *_data)
+{
+   return (instr->type == nir_instr_type_intrinsic &&
+           nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_primitive_id);
+}
+
+static nir_ssa_def *
+nir_lower_primid_sysval_to_input_lower(nir_builder *b, nir_instr *instr, void *data)
+{
+   nir_variable *var = *(nir_variable **)data;
+   if (!var) {
+      var = nir_variable_create(b->shader, nir_var_shader_in, glsl_uint_type(), "gl_PrimitiveID");
+      var->data.location = VARYING_SLOT_PRIMITIVE_ID;
+      b->shader->info.inputs_read |= VARYING_BIT_PRIMITIVE_ID;
+      var->data.driver_location = b->shader->num_outputs++;
+
+      *(nir_variable **)data = var;
+   }
+
+   nir_io_semantics semantics = {
+      .location = var->data.location,
+       .num_slots = 1
+   };
+   return nir_load_input(b, 1, 32, nir_imm_int(b, 0),
+                         .base = var->data.driver_location,
+                         .io_semantics = semantics);
+}
+
+static bool
+nir_lower_primid_sysval_to_input(nir_shader *s)
+{
+   nir_variable *input = NULL;
+
+   return nir_shader_lower_instructions(s,
+                                        nir_lower_primid_sysval_to_input_filter,
+                                        nir_lower_primid_sysval_to_input_lower, &input);
+}
+
 /**
  * Translates the NIR shader to TGSI.
  *
@@ -2929,6 +3077,13 @@ nir_to_tgsi(struct nir_shader *s,
    nir_to_tgsi_lower_txp(s);
    NIR_PASS_V(s, nir_to_tgsi_lower_tex);
 
+   /* While TGSI can represent PRIMID as either an input or a system value,
+    * glsl-to-tgsi had the GS (not TCS or TES) primid as an input, and drivers
+    * depend on that.
+    */
+   if (s->info.stage == MESA_SHADER_GEOMETRY)
+      NIR_PASS_V(s, nir_lower_primid_sysval_to_input);
+
    if (s->info.num_abos)
       NIR_PASS_V(s, ntt_lower_atomic_pre_dec);
 
@@ -3008,6 +3163,7 @@ nir_to_tgsi(struct nir_shader *s,
    ureg_setup_shader_info(c->ureg, &s->info);
 
    ntt_setup_inputs(c);
+   ntt_setup_outputs(c);
    ntt_setup_uniforms(c);
 
    if (s->info.stage == MESA_SHADER_FRAGMENT) {
diff --git a/mesa 3D driver/src/gallium/auxiliary/nir/nir_to_tgsi_info.c b/mesa 3D driver/src/gallium/auxiliary/nir/nir_to_tgsi_info.c
index 3bb5f1f8ba..ce1e3412f4 100644
--- a/mesa 3D driver/src/gallium/auxiliary/nir/nir_to_tgsi_info.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/nir/nir_to_tgsi_info.c	
@@ -784,11 +784,15 @@ void nir_tgsi_scan_shader(const struct nir_shader *nir,
          info->indirect_files |= 1 << TGSI_FILE_OUTPUT;
    }
 
-   uint32_t sampler_mask = 0, image_mask = 0;
+   uint32_t sampler_mask = 0;
    nir_foreach_uniform_variable(var, nir) {
-      uint32_t sampler_count = glsl_type_get_sampler_count(var->type);
-      uint32_t image_count = glsl_type_get_image_count(var->type);
+      uint32_t sampler_count = glsl_type_get_sampler_count(var->type) +
+                               glsl_type_get_texture_count(var->type);
       sampler_mask |= ((1ull << sampler_count) - 1) << var->data.binding;
+   }
+   uint32_t image_mask = 0;
+   nir_foreach_image_variable(var, nir) {
+      uint32_t image_count = glsl_type_get_image_count(var->type);
       image_mask |= ((1ull << image_count) - 1) << var->data.binding;
    }
    info->num_outputs = num_outputs;
diff --git a/mesa 3D driver/src/gallium/auxiliary/nir/tgsi_to_nir.c b/mesa 3D driver/src/gallium/auxiliary/nir/tgsi_to_nir.c
index 7454c6d21d..1714985595 100644
--- a/mesa 3D driver/src/gallium/auxiliary/nir/tgsi_to_nir.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/nir/tgsi_to_nir.c	
@@ -431,6 +431,8 @@ ttn_emit_declaration(struct ttn_compile *c)
                if (var->data.location == VARYING_SLOT_FOGC ||
                    var->data.location == VARYING_SLOT_PSIZ) {
                   var->type = glsl_float_type();
+               } else if (var->data.location == VARYING_SLOT_LAYER) {
+                  var->type = glsl_int_type();
                }
             }
 
@@ -821,7 +823,7 @@ ttn_get_dest(struct ttn_compile *c, struct tgsi_full_dst_register *tgsi_fdst)
    dest.saturate = false;
 
    if (tgsi_dst->Indirect && (tgsi_dst->File != TGSI_FILE_TEMPORARY)) {
-      nir_src *indirect = ralloc(c->build.shader, nir_src);
+      nir_src *indirect = malloc(sizeof(nir_src));
       *indirect = nir_src_for_ssa(ttn_src_for_indirect(c, &tgsi_fdst->Indirect));
       dest.dest.reg.indirect = indirect;
    }
@@ -1278,7 +1280,7 @@ get_image_var(struct ttn_compile *c, int binding,
    if (!var) {
       const struct glsl_type *type = glsl_image_type(dim, is_array, base_type);
 
-      var = nir_variable_create(c->build.shader, nir_var_uniform, type, "image");
+      var = nir_variable_create(c->build.shader, nir_var_image, type, "image");
       var->data.binding = binding;
       var->data.explicit_binding = true;
       var->data.access = access;
@@ -2219,8 +2221,9 @@ ttn_add_output_stores(struct ttn_compile *c)
          else if (var->data.location == FRAG_RESULT_SAMPLE_MASK)
             store_value = nir_channel(b, store_value, 0);
       } else {
-         /* FOGC and PSIZ are scalar values */
+         /* FOGC, LAYER, and PSIZ are scalar values */
          if (var->data.location == VARYING_SLOT_FOGC ||
+             var->data.location == VARYING_SLOT_LAYER ||
              var->data.location == VARYING_SLOT_PSIZ) {
             store_value = nir_channel(b, store_value, 0);
          }
@@ -2483,7 +2486,8 @@ ttn_finalize_nir(struct ttn_compile *c, struct pipe_screen *screen)
       NIR_PASS_V(nir, nir_lower_samplers);
 
    if (screen->finalize_nir) {
-      screen->finalize_nir(screen, nir);
+      char *msg = screen->finalize_nir(screen, nir);
+      free(msg);
    } else {
       ttn_optimize_nir(nir);
       nir_shader_gather_info(nir, c->build.impl);
diff --git a/mesa 3D driver/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h b/mesa 3D driver/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h
index 2ec793e125..3b630f7769 100644
--- a/mesa 3D driver/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h	
+++ b/mesa 3D driver/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h	
@@ -31,7 +31,6 @@ DRI_CONF_SECTION_DEBUG
    DRI_CONF_GLSL_CORRECT_DERIVATIVES_AFTER_DISCARD(false)
    DRI_CONF_GLSL_IGNORE_WRITE_TO_READONLY_VAR(false)
    DRI_CONF_ALLOW_DRAW_OUT_OF_ORDER(false)
-   DRI_CONF_ALLOW_INCORRECT_PRIMITIVE_ID(false)
    DRI_CONF_FORCE_COMPAT_PROFILE(false)
    DRI_CONF_FORCE_GL_NAMES_REUSE(false)
    DRI_CONF_TRANSCODE_ETC(false)
@@ -49,7 +48,7 @@ DRI_CONF_SECTION_MISCELLANEOUS
    DRI_CONF_ALWAYS_HAVE_DEPTH_BUFFER(false)
    DRI_CONF_GLSL_ZERO_INIT(false)
    DRI_CONF_VS_POSITION_ALWAYS_INVARIANT(false)
+   DRI_CONF_VS_POSITION_ALWAYS_PRECISE(false)
    DRI_CONF_ALLOW_RGB10_CONFIGS(true)
-   DRI_CONF_ALLOW_FP16_CONFIGS(false)
    DRI_CONF_FORCE_INTEGER_TEX_NEAREST(false)
 DRI_CONF_SECTION_END
diff --git a/mesa 3D driver/src/gallium/auxiliary/pipebuffer/pb_cache.c b/mesa 3D driver/src/gallium/auxiliary/pipebuffer/pb_cache.c
index 18c856f875..87b50cc942 100644
--- a/mesa 3D driver/src/gallium/auxiliary/pipebuffer/pb_cache.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/pipebuffer/pb_cache.c	
@@ -87,7 +87,7 @@ pb_cache_add_buffer(struct pb_cache_entry *entry)
    struct pb_buffer *buf = entry->buffer;
    unsigned i;
 
-   mtx_lock(&mgr->mutex);
+   simple_mtx_lock(&mgr->mutex);
    assert(!pipe_is_referenced(&buf->reference));
 
    int64_t current_time = os_time_get();
@@ -98,7 +98,7 @@ pb_cache_add_buffer(struct pb_cache_entry *entry)
    /* Directly release any buffer that exceeds the limit. */
    if (mgr->cache_size + buf->size > mgr->max_cache_size) {
       mgr->destroy_buffer(mgr->winsys, buf);
-      mtx_unlock(&mgr->mutex);
+      simple_mtx_unlock(&mgr->mutex);
       return;
    }
 
@@ -107,7 +107,7 @@ pb_cache_add_buffer(struct pb_cache_entry *entry)
    list_addtail(&entry->head, cache);
    ++mgr->num_buffers;
    mgr->cache_size += buf->size;
-   mtx_unlock(&mgr->mutex);
+   simple_mtx_unlock(&mgr->mutex);
 }
 
 /**
@@ -157,7 +157,7 @@ pb_cache_reclaim_buffer(struct pb_cache *mgr, pb_size size,
    assert(bucket_index < mgr->num_heaps);
    struct list_head *cache = &mgr->buckets[bucket_index];
 
-   mtx_lock(&mgr->mutex);
+   simple_mtx_lock(&mgr->mutex);
 
    entry = NULL;
    cur = cache->next;
@@ -210,13 +210,13 @@ pb_cache_reclaim_buffer(struct pb_cache *mgr, pb_size size,
       mgr->cache_size -= buf->size;
       list_del(&entry->head);
       --mgr->num_buffers;
-      mtx_unlock(&mgr->mutex);
+      simple_mtx_unlock(&mgr->mutex);
       /* Increase refcount */
       pipe_reference_init(&buf->reference, 1);
       return buf;
    }
 
-   mtx_unlock(&mgr->mutex);
+   simple_mtx_unlock(&mgr->mutex);
    return NULL;
 }
 
@@ -230,7 +230,7 @@ pb_cache_release_all_buffers(struct pb_cache *mgr)
    struct pb_cache_entry *buf;
    unsigned i;
 
-   mtx_lock(&mgr->mutex);
+   simple_mtx_lock(&mgr->mutex);
    for (i = 0; i < mgr->num_heaps; i++) {
       struct list_head *cache = &mgr->buckets[i];
 
@@ -243,7 +243,7 @@ pb_cache_release_all_buffers(struct pb_cache *mgr)
          next = curr->next;
       }
    }
-   mtx_unlock(&mgr->mutex);
+   simple_mtx_unlock(&mgr->mutex);
 }
 
 void
@@ -293,7 +293,7 @@ pb_cache_init(struct pb_cache *mgr, uint num_heaps,
    for (i = 0; i < num_heaps; i++)
       list_inithead(&mgr->buckets[i]);
 
-   (void) mtx_init(&mgr->mutex, mtx_plain);
+   (void) simple_mtx_init(&mgr->mutex, mtx_plain);
    mgr->winsys = winsys;
    mgr->cache_size = 0;
    mgr->max_cache_size = maximum_cache_size;
@@ -313,7 +313,7 @@ void
 pb_cache_deinit(struct pb_cache *mgr)
 {
    pb_cache_release_all_buffers(mgr);
-   mtx_destroy(&mgr->mutex);
+   simple_mtx_destroy(&mgr->mutex);
    FREE(mgr->buckets);
    mgr->buckets = NULL;
 }
diff --git a/mesa 3D driver/src/gallium/auxiliary/pipebuffer/pb_cache.h b/mesa 3D driver/src/gallium/auxiliary/pipebuffer/pb_cache.h
index 4afa0c804e..cda0f99844 100644
--- a/mesa 3D driver/src/gallium/auxiliary/pipebuffer/pb_cache.h	
+++ b/mesa 3D driver/src/gallium/auxiliary/pipebuffer/pb_cache.h	
@@ -30,6 +30,7 @@
 #define PB_CACHE_H
 
 #include "pb_buffer.h"
+#include "util/simple_mtx.h"
 #include "util/list.h"
 #include "os/os_thread.h"
 
@@ -52,7 +53,7 @@ struct pb_cache
     */
    struct list_head *buckets;
 
-   mtx_t mutex;
+   simple_mtx_t mutex;
    void *winsys;
    uint64_t cache_size;
    uint64_t max_cache_size;
diff --git a/mesa 3D driver/src/gallium/auxiliary/pipebuffer/pb_slab.c b/mesa 3D driver/src/gallium/auxiliary/pipebuffer/pb_slab.c
index 9918e854b1..83c1a597fe 100644
--- a/mesa 3D driver/src/gallium/auxiliary/pipebuffer/pb_slab.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/pipebuffer/pb_slab.c	
@@ -71,17 +71,27 @@ pb_slab_reclaim(struct pb_slabs *slabs, struct pb_slab_entry *entry)
    }
 }
 
+#define MAX_FAILED_RECLAIMS 2
+
 static void
 pb_slabs_reclaim_locked(struct pb_slabs *slabs)
 {
-   while (!list_is_empty(&slabs->reclaim)) {
-      struct pb_slab_entry *entry =
-         LIST_ENTRY(struct pb_slab_entry, slabs->reclaim.next, head);
-
-      if (!slabs->can_reclaim(slabs->priv, entry))
+   struct pb_slab_entry *entry, *next;
+   unsigned num_failed_reclaims = 0;
+   LIST_FOR_EACH_ENTRY_SAFE(entry, next, &slabs->reclaim, head) {
+      if (slabs->can_reclaim(slabs->priv, entry)) {
+         pb_slab_reclaim(slabs, entry);
+      /* there are typically three possible scenarios when reclaiming:
+       * - all entries reclaimed
+       * - no entries reclaimed
+       * - all but one entry reclaimed
+       * in the scenario where a slab contains many (10+) unused entries,
+       * the driver should not walk the entire list, as this is likely to
+       * result in zero reclaims if the first few entries fail to reclaim
+       */
+      } else if (++num_failed_reclaims >= MAX_FAILED_RECLAIMS) {
          break;
-
-      pb_slab_reclaim(slabs, entry);
+      }
    }
 }
 
@@ -120,7 +130,7 @@ pb_slab_alloc(struct pb_slabs *slabs, unsigned size, unsigned heap)
                  (1 + slabs->allow_three_fourths_allocations) + three_fourths;
    group = &slabs->groups[group_index];
 
-   mtx_lock(&slabs->mutex);
+   simple_mtx_lock(&slabs->mutex);
 
    /* If there is no candidate slab at all, or the first slab has no free
     * entries, try reclaiming entries.
@@ -146,11 +156,11 @@ pb_slab_alloc(struct pb_slabs *slabs, unsigned size, unsigned heap)
        * There's a chance that racing threads will end up allocating multiple
        * slabs for the same group, but that doesn't hurt correctness.
        */
-      mtx_unlock(&slabs->mutex);
+      simple_mtx_unlock(&slabs->mutex);
       slab = slabs->slab_alloc(slabs->priv, heap, entry_size, group_index);
       if (!slab)
          return NULL;
-      mtx_lock(&slabs->mutex);
+      simple_mtx_lock(&slabs->mutex);
 
       list_add(&slab->head, &group->slabs);
    }
@@ -159,7 +169,7 @@ pb_slab_alloc(struct pb_slabs *slabs, unsigned size, unsigned heap)
    list_del(&entry->head);
    slab->num_free--;
 
-   mtx_unlock(&slabs->mutex);
+   simple_mtx_unlock(&slabs->mutex);
 
    return entry;
 }
@@ -173,9 +183,9 @@ pb_slab_alloc(struct pb_slabs *slabs, unsigned size, unsigned heap)
 void
 pb_slab_free(struct pb_slabs* slabs, struct pb_slab_entry *entry)
 {
-   mtx_lock(&slabs->mutex);
+   simple_mtx_lock(&slabs->mutex);
    list_addtail(&entry->head, &slabs->reclaim);
-   mtx_unlock(&slabs->mutex);
+   simple_mtx_unlock(&slabs->mutex);
 }
 
 /* Check if any of the entries handed to pb_slab_free are ready to be re-used.
@@ -187,9 +197,9 @@ pb_slab_free(struct pb_slabs* slabs, struct pb_slab_entry *entry)
 void
 pb_slabs_reclaim(struct pb_slabs *slabs)
 {
-   mtx_lock(&slabs->mutex);
+   simple_mtx_lock(&slabs->mutex);
    pb_slabs_reclaim_locked(slabs);
-   mtx_unlock(&slabs->mutex);
+   simple_mtx_unlock(&slabs->mutex);
 }
 
 /* Initialize the slabs manager.
@@ -237,7 +247,7 @@ pb_slabs_init(struct pb_slabs *slabs,
       list_inithead(&group->slabs);
    }
 
-   (void) mtx_init(&slabs->mutex, mtx_plain);
+   (void) simple_mtx_init(&slabs->mutex, mtx_plain);
 
    return true;
 }
@@ -261,5 +271,5 @@ pb_slabs_deinit(struct pb_slabs *slabs)
    }
 
    FREE(slabs->groups);
-   mtx_destroy(&slabs->mutex);
+   simple_mtx_destroy(&slabs->mutex);
 }
diff --git a/mesa 3D driver/src/gallium/auxiliary/pipebuffer/pb_slab.h b/mesa 3D driver/src/gallium/auxiliary/pipebuffer/pb_slab.h
index a7940b6b51..c6b115eca1 100644
--- a/mesa 3D driver/src/gallium/auxiliary/pipebuffer/pb_slab.h	
+++ b/mesa 3D driver/src/gallium/auxiliary/pipebuffer/pb_slab.h	
@@ -45,6 +45,7 @@
 #define PB_SLAB_H
 
 #include "pb_buffer.h"
+#include "util/simple_mtx.h"
 #include "util/list.h"
 #include "os/os_thread.h"
 
@@ -111,7 +112,7 @@ typedef bool (slab_can_reclaim_fn)(void *priv, struct pb_slab_entry *);
  */
 struct pb_slabs
 {
-   mtx_t mutex;
+   simple_mtx_t mutex;
 
    unsigned min_order;
    unsigned num_orders;
diff --git a/mesa 3D driver/src/gallium/auxiliary/postprocess/pp_colors.c b/mesa 3D driver/src/gallium/auxiliary/postprocess/pp_colors.c
index f319ebb221..e7ce77758f 100644
--- a/mesa 3D driver/src/gallium/auxiliary/postprocess/pp_colors.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/postprocess/pp_colors.c	
@@ -47,7 +47,7 @@ pp_nocolor(struct pp_queue_t *ppq, struct pipe_resource *in,
    pp_filter_misc_state(p);
 
    cso_set_samplers(p->cso, PIPE_SHADER_FRAGMENT, 1, samplers);
-   pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 1, 0, &p->view);
+   pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 1, 0, false, &p->view);
 
    cso_set_vertex_shader_handle(p->cso, ppq->shaders[n][0]);
    cso_set_fragment_shader_handle(p->cso, ppq->shaders[n][1]);
diff --git a/mesa 3D driver/src/gallium/auxiliary/postprocess/pp_mlaa.c b/mesa 3D driver/src/gallium/auxiliary/postprocess/pp_mlaa.c
index 2bc2ac873d..102e71f48b 100644
--- a/mesa 3D driver/src/gallium/auxiliary/postprocess/pp_mlaa.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/postprocess/pp_mlaa.c	
@@ -134,7 +134,7 @@ pp_jimenezmlaa_run(struct pp_queue_t *ppq, struct pipe_resource *in,
       const struct pipe_sampler_state *samplers[] = {&p->sampler_point};
       cso_set_samplers(p->cso, PIPE_SHADER_FRAGMENT, 1, samplers);
    }
-   pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 1, 0, &p->view);
+   pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 1, 0, false, &p->view);
 
    cso_set_vertex_shader_handle(p->cso, ppq->shaders[n][1]);    /* offsetvs */
    cso_set_fragment_shader_handle(p->cso, ppq->shaders[n][2]);
@@ -166,7 +166,7 @@ pp_jimenezmlaa_run(struct pp_queue_t *ppq, struct pipe_resource *in,
    }
 
    arr[0] = p->view;
-   pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 3, 0, arr);
+   pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 3, 0, false, arr);
 
    cso_set_vertex_shader_handle(p->cso, ppq->shaders[n][0]);    /* passvs */
    cso_set_fragment_shader_handle(p->cso, ppq->shaders[n][3]);
@@ -198,7 +198,7 @@ pp_jimenezmlaa_run(struct pp_queue_t *ppq, struct pipe_resource *in,
    }
 
    arr[1] = p->view;
-   pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 2, 0, arr);
+   pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 2, 0, false, arr);
 
    cso_set_vertex_shader_handle(p->cso, ppq->shaders[n][1]);    /* offsetvs */
    cso_set_fragment_shader_handle(p->cso, ppq->shaders[n][4]);
diff --git a/mesa 3D driver/src/gallium/auxiliary/postprocess/pp_run.c b/mesa 3D driver/src/gallium/auxiliary/postprocess/pp_run.c
index 3615f348cf..93e0fa7b71 100644
--- a/mesa 3D driver/src/gallium/auxiliary/postprocess/pp_run.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/postprocess/pp_run.c	
@@ -184,14 +184,11 @@ pp_run(struct pp_queue_t *ppq, struct pipe_resource *in,
    }
 
    /* restore state we changed */
-   cso_restore_state(cso);
-
-   /* Unbind resources that we have bound. */
-   struct pipe_context *pipe = ppq->p->pipe;
-   pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, false, NULL);
-   pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, false, NULL);
-   pipe->set_vertex_buffers(pipe, 0, 0, 1, false, NULL);
-   pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 0, 3, NULL);
+   cso_restore_state(cso, CSO_UNBIND_FS_SAMPLERVIEWS |
+                          CSO_UNBIND_FS_IMAGE0 |
+                          CSO_UNBIND_VS_CONSTANTS |
+                          CSO_UNBIND_FS_CONSTANTS |
+                          CSO_UNBIND_VERTEX_BUFFER0);
 
    /* restore states not restored by cso */
    if (ppq->p->st) {
diff --git a/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_build.c b/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_build.c
index 9e1a553428..73d1eb26af 100644
--- a/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_build.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_build.c	
@@ -218,7 +218,6 @@ tgsi_default_declaration_interp( void )
 
    di.Interpolate = TGSI_INTERPOLATE_CONSTANT;
    di.Location = TGSI_INTERPOLATE_LOC_CENTER;
-   di.CylindricalWrap = 0;
    di.Padding = 0;
 
    return di;
@@ -227,7 +226,6 @@ tgsi_default_declaration_interp( void )
 static struct tgsi_declaration_interp
 tgsi_build_declaration_interp(unsigned interpolate,
                               unsigned interpolate_location,
-                              unsigned cylindrical_wrap,
                               struct tgsi_declaration *declaration,
                               struct tgsi_header *header)
 {
@@ -235,7 +233,6 @@ tgsi_build_declaration_interp(unsigned interpolate,
 
    di.Interpolate = interpolate;
    di.Location = interpolate_location;
-   di.CylindricalWrap = cylindrical_wrap;
    di.Padding = 0;
 
    declaration_grow(declaration, header);
@@ -467,7 +464,6 @@ tgsi_build_full_declaration(
 
       *di = tgsi_build_declaration_interp(full_decl->Interp.Interpolate,
                                           full_decl->Interp.Location,
-                                          full_decl->Interp.CylindricalWrap,
                                           declaration,
                                           header);
    }
diff --git a/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_dump.c b/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_dump.c
index 110aeb6084..68e30b6b8c 100644
--- a/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_dump.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_dump.c	
@@ -436,22 +436,6 @@ iter_declaration(
          TXT( ", " );
          ENM( decl->Interp.Location, tgsi_interpolate_locations );
       }
-
-      if (decl->Interp.CylindricalWrap) {
-         TXT(", CYLWRAP_");
-         if (decl->Interp.CylindricalWrap & TGSI_CYLINDRICAL_WRAP_X) {
-            CHR('X');
-         }
-         if (decl->Interp.CylindricalWrap & TGSI_CYLINDRICAL_WRAP_Y) {
-            CHR('Y');
-         }
-         if (decl->Interp.CylindricalWrap & TGSI_CYLINDRICAL_WRAP_Z) {
-            CHR('Z');
-         }
-         if (decl->Interp.CylindricalWrap & TGSI_CYLINDRICAL_WRAP_W) {
-            CHR('W');
-         }
-      }
    }
 
    if (decl->Declaration.Invariant) {
diff --git a/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_lowering.c b/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_lowering.c
index 4f8f301756..db23026691 100644
--- a/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_lowering.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_lowering.c	
@@ -1188,7 +1188,6 @@ emit_twoside(struct tgsi_transform_context *tctx)
       decl.Declaration.Interpolate = true;
       decl.Interp.Interpolate = info->input_interpolate[in_idx];
       decl.Interp.Location = info->input_interpolate_loc[in_idx];
-      decl.Interp.CylindricalWrap = info->input_cylindrical_wrap[in_idx];
       tctx->emit_declaration(tctx, &decl);
    }
 
diff --git a/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_scan.c b/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_scan.c
index 0cd4a8883a..ecb3706edd 100644
--- a/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_scan.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_scan.c	
@@ -668,7 +668,6 @@ scan_declaration(struct tgsi_shader_info *info,
          info->input_semantic_index[reg] = (ubyte) semIndex;
          info->input_interpolate[reg] = (ubyte)fulldecl->Interp.Interpolate;
          info->input_interpolate_loc[reg] = (ubyte)fulldecl->Interp.Location;
-         info->input_cylindrical_wrap[reg] = (ubyte)fulldecl->Interp.CylindricalWrap;
 
          /* Vertex shaders can have inputs with holes between them. */
          info->num_inputs = MAX2(info->num_inputs, reg + 1);
diff --git a/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_scan.h b/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_scan.h
index ca8d90a801..ace5b08872 100644
--- a/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_scan.h	
+++ b/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_scan.h	
@@ -51,7 +51,6 @@ struct tgsi_shader_info
    ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS];
    ubyte input_interpolate_loc[PIPE_MAX_SHADER_INPUTS];
    ubyte input_usage_mask[PIPE_MAX_SHADER_INPUTS];
-   ubyte input_cylindrical_wrap[PIPE_MAX_SHADER_INPUTS];
    ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */
    ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
    ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
diff --git a/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index 0bf10fd039..eedba34cbc 100644
--- a/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_ureg.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_ureg.c	
@@ -114,11 +114,10 @@ struct ureg_program
    bool supports_any_inout_decl_range;
    int next_shader_processor;
 
-   struct {
+   struct ureg_input_decl {
       enum tgsi_semantic semantic_name;
       unsigned semantic_index;
       enum tgsi_interpolate_mode interp;
-      unsigned char cylindrical_wrap;
       unsigned char usage_mask;
       enum tgsi_interpolate_loc interp_location;
       unsigned first;
@@ -284,11 +283,10 @@ ureg_property(struct ureg_program *ureg, unsigned name, unsigned value)
 }
 
 struct ureg_src
-ureg_DECL_fs_input_cyl_centroid_layout(struct ureg_program *ureg,
+ureg_DECL_fs_input_centroid_layout(struct ureg_program *ureg,
                        enum tgsi_semantic semantic_name,
                        unsigned semantic_index,
                        enum tgsi_interpolate_mode interp_mode,
-                       unsigned cylindrical_wrap,
                        enum tgsi_interpolate_loc interp_location,
                        unsigned index,
                        unsigned usage_mask,
@@ -304,7 +302,6 @@ ureg_DECL_fs_input_cyl_centroid_layout(struct ureg_program *ureg,
       if (ureg->input[i].semantic_name == semantic_name &&
           ureg->input[i].semantic_index == semantic_index) {
          assert(ureg->input[i].interp == interp_mode);
-         assert(ureg->input[i].cylindrical_wrap == cylindrical_wrap);
          assert(ureg->input[i].interp_location == interp_location);
          if (ureg->input[i].array_id == array_id) {
             ureg->input[i].usage_mask |= usage_mask;
@@ -319,7 +316,6 @@ ureg_DECL_fs_input_cyl_centroid_layout(struct ureg_program *ureg,
       ureg->input[i].semantic_name = semantic_name;
       ureg->input[i].semantic_index = semantic_index;
       ureg->input[i].interp = interp_mode;
-      ureg->input[i].cylindrical_wrap = cylindrical_wrap;
       ureg->input[i].interp_location = interp_location;
       ureg->input[i].first = index;
       ureg->input[i].last = index + array_size - 1;
@@ -337,18 +333,17 @@ ureg_DECL_fs_input_cyl_centroid_layout(struct ureg_program *ureg,
 }
 
 struct ureg_src
-ureg_DECL_fs_input_cyl_centroid(struct ureg_program *ureg,
+ureg_DECL_fs_input_centroid(struct ureg_program *ureg,
                        enum tgsi_semantic semantic_name,
                        unsigned semantic_index,
                        enum tgsi_interpolate_mode interp_mode,
-                       unsigned cylindrical_wrap,
                        enum tgsi_interpolate_loc interp_location,
                        unsigned array_id,
                        unsigned array_size)
 {
-   return ureg_DECL_fs_input_cyl_centroid_layout(ureg,
+   return ureg_DECL_fs_input_centroid_layout(ureg,
          semantic_name, semantic_index, interp_mode,
-         cylindrical_wrap, interp_location,
+         interp_location,
          ureg->nr_input_regs, TGSI_WRITEMASK_XYZW, array_id, array_size);
 }
 
@@ -374,9 +369,9 @@ ureg_DECL_input_layout(struct ureg_program *ureg,
                 unsigned array_id,
                 unsigned array_size)
 {
-   return ureg_DECL_fs_input_cyl_centroid_layout(ureg,
+   return ureg_DECL_fs_input_centroid_layout(ureg,
                semantic_name, semantic_index,
-               TGSI_INTERPOLATE_CONSTANT, 0, TGSI_INTERPOLATE_LOC_CENTER,
+               TGSI_INTERPOLATE_CONSTANT, TGSI_INTERPOLATE_LOC_CENTER,
                index, usage_mask, array_id, array_size);
 }
 
@@ -388,8 +383,8 @@ ureg_DECL_input(struct ureg_program *ureg,
                 unsigned array_id,
                 unsigned array_size)
 {
-   return ureg_DECL_fs_input_cyl_centroid(ureg, semantic_name, semantic_index,
-                                          TGSI_INTERPOLATE_CONSTANT, 0,
+   return ureg_DECL_fs_input_centroid(ureg, semantic_name, semantic_index,
+                                          TGSI_INTERPOLATE_CONSTANT,
                                           TGSI_INTERPOLATE_LOC_CENTER,
                                           array_id, array_size);
 }
@@ -1587,7 +1582,6 @@ emit_decl_fs(struct ureg_program *ureg,
              enum tgsi_semantic semantic_name,
              unsigned semantic_index,
              enum tgsi_interpolate_mode interpolate,
-             unsigned cylindrical_wrap,
              enum tgsi_interpolate_loc interpolate_location,
              unsigned array_id,
              unsigned usage_mask)
@@ -1610,7 +1604,6 @@ emit_decl_fs(struct ureg_program *ureg,
 
    out[2].value = 0;
    out[2].decl_interp.Interpolate = interpolate;
-   out[2].decl_interp.CylindricalWrap = cylindrical_wrap;
    out[2].decl_interp.Location = interpolate_location;
 
    out[3].value = 0;
@@ -1819,6 +1812,14 @@ emit_property(struct ureg_program *ureg,
    out[1].prop_data.Data = data;
 }
 
+static int
+input_sort(const void *in_a, const void *in_b)
+{
+   const struct ureg_input_decl *a = in_a, *b = in_b;
+
+   return a->first - b->first;
+}
+
 static int
 output_sort(const void *in_a, const void *in_b)
 {
@@ -1835,6 +1836,11 @@ static void emit_decls( struct ureg_program *ureg )
       if (ureg->properties[i] != ~0u)
          emit_property(ureg, i, ureg->properties[i]);
 
+   /* While not required by TGSI spec, virglrenderer has a dependency on the
+    * inputs being sorted.
+    */
+   qsort(ureg->input, ureg->nr_inputs, sizeof(ureg->input[0]), input_sort);
+
    if (ureg->processor == PIPE_SHADER_VERTEX) {
       for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
          if (ureg->vs_inputs[i/32] & (1u << (i%32))) {
@@ -1851,7 +1857,6 @@ static void emit_decls( struct ureg_program *ureg )
                          ureg->input[i].semantic_name,
                          ureg->input[i].semantic_index,
                          ureg->input[i].interp,
-                         ureg->input[i].cylindrical_wrap,
                          ureg->input[i].interp_location,
                          ureg->input[i].array_id,
                          ureg->input[i].usage_mask);
@@ -1867,7 +1872,6 @@ static void emit_decls( struct ureg_program *ureg )
                             ureg->input[i].semantic_index +
                             (j - ureg->input[i].first),
                             ureg->input[i].interp,
-                            ureg->input[i].cylindrical_wrap,
                             ureg->input[i].interp_location, 0,
                             ureg->input[i].usage_mask);
             }
@@ -2359,6 +2363,11 @@ ureg_setup_fragment_shader(struct ureg_program *ureg,
          assert(0);
       }
    }
+
+   if (info->fs.advanced_blend_modes) {
+      ureg_property(ureg, TGSI_PROPERTY_FS_BLEND_EQUATION_ADVANCED,
+                    info->fs.advanced_blend_modes);
+   }
 }
 
 static void
diff --git a/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index 1b69c87309..343708b6c0 100644
--- a/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_ureg.h	
+++ b/mesa 3D driver/src/gallium/auxiliary/tgsi/tgsi_ureg.h	
@@ -172,11 +172,10 @@ ureg_property(struct ureg_program *ureg, unsigned name, unsigned value);
  */
 
 struct ureg_src
-ureg_DECL_fs_input_cyl_centroid_layout(struct ureg_program *,
+ureg_DECL_fs_input_centroid_layout(struct ureg_program *,
                        enum tgsi_semantic semantic_name,
                        unsigned semantic_index,
                        enum tgsi_interpolate_mode interp_mode,
-                       unsigned cylindrical_wrap,
                        enum tgsi_interpolate_loc interp_location,
                        unsigned index,
                        unsigned usage_mask,
@@ -184,41 +183,25 @@ ureg_DECL_fs_input_cyl_centroid_layout(struct ureg_program *,
                        unsigned array_size);
 
 struct ureg_src
-ureg_DECL_fs_input_cyl_centroid(struct ureg_program *,
+ureg_DECL_fs_input_centroid(struct ureg_program *,
                        enum tgsi_semantic semantic_name,
                        unsigned semantic_index,
                        enum tgsi_interpolate_mode interp_mode,
-                       unsigned cylindrical_wrap,
                        enum tgsi_interpolate_loc interp_location,
                        unsigned array_id,
                        unsigned array_size);
 
-static inline struct ureg_src
-ureg_DECL_fs_input_cyl(struct ureg_program *ureg,
-                       enum tgsi_semantic semantic_name,
-                       unsigned semantic_index,
-                       enum tgsi_interpolate_mode interp_mode,
-                       unsigned cylindrical_wrap)
-{
-   return ureg_DECL_fs_input_cyl_centroid(ureg,
-                                 semantic_name,
-                                 semantic_index,
-                                 interp_mode,
-                                 cylindrical_wrap,
-                                 TGSI_INTERPOLATE_LOC_CENTER, 0, 1);
-}
-
 static inline struct ureg_src
 ureg_DECL_fs_input(struct ureg_program *ureg,
                    enum tgsi_semantic semantic_name,
                    unsigned semantic_index,
                    enum tgsi_interpolate_mode interp_mode)
 {
-   return ureg_DECL_fs_input_cyl_centroid(ureg,
+   return ureg_DECL_fs_input_centroid(ureg,
                                  semantic_name,
                                  semantic_index,
                                  interp_mode,
-                                 0, TGSI_INTERPOLATE_LOC_CENTER, 0, 1);
+                                 TGSI_INTERPOLATE_LOC_CENTER, 0, 1);
 }
 
 struct ureg_src
diff --git a/mesa 3D driver/src/gallium/auxiliary/util/u_async_debug.h b/mesa 3D driver/src/gallium/auxiliary/util/u_async_debug.h
index b192a01f99..5f27d21d29 100644
--- a/mesa 3D driver/src/gallium/auxiliary/util/u_async_debug.h	
+++ b/mesa 3D driver/src/gallium/auxiliary/util/u_async_debug.h	
@@ -37,6 +37,10 @@
 #include "util/u_debug.h"
 #include "util/simple_mtx.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct util_debug_message {
    unsigned *id;
    enum pipe_debug_type type;
@@ -71,4 +75,8 @@ u_async_debug_drain(struct util_async_debug_callback *adbg,
       _u_async_debug_drain(adbg, dst);
 }
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* UTIL_ASYNC_DEBUG_H */
diff --git a/mesa 3D driver/src/gallium/auxiliary/util/u_blitter.c b/mesa 3D driver/src/gallium/auxiliary/util/u_blitter.c
index bb76c63acc..4623eb7b08 100644
--- a/mesa 3D driver/src/gallium/auxiliary/util/u_blitter.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/util/u_blitter.c	
@@ -784,11 +784,12 @@ void util_blitter_restore_textures(struct blitter_context *blitter)
 
    /* Fragment sampler views. */
    pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0,
-                           ctx->base.saved_num_sampler_views, 0,
+                           ctx->base.saved_num_sampler_views, 0, true,
                            ctx->base.saved_sampler_views);
 
+   /* Just clear them to NULL because set_sampler_views(take_ownership = true). */
    for (i = 0; i < ctx->base.saved_num_sampler_views; i++)
-      pipe_sampler_view_reference(&ctx->base.saved_sampler_views[i], NULL);
+      ctx->base.saved_sampler_views[i] = NULL;
 
    ctx->base.saved_num_sampler_views = ~0;
 }
@@ -1733,7 +1734,7 @@ void util_blitter_copy_texture(struct blitter_context *blitter,
    util_blitter_blit_generic(blitter, dst_view, &dstbox,
                              src_view, srcbox, src->width0, src->height0,
                              PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL,
-                             false);
+                             false, false);
 
    pipe_surface_reference(&dst_view, NULL);
    pipe_sampler_view_reference(&src_view, NULL);
@@ -1786,7 +1787,7 @@ static void do_blits(struct blitter_context_priv *ctx,
                      unsigned src_height0,
                      const struct pipe_box *srcbox,
                      bool is_zsbuf,
-                     bool uses_txf)
+                     bool uses_txf, bool sample0_only)
 {
    struct pipe_context *pipe = ctx->base.pipe;
    unsigned src_samples = src->texture->nr_samples;
@@ -1866,9 +1867,9 @@ static void do_blits(struct blitter_context_priv *ctx,
          pipe->set_framebuffer_state(pipe, &fb_state);
 
          /* See if we need to blit a multisample or singlesample buffer. */
-         if (src_samples == dst_samples && dst_samples > 1) {
+         if (sample0_only || (src_samples == dst_samples && dst_samples > 1)) {
             /* MSAA copy. */
-            unsigned i, max_sample = dst_samples - 1;
+            unsigned i, max_sample = sample0_only ? 0 : dst_samples - 1;
 
             for (i = 0; i <= max_sample; i++) {
                pipe->set_sample_mask(pipe, 1 << i);
@@ -1917,7 +1918,7 @@ void util_blitter_blit_generic(struct blitter_context *blitter,
                                unsigned src_width0, unsigned src_height0,
                                unsigned mask, unsigned filter,
                                const struct pipe_scissor_state *scissor,
-                               bool alpha_blend)
+                               bool alpha_blend, bool sample0_only)
 {
    struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
    struct pipe_context *pipe = ctx->base.pipe;
@@ -2089,7 +2090,7 @@ void util_blitter_blit_generic(struct blitter_context *blitter,
       views[0] = src;
       views[1] = pipe->create_sampler_view(pipe, src->texture, &templ);
 
-      pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 2, 0, views);
+      pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 2, 0, false, views);
       pipe->bind_sampler_states(pipe, PIPE_SHADER_FRAGMENT, 0, 2, samplers);
 
       pipe_sampler_view_reference(&views[1], NULL);
@@ -2104,13 +2105,13 @@ void util_blitter_blit_generic(struct blitter_context *blitter,
 
       view = pipe->create_sampler_view(pipe, src->texture, &templ);
 
-      pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 1, 0, &view);
+      pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 1, 0, false, &view);
       pipe->bind_sampler_states(pipe, PIPE_SHADER_FRAGMENT,
                                 0, 1, &sampler_state);
 
       pipe_sampler_view_reference(&view, NULL);
    } else {
-      pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 1, 0, &src);
+      pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 1, 0, false, &src);
       pipe->bind_sampler_states(pipe, PIPE_SHADER_FRAGMENT,
                                 0, 1, &sampler_state);
    }
@@ -2122,7 +2123,7 @@ void util_blitter_blit_generic(struct blitter_context *blitter,
    blitter_set_common_draw_rect_state(ctx, scissor != NULL, dst_samples > 1);
 
    do_blits(ctx, dst, dstbox, src, src_width0, src_height0,
-            srcbox, dst_has_depth || dst_has_stencil, use_txf);
+            srcbox, dst_has_depth || dst_has_stencil, use_txf, sample0_only);
 
    util_blitter_restore_vertex_states(blitter);
    util_blitter_restore_fragment_states(blitter);
@@ -2162,7 +2163,7 @@ util_blitter_blit(struct blitter_context *blitter,
                              src_view, &info->src.box, src->width0, src->height0,
                              info->mask, info->filter,
                              info->scissor_enable ? &info->scissor : NULL,
-                             info->alpha_blend);
+                             info->alpha_blend, info->sample0_only);
 
    pipe_surface_reference(&dst_view, NULL);
    pipe_sampler_view_reference(&src_view, NULL);
@@ -2257,10 +2258,10 @@ void util_blitter_generate_mipmap(struct blitter_context *blitter,
       src_templ.format = format;
       src_view = pipe->create_sampler_view(pipe, tex, &src_templ);
 
-      pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 1, 0, &src_view);
+      pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 1, 0, false, &src_view);
 
       do_blits(ctx, dst_view, &dstbox, src_view, tex->width0, tex->height0,
-               &srcbox, is_depth, false);
+               &srcbox, is_depth, false, false);
 
       pipe_surface_reference(&dst_view, NULL);
       pipe_sampler_view_reference(&src_view, NULL);
@@ -2889,7 +2890,7 @@ util_blitter_stencil_fallback(struct blitter_context *blitter,
                                 true);
    }
 
-   pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 1, 0, &src_view);
+   pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 1, 0, false, &src_view);
    pipe->bind_sampler_states(pipe, PIPE_SHADER_FRAGMENT, 0, 1, &ctx->sampler_state);
 
    unsigned stencil_bits =
diff --git a/mesa 3D driver/src/gallium/auxiliary/util/u_blitter.h b/mesa 3D driver/src/gallium/auxiliary/util/u_blitter.h
index 79605d3d1a..48b9344430 100644
--- a/mesa 3D driver/src/gallium/auxiliary/util/u_blitter.h	
+++ b/mesa 3D driver/src/gallium/auxiliary/util/u_blitter.h	
@@ -270,7 +270,7 @@ void util_blitter_blit_generic(struct blitter_context *blitter,
                                unsigned src_width0, unsigned src_height0,
                                unsigned mask, unsigned filter,
                                const struct pipe_scissor_state *scissor,
-                               bool alpha_blend);
+                               bool alpha_blend, bool sample0_only);
 
 void util_blitter_blit(struct blitter_context *blitter,
 		       const struct pipe_blit_info *info);
diff --git a/mesa 3D driver/src/gallium/auxiliary/util/u_compute.c b/mesa 3D driver/src/gallium/auxiliary/util/u_compute.c
index 1eab36ab6b..8d4d871b2f 100644
--- a/mesa 3D driver/src/gallium/auxiliary/util/u_compute.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/util/u_compute.c	
@@ -139,7 +139,7 @@ void util_compute_blit(struct pipe_context *ctx, struct pipe_blit_info *blit_inf
    u_sampler_view_default_template(&src_templ, src, src->format);
    src_templ.format = util_format_linear(blit_info->src.format);
    src_view = ctx->create_sampler_view(ctx, src, &src_templ);
-   ctx->set_sampler_views(ctx, PIPE_SHADER_COMPUTE, 0, 1, 0, &src_view);
+   ctx->set_sampler_views(ctx, PIPE_SHADER_COMPUTE, 0, 1, 0, false, &src_view);
 
    if (!*compute_state)
      *compute_state = blit_compute_shader(ctx);
@@ -160,7 +160,7 @@ void util_compute_blit(struct pipe_context *ctx, struct pipe_blit_info *blit_inf
 
    ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 0, 1, NULL);
    ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, false, NULL);
-   ctx->set_sampler_views(ctx, PIPE_SHADER_COMPUTE, 0, 0, 1, NULL);
+   ctx->set_sampler_views(ctx, PIPE_SHADER_COMPUTE, 0, 0, 1, false, NULL);
    pipe_sampler_view_reference(&src_view, NULL);
    ctx->delete_sampler_state(ctx, sampler_state_p);
    ctx->bind_compute_state(ctx, NULL);
diff --git a/mesa 3D driver/src/gallium/auxiliary/util/u_driconf.c b/mesa 3D driver/src/gallium/auxiliary/util/u_driconf.c
index 0cd84e9a11..8ace847476 100644
--- a/mesa 3D driver/src/gallium/auxiliary/util/u_driconf.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/util/u_driconf.c	
@@ -54,10 +54,10 @@ u_driconf_fill_st_options(struct st_config_options *options,
    query_bool_option(glsl_zero_init);
    query_bool_option(force_integer_tex_nearest);
    query_bool_option(vs_position_always_invariant);
+   query_bool_option(vs_position_always_precise);
    query_bool_option(force_glsl_abs_sqrt);
    query_bool_option(allow_glsl_cross_stage_interpolation_mismatch);
    query_bool_option(allow_draw_out_of_order);
-   query_bool_option(allow_incorrect_primitive_id);
    query_bool_option(ignore_map_unsynchronized);
    query_bool_option(force_gl_names_reuse);
    query_bool_option(transcode_etc);
diff --git a/mesa 3D driver/src/gallium/auxiliary/util/u_dump_state.c b/mesa 3D driver/src/gallium/auxiliary/util/u_dump_state.c
index 85a4d01027..f0bca16805 100644
--- a/mesa 3D driver/src/gallium/auxiliary/util/u_dump_state.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/util/u_dump_state.c	
@@ -913,8 +913,6 @@ util_dump_draw_info(FILE *stream, const struct pipe_draw_info *state)
    util_dump_member(stream, uint, state, start_instance);
    util_dump_member(stream, uint, state, instance_count);
 
-   util_dump_member(stream, uint, state, vertices_per_patch);
-
    util_dump_member(stream, uint, state, min_index);
    util_dump_member(stream, uint, state, max_index);
 
diff --git a/mesa 3D driver/src/gallium/auxiliary/util/u_helpers.c b/mesa 3D driver/src/gallium/auxiliary/util/u_helpers.c
index 0c358a06b2..dd415b9909 100644
--- a/mesa 3D driver/src/gallium/auxiliary/util/u_helpers.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/util/u_helpers.c	
@@ -496,3 +496,25 @@ util_lower_clearsize_to_dword(const void *clearValue, int *clearValueSize, uint3
    }
    return false;
 }
+
+void
+util_init_pipe_vertex_state(struct pipe_screen *screen,
+                            struct pipe_vertex_buffer *buffer,
+                            const struct pipe_vertex_element *elements,
+                            unsigned num_elements,
+                            struct pipe_resource *indexbuf,
+                            uint32_t full_velem_mask,
+                            struct pipe_vertex_state *state)
+{
+   assert(num_elements == util_bitcount(full_velem_mask));
+
+   pipe_reference_init(&state->reference, 1);
+   state->screen = screen;
+
+   pipe_vertex_buffer_reference(&state->input.vbuffer, buffer);
+   pipe_resource_reference(&state->input.indexbuf, indexbuf);
+   state->input.num_elements = num_elements;
+   for (unsigned i = 0; i < num_elements; i++)
+      state->input.elements[i] = elements[i];
+   state->input.full_velem_mask = full_velem_mask;
+}
diff --git a/mesa 3D driver/src/gallium/auxiliary/util/u_helpers.h b/mesa 3D driver/src/gallium/auxiliary/util/u_helpers.h
index f08f44dad9..9246d306ed 100644
--- a/mesa 3D driver/src/gallium/auxiliary/util/u_helpers.h	
+++ b/mesa 3D driver/src/gallium/auxiliary/util/u_helpers.h	
@@ -121,6 +121,15 @@ void util_throttle_memory_usage(struct pipe_context *pipe,
 bool
 util_lower_clearsize_to_dword(const void *clearValue, int *clearValueSize, uint32_t *clamped);
 
+void
+util_init_pipe_vertex_state(struct pipe_screen *screen,
+                            struct pipe_vertex_buffer *buffer,
+                            const struct pipe_vertex_element *elements,
+                            unsigned num_elements,
+                            struct pipe_resource *indexbuf,
+                            uint32_t full_velem_mask,
+                            struct pipe_vertex_state *state);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/mesa 3D driver/src/gallium/auxiliary/util/u_inlines.h b/mesa 3D driver/src/gallium/auxiliary/util/u_inlines.h
index cd6e935135..1f1215c7ff 100644
--- a/mesa 3D driver/src/gallium/auxiliary/util/u_inlines.h	
+++ b/mesa 3D driver/src/gallium/auxiliary/util/u_inlines.h	
@@ -230,6 +230,18 @@ pipe_so_target_reference(struct pipe_stream_output_target **dst,
    *dst = src;
 }
 
+static inline void
+pipe_vertex_state_reference(struct pipe_vertex_state **dst,
+                            struct pipe_vertex_state *src)
+{
+   struct pipe_vertex_state *old_dst = *dst;
+
+   if (pipe_reference(old_dst ? &old_dst->reference : NULL,
+                      src ? &src->reference : NULL))
+      old_dst->screen->vertex_state_destroy(old_dst->screen, old_dst);
+   *dst = src;
+}
+
 static inline void
 pipe_vertex_buffer_unreference(struct pipe_vertex_buffer *dst)
 {
@@ -252,9 +264,17 @@ pipe_vertex_buffer_reference(struct pipe_vertex_buffer *dst,
    }
 
    pipe_vertex_buffer_unreference(dst);
-   if (!src->is_user_buffer)
+   /* Don't use memcpy because there is a hole between variables.
+    * dst can be used as a hash key.
+    */
+   dst->stride = src->stride;
+   dst->is_user_buffer = src->is_user_buffer;
+   dst->buffer_offset = src->buffer_offset;
+
+   if (src->is_user_buffer)
+      dst->buffer.user = src->buffer.user;
+   else
       pipe_resource_reference(&dst->buffer.resource, src->buffer.resource);
-   memcpy(dst, src, sizeof(*src));
 }
 
 static inline void
diff --git a/mesa 3D driver/src/gallium/auxiliary/util/u_live_shader_cache.h b/mesa 3D driver/src/gallium/auxiliary/util/u_live_shader_cache.h
index b6e6e32c76..99ed5ec978 100644
--- a/mesa 3D driver/src/gallium/auxiliary/util/u_live_shader_cache.h	
+++ b/mesa 3D driver/src/gallium/auxiliary/util/u_live_shader_cache.h	
@@ -52,6 +52,10 @@
 #include "util/simple_mtx.h"
 #include "pipe/p_state.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct util_live_shader_cache {
    simple_mtx_t lock;
    struct hash_table *hashtable;
@@ -88,4 +92,8 @@ util_shader_reference(struct pipe_context *ctx,
                       struct util_live_shader_cache *cache,
                       void **dst, void *src);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/mesa 3D driver/src/gallium/auxiliary/util/u_prim.c b/mesa 3D driver/src/gallium/auxiliary/util/u_prim.c
index cbd48e26ab..a84d0e71e7 100644
--- a/mesa 3D driver/src/gallium/auxiliary/util/u_prim.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/util/u_prim.c	
@@ -21,12 +21,25 @@
  */
 
 #include "u_prim.h"
+#include "pipe/p_state.h"
 
 
 /** Return string name of given primitive type */
 const char *
 u_prim_name(enum pipe_prim_type prim)
 {
+#if defined(__GNUC__)
+   /* Check that the enum is packed: */
+   STATIC_ASSERT(sizeof(enum pipe_prim_type) == 1);
+#endif
+
+   /* Draw merging in u_threaded_context requires that sizeof(mode) == 1. */
+   struct pipe_draw_info info;
+   STATIC_ASSERT(sizeof(info.mode) == 1);
+
+   struct pipe_draw_vertex_state_info dvs_info;
+   STATIC_ASSERT(sizeof(dvs_info.mode) == 1);
+
    static const struct debug_named_value names[] = {
       DEBUG_NAMED_VALUE(PIPE_PRIM_POINTS),
       DEBUG_NAMED_VALUE(PIPE_PRIM_LINES),
diff --git a/mesa 3D driver/src/gallium/auxiliary/util/u_screen.c b/mesa 3D driver/src/gallium/auxiliary/util/u_screen.c
index f682eede34..eba554600f 100644
--- a/mesa 3D driver/src/gallium/auxiliary/util/u_screen.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/util/u_screen.c	
@@ -77,6 +77,7 @@ u_pipe_screen_get_param_defaults(struct pipe_screen *pscreen,
    case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
    case PIPE_CAP_DEPTH_CLIP_DISABLE:
    case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE:
+   case PIPE_CAP_DEPTH_CLAMP_ENABLE:
    case PIPE_CAP_SHADER_STENCIL_EXPORT:
    case PIPE_CAP_TGSI_INSTANCEID:
    case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
@@ -290,6 +291,7 @@ u_pipe_screen_get_param_defaults(struct pipe_screen *pscreen,
       return 4; /* GLES 2.0 minimum value */
 
    case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY:
+   case PIPE_CAP_PREFER_BACK_BUFFER_REUSE:
       return 1;
 
    case PIPE_CAP_GLSL_TESS_LEVELS_AS_INPUTS:
@@ -470,6 +472,7 @@ u_pipe_screen_get_param_defaults(struct pipe_screen *pscreen,
       return 1;
 
    case PIPE_CAP_EMULATE_NONFIXED_PRIMITIVE_RESTART:
+   case PIPE_CAP_DRAW_VERTEX_STATE:
       return 0;
 
    default:
diff --git a/mesa 3D driver/src/gallium/auxiliary/util/u_simple_shaders.c b/mesa 3D driver/src/gallium/auxiliary/util/u_simple_shaders.c
index aaaa043480..7a9c5cf435 100644
--- a/mesa 3D driver/src/gallium/auxiliary/util/u_simple_shaders.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/util/u_simple_shaders.c	
@@ -623,7 +623,8 @@ util_make_fs_blit_msaa_depth(struct pipe_context *pipe,
                              enum tgsi_texture_type tgsi_tex)
 {
    return util_make_fs_blit_msaa_gen(pipe, tgsi_tex, "FLOAT",
-                                     "POSITION", ".z", "", "");
+                                     "POSITION", ".z", "",
+                                     "MOV TEMP[0].z, TEMP[0].xxxx\n");
 }
 
 
@@ -637,7 +638,8 @@ util_make_fs_blit_msaa_stencil(struct pipe_context *pipe,
                                enum tgsi_texture_type tgsi_tex)
 {
    return util_make_fs_blit_msaa_gen(pipe, tgsi_tex, "UINT",
-                                     "STENCIL", ".y", "", "");
+                                     "STENCIL", ".y", "",
+                                     "MOV TEMP[0].y, TEMP[0].xxxx\n");
 }
 
 
diff --git a/mesa 3D driver/src/gallium/auxiliary/util/u_tests.c b/mesa 3D driver/src/gallium/auxiliary/util/u_tests.c
index aa381448b0..71e954ff94 100644
--- a/mesa 3D driver/src/gallium/auxiliary/util/u_tests.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/util/u_tests.c	
@@ -48,7 +48,7 @@ util_create_texture2d(struct pipe_screen *screen, unsigned width,
                       unsigned height, enum pipe_format format,
                       unsigned num_samples)
 {
-   struct pipe_resource templ = {{0}};
+   struct pipe_resource templ = {0};
 
    templ.target = PIPE_TEXTURE_2D;
    templ.width0 = width;
@@ -392,7 +392,7 @@ null_sampler_view(struct pipe_context *ctx, unsigned tgsi_tex_target)
                               PIPE_FORMAT_R8G8B8A8_UNORM, 0);
    util_set_common_states_and_clear(cso, ctx, cb);
 
-   ctx->set_sampler_views(ctx, PIPE_SHADER_FRAGMENT, 0, 0, 1, NULL);
+   ctx->set_sampler_views(ctx, PIPE_SHADER_FRAGMENT, 0, 0, 1, false, NULL);
 
    /* Fragment shader. */
    fs = util_make_fragment_tex_shader(ctx, tgsi_tex_target,
@@ -698,7 +698,7 @@ test_texture_barrier(struct pipe_context *ctx, bool use_fbfetch,
              "ADD OUT[0], TEMP[0], IMM[0]\n"
              "END\n";
    } else {
-      struct pipe_sampler_view templ = {{0}};
+      struct pipe_sampler_view templ = {0};
       templ.format = cb->format;
       templ.target = cb->target;
       templ.swizzle_r = PIPE_SWIZZLE_X;
@@ -706,7 +706,7 @@ test_texture_barrier(struct pipe_context *ctx, bool use_fbfetch,
       templ.swizzle_b = PIPE_SWIZZLE_Z;
       templ.swizzle_a = PIPE_SWIZZLE_W;
       view = ctx->create_sampler_view(ctx, cb, &templ);
-      ctx->set_sampler_views(ctx, PIPE_SHADER_FRAGMENT, 0, 1, 0, &view);
+      ctx->set_sampler_views(ctx, PIPE_SHADER_FRAGMENT, 0, 1, 0, false, &view);
 
       /* Fragment shader. */
       if (num_samples > 1) {
diff --git a/mesa 3D driver/src/gallium/auxiliary/util/u_threaded_context.c b/mesa 3D driver/src/gallium/auxiliary/util/u_threaded_context.c
index 5994a6a309..dd45bbf58e 100644
--- a/mesa 3D driver/src/gallium/auxiliary/util/u_threaded_context.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/util/u_threaded_context.c	
@@ -71,6 +71,12 @@ typedef uint16_t (*tc_execute)(struct pipe_context *pipe, void *call, uint64_t *
 
 static const tc_execute execute_func[TC_NUM_CALLS];
 
+static void
+tc_buffer_subdata(struct pipe_context *_pipe,
+                  struct pipe_resource *resource,
+                  unsigned usage, unsigned offset,
+                  unsigned size, const void *data);
+
 static void
 tc_batch_check(UNUSED struct tc_batch *batch)
 {
@@ -128,6 +134,15 @@ tc_set_resource_reference(struct pipe_resource **dst, struct pipe_resource *src)
    pipe_reference(NULL, &src->reference); /* only increment refcount */
 }
 
+/* Assign src to dst while dst is uninitialized. */
+static inline void
+tc_set_vertex_state_reference(struct pipe_vertex_state **dst,
+                              struct pipe_vertex_state *src)
+{
+   *dst = src;
+   pipe_reference(NULL, &src->reference); /* only increment refcount */
+}
+
 /* Unreference dst but don't touch the dst pointer. */
 static inline void
 tc_drop_resource_reference(struct pipe_resource *dst)
@@ -160,6 +175,20 @@ tc_drop_so_target_reference(struct pipe_stream_output_target *dst)
       dst->context->stream_output_target_destroy(dst->context, dst);
 }
 
+/**
+ * Subtract the given number of references.
+ */
+static inline void
+tc_drop_vertex_state_references(struct pipe_vertex_state *dst, int num_refs)
+{
+   int count = p_atomic_add_return(&dst->reference.count, -num_refs);
+
+   assert(count >= 0);
+   /* Underflows shouldn't happen, but let's be safe. */
+   if (count <= 0)
+      dst->screen->vertex_state_destroy(dst->screen, dst);
+}
+
 /* We don't want to read or write min_index and max_index, because
  * it shouldn't be needed by drivers at this point.
  */
@@ -198,7 +227,7 @@ tc_batch_execute(void *job, UNUSED void *gdata, int thread_index)
    struct util_queue_fence *fence =
       &tc->buffer_lists[batch->buffer_list_index].driver_flushed_fence;
 
-   if (tc->driver_calls_flush_notify) {
+   if (tc->options.driver_calls_flush_notify) {
       tc->signal_fences_next_flush[tc->num_signal_fences_next_flush++] = fence;
 
       /* Since our buffer lists are chained as a ring, we need to flush
@@ -626,7 +655,7 @@ static bool
 tc_is_buffer_busy(struct threaded_context *tc, struct threaded_resource *tbuf,
                   unsigned map_usage)
 {
-   if (!tc->is_resource_busy)
+   if (!tc->options.is_resource_busy)
       return true;
 
    uint32_t id_hash = tbuf->buffer_id_unique & TC_BUFFER_ID_MASK;
@@ -643,21 +672,35 @@ tc_is_buffer_busy(struct threaded_context *tc, struct threaded_resource *tbuf,
 
    /* The buffer isn't referenced by any unflushed batch: we can safely ask to the driver whether
     * this buffer is busy or not. */
-   return tc->is_resource_busy(tc->pipe->screen, tbuf->latest, map_usage);
+   return tc->options.is_resource_busy(tc->pipe->screen, tbuf->latest, map_usage);
 }
 
+/**
+ * allow_cpu_storage should be false for user memory and imported buffers.
+ */
 void
-threaded_resource_init(struct pipe_resource *res)
+threaded_resource_init(struct pipe_resource *res, bool allow_cpu_storage,
+                       unsigned map_buffer_alignment)
 {
    struct threaded_resource *tres = threaded_resource(res);
 
    tres->latest = &tres->b;
+   tres->cpu_storage = NULL;
    util_range_init(&tres->valid_buffer_range);
    tres->is_shared = false;
    tres->is_user_ptr = false;
    tres->buffer_id_unique = 0;
    tres->pending_staging_uploads = 0;
    util_range_init(&tres->pending_staging_uploads_range);
+
+   if (allow_cpu_storage &&
+       !(res->flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT |
+                       PIPE_RESOURCE_FLAG_SPARSE |
+                       PIPE_RESOURCE_FLAG_ENCRYPTED)) &&
+       /* We need buffer invalidation and buffer busyness tracking for the CPU
+        * storage, which aren't supported with pipe_vertex_state. */
+       !(res->bind & PIPE_BIND_VERTEX_STATE))
+      tres->cpu_storage = align_malloc(res->width0, map_buffer_alignment);
 }
 
 void
@@ -669,6 +712,7 @@ threaded_resource_deinit(struct pipe_resource *res)
            pipe_resource_reference(&tres->latest, NULL);
    util_range_destroy(&tres->valid_buffer_range);
    util_range_destroy(&tres->pending_staging_uploads_range);
+   align_free(tres->cpu_storage);
 }
 
 struct pipe_context *
@@ -884,10 +928,12 @@ tc_get_query_result_resource(struct pipe_context *_pipe,
                              struct pipe_resource *resource, unsigned offset)
 {
    struct threaded_context *tc = threaded_context(_pipe);
+
+   tc_buffer_disable_cpu_storage(resource);
+
    struct tc_query_result_resource *p =
       tc_add_call(tc, TC_CALL_get_query_result_resource,
                   tc_query_result_resource);
-
    p->query = query;
    p->wait = wait;
    p->result_type = result_type;
@@ -1081,6 +1127,29 @@ tc_set_tess_state(struct pipe_context *_pipe,
    memcpy(p + 4, default_inner_level, 2 * sizeof(float));
 }
 
+struct tc_patch_vertices {
+   struct tc_call_base base;
+   ubyte patch_vertices;
+};
+
+static uint16_t
+tc_call_set_patch_vertices(struct pipe_context *pipe, void *call, uint64_t *last)
+{
+   uint8_t patch_vertices = to_call(call, tc_patch_vertices)->patch_vertices;
+
+   pipe->set_patch_vertices(pipe, patch_vertices);
+   return call_size(tc_patch_vertices);
+}
+
+static void
+tc_set_patch_vertices(struct pipe_context *_pipe, uint8_t patch_vertices)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+
+   tc_add_call(tc, TC_CALL_set_patch_vertices,
+               tc_patch_vertices)->patch_vertices = patch_vertices;
+}
+
 struct tc_constant_buffer_base {
    struct tc_call_base base;
    ubyte shader, index;
@@ -1322,13 +1391,9 @@ static uint16_t
 tc_call_set_sampler_views(struct pipe_context *pipe, void *call, uint64_t *last)
 {
    struct tc_sampler_views *p = (struct tc_sampler_views *)call;
-   unsigned count = p->count;
 
    pipe->set_sampler_views(pipe, p->shader, p->start, p->count,
-                           p->unbind_num_trailing_slots, p->slot);
-   for (unsigned i = 0; i < count; i++)
-      tc_drop_sampler_view_reference(p->slot[i]);
-
+                           p->unbind_num_trailing_slots, true, p->slot);
    return p->base.num_slots;
 }
 
@@ -1336,7 +1401,7 @@ static void
 tc_set_sampler_views(struct pipe_context *_pipe,
                      enum pipe_shader_type shader,
                      unsigned start, unsigned count,
-                     unsigned unbind_num_trailing_slots,
+                     unsigned unbind_num_trailing_slots, bool take_ownership,
                      struct pipe_sampler_view **views)
 {
    if (!count && !unbind_num_trailing_slots)
@@ -1356,15 +1421,28 @@ tc_set_sampler_views(struct pipe_context *_pipe,
       p->count = count;
       p->unbind_num_trailing_slots = unbind_num_trailing_slots;
 
-      for (unsigned i = 0; i < count; i++) {
-         p->slot[i] = NULL;
-         pipe_sampler_view_reference(&p->slot[i], views[i]);
+      if (take_ownership) {
+         memcpy(p->slot, views, sizeof(*views) * count);
 
-         if (views[i] && views[i]->target == PIPE_BUFFER) {
-            tc_bind_buffer(&tc->sampler_buffers[shader][start + i], next,
-                           views[i]->texture);
-         } else {
-            tc_unbind_buffer(&tc->sampler_buffers[shader][start + i]);
+         for (unsigned i = 0; i < count; i++) {
+            if (views[i] && views[i]->target == PIPE_BUFFER) {
+               tc_bind_buffer(&tc->sampler_buffers[shader][start + i], next,
+                              views[i]->texture);
+            } else {
+               tc_unbind_buffer(&tc->sampler_buffers[shader][start + i]);
+            }
+         }
+      } else {
+         for (unsigned i = 0; i < count; i++) {
+            p->slot[i] = NULL;
+            pipe_sampler_view_reference(&p->slot[i], views[i]);
+
+            if (views[i] && views[i]->target == PIPE_BUFFER) {
+               tc_bind_buffer(&tc->sampler_buffers[shader][start + i], next,
+                              views[i]->texture);
+            } else {
+               tc_unbind_buffer(&tc->sampler_buffers[shader][start + i]);
+            }
          }
       }
 
@@ -1444,6 +1522,7 @@ tc_set_shader_images(struct pipe_context *_pipe,
             if (images[i].access & PIPE_IMAGE_ACCESS_WRITE) {
                struct threaded_resource *tres = threaded_resource(resource);
 
+               tc_buffer_disable_cpu_storage(resource);
                util_range_add(&tres->b, &tres->valid_buffer_range,
                               images[i].u.buf.offset,
                               images[i].u.buf.offset + images[i].u.buf.size);
@@ -1536,6 +1615,7 @@ tc_set_shader_buffers(struct pipe_context *_pipe,
             tc_bind_buffer(&tc->shader_buffers[shader][start + i], next, &tres->b);
 
             if (writable_bitmask & BITFIELD_BIT(i)) {
+               tc_buffer_disable_cpu_storage(src->buffer);
                util_range_add(&tres->b, &tres->valid_buffer_range,
                               src->buffer_offset,
                               src->buffer_offset + src->buffer_size);
@@ -1682,6 +1762,7 @@ tc_set_stream_output_targets(struct pipe_context *_pipe,
       p->targets[i] = NULL;
       pipe_so_target_reference(&p->targets[i], tgs[i]);
       if (tgs[i]) {
+         tc_buffer_disable_cpu_storage(tgs[i]->buffer);
          tc_bind_buffer(&tc->streamout_buffers[i], next, tgs[i]->buffer);
       } else {
          tc_unbind_buffer(&tc->streamout_buffers[i]);
@@ -1851,6 +1932,9 @@ tc_create_image_handle(struct pipe_context *_pipe,
    struct threaded_context *tc = threaded_context(_pipe);
    struct pipe_context *pipe = tc->pipe;
 
+   if (image->resource->target == PIPE_BUFFER)
+      tc_buffer_disable_cpu_storage(image->resource);
+
    tc_sync(tc);
    return pipe->create_image_handle(pipe, image);
 }
@@ -2087,8 +2171,35 @@ tc_buffer_map(struct pipe_context *_pipe,
    struct threaded_resource *tres = threaded_resource(resource);
    struct pipe_context *pipe = tc->pipe;
 
+   /* PIPE_MAP_THREAD_SAFE is for glthread, which shouldn't use the CPU storage and
+    * this shouldn't normally be necessary because glthread only uses large buffers.
+    */
+   if (usage & PIPE_MAP_THREAD_SAFE)
+      tc_buffer_disable_cpu_storage(resource);
+
    usage = tc_improve_map_buffer_flags(tc, tres, usage, box->x, box->width);
 
+   /* If the CPU storage is enabled, return it directly. */
+   if (tres->cpu_storage && !(usage & TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE)) {
+      /* We can't let resource_copy_region disable the CPU storage. */
+      assert(!(tres->b.flags & PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY));
+
+      struct threaded_transfer *ttrans = slab_alloc(&tc->pool_transfers);
+      ttrans->b.resource = resource;
+      ttrans->b.level = 0;
+      ttrans->b.usage = usage;
+      ttrans->b.box = *box;
+      ttrans->b.stride = 0;
+      ttrans->b.layer_stride = 0;
+      ttrans->b.offset = 0;
+      ttrans->staging = NULL;
+      ttrans->valid_buffer_range = &tres->valid_buffer_range;
+      ttrans->cpu_storage_mapped = true;
+      *transfer = &ttrans->b;
+
+      return (uint8_t*)tres->cpu_storage + box->x;
+   }
+
    /* Do a staging transfer within the threaded context. The driver should
     * only get resource_copy_region.
     */
@@ -2114,6 +2225,7 @@ tc_buffer_map(struct pipe_context *_pipe,
       ttrans->b.stride = 0;
       ttrans->b.layer_stride = 0;
       ttrans->valid_buffer_range = &tres->valid_buffer_range;
+      ttrans->cpu_storage_mapped = false;
       *transfer = &ttrans->b;
 
       p_atomic_inc(&tres->pending_staging_uploads);
@@ -2148,6 +2260,7 @@ tc_buffer_map(struct pipe_context *_pipe,
    void *ret = pipe->buffer_map(pipe, tres->latest ? tres->latest : resource,
                                 level, usage, box, transfer);
    threaded_transfer(*transfer)->valid_buffer_range = &tres->valid_buffer_range;
+   threaded_transfer(*transfer)->cpu_storage_mapped = false;
 
    if (!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC))
       tc_clear_driver_thread(tc);
@@ -2230,8 +2343,13 @@ tc_buffer_do_flush_region(struct threaded_context *tc,
                               ttrans->staging, 0, &src_box);
    }
 
-   util_range_add(&tres->b, ttrans->valid_buffer_range,
-                  box->x, box->x + box->width);
+   /* Don't update the valid range when we're uploading the CPU storage
+    * because it includes the uninitialized range too.
+    */
+   if (!(ttrans->b.usage & TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE)) {
+      util_range_add(&tres->b, ttrans->valid_buffer_range,
+                     box->x, box->x + box->width);
+   }
 }
 
 static void
@@ -2318,12 +2436,37 @@ tc_buffer_unmap(struct pipe_context *_pipe, struct pipe_transfer *transfer)
       return;
    }
 
-   bool was_staging_transfer = false;
-
    if (transfer->usage & PIPE_MAP_WRITE &&
        !(transfer->usage & PIPE_MAP_FLUSH_EXPLICIT))
       tc_buffer_do_flush_region(tc, ttrans, &transfer->box);
 
+   if (ttrans->cpu_storage_mapped) {
+      /* GL allows simultaneous GPU stores with mapped buffers as long as GPU stores don't
+       * touch the mapped range. That's a problem because GPU stores free the CPU storage.
+       * If that happens, we just ignore the unmap call and don't upload anything to prevent
+       * a crash.
+       *
+       * Disallow the CPU storage in the driver to work around this.
+       */
+      assert(tres->cpu_storage);
+
+      if (tres->cpu_storage) {
+         tc_invalidate_buffer(tc, tres);
+         tc_buffer_subdata(&tc->base, &tres->b,
+                           PIPE_MAP_UNSYNCHRONIZED |
+                           TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE,
+                           0, tres->b.width0, tres->cpu_storage);
+         /* This shouldn't have been freed by buffer_subdata. */
+         assert(tres->cpu_storage);
+      }
+
+      tc_drop_resource_reference(ttrans->staging);
+      slab_free(&tc->pool_transfers, ttrans);
+      return;
+   }
+
+   bool was_staging_transfer = false;
+
    if (ttrans->staging) {
       was_staging_transfer = true;
 
@@ -2428,7 +2571,8 @@ tc_buffer_subdata(struct pipe_context *_pipe,
     */
    if (usage & (PIPE_MAP_UNSYNCHRONIZED |
                 PIPE_MAP_DISCARD_WHOLE_RESOURCE) ||
-       size > TC_MAX_SUBDATA_BYTES) {
+       size > TC_MAX_SUBDATA_BYTES ||
+       tres->cpu_storage) {
       struct pipe_transfer *transfer;
       struct pipe_box box;
       uint8_t *map = NULL;
@@ -2537,7 +2681,6 @@ tc_texture_subdata(struct pipe_context *_pipe,
       return pipe->func(pipe); \
    }
 
-TC_FUNC_SYNC_RET0(enum pipe_reset_status, get_device_reset_status)
 TC_FUNC_SYNC_RET0(uint64_t, get_timestamp)
 
 static void
@@ -2553,6 +2696,18 @@ tc_get_sample_position(struct pipe_context *_pipe,
                              out_value);
 }
 
+static enum pipe_reset_status
+tc_get_device_reset_status(struct pipe_context *_pipe)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+   struct pipe_context *pipe = tc->pipe;
+
+   if (!tc->options.unsynchronized_get_device_reset_status)
+      tc_sync(tc);
+
+   return pipe->get_device_reset_status(pipe);
+}
+
 static void
 tc_set_device_reset_callback(struct pipe_context *_pipe,
                              const struct pipe_device_reset_callback *cb)
@@ -2817,7 +2972,7 @@ tc_flush(struct pipe_context *_pipe, struct pipe_fence_handle **fence,
    struct pipe_screen *screen = pipe->screen;
    bool async = flags & (PIPE_FLUSH_DEFERRED | PIPE_FLUSH_ASYNC);
 
-   if (async && tc->create_fence) {
+   if (async && tc->options.create_fence) {
       if (fence) {
          struct tc_batch *next = &tc->batch_slots[tc->next];
 
@@ -2830,7 +2985,8 @@ tc_flush(struct pipe_context *_pipe, struct pipe_fence_handle **fence,
             next->token->tc = tc;
          }
 
-         screen->fence_reference(screen, fence, tc->create_fence(pipe, next->token));
+         screen->fence_reference(screen, fence,
+                                 tc->options.create_fence(pipe, next->token));
          if (!*fence)
             goto out_of_memory;
       }
@@ -2902,13 +3058,11 @@ simplify_draw_info(struct pipe_draw_info *info)
    info->index_bounds_valid = false;
    info->take_index_buffer_ownership = false;
    info->index_bias_varies = false;
+   info->_pad = 0;
 
    /* This shouldn't be set when merging single draws. */
    info->increment_draw_id = false;
 
-   if (info->mode != PIPE_PRIM_PATCHES)
-      info->vertices_per_patch = 0;
-
    if (info->index_size) {
       if (!info->primitive_restart)
          info->restart_index = 0;
@@ -3276,6 +3430,166 @@ tc_draw_vbo(struct pipe_context *_pipe, const struct pipe_draw_info *info,
    }
 }
 
+struct tc_draw_vstate_single {
+   struct tc_call_base base;
+   struct pipe_draw_start_count_bias draw;
+
+   /* The following states must be together without holes because they are
+    * compared by draw merging.
+    */
+   struct pipe_vertex_state *state;
+   uint32_t partial_velem_mask;
+   struct pipe_draw_vertex_state_info info;
+};
+
+static bool
+is_next_call_a_mergeable_draw_vstate(struct tc_draw_vstate_single *first,
+                                     struct tc_draw_vstate_single *next)
+{
+   if (next->base.call_id != TC_CALL_draw_vstate_single)
+      return false;
+
+   return !memcmp(&first->state, &next->state,
+                  offsetof(struct tc_draw_vstate_single, info) +
+                  sizeof(struct pipe_draw_vertex_state_info) -
+                  offsetof(struct tc_draw_vstate_single, state));
+}
+
+static uint16_t
+tc_call_draw_vstate_single(struct pipe_context *pipe, void *call, uint64_t *last_ptr)
+{
+   /* Draw call merging. */
+   struct tc_draw_vstate_single *first = to_call(call, tc_draw_vstate_single);
+   struct tc_draw_vstate_single *last = (struct tc_draw_vstate_single *)last_ptr;
+   struct tc_draw_vstate_single *next = get_next_call(first, tc_draw_vstate_single);
+
+   /* If at least 2 consecutive draw calls can be merged... */
+   if (next != last &&
+       is_next_call_a_mergeable_draw_vstate(first, next)) {
+      /* The maximum number of merged draws is given by the batch size. */
+      struct pipe_draw_start_count_bias draws[TC_SLOTS_PER_BATCH /
+                                              call_size(tc_draw_vstate_single)];
+      unsigned num_draws = 2;
+
+      draws[0] = first->draw;
+      draws[1] = next->draw;
+
+      /* Find how many other draws can be merged. */
+      next = get_next_call(next, tc_draw_vstate_single);
+      for (; next != last &&
+           is_next_call_a_mergeable_draw_vstate(first, next);
+           next = get_next_call(next, tc_draw_vstate_single),
+           num_draws++)
+         draws[num_draws] = next->draw;
+
+      pipe->draw_vertex_state(pipe, first->state, first->partial_velem_mask,
+                              first->info, draws, num_draws);
+      /* Since all draws use the same state, drop all references at once. */
+      tc_drop_vertex_state_references(first->state, num_draws);
+
+      return call_size(tc_draw_vstate_single) * num_draws;
+   }
+
+   pipe->draw_vertex_state(pipe, first->state, first->partial_velem_mask,
+                           first->info, &first->draw, 1);
+   tc_drop_vertex_state_references(first->state, 1);
+   return call_size(tc_draw_vstate_single);
+}
+
+struct tc_draw_vstate_multi {
+   struct tc_call_base base;
+   uint32_t partial_velem_mask;
+   struct pipe_draw_vertex_state_info info;
+   unsigned num_draws;
+   struct pipe_vertex_state *state;
+   struct pipe_draw_start_count_bias slot[0];
+};
+
+static uint16_t
+tc_call_draw_vstate_multi(struct pipe_context *pipe, void *call, uint64_t *last)
+{
+   struct tc_draw_vstate_multi *info = (struct tc_draw_vstate_multi*)call;
+
+   pipe->draw_vertex_state(pipe, info->state, info->partial_velem_mask,
+                           info->info, info->slot, info->num_draws);
+   tc_drop_vertex_state_references(info->state, 1);
+   return info->base.num_slots;
+}
+
+static void
+tc_draw_vertex_state(struct pipe_context *_pipe,
+                     struct pipe_vertex_state *state,
+                     uint32_t partial_velem_mask,
+                     struct pipe_draw_vertex_state_info info,
+                     const struct pipe_draw_start_count_bias *draws,
+                     unsigned num_draws)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+
+   if (unlikely(tc->add_all_gfx_bindings_to_buffer_list))
+      tc_add_all_gfx_bindings_to_buffer_list(tc);
+
+   if (num_draws == 1) {
+      /* Single draw. */
+      struct tc_draw_vstate_single *p =
+         tc_add_call(tc, TC_CALL_draw_vstate_single, tc_draw_vstate_single);
+      p->partial_velem_mask = partial_velem_mask;
+      p->draw = draws[0];
+      p->info.mode = info.mode;
+      p->info.take_vertex_state_ownership = false;
+
+      /* This should be always 0 for simplicity because we assume that
+       * index_bias doesn't vary.
+       */
+      assert(draws[0].index_bias == 0);
+
+      if (!info.take_vertex_state_ownership)
+         tc_set_vertex_state_reference(&p->state, state);
+      else
+         p->state = state;
+      return;
+   }
+
+   const int draw_overhead_bytes = sizeof(struct tc_draw_vstate_multi);
+   const int one_draw_slot_bytes = sizeof(((struct tc_draw_vstate_multi*)NULL)->slot[0]);
+   const int slots_for_one_draw = DIV_ROUND_UP(draw_overhead_bytes + one_draw_slot_bytes,
+                                               sizeof(struct tc_call_base));
+   /* Multi draw. */
+   int total_offset = 0;
+   bool take_vertex_state_ownership = info.take_vertex_state_ownership;
+   while (num_draws) {
+      struct tc_batch *next = &tc->batch_slots[tc->next];
+
+      int nb_slots_left = TC_SLOTS_PER_BATCH - next->num_total_slots;
+      /* If there isn't enough place for one draw, try to fill the next one */
+      if (nb_slots_left < slots_for_one_draw)
+         nb_slots_left = TC_SLOTS_PER_BATCH;
+      const int size_left_bytes = nb_slots_left * sizeof(struct tc_call_base);
+
+      /* How many draws can we fit in the current batch */
+      const int dr = MIN2(num_draws, (size_left_bytes - draw_overhead_bytes) / one_draw_slot_bytes);
+
+      /* Non-indexed call or indexed with a real index buffer. */
+      struct tc_draw_vstate_multi *p =
+         tc_add_slot_based_call(tc, TC_CALL_draw_vstate_multi, tc_draw_vstate_multi, dr);
+
+      if (!take_vertex_state_ownership)
+         tc_set_vertex_state_reference(&p->state, state);
+      else
+         p->state = state;
+
+      take_vertex_state_ownership = false;
+      p->partial_velem_mask = partial_velem_mask;
+      p->info.mode = info.mode;
+      p->info.take_vertex_state_ownership = false;
+      p->num_draws = dr;
+      memcpy(p->slot, &draws[total_offset], sizeof(draws[0]) * dr);
+      num_draws -= dr;
+
+      total_offset += dr;
+   }
+}
+
 struct tc_launch_grid_call {
    struct tc_call_base base;
    struct pipe_grid_info info;
@@ -3335,6 +3649,9 @@ tc_resource_copy_region(struct pipe_context *_pipe,
       tc_add_call(tc, TC_CALL_resource_copy_region,
                   tc_resource_copy_region);
 
+   if (dst->target == PIPE_BUFFER)
+      tc_buffer_disable_cpu_storage(dst);
+
    tc_set_resource_reference(&p->dst, dst);
    p->dst_level = dst_level;
    p->dstx = dstx;
@@ -3649,6 +3966,8 @@ tc_clear_buffer(struct pipe_context *_pipe, struct pipe_resource *res,
    struct tc_clear_buffer *p =
       tc_add_call(tc, TC_CALL_clear_buffer, tc_clear_buffer);
 
+   tc_buffer_disable_cpu_storage(res);
+
    tc_set_resource_reference(&p->res, res);
    tc_add_to_buffer_list(&tc->buffer_lists[tc->next_buf_list], res);
    p->offset = offset;
@@ -3963,12 +4282,7 @@ void tc_driver_internal_flush_notify(struct threaded_context *tc)
  *                             in pipe_screen.
  * \param replace_buffer  callback for replacing a pipe_resource's storage
  *                        with another pipe_resource's storage.
- * \param create_fence    optional callback to create a fence for async flush
- * \param is_resource_busy   optional callback to tell TC if transfer_map()/etc
- *                           with the given usage would stall
- * \param driver_calls_flush_notify  whether the driver calls
- *                                   tc_driver_internal_flush_notify after every
- *                                   driver flush
+ * \param options         optional TC options/callbacks
  * \param out  if successful, the threaded_context will be returned here in
  *             addition to the return value if "out" != NULL
  */
@@ -3976,9 +4290,7 @@ struct pipe_context *
 threaded_context_create(struct pipe_context *pipe,
                         struct slab_parent_pool *parent_transfer_pool,
                         tc_replace_buffer_storage_func replace_buffer,
-                        tc_create_fence_func create_fence,
-                        tc_is_resource_busy is_resource_busy,
-                        bool driver_calls_flush_notify,
+                        const struct threaded_context_options *options,
                         struct threaded_context **out)
 {
    struct threaded_context *tc;
@@ -3997,16 +4309,16 @@ threaded_context_create(struct pipe_context *pipe,
       return NULL;
    }
 
-   pipe = trace_context_create_threaded(pipe->screen, pipe, &replace_buffer, &create_fence, &is_resource_busy);
+   if (options)
+      tc->options = *options;
+
+   pipe = trace_context_create_threaded(pipe->screen, pipe, &replace_buffer, &tc->options);
 
    /* The driver context isn't wrapped, so set its "priv" to NULL. */
    pipe->priv = NULL;
 
    tc->pipe = pipe;
    tc->replace_buffer_storage = replace_buffer;
-   tc->create_fence = create_fence;
-   tc->is_resource_busy = is_resource_busy;
-   tc->driver_calls_flush_notify = driver_calls_flush_notify;
    tc->map_buffer_alignment =
       pipe->screen->get_param(pipe->screen, PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT);
    tc->ubo_alignment =
@@ -4072,6 +4384,7 @@ threaded_context_create(struct pipe_context *pipe,
 
    CTX_INIT(flush);
    CTX_INIT(draw_vbo);
+   CTX_INIT(draw_vertex_state);
    CTX_INIT(launch_grid);
    CTX_INIT(resource_copy_region);
    CTX_INIT(blit);
@@ -4139,6 +4452,7 @@ threaded_context_create(struct pipe_context *pipe,
    CTX_INIT(set_window_rectangles);
    CTX_INIT(set_sampler_views);
    CTX_INIT(set_tess_state);
+   CTX_INIT(set_patch_vertices);
    CTX_INIT(set_shader_buffers);
    CTX_INIT(set_shader_images);
    CTX_INIT(set_vertex_buffers);
diff --git a/mesa 3D driver/src/gallium/auxiliary/util/u_threaded_context.h b/mesa 3D driver/src/gallium/auxiliary/util/u_threaded_context.h
index 0e1189d7f7..ec7438c8e2 100644
--- a/mesa 3D driver/src/gallium/auxiliary/util/u_threaded_context.h	
+++ b/mesa 3D driver/src/gallium/auxiliary/util/u_threaded_context.h	
@@ -145,6 +145,10 @@
  *    the threaded context wants to replace a resource's backing storage with
  *    another resource's backing storage. The threaded context uses it to
  *    implement buffer invalidation. This call is always queued.
+ *    Note that 'minimum_num_rebinds' specifies only the minimum number of rebinds
+ *    which must be managed by the driver; if a buffer is bound multiple times in
+ *    the same binding point (e.g., vertex buffer slots 0,1,2), this will be counted
+ *    as a single rebind.
  *
  *
  * Optional resource busy callbacks for better performance
@@ -204,6 +208,8 @@ struct tc_unflushed_batch_token;
 /* 0 = disabled, 1 = assertions, 2 = printfs, 3 = logging */
 #define TC_DEBUG 0
 
+/* This is an internal flag not sent to the driver. */
+#define TC_TRANSFER_MAP_UPLOAD_CPU_STORAGE   (1u << 28)
 /* These are map flags sent to drivers. */
 /* Never infer whether it's safe to use unsychronized mappings: */
 #define TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED (1u << 29)
@@ -290,7 +296,7 @@ enum tc_binding_type {
 typedef void (*tc_replace_buffer_storage_func)(struct pipe_context *ctx,
                                                struct pipe_resource *dst,
                                                struct pipe_resource *src,
-                                               unsigned num_rebinds,
+                                               unsigned minimum_num_rebinds,
                                                uint32_t rebind_mask,
                                                uint32_t delete_buffer_id);
 typedef struct pipe_fence_handle *(*tc_create_fence_func)(struct pipe_context *ctx,
@@ -309,6 +315,13 @@ struct threaded_resource {
     */
    struct pipe_resource *latest;
 
+   /* Optional CPU storage of the buffer. When we get partial glBufferSubData(implemented by
+    * copy_buffer) + glDrawElements, we don't want to drain the gfx pipeline before executing
+    * the copy. For ideal pipelining, we upload to this CPU storage and then reallocate
+    * the GPU storage completely and reupload everything without copy_buffer.
+    */
+   void *cpu_storage;
+
    /* The buffer range which is initialized (with a write transfer, streamout,
     * or writable shader resources). The remainder of the buffer is considered
     * invalid and can be mapped unsynchronized.
@@ -356,6 +369,8 @@ struct threaded_transfer {
     * the base instance. Initially it's set to &b.resource->valid_buffer_range.
     */
    struct util_range *valid_buffer_range;
+
+   bool cpu_storage_mapped;
 };
 
 struct threaded_query {
@@ -404,13 +419,29 @@ struct tc_buffer_list {
    BITSET_DECLARE(buffer_list, TC_BUFFER_ID_MASK + 1);
 };
 
+/**
+ * Optional TC parameters/callbacks.
+ */
+struct threaded_context_options {
+   tc_create_fence_func create_fence;
+   tc_is_resource_busy is_resource_busy;
+   bool driver_calls_flush_notify;
+
+   /**
+    * If true, ctx->get_device_reset_status() will be called without
+    * synchronizing with driver thread.  Drivers can enable this to avoid
+    * TC syncs if their implementation of get_device_reset_status() is
+    * safe to call without synchronizing with driver thread.
+    */
+   bool unsynchronized_get_device_reset_status;
+};
+
 struct threaded_context {
    struct pipe_context base;
    struct pipe_context *pipe;
    struct slab_child_pool pool_transfers;
    tc_replace_buffer_storage_func replace_buffer_storage;
-   tc_create_fence_func create_fence;
-   tc_is_resource_busy is_resource_busy;
+   struct threaded_context_options options;
    unsigned map_buffer_alignment;
    unsigned ubo_alignment;
 
@@ -421,7 +452,6 @@ struct threaded_context {
    unsigned num_direct_slots;
    unsigned num_syncs;
 
-   bool driver_calls_flush_notify;
    bool use_forced_staging_uploads;
    bool add_all_gfx_bindings_to_buffer_list;
    bool add_all_compute_bindings_to_buffer_list;
@@ -484,7 +514,8 @@ struct threaded_context {
    struct tc_buffer_list buffer_lists[TC_MAX_BUFFER_LISTS];
 };
 
-void threaded_resource_init(struct pipe_resource *res);
+void threaded_resource_init(struct pipe_resource *res, bool allow_cpu_storage,
+                            unsigned map_buffer_alignment);
 void threaded_resource_deinit(struct pipe_resource *res);
 struct pipe_context *threaded_context_unwrap_sync(struct pipe_context *pipe);
 void tc_driver_internal_flush_notify(struct threaded_context *tc);
@@ -493,9 +524,7 @@ struct pipe_context *
 threaded_context_create(struct pipe_context *pipe,
                         struct slab_parent_pool *parent_transfer_pool,
                         tc_replace_buffer_storage_func replace_buffer,
-                        tc_create_fence_func create_fence,
-                        tc_is_resource_busy is_resource_busy,
-                        bool driver_calls_flush_notify,
+                        const struct threaded_context_options *options,
                         struct threaded_context **out);
 
 void
@@ -562,4 +591,21 @@ tc_assert_driver_thread(struct threaded_context *tc)
 #endif
 }
 
+/**
+ * This is called before GPU stores to disable the CPU storage because
+ * the CPU storage doesn't mirror the GPU storage.
+ *
+ * Drivers should also call it before exporting a DMABUF of a buffer.
+ */
+static inline void
+tc_buffer_disable_cpu_storage(struct pipe_resource *buf)
+{
+   struct threaded_resource *tres = threaded_resource(buf);
+
+   if (tres->cpu_storage) {
+      free(tres->cpu_storage);
+      tres->cpu_storage = NULL;
+   }
+}
+
 #endif
diff --git a/mesa 3D driver/src/gallium/auxiliary/util/u_threaded_context_calls.h b/mesa 3D driver/src/gallium/auxiliary/util/u_threaded_context_calls.h
index 7174d3ab94..ab78d3de3a 100644
--- a/mesa 3D driver/src/gallium/auxiliary/util/u_threaded_context_calls.h	
+++ b/mesa 3D driver/src/gallium/auxiliary/util/u_threaded_context_calls.h	
@@ -10,6 +10,7 @@ CALL(render_condition)
 CALL(bind_sampler_states)
 CALL(set_framebuffer_state)
 CALL(set_tess_state)
+CALL(set_patch_vertices)
 CALL(set_constant_buffer)
 CALL(set_inlinable_constants)
 CALL(set_sample_locations)
@@ -32,6 +33,8 @@ CALL(draw_single)
 CALL(draw_single_drawid)
 CALL(draw_multi)
 CALL(draw_indirect)
+CALL(draw_vstate_single)
+CALL(draw_vstate_multi)
 CALL(launch_grid)
 CALL(resource_copy_region)
 CALL(blit)
diff --git a/mesa 3D driver/src/gallium/auxiliary/util/u_trace_gallium.c b/mesa 3D driver/src/gallium/auxiliary/util/u_trace_gallium.c
new file mode 100644
index 0000000000..3e9a254a46
--- /dev/null
+++ b/mesa 3D driver/src/gallium/auxiliary/util/u_trace_gallium.c	
@@ -0,0 +1,96 @@
+/*
+ * Copyright © 2020 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "u_trace_gallium.h"
+#include "u_inlines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+
+#include "u_tracepoints.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+static void *
+u_trace_pipe_create_ts_buffer(struct u_trace_context *utctx, uint32_t size)
+{
+   struct pipe_context *ctx = utctx->pctx;
+
+   struct pipe_resource tmpl = {
+      .target     = PIPE_BUFFER,
+      .format     = PIPE_FORMAT_R8_UNORM,
+      .bind       = PIPE_BIND_QUERY_BUFFER | PIPE_BIND_LINEAR,
+      .width0     = size,
+      .height0    = 1,
+      .depth0     = 1,
+      .array_size = 1,
+   };
+
+   return ctx->screen->resource_create(ctx->screen, &tmpl);
+}
+
+static void
+u_trace_pipe_delete_ts_buffer(struct u_trace_context *utctx, void *timestamps)
+{
+   struct pipe_resource *buffer = timestamps;
+   pipe_resource_reference(&buffer, NULL);
+}
+
+void
+u_trace_pipe_context_init(struct u_trace_context *utctx,
+                          struct pipe_context *pctx,
+                          u_trace_record_ts record_timestamp,
+                          u_trace_read_ts read_timestamp,
+                          u_trace_delete_flush_data delete_flush_data)
+{
+   u_trace_context_init(utctx, pctx,
+                        u_trace_pipe_create_ts_buffer,
+                        u_trace_pipe_delete_ts_buffer,
+                        record_timestamp,
+                        read_timestamp,
+                        delete_flush_data);
+}
+
+inline void
+trace_framebuffer_state(struct u_trace *ut, void *cs, const struct pipe_framebuffer_state *pfb)
+{
+   if (likely(!ut->enabled))
+      return;
+
+   trace_framebuffer(ut, cs, pfb);
+
+   for (unsigned i = 0; i < pfb->nr_cbufs; i++) {
+      if (pfb->cbufs[i]) {
+         trace_surface(ut, cs, pfb->cbufs[i]);
+      }
+   }
+   if (pfb->zsbuf) {
+      trace_surface(ut, cs, pfb->zsbuf);
+   }
+}
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/mesa 3D driver/src/gallium/auxiliary/util/u_trace_gallium.h b/mesa 3D driver/src/gallium/auxiliary/util/u_trace_gallium.h
new file mode 100644
index 0000000000..e37e3e6636
--- /dev/null
+++ b/mesa 3D driver/src/gallium/auxiliary/util/u_trace_gallium.h	
@@ -0,0 +1,57 @@
+/*
+ * Copyright © 2020 Google, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _U_TRACE_GALLIUM_H
+#define _U_TRACE_GALLIUM_H
+
+#include "util/perf/u_trace.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+/* Gallium specific u_trace helpers */
+
+struct pipe_context;
+struct pipe_framebuffer_state;
+
+void
+u_trace_pipe_context_init(struct u_trace_context *utctx,
+                          struct pipe_context *pctx,
+                          u_trace_record_ts record_timestamp,
+                          u_trace_read_ts read_timestamp,
+                          u_trace_delete_flush_data delete_flush_data);
+
+/*
+ * In some cases it is useful to have composite tracepoints like this,
+ * to log more complex data structures.
+ */
+
+void
+trace_framebuffer_state(struct u_trace *ut, void *cs, const struct pipe_framebuffer_state *pfb);
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif  /* _U_TRACE_GALLIUM_H */
diff --git a/mesa 3D driver/src/gallium/auxiliary/util/u_tracepoints.py b/mesa 3D driver/src/gallium/auxiliary/util/u_tracepoints.py
index f8a70d05c0..30aaab9df1 100644
--- a/mesa 3D driver/src/gallium/auxiliary/util/u_tracepoints.py	
+++ b/mesa 3D driver/src/gallium/auxiliary/util/u_tracepoints.py	
@@ -37,6 +37,8 @@ sys.path.insert(0, args.import_path)
 
 from u_trace import Header
 from u_trace import Tracepoint
+from u_trace import TracepointArg as Arg
+from u_trace import TracepointArgStruct as ArgStruct
 from u_trace import utrace_generate
 
 #
@@ -47,11 +49,11 @@ Header('pipe/p_state.h')
 Header('util/format/u_format.h')
 
 Tracepoint('surface',
-    args=[['const struct pipe_surface *', 'psurf']],
-    tp_struct=[['uint16_t',     'width',      'psurf->width'],
-               ['uint16_t',     'height',     'psurf->height'],
-               ['uint8_t',      'nr_samples', 'psurf->nr_samples'],
-               ['const char *', 'format',     'util_format_short_name(psurf->format)']],
+    args=[ArgStruct(type='const struct pipe_surface *', var='psurf')],
+    tp_struct=[Arg(type='uint16_t',     name='width',      var='psurf->width',                          c_format='%u'),
+               Arg(type='uint16_t',     name='height',     var='psurf->height',                         c_format='%u'),
+               Arg(type='uint8_t',      name='nr_samples', var='psurf->nr_samples',                     c_format='%u'),
+               Arg(type='const char *', name='format',     var='util_format_short_name(psurf->format)', c_format='%s')],
     tp_print=['%ux%u@%u, fmt=%s',
         '__entry->width',
         '__entry->height',
@@ -61,12 +63,12 @@ Tracepoint('surface',
 
 # Note: called internally from trace_framebuffer_state()
 Tracepoint('framebuffer',
-    args=[['const struct pipe_framebuffer_state *', 'pfb']],
-    tp_struct=[['uint16_t',     'width',      'pfb->width'],
-               ['uint16_t',     'height',     'pfb->height'],
-               ['uint8_t',      'layers',     'pfb->layers'],
-               ['uint8_t',      'samples',    'pfb->samples'],
-               ['uint8_t',      'nr_cbufs',   'pfb->nr_cbufs']],
+    args=[ArgStruct(type='const struct pipe_framebuffer_state *', var='pfb')],
+    tp_struct=[Arg(type='uint16_t', name='width',    var='pfb->width',    c_format='%u'),
+               Arg(type='uint16_t', name='height',   var='pfb->height',   c_format='%u'),
+               Arg(type='uint8_t',  name='layers',   var='pfb->layers',   c_format='%u'),
+               Arg(type='uint8_t',  name='samples',  var='pfb->samples',  c_format='%u'),
+               Arg(type='uint8_t',  name='nr_cbufs', var='pfb->nr_cbufs', c_format='%u')],
     tp_print=['%ux%ux%u@%u, nr_cbufs: %u',
         '__entry->width',
         '__entry->height',
@@ -76,17 +78,17 @@ Tracepoint('framebuffer',
 )
 
 Tracepoint('grid_info',
-    args=[['const struct pipe_grid_info *', 'pgrid']],
-    tp_struct=[['uint8_t',  'work_dim',  'pgrid->work_dim'],
-               ['uint16_t', 'block_x',   'pgrid->block[0]'],
-               ['uint16_t', 'block_y',   'pgrid->block[1]'],
-               ['uint16_t', 'block_z',   'pgrid->block[2]'],
-               ['uint16_t', 'grid_x',    'pgrid->grid[0]'],
-               ['uint16_t', 'grid_y',    'pgrid->grid[1]'],
-               ['uint16_t', 'grid_z',    'pgrid->grid[2]']],
+    args=[ArgStruct(type='const struct pipe_grid_info *', var='pgrid')],
+    tp_struct=[Arg(type='uint8_t',  name='work_dim', var='pgrid->work_dim', c_format='%u'),
+               Arg(type='uint16_t', name='block_x',  var='pgrid->block[0]', c_format='%u'),
+               Arg(type='uint16_t', name='block_y',  var='pgrid->block[1]', c_format='%u'),
+               Arg(type='uint16_t', name='block_z',  var='pgrid->block[2]', c_format='%u'),
+               Arg(type='uint16_t', name='grid_x',   var='pgrid->grid[0]',  c_format='%u'),
+               Arg(type='uint16_t', name='grid_y',   var='pgrid->grid[1]',  c_format='%u'),
+               Arg(type='uint16_t', name='grid_z',   var='pgrid->grid[2]',  c_format='%u')],
     tp_print=['work_dim=%u, block=%ux%ux%u, grid=%ux%ux%u', '__entry->work_dim',
         '__entry->block_x', '__entry->block_y', '__entry->block_z',
         '__entry->grid_x', '__entry->grid_y', '__entry->grid_z'],
 )
 
-utrace_generate(cpath=args.src, hpath=args.hdr)
+utrace_generate(cpath=args.src, hpath=args.hdr, ctx_param='struct pipe_context *pctx')
diff --git a/mesa 3D driver/src/gallium/auxiliary/util/u_vertex_state_cache.c b/mesa 3D driver/src/gallium/auxiliary/util/u_vertex_state_cache.c
new file mode 100644
index 0000000000..f98a1071af
--- /dev/null
+++ b/mesa 3D driver/src/gallium/auxiliary/util/u_vertex_state_cache.c	
@@ -0,0 +1,134 @@
+/*
+ * Copyright 2021 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "util/u_vertex_state_cache.h"
+#include "util/u_inlines.h"
+#include "util/hash_table.h"
+#include "util/set.h"
+
+static uint32_t key_hash(const void *key)
+{
+   const struct pipe_vertex_state *state = key;
+
+   return _mesa_hash_data(&state->input, sizeof(state->input));
+}
+
+static bool key_equals(const void *a, const void *b)
+{
+   const struct pipe_vertex_state *sa = a;
+   const struct pipe_vertex_state *sb = b;
+
+   return !memcmp(&sa->input, &sb->input, sizeof(sa->input));
+}
+
+void
+util_vertex_state_cache_init(struct util_vertex_state_cache *cache,
+                             pipe_create_vertex_state_func create,
+                             pipe_vertex_state_destroy_func destroy)
+{
+   simple_mtx_init(&cache->lock, mtx_plain);
+   cache->set = _mesa_set_create(NULL, key_hash, key_equals);
+   cache->create = create;
+   cache->destroy = destroy;
+}
+
+void
+util_vertex_state_cache_deinit(struct util_vertex_state_cache *cache)
+{
+   if (cache->set) {
+      set_foreach(cache->set, entry) {
+         fprintf(stderr, "mesa: vertex state cache should be empty\n");
+         assert(!"vertex state cache should be empty");
+      }
+
+      _mesa_set_destroy(cache->set, NULL);
+      simple_mtx_destroy(&cache->lock);
+   }
+}
+
+struct pipe_vertex_state *
+util_vertex_state_cache_get(struct pipe_screen *screen,
+                            struct pipe_vertex_buffer *buffer,
+                            const struct pipe_vertex_element *elements,
+                            unsigned num_elements,
+                            struct pipe_resource *indexbuf,
+                            uint32_t full_velem_mask,
+                            struct util_vertex_state_cache *cache)
+{
+   struct pipe_vertex_state key;
+
+   memset(&key, 0, sizeof(key));
+   key.input.indexbuf = indexbuf;
+   key.input.vbuffer.stride = buffer->stride;
+   assert(!buffer->is_user_buffer);
+   key.input.vbuffer.buffer_offset = buffer->buffer_offset;
+   key.input.vbuffer.buffer = buffer->buffer;
+   key.input.num_elements = num_elements;
+   for (unsigned i = 0; i < num_elements; i++)
+      key.input.elements[i] = elements[i];
+   key.input.full_velem_mask = full_velem_mask;
+
+   uint32_t hash = key_hash(&key);
+
+   /* Find the state in the live cache. */
+   simple_mtx_lock(&cache->lock);
+   struct set_entry *entry = _mesa_set_search_pre_hashed(cache->set, hash, &key);
+   struct pipe_vertex_state *state = entry ? (void*)entry->key : NULL;
+
+   /* Return if the state already exists. */
+   if (state) {
+      /* Increase the refcount. */
+      p_atomic_inc(&state->reference.count);
+      assert(state->reference.count >= 1);
+      simple_mtx_unlock(&cache->lock);
+      return state;
+   }
+
+   state = cache->create(screen, buffer, elements, num_elements, indexbuf,
+                         full_velem_mask);
+   if (state) {
+      assert(key_hash(state) == hash);
+      _mesa_set_add_pre_hashed(cache->set, hash, state);
+   }
+
+   simple_mtx_unlock(&cache->lock);
+   return state;
+}
+
+void
+util_vertex_state_destroy(struct pipe_screen *screen,
+                          struct util_vertex_state_cache *cache,
+                          struct pipe_vertex_state *state)
+{
+   simple_mtx_lock(&cache->lock);
+   /* There could have been a thread race and the cache might have returned
+    * the vertex state being destroyed. Check the reference count and do
+    * nothing if it's positive.
+    */
+   if (p_atomic_read(&state->reference.count) <= 0) {
+      _mesa_set_remove_key(cache->set, state);
+      cache->destroy(screen, state);
+   }
+   simple_mtx_unlock(&cache->lock);
+}
diff --git a/mesa 3D driver/src/gallium/auxiliary/util/u_vertex_state_cache.h b/mesa 3D driver/src/gallium/auxiliary/util/u_vertex_state_cache.h
new file mode 100644
index 0000000000..902e91e43a
--- /dev/null
+++ b/mesa 3D driver/src/gallium/auxiliary/util/u_vertex_state_cache.h	
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2021 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* This deduplicates pipe_vertex_state CSOs to enable draw merging in
+ * u_threaded_context because the draw merging is possible only if different
+ * display lists use the same pipe_vertex_state CSO.
+ */
+
+#ifndef U_VERTEX_STATE_CACHE_H
+#define U_VERTEX_STATE_CACHE_H
+
+#include "util/simple_mtx.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+
+struct util_vertex_state_cache {
+   simple_mtx_t lock;
+   struct set *set;
+
+   pipe_create_vertex_state_func create;
+   pipe_vertex_state_destroy_func destroy;
+};
+
+void
+util_vertex_state_cache_init(struct util_vertex_state_cache *cache,
+                             pipe_create_vertex_state_func create,
+                             pipe_vertex_state_destroy_func destroy);
+
+void
+util_vertex_state_cache_deinit(struct util_vertex_state_cache *cache);
+
+struct pipe_vertex_state *
+util_vertex_state_cache_get(struct pipe_screen *screen,
+                            struct pipe_vertex_buffer *buffer,
+                            const struct pipe_vertex_element *elements,
+                            unsigned num_elements,
+                            struct pipe_resource *indexbuf,
+                            uint32_t full_velem_mask,
+                            struct util_vertex_state_cache *cache);
+
+void
+util_vertex_state_destroy(struct pipe_screen *screen,
+                          struct util_vertex_state_cache *cache,
+                          struct pipe_vertex_state *state);
+
+#endif
diff --git a/mesa 3D driver/src/gallium/auxiliary/vl/vl_bicubic_filter.c b/mesa 3D driver/src/gallium/auxiliary/vl/vl_bicubic_filter.c
index 674aa587d6..457f2cf68b 100644
--- a/mesa 3D driver/src/gallium/auxiliary/vl/vl_bicubic_filter.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/vl/vl_bicubic_filter.c	
@@ -456,7 +456,7 @@ vl_bicubic_filter_render(struct vl_bicubic_filter *filter,
    filter->pipe->bind_sampler_states(filter->pipe, PIPE_SHADER_FRAGMENT,
                                      0, 1, &filter->sampler);
    filter->pipe->set_sampler_views(filter->pipe, PIPE_SHADER_FRAGMENT,
-                                   0, 1, 0, &src);
+                                   0, 1, 0, false, &src);
    filter->pipe->bind_vs_state(filter->pipe, filter->vs);
    filter->pipe->bind_fs_state(filter->pipe, filter->fs);
    filter->pipe->set_framebuffer_state(filter->pipe, &fb_state);
diff --git a/mesa 3D driver/src/gallium/auxiliary/vl/vl_compositor_cs.c b/mesa 3D driver/src/gallium/auxiliary/vl/vl_compositor_cs.c
index 813aa14891..ad2175213e 100644
--- a/mesa 3D driver/src/gallium/auxiliary/vl/vl_compositor_cs.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/vl/vl_compositor_cs.c	
@@ -727,7 +727,7 @@ draw_layers(struct vl_compositor       *c,
          c->pipe->bind_sampler_states(c->pipe, PIPE_SHADER_COMPUTE, 0,
                         num_sampler_views, layer->samplers);
          c->pipe->set_sampler_views(c->pipe, PIPE_SHADER_COMPUTE, 0,
-                        num_sampler_views, 0, samplers);
+                        num_sampler_views, 0, false, samplers);
 
          cs_launch(c, layer->cs, &(drawn.area));
 
@@ -735,7 +735,7 @@ draw_layers(struct vl_compositor       *c,
          c->pipe->set_shader_images(c->pipe, PIPE_SHADER_COMPUTE, 0, 0, 1, NULL);
          c->pipe->set_constant_buffer(c->pipe, PIPE_SHADER_COMPUTE, 0, false, NULL);
          c->pipe->set_sampler_views(c->pipe, PIPE_SHADER_FRAGMENT, 0, 0,
-                        num_sampler_views, NULL);
+                        num_sampler_views, false, NULL);
          c->pipe->bind_compute_state(c->pipe, NULL);
          c->pipe->bind_sampler_states(c->pipe, PIPE_SHADER_COMPUTE, 0,
                         num_sampler_views, NULL);
diff --git a/mesa 3D driver/src/gallium/auxiliary/vl/vl_compositor_gfx.c b/mesa 3D driver/src/gallium/auxiliary/vl/vl_compositor_gfx.c
index c4eba2293c..24f5625b47 100644
--- a/mesa 3D driver/src/gallium/auxiliary/vl/vl_compositor_gfx.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/vl/vl_compositor_gfx.c	
@@ -665,7 +665,7 @@ draw_layers(struct vl_compositor *c, struct vl_compositor_state *s, struct u_rec
          c->pipe->bind_sampler_states(c->pipe, PIPE_SHADER_FRAGMENT, 0,
                                       num_sampler_views, layer->samplers);
          c->pipe->set_sampler_views(c->pipe, PIPE_SHADER_FRAGMENT, 0,
-                                    num_sampler_views, 0, samplers);
+                                    num_sampler_views, 0, false, samplers);
 
          util_draw_arrays(c->pipe, PIPE_PRIM_QUADS, vb_index * 4, 4);
          vb_index++;
diff --git a/mesa 3D driver/src/gallium/auxiliary/vl/vl_deint_filter.c b/mesa 3D driver/src/gallium/auxiliary/vl/vl_deint_filter.c
index 950739b6ca..fa6b5046e9 100644
--- a/mesa 3D driver/src/gallium/auxiliary/vl/vl_deint_filter.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/vl/vl_deint_filter.c	
@@ -501,7 +501,8 @@ vl_deint_filter_render(struct vl_deint_filter *filter,
       sampler_views[1] = prev_sv[k];
       sampler_views[2] = cur_sv[k];
       sampler_views[3] = next_sv[k];
-      filter->pipe->set_sampler_views(filter->pipe, PIPE_SHADER_FRAGMENT, 0, 4, 0, sampler_views);
+      filter->pipe->set_sampler_views(filter->pipe, PIPE_SHADER_FRAGMENT,
+                                      0, 4, 0, false, sampler_views);
 
       /* blit current field */
       fb_state.cbufs[0] = blit_surf;
diff --git a/mesa 3D driver/src/gallium/auxiliary/vl/vl_idct.c b/mesa 3D driver/src/gallium/auxiliary/vl/vl_idct.c
index 9368030919..58fd5329d0 100644
--- a/mesa 3D driver/src/gallium/auxiliary/vl/vl_idct.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/vl/vl_idct.c	
@@ -836,7 +836,7 @@ vl_idct_flush(struct vl_idct *idct, struct vl_idct_buffer *buffer, unsigned num_
                                    0, 2, idct->samplers);
 
    idct->pipe->set_sampler_views(idct->pipe, PIPE_SHADER_FRAGMENT, 0, 2, 0,
-                                 buffer->sampler_views.stage[0]);
+                                 false, buffer->sampler_views.stage[0]);
 
    /* mismatch control */
    idct->pipe->set_framebuffer_state(idct->pipe, &buffer->fb_state_mismatch);
@@ -863,6 +863,6 @@ vl_idct_prepare_stage2(struct vl_idct *idct, struct vl_idct_buffer *buffer)
    idct->pipe->bind_sampler_states(idct->pipe, PIPE_SHADER_FRAGMENT,
                                    0, 2, idct->samplers);
    idct->pipe->set_sampler_views(idct->pipe, PIPE_SHADER_FRAGMENT,
-                                 0, 2, 0, buffer->sampler_views.stage[1]);
+                                 0, 2, 0, false, buffer->sampler_views.stage[1]);
 }
 
diff --git a/mesa 3D driver/src/gallium/auxiliary/vl/vl_matrix_filter.c b/mesa 3D driver/src/gallium/auxiliary/vl/vl_matrix_filter.c
index 6c912ea864..f3e3bd5c63 100644
--- a/mesa 3D driver/src/gallium/auxiliary/vl/vl_matrix_filter.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/vl/vl_matrix_filter.c	
@@ -295,7 +295,7 @@ vl_matrix_filter_render(struct vl_matrix_filter *filter,
    filter->pipe->bind_sampler_states(filter->pipe, PIPE_SHADER_FRAGMENT,
                                      0, 1, &filter->sampler);
    filter->pipe->set_sampler_views(filter->pipe, PIPE_SHADER_FRAGMENT,
-                                   0, 1, 0, &src);
+                                   0, 1, 0, false, &src);
    filter->pipe->bind_vs_state(filter->pipe, filter->vs);
    filter->pipe->bind_fs_state(filter->pipe, filter->fs);
    filter->pipe->set_framebuffer_state(filter->pipe, &fb_state);
diff --git a/mesa 3D driver/src/gallium/auxiliary/vl/vl_mc.c b/mesa 3D driver/src/gallium/auxiliary/vl/vl_mc.c
index 0b2a210cb1..d331da1d5f 100644
--- a/mesa 3D driver/src/gallium/auxiliary/vl/vl_mc.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/vl/vl_mc.c	
@@ -622,7 +622,7 @@ vl_mc_render_ref(struct vl_mc *renderer, struct vl_mc_buffer *buffer, struct pip
    renderer->pipe->bind_fs_state(renderer->pipe, renderer->fs_ref);
 
    renderer->pipe->set_sampler_views(renderer->pipe, PIPE_SHADER_FRAGMENT,
-                                     0, 1, 0, &ref);
+                                     0, 1, 0, false, &ref);
    renderer->pipe->bind_sampler_states(renderer->pipe, PIPE_SHADER_FRAGMENT,
                                        0, 1, &renderer->sampler_ref);
 
diff --git a/mesa 3D driver/src/gallium/auxiliary/vl/vl_median_filter.c b/mesa 3D driver/src/gallium/auxiliary/vl/vl_median_filter.c
index 90ae4fcf92..ca935237b8 100644
--- a/mesa 3D driver/src/gallium/auxiliary/vl/vl_median_filter.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/vl/vl_median_filter.c	
@@ -399,7 +399,7 @@ vl_median_filter_render(struct vl_median_filter *filter,
    filter->pipe->bind_sampler_states(filter->pipe, PIPE_SHADER_FRAGMENT,
                                      0, 1, &filter->sampler);
    filter->pipe->set_sampler_views(filter->pipe, PIPE_SHADER_FRAGMENT,
-                                   0, 1, 0, &src);
+                                   0, 1, 0, false, &src);
    filter->pipe->bind_vs_state(filter->pipe, filter->vs);
    filter->pipe->bind_fs_state(filter->pipe, filter->fs);
    filter->pipe->set_framebuffer_state(filter->pipe, &fb_state);
diff --git a/mesa 3D driver/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c b/mesa 3D driver/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
index 149a017f8a..8ce89351a1 100644
--- a/mesa 3D driver/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c	
@@ -830,7 +830,7 @@ vl_mpeg12_end_frame(struct pipe_video_codec *decoder,
             vl_idct_prepare_stage2(i ? &dec->idct_c : &dec->idct_y, &buf->idct[plane]);
          else {
             dec->context->set_sampler_views(dec->context,
-                                            PIPE_SHADER_FRAGMENT, 0, 1, 0,
+                                            PIPE_SHADER_FRAGMENT, 0, 1, 0, false,
                                             &mc_source_sv[plane]);
             dec->context->bind_sampler_states(dec->context,
                                               PIPE_SHADER_FRAGMENT,
diff --git a/mesa 3D driver/src/gallium/auxiliary/vl/vl_zscan.c b/mesa 3D driver/src/gallium/auxiliary/vl/vl_zscan.c
index 42ba13bd9a..543c9340fd 100644
--- a/mesa 3D driver/src/gallium/auxiliary/vl/vl_zscan.c	
+++ b/mesa 3D driver/src/gallium/auxiliary/vl/vl_zscan.c	
@@ -608,7 +608,7 @@ vl_zscan_render(struct vl_zscan *zscan, struct vl_zscan_buffer *buffer, unsigned
    zscan->pipe->set_framebuffer_state(zscan->pipe, &buffer->fb_state);
    zscan->pipe->set_viewport_states(zscan->pipe, 0, 1, &buffer->viewport);
    zscan->pipe->set_sampler_views(zscan->pipe, PIPE_SHADER_FRAGMENT,
-                                  0, 3, 0, &buffer->src);
+                                  0, 3, 0, false, &buffer->src);
    zscan->pipe->bind_vs_state(zscan->pipe, zscan->vs);
    zscan->pipe->bind_fs_state(zscan->pipe, zscan->fs);
    util_draw_arrays_instanced(zscan->pipe, PIPE_PRIM_QUADS, 0, 4, 0, num_instances);
diff --git a/mesa 3D driver/src/gallium/drivers/asahi/agx_state.c b/mesa 3D driver/src/gallium/drivers/asahi/agx_state.c
index 6ac4214096..6a9027a0d6 100644
--- a/mesa 3D driver/src/gallium/drivers/asahi/agx_state.c	
+++ b/mesa 3D driver/src/gallium/drivers/asahi/agx_state.c	
@@ -496,6 +496,7 @@ agx_set_sampler_views(struct pipe_context *pctx,
                       enum pipe_shader_type shader,
                       unsigned start, unsigned count,
                       unsigned unbind_num_trailing_slots,
+                      bool take_ownership,
                       struct pipe_sampler_view **views)
 {
    struct agx_context *ctx = agx_context(pctx);
@@ -511,8 +512,14 @@ agx_set_sampler_views(struct pipe_context *pctx,
       if (views[i])
          new_nr = i + 1;
 
-      pipe_sampler_view_reference((struct pipe_sampler_view **)
-                                  &ctx->stage[shader].textures[i], views[i]);
+      if (take_ownership) {
+         pipe_sampler_view_reference((struct pipe_sampler_view **)
+                                     &ctx->stage[shader].textures[i], NULL);
+         ctx->stage[shader].textures[i] = (struct agx_sampler_view *)views[i];
+      } else {
+         pipe_sampler_view_reference((struct pipe_sampler_view **)
+                                     &ctx->stage[shader].textures[i], views[i]);
+      }
    }
 
    for (; i < ctx->stage[shader].texture_count; i++) {
diff --git a/mesa 3D driver/src/gallium/drivers/crocus/crocus_batch.c b/mesa 3D driver/src/gallium/drivers/crocus/crocus_batch.c
index 0cf32c69e5..d029bd0e32 100644
--- a/mesa 3D driver/src/gallium/drivers/crocus/crocus_batch.c	
+++ b/mesa 3D driver/src/gallium/drivers/crocus/crocus_batch.c	
@@ -216,7 +216,7 @@ crocus_init_batch(struct crocus_context *ice,
    if (devinfo->ver == 6)
       batch->valid_reloc_flags |= EXEC_OBJECT_NEEDS_GTT;
 
-   if (INTEL_DEBUG & DEBUG_BATCH) {
+   if (INTEL_DEBUG(DEBUG_BATCH)) {
       /* The shadow doesn't get relocs written so state decode fails. */
       batch->use_shadow_copy = false;
    } else
@@ -247,12 +247,12 @@ crocus_init_batch(struct crocus_context *ice,
          batch->other_batches[j++] = &ice->batches[i];
    }
 
-   if (INTEL_DEBUG & DEBUG_BATCH) {
+   if (INTEL_DEBUG(DEBUG_BATCH)) {
 
       batch->state_sizes = _mesa_hash_table_u64_create(NULL);
       const unsigned decode_flags =
          INTEL_BATCH_DECODE_FULL |
-         ((INTEL_DEBUG & DEBUG_COLOR) ? INTEL_BATCH_DECODE_IN_COLOR : 0) |
+         (INTEL_DEBUG(DEBUG_COLOR) ? INTEL_BATCH_DECODE_IN_COLOR : 0) |
          INTEL_BATCH_DECODE_OFFSETS | INTEL_BATCH_DECODE_FLOATS;
 
       intel_batch_decode_ctx_init(&batch->decoder, &screen->devinfo, stderr,
@@ -881,7 +881,7 @@ submit_batch(struct crocus_batch *batch)
    }
 
    int ret = 0;
-   if (!batch->screen->no_hw &&
+   if (!batch->screen->devinfo.no_hw &&
        intel_ioctl(batch->screen->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf))
       ret = -errno;
 
@@ -940,8 +940,7 @@ _crocus_batch_flush(struct crocus_batch *batch, const char *file, int line)
    finish_growing_bos(&batch->state);
    int ret = submit_batch(batch);
 
-   if (unlikely(INTEL_DEBUG &
-                (DEBUG_BATCH | DEBUG_SUBMIT | DEBUG_PIPE_CONTROL))) {
+   if (INTEL_DEBUG(DEBUG_BATCH | DEBUG_SUBMIT | DEBUG_PIPE_CONTROL)) {
       int bytes_for_commands = crocus_batch_bytes_used(batch);
       int second_bytes = 0;
       if (batch->command.bo != batch->exec_bos[0]) {
@@ -959,12 +958,12 @@ _crocus_batch_flush(struct crocus_batch *batch, const char *file, int line)
               batch->command.relocs.reloc_count,
               batch->state.relocs.reloc_count);
 
-      if (INTEL_DEBUG & (DEBUG_BATCH | DEBUG_SUBMIT)) {
+      if (INTEL_DEBUG(DEBUG_BATCH | DEBUG_SUBMIT)) {
          dump_fence_list(batch);
          dump_validation_list(batch);
       }
 
-      if (INTEL_DEBUG & DEBUG_BATCH) {
+      if (INTEL_DEBUG(DEBUG_BATCH)) {
          decode_batch(batch);
       }
    }
@@ -985,7 +984,7 @@ _crocus_batch_flush(struct crocus_batch *batch, const char *file, int line)
 
    util_dynarray_clear(&batch->exec_fences);
 
-   if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
+   if (INTEL_DEBUG(DEBUG_SYNC)) {
       dbg_printf("waiting for idle\n");
       crocus_bo_wait_rendering(batch->command.bo); /* if execbuf failed; this is a nop */
    }
@@ -1009,7 +1008,7 @@ _crocus_batch_flush(struct crocus_batch *batch, const char *file, int line)
 
    if (ret < 0) {
 #ifdef DEBUG
-      const bool color = INTEL_DEBUG & DEBUG_COLOR;
+      const bool color = INTEL_DEBUG(DEBUG_COLOR);
       fprintf(stderr, "%scrocus: Failed to submit batchbuffer: %-80s%s\n",
               color ? "\e[1;41m" : "", strerror(-ret), color ? "\e[0m" : "");
 #endif
diff --git a/mesa 3D driver/src/gallium/drivers/crocus/crocus_blit.c b/mesa 3D driver/src/gallium/drivers/crocus/crocus_blit.c
index 7620efc932..2fad9a30aa 100644
--- a/mesa 3D driver/src/gallium/drivers/crocus/crocus_blit.c	
+++ b/mesa 3D driver/src/gallium/drivers/crocus/crocus_blit.c	
@@ -754,20 +754,6 @@ crocus_copy_region(struct blorp_context *blorp,
    tex_cache_flush_hack(batch, ISL_FORMAT_UNSUPPORTED, src_res->surf.format);
 }
 
-static struct crocus_batch *
-get_preferred_batch(struct crocus_context *ice, struct crocus_bo *bo)
-{
-   /* If the compute batch is already using this buffer, we'd prefer to
-    * continue queueing in the compute batch.
-    */
-   if (crocus_batch_references(&ice->batches[CROCUS_BATCH_COMPUTE], bo))
-      return &ice->batches[CROCUS_BATCH_COMPUTE];
-
-   /* Otherwise default to the render batch. */
-   return &ice->batches[CROCUS_BATCH_RENDER];
-}
-
-
 /**
  * The pipe->resource_copy_region() driver hook.
  *
@@ -795,21 +781,6 @@ crocus_resource_copy_region(struct pipe_context *ctx,
    if (crocus_resource_unfinished_aux_import(dst))
       crocus_resource_finish_aux_import(ctx->screen, dst);
 
-   /* Use MI_COPY_MEM_MEM for tiny (<= 16 byte, % 4) buffer copies. */
-   if (p_src->target == PIPE_BUFFER && p_dst->target == PIPE_BUFFER &&
-       (src_box->width % 4 == 0) && src_box->width <= 16 &&
-       screen->vtbl.copy_mem_mem) {
-      struct crocus_bo *dst_bo = crocus_resource_bo(p_dst);
-      batch = get_preferred_batch(ice, dst_bo);
-      crocus_batch_maybe_flush(batch, 24 + 5 * (src_box->width / 4));
-      crocus_emit_pipe_control_flush(batch,
-                                     "stall for MI_COPY_MEM_MEM copy_region",
-                                     PIPE_CONTROL_CS_STALL);
-      screen->vtbl.copy_mem_mem(batch, dst_bo, dstx, crocus_resource_bo(p_src),
-                                src_box->x, src_box->width);
-      return;
-   }
-
    if (devinfo->ver < 6 && util_format_is_depth_or_stencil(p_dst->format)) {
       util_resource_copy_region(ctx, p_dst, dst_level, dstx, dsty, dstz,
                                 p_src, src_level, src_box);
diff --git a/mesa 3D driver/src/gallium/drivers/crocus/crocus_blorp.c b/mesa 3D driver/src/gallium/drivers/crocus/crocus_blorp.c
index c6f4c06fc9..4bf7523da5 100644
--- a/mesa 3D driver/src/gallium/drivers/crocus/crocus_blorp.c	
+++ b/mesa 3D driver/src/gallium/drivers/crocus/crocus_blorp.c	
@@ -159,6 +159,16 @@ blorp_alloc_dynamic_state(struct blorp_batch *blorp_batch,
    return stream_state(batch, size, alignment, offset, NULL);
 }
 
+UNUSED static void *
+blorp_alloc_general_state(struct blorp_batch *blorp_batch,
+                          uint32_t size,
+                          uint32_t alignment,
+                          uint32_t *offset)
+{
+   /* Use dynamic state range for general state on crocus. */
+   return blorp_alloc_dynamic_state(blorp_batch, size, alignment, offset);
+}
+
 static void
 blorp_alloc_binding_table(struct blorp_batch *blorp_batch,
                           unsigned num_entries,
diff --git a/mesa 3D driver/src/gallium/drivers/crocus/crocus_blt.c b/mesa 3D driver/src/gallium/drivers/crocus/crocus_blt.c
index d6eaa1513f..a63f62bd2f 100644
--- a/mesa 3D driver/src/gallium/drivers/crocus/crocus_blt.c	
+++ b/mesa 3D driver/src/gallium/drivers/crocus/crocus_blt.c	
@@ -69,8 +69,10 @@ blt_set_alpha_to_one(struct crocus_batch *batch,
          uint32_t tile_x, tile_y;
          uint64_t offset_B;
          ASSERTED uint32_t z_offset_el, array_offset;
-         isl_tiling_get_intratile_offset_el(dst->surf.tiling,
-                                            cpp * 8, dst->surf.row_pitch_B,
+         isl_tiling_get_intratile_offset_el(dst->surf.tiling, dst->surf.dim,
+                                            dst->surf.msaa_layout,
+                                            cpp * 8, dst->surf.samples,
+                                            dst->surf.row_pitch_B,
                                             dst->surf.array_pitch_el_rows,
                                             chunk_x, chunk_y, 0, 0,
                                             &offset_B,
@@ -324,8 +326,10 @@ static bool crocus_emit_blt(struct crocus_batch *batch,
          uint64_t src_offset;
          uint32_t src_tile_x, src_tile_y;
          ASSERTED uint32_t z_offset_el, array_offset;
-         isl_tiling_get_intratile_offset_el(src->surf.tiling,
-                                            src_cpp * 8, src->surf.row_pitch_B,
+         isl_tiling_get_intratile_offset_el(src->surf.tiling, src->surf.dim,
+                                            src->surf.msaa_layout,
+                                            src_cpp * 8, src->surf.samples,
+                                            src->surf.row_pitch_B,
                                             src->surf.array_pitch_el_rows,
                                             src_x + chunk_x, src_y + chunk_y, 0, 0,
                                             &src_offset,
@@ -336,8 +340,10 @@ static bool crocus_emit_blt(struct crocus_batch *batch,
 
          uint64_t dst_offset;
          uint32_t dst_tile_x, dst_tile_y;
-         isl_tiling_get_intratile_offset_el(dst->surf.tiling,
-                                            dst_cpp * 8, dst->surf.row_pitch_B,
+         isl_tiling_get_intratile_offset_el(dst->surf.tiling, dst->surf.dim,
+                                            dst->surf.msaa_layout,
+                                            dst_cpp * 8, dst->surf.samples,
+                                            dst->surf.row_pitch_B,
                                             dst->surf.array_pitch_el_rows,
                                             dst_x + chunk_x, dst_y + chunk_y, 0, 0,
                                             &dst_offset,
diff --git a/mesa 3D driver/src/gallium/drivers/crocus/crocus_bufmgr.h b/mesa 3D driver/src/gallium/drivers/crocus/crocus_bufmgr.h
index 0f3408c123..e431044844 100644
--- a/mesa 3D driver/src/gallium/drivers/crocus/crocus_bufmgr.h	
+++ b/mesa 3D driver/src/gallium/drivers/crocus/crocus_bufmgr.h	
@@ -302,7 +302,6 @@ int crocus_bo_busy(struct crocus_bo *bo);
  */
 int crocus_bo_madvise(struct crocus_bo *bo, int madv);
 
-/* drm_bacon_bufmgr_gem.c */
 struct crocus_bufmgr *
 crocus_bufmgr_get_for_fd(struct intel_device_info *devinfo, int fd,
                          bool bo_reuse);
diff --git a/mesa 3D driver/src/gallium/drivers/crocus/crocus_clear.c b/mesa 3D driver/src/gallium/drivers/crocus/crocus_clear.c
index 261f19ac9d..029131caf5 100644
--- a/mesa 3D driver/src/gallium/drivers/crocus/crocus_clear.c	
+++ b/mesa 3D driver/src/gallium/drivers/crocus/crocus_clear.c	
@@ -73,7 +73,7 @@ can_fast_clear_color(struct crocus_context *ice,
 {
    struct crocus_resource *res = (void *) p_res;
 
-   if (INTEL_DEBUG & DEBUG_NO_FAST_CLEAR)
+   if (INTEL_DEBUG(DEBUG_NO_FAST_CLEAR))
       return false;
 
    if (!isl_aux_usage_has_fast_clears(res->aux.usage))
@@ -360,7 +360,6 @@ clear_color(struct crocus_context *ice,
       return;
    }
 
-   bool color_write_disable[4] = { false, false, false, false };
    enum isl_aux_usage aux_usage =
       crocus_resource_render_aux_usage(ice, res, level, format, false);
 
@@ -381,7 +380,7 @@ clear_color(struct crocus_context *ice,
    blorp_clear(&blorp_batch, &surf, format, swizzle,
                level, box->z, box->depth, box->x, box->y,
                box->x + box->width, box->y + box->height,
-               color, color_write_disable);
+               color, 0 /* color_write_disable */);
 
    blorp_batch_finish(&blorp_batch);
    crocus_flush_and_dirty_for_history(ice, batch, res,
@@ -408,7 +407,7 @@ can_fast_clear_depth(struct crocus_context *ice,
    if (devinfo->ver < 6)
       return false;
 
-   if (INTEL_DEBUG & DEBUG_NO_FAST_CLEAR)
+   if (INTEL_DEBUG(DEBUG_NO_FAST_CLEAR))
       return false;
 
    /* Check for partial clears */
diff --git a/mesa 3D driver/src/gallium/drivers/crocus/crocus_context.c b/mesa 3D driver/src/gallium/drivers/crocus/crocus_context.c
index 44d947de84..ed0a372300 100644
--- a/mesa 3D driver/src/gallium/drivers/crocus/crocus_context.c	
+++ b/mesa 3D driver/src/gallium/drivers/crocus/crocus_context.c	
@@ -325,8 +325,6 @@ crocus_create_context(struct pipe_screen *pscreen, void *priv, unsigned flags)
    return threaded_context_create(ctx, &screen->transfer_pool,
                                   crocus_replace_buffer_storage,
                                   NULL, /* TODO: asynchronous flushes? */
-                                  NULL,
-                                  false,
                                   &ice->thrctx);
 }
 
diff --git a/mesa 3D driver/src/gallium/drivers/crocus/crocus_context.h b/mesa 3D driver/src/gallium/drivers/crocus/crocus_context.h
index 15b41079ce..f8f89f8d77 100644
--- a/mesa 3D driver/src/gallium/drivers/crocus/crocus_context.h	
+++ b/mesa 3D driver/src/gallium/drivers/crocus/crocus_context.h	
@@ -318,9 +318,6 @@ struct crocus_uncompiled_shader {
    /** Have any shader variants been compiled yet? */
    bool compiled_once;
 
-   /** Should we use ALT mode for math?  Useful for ARB programs. */
-   bool use_alt_mode;
-
    bool needs_edge_flag;
 
    /** Constant data scraped from the shader by nir_opt_large_constants */
@@ -587,9 +584,11 @@ struct crocus_context {
 
       bool primitive_restart;
       unsigned cut_index;
+      enum pipe_prim_type reduced_prim_mode:8;
       enum pipe_prim_type prim_mode:8;
       bool prim_is_points_or_lines;
       uint8_t vertices_per_patch;
+      uint8_t patch_vertices;
 
       bool window_space_position;
 
@@ -624,6 +623,8 @@ struct crocus_context {
 
       struct crocus_shader_state shaders[MESA_SHADER_STAGES];
 
+      /* track if geom shader is active for IVB GT2 workaround */
+      bool gs_enabled;
       /** Do vertex shader uses shader draw parameters ? */
       bool vs_uses_draw_params;
       bool vs_uses_derived_draw_params;
@@ -753,7 +754,7 @@ struct crocus_context {
 };
 
 #define perf_debug(dbg, ...) do {                      \
-   if (INTEL_DEBUG & DEBUG_PERF)                       \
+   if (INTEL_DEBUG(DEBUG_PERF))                        \
       dbg_printf(__VA_ARGS__);                         \
    if (unlikely(dbg))                                  \
       pipe_debug_message(dbg, PERF_INFO, __VA_ARGS__); \
diff --git a/mesa 3D driver/src/gallium/drivers/crocus/crocus_disk_cache.c b/mesa 3D driver/src/gallium/drivers/crocus/crocus_disk_cache.c
index c84d043fbc..037136ec43 100644
--- a/mesa 3D driver/src/gallium/drivers/crocus/crocus_disk_cache.c	
+++ b/mesa 3D driver/src/gallium/drivers/crocus/crocus_disk_cache.c	
@@ -237,7 +237,7 @@ void
 crocus_disk_cache_init(struct crocus_screen *screen)
 {
 #ifdef ENABLE_SHADER_CACHE
-   if (INTEL_DEBUG & DEBUG_DISK_CACHE_DISABLE_MASK)
+   if (INTEL_DEBUG(DEBUG_DISK_CACHE_DISABLE_MASK))
       return;
 
    /* array length = print length + nul char + 1 extra to verify it's unused */
diff --git a/mesa 3D driver/src/gallium/drivers/crocus/crocus_draw.c b/mesa 3D driver/src/gallium/drivers/crocus/crocus_draw.c
index feef4e78ec..558083045a 100644
--- a/mesa 3D driver/src/gallium/drivers/crocus/crocus_draw.c	
+++ b/mesa 3D driver/src/gallium/drivers/crocus/crocus_draw.c	
@@ -41,7 +41,6 @@
 #include "crocus_context.h"
 #include "crocus_defines.h"
 #include "util/u_prim_restart.h"
-#include "indices/u_primconvert.h"
 #include "util/u_prim.h"
 
 static bool
@@ -139,11 +138,18 @@ crocus_update_draw_info(struct crocus_context *ice,
    if (ice->state.prim_mode != mode) {
       ice->state.prim_mode = mode;
 
+      enum pipe_prim_type reduced = u_reduced_prim(mode);
+      if (ice->state.reduced_prim_mode != reduced) {
+         if (screen->devinfo.ver < 6)
+            ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG;
+         /* if the reduced prim changes the WM needs updating. */
+         ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_UNCOMPILED_FS;
+         ice->state.reduced_prim_mode = reduced;
+      }
+
       if (screen->devinfo.ver == 8)
          ice->state.dirty |= CROCUS_DIRTY_GEN8_VF_TOPOLOGY;
 
-      if (screen->devinfo.ver < 6)
-         ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG;
       if (screen->devinfo.ver <= 6)
          ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
 
@@ -159,8 +165,8 @@ crocus_update_draw_info(struct crocus_context *ice,
    }
 
    if (info->mode == PIPE_PRIM_PATCHES &&
-       ice->state.vertices_per_patch != info->vertices_per_patch) {
-      ice->state.vertices_per_patch = info->vertices_per_patch;
+       ice->state.vertices_per_patch != ice->state.patch_vertices) {
+      ice->state.vertices_per_patch = ice->state.patch_vertices;
 
       if (screen->devinfo.ver == 8)
          ice->state.dirty |= CROCUS_DIRTY_GEN8_VF_TOPOLOGY;
@@ -399,8 +405,8 @@ crocus_draw_vbo(struct pipe_context *ctx,
    /* We can't safely re-emit 3DSTATE_SO_BUFFERS because it may zero the
     * write offsets, changing the behavior.
     */
-   if (unlikely(INTEL_DEBUG & DEBUG_REEMIT)) {
-      ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_RENDER & ~CROCUS_DIRTY_GEN7_SO_BUFFERS;
+   if (INTEL_DEBUG(DEBUG_REEMIT)) {
+      ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_RENDER & ~(CROCUS_DIRTY_GEN7_SO_BUFFERS | CROCUS_DIRTY_GEN6_SVBI);
       ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_RENDER;
    }
 
@@ -478,7 +484,7 @@ crocus_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info *grid)
    if (!crocus_check_conditional_render(ice))
       return;
 
-   if (unlikely(INTEL_DEBUG & DEBUG_REEMIT)) {
+   if (INTEL_DEBUG(DEBUG_REEMIT)) {
       ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_COMPUTE;
       ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE;
    }
diff --git a/mesa 3D driver/src/gallium/drivers/crocus/crocus_program.c b/mesa 3D driver/src/gallium/drivers/crocus/crocus_program.c
index 33c97ca6a3..57dbccf155 100644
--- a/mesa 3D driver/src/gallium/drivers/crocus/crocus_program.c	
+++ b/mesa 3D driver/src/gallium/drivers/crocus/crocus_program.c	
@@ -951,7 +951,7 @@ crocus_setup_binding_table(const struct intel_device_info *devinfo,
    }
    bt->size_bytes = next * 4;
 
-   if (unlikely(INTEL_DEBUG & DEBUG_BT)) {
+   if (INTEL_DEBUG(DEBUG_BT)) {
       crocus_print_binding_table(stderr, gl_shader_stage_name(info->stage), bt);
    }
 
@@ -1200,7 +1200,7 @@ crocus_compile_vs(struct crocus_context *ice,
       nir_shader_gather_info(nir, impl);
    }
 
-   prog_data->use_alt_mode = ish->use_alt_mode;
+   prog_data->use_alt_mode = nir->info.is_arb_asm;
 
    crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values,
                          &num_system_values, &num_cbufs);
@@ -1829,7 +1829,7 @@ crocus_compile_fs(struct crocus_context *ice,
 
    nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir);
 
-   prog_data->use_alt_mode = ish->use_alt_mode;
+   prog_data->use_alt_mode = nir->info.is_arb_asm;
 
    crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values,
                          &num_system_values, &num_cbufs);
@@ -2082,7 +2082,7 @@ crocus_update_compiled_clip(struct crocus_context *ice)
       memcpy(key.interp_mode, wm_prog_data->interp_mode, sizeof(key.interp_mode));
    }
 
-   key.primitive = u_reduced_prim(ice->state.prim_mode);
+   key.primitive = ice->state.reduced_prim_mode;
    key.attrs = ice->shaders.last_vue_map->slots_valid;
 
    struct pipe_rasterizer_state *rs_state = crocus_get_rast_state(ice);
@@ -2230,7 +2230,7 @@ crocus_update_compiled_sf(struct crocus_context *ice)
 
    key.attrs = ice->shaders.last_vue_map->slots_valid;
 
-   switch (u_reduced_prim(ice->state.prim_mode)) {
+   switch (ice->state.reduced_prim_mode) {
    case GL_TRIANGLES:
    default:
       if (key.attrs & BITFIELD64_BIT(VARYING_SLOT_EDGE))
@@ -2645,24 +2645,9 @@ crocus_get_scratch_space(struct crocus_context *ice,
 
    struct crocus_bo **bop = &ice->shaders.scratch_bos[encoded_size][stage];
 
-   unsigned subslice_total = screen->subslice_total;
-   subslice_total = 4 * devinfo->num_slices;
-   //   assert(subslice_total >= screen->subslice_total);
-
    if (!*bop) {
-      unsigned scratch_ids_per_subslice = devinfo->max_cs_threads;
-
-      uint32_t max_threads[] = {
-         [MESA_SHADER_VERTEX]    = devinfo->max_vs_threads,
-         [MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads,
-         [MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads,
-         [MESA_SHADER_GEOMETRY]  = devinfo->max_gs_threads,
-         [MESA_SHADER_FRAGMENT]  = devinfo->max_wm_threads,
-         [MESA_SHADER_COMPUTE]   = scratch_ids_per_subslice * subslice_total,
-      };
-
-      uint32_t size = per_thread_scratch * max_threads[stage];
-
+      assert(stage < ARRAY_SIZE(devinfo->max_scratch_ids));
+      uint32_t size = per_thread_scratch * devinfo->max_scratch_ids[stage];
       *bop = crocus_bo_alloc(bufmgr, "scratch", size);
    }
 
@@ -2698,7 +2683,7 @@ crocus_create_uncompiled_shader(struct pipe_context *ctx,
 
    brw_preprocess_nir(screen->compiler, nir, NULL);
 
-   NIR_PASS_V(nir, brw_nir_lower_storage_image, devinfo, false);
+   NIR_PASS_V(nir, brw_nir_lower_storage_image, devinfo);
    NIR_PASS_V(nir, crocus_lower_storage_image_derefs);
 
    nir_sweep(nir);
@@ -2710,10 +2695,6 @@ crocus_create_uncompiled_shader(struct pipe_context *ctx,
       update_so_info(&ish->stream_output, nir->info.outputs_written);
    }
 
-   /* Save this now before potentially dropping nir->info.name */
-   if (nir->info.name && strncmp(nir->info.name, "ARB", 3) == 0)
-      ish->use_alt_mode = true;
-
    if (screen->disk_cache) {
       /* Serialize the NIR to a binary blob that we can hash for the disk
        * cache.  Drop unnecessary information (like variable names)
diff --git a/mesa 3D driver/src/gallium/drivers/crocus/crocus_program_cache.c b/mesa 3D driver/src/gallium/drivers/crocus/crocus_program_cache.c
index d2d4b82175..52d8bbf0b4 100644
--- a/mesa 3D driver/src/gallium/drivers/crocus/crocus_program_cache.c	
+++ b/mesa 3D driver/src/gallium/drivers/crocus/crocus_program_cache.c	
@@ -155,10 +155,11 @@ crocus_cache_new_bo(struct crocus_context *ice,
    ice->shaders.cache_bo = new_bo;
    ice->shaders.cache_bo_map = map;
 
-   if (screen->devinfo.ver == 4) {
+   if (screen->devinfo.ver <= 5) {
       /* reemit all shaders on GEN4 only. */
       ice->state.dirty |= CROCUS_DIRTY_CLIP | CROCUS_DIRTY_RASTER |
          CROCUS_DIRTY_WM;
+      ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS;
    }
    ice->batches[CROCUS_BATCH_RENDER].state_base_address_emitted = false;
    ice->batches[CROCUS_BATCH_COMPUTE].state_base_address_emitted = false;
diff --git a/mesa 3D driver/src/gallium/drivers/crocus/crocus_query.c b/mesa 3D driver/src/gallium/drivers/crocus/crocus_query.c
index df3f310364..152fd4fa28 100644
--- a/mesa 3D driver/src/gallium/drivers/crocus/crocus_query.c	
+++ b/mesa 3D driver/src/gallium/drivers/crocus/crocus_query.c	
@@ -659,7 +659,7 @@ crocus_get_query_result(struct pipe_context *ctx,
    struct crocus_screen *screen = (void *) ctx->screen;
    const struct intel_device_info *devinfo = &screen->devinfo;
 
-   if (unlikely(screen->no_hw)) {
+   if (unlikely(screen->devinfo.no_hw)) {
       result->u64 = 0;
       return true;
    }
@@ -678,8 +678,12 @@ crocus_get_query_result(struct pipe_context *ctx,
       }
       assert(READ_ONCE(q->map->snapshots_landed));
 #else
-      if (crocus_wait_syncobj(ctx->screen, q->syncobj, wait ? INT64_MAX : 0))
+      if (crocus_wait_syncobj(ctx->screen, q->syncobj, wait ? INT64_MAX : 0)) {
+         /* if we've waited and timedout, just set the query to ready to avoid infinite loop */
+         if (wait)
+            q->ready = true;
          return false;
+      }
 #endif
       calculate_result_on_cpu(devinfo, q);
    }
diff --git a/mesa 3D driver/src/gallium/drivers/crocus/crocus_resource.c b/mesa 3D driver/src/gallium/drivers/crocus/crocus_resource.c
index 2c8ea3a652..11309b8bef 100644
--- a/mesa 3D driver/src/gallium/drivers/crocus/crocus_resource.c	
+++ b/mesa 3D driver/src/gallium/drivers/crocus/crocus_resource.c	
@@ -162,7 +162,7 @@ pipe_bind_to_isl_usage(unsigned bindings)
    if (bindings & (PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SHADER_BUFFER))
       usage |= ISL_SURF_USAGE_STORAGE_BIT;
 
-   if (bindings & PIPE_BIND_DISPLAY_TARGET)
+   if (bindings & PIPE_BIND_SCANOUT)
       usage |= ISL_SURF_USAGE_DISPLAY_BIT;
    return usage;
 }
@@ -356,7 +356,7 @@ crocus_alloc_resource(struct pipe_screen *pscreen,
    res->base.b.screen = pscreen;
    res->orig_screen = crocus_pscreen_ref(pscreen);
    pipe_reference_init(&res->base.b.reference, 1);
-   threaded_resource_init(&res->base.b);
+   threaded_resource_init(&res->base.b, false, 0);
 
    if (templ->target == PIPE_BUFFER)
       util_range_init(&res->valid_buffer_range);
@@ -434,11 +434,11 @@ crocus_resource_configure_aux(struct crocus_screen *screen,
       isl_surf_get_mcs_surf(&screen->isl_dev, &res->surf, &res->aux.surf);
 
    const bool has_hiz = devinfo->ver >= 6 && !res->mod_info &&
-      !(INTEL_DEBUG & DEBUG_NO_HIZ) &&
+      !INTEL_DEBUG(DEBUG_NO_HIZ) &&
       isl_surf_get_hiz_surf(&screen->isl_dev, &res->surf, &res->aux.surf);
 
    const bool has_ccs =
-      ((devinfo->ver >= 7 && !res->mod_info && !(INTEL_DEBUG & DEBUG_NO_RBC)) ||
+      ((devinfo->ver >= 7 && !res->mod_info && !INTEL_DEBUG(DEBUG_NO_RBC)) ||
        (res->mod_info && res->mod_info->aux_usage != ISL_AUX_USAGE_NONE)) &&
       isl_surf_get_ccs_surf(&screen->isl_dev, &res->surf, NULL,
                             &res->aux.surf, 0);
diff --git a/mesa 3D driver/src/gallium/drivers/crocus/crocus_screen.c b/mesa 3D driver/src/gallium/drivers/crocus/crocus_screen.c
index 7a741e98f3..16060de063 100644
--- a/mesa 3D driver/src/gallium/drivers/crocus/crocus_screen.c	
+++ b/mesa 3D driver/src/gallium/drivers/crocus/crocus_screen.c	
@@ -694,7 +694,7 @@ crocus_shader_perf_log(void *data, unsigned *id, const char *fmt, ...)
    va_list args;
    va_start(args, fmt);
 
-   if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
+   if (INTEL_DEBUG(DEBUG_PERF)) {
       va_list args_copy;
       va_copy(args_copy, args);
       vfprintf(stderr, fmt, args_copy);
@@ -746,7 +746,6 @@ crocus_screen_create(int fd, const struct pipe_screen_config *config)
    if (!intel_get_device_info_from_fd(fd, &screen->devinfo))
       return NULL;
    screen->pci_id = screen->devinfo.chipset_id;
-   screen->no_hw = screen->devinfo.no_hw;
 
    if (screen->devinfo.ver > 8)
       return NULL;
@@ -762,9 +761,6 @@ crocus_screen_create(int fd, const struct pipe_screen_config *config)
 
    screen->aperture_bytes = get_aperture_size(fd);
 
-   if (getenv("INTEL_NO_HW") != NULL)
-      screen->no_hw = true;
-
    driParseConfigFiles(config->options, config->options_info, 0, "crocus",
                        NULL, NULL, NULL, 0, NULL, 0);
 
@@ -817,9 +813,6 @@ crocus_screen_create(int fd, const struct pipe_screen_config *config)
    slab_create_parent(&screen->transfer_pool,
                       sizeof(struct crocus_transfer), 64);
 
-   screen->subslice_total = intel_device_info_subslice_total(&screen->devinfo);
-   assert(screen->subslice_total >= 1);
-
    struct pipe_screen *pscreen = &screen->base;
 
    crocus_init_screen_fence_functions(pscreen);
diff --git a/mesa 3D driver/src/gallium/drivers/crocus/crocus_screen.h b/mesa 3D driver/src/gallium/drivers/crocus/crocus_screen.h
index 652f81388b..d8fc1f2d15 100644
--- a/mesa 3D driver/src/gallium/drivers/crocus/crocus_screen.h	
+++ b/mesa 3D driver/src/gallium/drivers/crocus/crocus_screen.h	
@@ -185,8 +185,6 @@ struct crocus_screen {
    /** PCI ID for our GPU device */
    int pci_id;
 
-   bool no_hw;
-
    struct crocus_vtable vtbl;
 
    /** Global program_string_id counter (see get_program_string_id()) */
@@ -203,8 +201,6 @@ struct crocus_screen {
       bool always_flush_cache;
    } driconf;
 
-   unsigned subslice_total;
-
    uint64_t aperture_bytes;
 
    struct intel_device_info devinfo;
diff --git a/mesa 3D driver/src/gallium/drivers/crocus/crocus_state.c b/mesa 3D driver/src/gallium/drivers/crocus/crocus_state.c
index 73ad5c2011..f575fdb815 100644
--- a/mesa 3D driver/src/gallium/drivers/crocus/crocus_state.c	
+++ b/mesa 3D driver/src/gallium/drivers/crocus/crocus_state.c	
@@ -426,7 +426,7 @@ flush_before_state_base_change(struct crocus_batch *batch)
     * rendering.  It's a bit of a big hammer but it appears to work.
     */
    const unsigned dc_flush =
-      batch->screen->devinfo.ver >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
+      GFX_VER >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
    crocus_emit_end_of_pipe_sync(batch,
                                 "change STATE_BASE_ADDRESS (flushes)",
                                 PIPE_CONTROL_RENDER_TARGET_FLUSH |
@@ -698,7 +698,6 @@ static bool
 crocus_calculate_urb_fence(struct crocus_batch *batch, unsigned csize,
                            unsigned vsize, unsigned sfsize)
 {
-   const struct intel_device_info *devinfo = &batch->screen->devinfo;
    struct crocus_context *ice = batch->ice;
    if (csize < limits[URB_CS].min_entry_size)
       csize = limits[URB_CS].min_entry_size;
@@ -729,7 +728,7 @@ crocus_calculate_urb_fence(struct crocus_batch *batch, unsigned csize,
 
       ice->urb.constrained = 0;
 
-      if (devinfo->ver == 5) {
+      if (GFX_VER == 5) {
          ice->urb.nr_vs_entries = 128;
          ice->urb.nr_sf_entries = 48;
          if (check_urb_layout(ice)) {
@@ -739,7 +738,7 @@ crocus_calculate_urb_fence(struct crocus_batch *batch, unsigned csize,
             ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
             ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries;
          }
-      } else if (devinfo->is_g4x) {
+      } else if (GFX_VERx10 == 45) {
          ice->urb.nr_vs_entries = 64;
          if (check_urb_layout(ice)) {
             goto done;
@@ -771,12 +770,12 @@ crocus_calculate_urb_fence(struct crocus_batch *batch, unsigned csize,
             exit(1);
          }
 
-         if (unlikely(INTEL_DEBUG & (DEBUG_URB|DEBUG_PERF)))
+         if (INTEL_DEBUG(DEBUG_URB|DEBUG_PERF))
             fprintf(stderr, "URB CONSTRAINED\n");
       }
 
 done:
-      if (unlikely(INTEL_DEBUG & DEBUG_URB))
+      if (INTEL_DEBUG(DEBUG_URB))
          fprintf(stderr,
                  "URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
                  ice->urb.vs_start,
@@ -1197,7 +1196,7 @@ emit_l3_state(struct crocus_batch *batch, bool compute)
       compute ? batch->screen->l3_config_cs : batch->screen->l3_config_3d;
 
    setup_l3_config(batch, cfg);
-   if (unlikely(INTEL_DEBUG & DEBUG_L3)) {
+   if (INTEL_DEBUG(DEBUG_L3)) {
       intel_dump_l3_config(cfg, stderr);
    }
 }
@@ -1246,7 +1245,7 @@ emit_pipeline_select(struct crocus_batch *batch, uint32_t pipeline)
     *     MI_PIPELINE_SELECT command to change the Pipeline Select Mode."
     */
    const unsigned dc_flush =
-      batch->screen->devinfo.ver >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
+      GFX_VER >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
    crocus_emit_pipe_control_flush(batch,
                                   "workaround: PIPELINE_SELECT flushes (1/2)",
                                   PIPE_CONTROL_RENDER_TARGET_FLUSH |
@@ -1321,13 +1320,8 @@ emit_pipeline_select(struct crocus_batch *batch, uint32_t pipeline)
 static void
 crocus_alloc_push_constants(struct crocus_batch *batch)
 {
-#if GFX_VERx10 == 75
-   const unsigned push_constant_kb = batch->screen->devinfo.gt == 3 ? 32 : 16;
-#elif GFX_VER == 8
-   const unsigned push_constant_kb = 32;
-#else
-   const unsigned push_constant_kb = 16;
-#endif
+   const unsigned push_constant_kb =
+      batch->screen->devinfo.max_constant_urb_size_kb;
    unsigned size_per_stage = push_constant_kb / 5;
 
    /* For now, we set a static partitioning of the push constant area,
@@ -2026,6 +2020,9 @@ crocus_create_rasterizer_state(struct pipe_context *ctx,
       sf.LineEndCapAntialiasingRegionWidth =
          state->line_smooth ? _10pixels : _05pixels;
       sf.LastPixelEnable = state->line_last_pixel;
+#if GFX_VER <= 7
+      sf.AntialiasingEnable = state->line_smooth;
+#endif
 #if GFX_VER == 8
       struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
       if (screen->devinfo.is_cherryview)
@@ -2696,7 +2693,7 @@ crocus_create_sampler_view(struct pipe_context *ctx,
       tex = util_format_has_depth(desc) ? &zres->base.b : &sres->base.b;
 
       if (tex->format == PIPE_FORMAT_S8_UINT)
-         if (devinfo->ver == 7 && sres->shadow)
+         if (GFX_VER == 7 && sres->shadow)
             tex = &sres->shadow->base.b;
    }
 
@@ -2715,7 +2712,7 @@ crocus_create_sampler_view(struct pipe_context *ctx,
    crocus_combine_swizzle(isv->swizzle, fmt.swizzles, vswz);
 
    /* hardcode stencil swizzles - hw returns 0G01, we want GGGG */
-   if (devinfo->ver < 6 &&
+   if (GFX_VER < 6 &&
        (tmpl->format == PIPE_FORMAT_X32_S8X24_UINT ||
         tmpl->format == PIPE_FORMAT_X24S8_UINT)) {
       isv->swizzle[0] = tmpl->swizzle_g;
@@ -3145,6 +3142,7 @@ crocus_set_sampler_views(struct pipe_context *ctx,
                          enum pipe_shader_type p_stage,
                          unsigned start, unsigned count,
                          unsigned unbind_num_trailing_slots,
+                         bool take_ownership,
                          struct pipe_sampler_view **views)
 {
    struct crocus_context *ice = (struct crocus_context *) ctx;
@@ -3155,8 +3153,16 @@ crocus_set_sampler_views(struct pipe_context *ctx,
 
    for (unsigned i = 0; i < count; i++) {
       struct pipe_sampler_view *pview = views ? views[i] : NULL;
-      pipe_sampler_view_reference((struct pipe_sampler_view **)
-                                  &shs->textures[start + i], pview);
+
+      if (take_ownership) {
+         pipe_sampler_view_reference((struct pipe_sampler_view **)
+                                     &shs->textures[start + i], NULL);
+         shs->textures[start + i] = (struct crocus_sampler_view *)pview;
+      } else {
+         pipe_sampler_view_reference((struct pipe_sampler_view **)
+                                     &shs->textures[start + i], pview);
+      }
+
       struct crocus_sampler_view *view = (void *) pview;
       if (view) {
          view->res->bind_history |= PIPE_BIND_SAMPLER_VIEW;
@@ -3194,6 +3200,14 @@ crocus_set_tess_state(struct pipe_context *ctx,
    shs->sysvals_need_upload = true;
 }
 
+static void
+crocus_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices)
+{
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+
+   ice->state.patch_vertices = patch_vertices;
+}
+
 static void
 crocus_surface_destroy(struct pipe_context *ctx, struct pipe_surface *p_surf)
 {
@@ -4776,7 +4790,7 @@ crocus_populate_fs_key(const struct crocus_context *ice,
 
    uint32_t line_aa = BRW_WM_AA_NEVER;
    if (rast->cso.line_smooth) {
-      int reduced_prim = u_reduced_prim(ice->state.prim_mode);
+      int reduced_prim = ice->state.reduced_prim_mode;
       if (reduced_prim == PIPE_PRIM_LINES)
          line_aa = BRW_WM_AA_ALWAYS;
       else if (reduced_prim == PIPE_PRIM_TRIANGLES) {
@@ -4941,7 +4955,7 @@ emit_surface_state(struct crocus_batch *batch,
                    struct crocus_resource *res,
                    const struct isl_surf *in_surf,
                    bool adjust_surf,
-                   struct isl_view *view,
+                   struct isl_view *in_view,
                    bool writeable,
                    enum isl_aux_usage aux_usage,
                    bool blend_enable,
@@ -4949,7 +4963,6 @@ emit_surface_state(struct crocus_batch *batch,
                    uint32_t *surf_state,
                    uint32_t addr_offset)
 {
-   const struct intel_device_info *devinfo = &batch->screen->devinfo;
    struct isl_device *isl_dev = &batch->screen->isl_dev;
    uint32_t reloc = RELOC_32BIT;
    uint64_t offset_B = res->offset;
@@ -4959,23 +4972,24 @@ emit_surface_state(struct crocus_batch *batch,
       reloc |= RELOC_WRITE;
 
    struct isl_surf surf = *in_surf;
+   struct isl_view view = *in_view;
    if (adjust_surf) {
-      if (res->base.b.target == PIPE_TEXTURE_3D && view->array_len == 1) {
+      if (res->base.b.target == PIPE_TEXTURE_3D && view.array_len == 1) {
          isl_surf_get_image_surf(isl_dev, in_surf,
-                                 view->base_level, 0,
-                                 view->base_array_layer,
+                                 view.base_level, 0,
+                                 view.base_array_layer,
                                  &surf, &offset_B,
                                  &tile_x_sa, &tile_y_sa);
-         view->base_array_layer = 0;
-         view->base_level = 0;
-      } else if (res->base.b.target == PIPE_TEXTURE_CUBE && devinfo->ver == 4) {
+         view.base_array_layer = 0;
+         view.base_level = 0;
+      } else if (res->base.b.target == PIPE_TEXTURE_CUBE && GFX_VER == 4) {
          isl_surf_get_image_surf(isl_dev, in_surf,
-                                 view->base_level, view->base_array_layer,
+                                 view.base_level, view.base_array_layer,
                                  0,
                                  &surf, &offset_B,
                                  &tile_x_sa, &tile_y_sa);
-         view->base_array_layer = 0;
-         view->base_level = 0;
+         view.base_array_layer = 0;
+         view.base_level = 0;
       } else if (res->base.b.target == PIPE_TEXTURE_1D_ARRAY)
          surf.dim = ISL_SURF_DIM_2D;
    }
@@ -4994,7 +5008,7 @@ emit_surface_state(struct crocus_batch *batch,
 
    isl_surf_fill_state(isl_dev, surf_state,
                        .surf = &surf,
-                       .view = view,
+                       .view = &view,
                        .address = crocus_state_reloc(batch,
                                                      addr_offset + isl_dev->ss.addr_offset,
                                                      res->bo, offset_B, reloc),
@@ -5022,7 +5036,7 @@ emit_surface_state(struct crocus_batch *batch,
        *
        * FIXME: move to the point of assignment.
        */
-      if (devinfo->ver == 8) {
+      if (GFX_VER == 8) {
          uint64_t *aux_addr = (uint64_t *)(surf_state + (isl_dev->ss.aux_addr_offset / 4));
          *aux_addr = crocus_state_reloc(batch,
                                         addr_offset + isl_dev->ss.aux_addr_offset,
@@ -5046,7 +5060,6 @@ emit_surface(struct crocus_batch *batch,
              bool blend_enable,
              uint32_t write_disables)
 {
-   const struct intel_device_info *devinfo = &batch->screen->devinfo;
    struct isl_device *isl_dev = &batch->screen->isl_dev;
    struct crocus_resource *res = (struct crocus_resource *)surf->base.texture;
    struct isl_view *view = &surf->view;
@@ -5054,7 +5067,7 @@ emit_surface(struct crocus_batch *batch,
    enum pipe_texture_target target = res->base.b.target;
    bool adjust_surf = false;
 
-   if (devinfo->ver == 4 && target == PIPE_TEXTURE_CUBE)
+   if (GFX_VER == 4 && target == PIPE_TEXTURE_CUBE)
       adjust_surf = true;
 
    if (surf->align_res)
@@ -5669,9 +5682,6 @@ setup_constant_buffers(struct crocus_context *ice,
 static void
 gen7_emit_vs_workaround_flush(struct crocus_batch *batch)
 {
-   ASSERTED const struct intel_device_info *devinfo = &batch->screen->devinfo;
-
-   assert(devinfo->ver == 7);
    crocus_emit_pipe_control_write(batch,
                                   "vs workaround",
                                   PIPE_CONTROL_WRITE_IMMEDIATE
@@ -5860,8 +5870,10 @@ crocus_upload_dirty_render_state(struct crocus_context *ice,
      bool ret = crocus_calculate_urb_fence(batch, ice->curbe.total_size,
                                            brw_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->urb_entry_size,
                                            ((struct brw_sf_prog_data *)ice->shaders.sf_prog->prog_data)->urb_entry_size);
-     if (ret)
-        dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
+     if (ret) {
+	dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS | CROCUS_DIRTY_RASTER | CROCUS_DIRTY_CLIP;
+	stage_dirty |= CROCUS_STAGE_DIRTY_GS | CROCUS_STAGE_DIRTY_VS;
+     }
    }
 #endif
    if (dirty & CROCUS_DIRTY_CC_VIEWPORT) {
@@ -6788,6 +6800,22 @@ crocus_upload_dirty_render_state(struct crocus_context *ice,
 
       emit_push_constant_packets(ice, batch, MESA_SHADER_GEOMETRY, &push_bos);
 #endif
+#if GFX_VERx10 == 70
+   /**
+    * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
+    * Geometry > Geometry Shader > State:
+    *
+    *     "Note: Because of corruption in IVB:GT2, software needs to flush the
+    *     whole fixed function pipeline when the GS enable changes value in
+    *     the 3DSTATE_GS."
+    *
+    * The hardware architects have clarified that in this context "flush the
+    * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
+    * Stall" bit set.
+    */
+   if (batch->screen->devinfo.gt == 2 && ice->state.gs_enabled != active)
+      gen7_emit_cs_stall_flush(batch);
+#endif
 #if GFX_VER >= 6
       crocus_emit_cmd(batch, GENX(3DSTATE_GS), gs)
 #else
@@ -6933,6 +6961,7 @@ crocus_upload_dirty_render_state(struct crocus_context *ice,
          gs.MaximumVPIndex = ice->state.num_viewports - 1;
 #endif
       }
+      ice->state.gs_enabled = active;
    }
 
 #if GFX_VER >= 7
@@ -7019,11 +7048,15 @@ crocus_upload_dirty_render_state(struct crocus_context *ice,
          sf.DestinationOriginHorizontalBias = 0.5;
          sf.DestinationOriginVerticalBias = 0.5;
 
+	 sf.LineEndCapAntialiasingRegionWidth =
+            cso_state->line_smooth ? _10pixels : _05pixels;
          sf.LastPixelEnable = cso_state->line_last_pixel;
+         sf.AntialiasingEnable = cso_state->line_smooth;
+
          sf.LineWidth = get_line_width(cso_state);
          sf.PointWidth = cso_state->point_size;
          sf.PointWidthSource = cso_state->point_size_per_vertex ? Vertex : State;
-#if GFX_VERx10 == 45 || GFX_VER >= 5
+#if GFX_VERx10 >= 45
          sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
 #endif
          sf.ViewportTransformEnable = true;
@@ -7489,7 +7522,7 @@ crocus_upload_dirty_render_state(struct crocus_context *ice,
    if (dirty & CROCUS_DIRTY_GEN8_VF_TOPOLOGY) {
       crocus_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
          topo.PrimitiveTopologyType =
-            translate_prim_type(draw->mode, draw->vertices_per_patch);
+            translate_prim_type(draw->mode, ice->state.patch_vertices);
       }
    }
 #endif
@@ -7956,7 +7989,7 @@ crocus_upload_render_state(struct crocus_context *ice,
       prim.PredicateEnable = use_predicate;
 #endif
 
-      prim.PrimitiveTopologyType = translate_prim_type(ice->state.prim_mode, draw->vertices_per_patch);
+      prim.PrimitiveTopologyType = translate_prim_type(ice->state.prim_mode, ice->state.patch_vertices);
       if (indirect) {
          // XXX Probably have to do something for gen6 here?
 #if GFX_VER >= 7
@@ -8050,12 +8083,12 @@ crocus_upload_compute_state(struct crocus_context *ice,
          }
 
          vfe.MaximumNumberofThreads =
-            devinfo->max_cs_threads * screen->subslice_total - 1;
+            devinfo->max_cs_threads * devinfo->subslice_total - 1;
          vfe.ResetGatewayTimer =
             Resettingrelativetimerandlatchingtheglobaltimestamp;
          vfe.BypassGatewayControl = true;
 #if GFX_VER == 7
-         vfe.GPGPUMode = 1;
+         vfe.GPGPUMode = true;
 #endif
 #if GFX_VER == 8
          vfe.BypassGatewayControl = true;
@@ -8770,7 +8803,7 @@ crocus_emit_raw_pipe_control(struct crocus_batch *batch,
 
    /* Emit --------------------------------------------------------------- */
 
-   if (INTEL_DEBUG & DEBUG_PIPE_CONTROL) {
+   if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) {
       fprintf(stderr,
               "  PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n",
               (flags & PIPE_CONTROL_FLUSH_ENABLE) ? "PipeCon " : "",
@@ -9122,6 +9155,7 @@ void
 genX(crocus_init_screen_state)(struct crocus_screen *screen)
 {
    assert(screen->devinfo.verx10 == GFX_VERx10);
+   assert(screen->devinfo.ver == GFX_VER);
    screen->vtbl.destroy_state = crocus_destroy_state;
    screen->vtbl.init_render_context = crocus_init_render_context;
    screen->vtbl.upload_render_state = crocus_upload_render_state;
@@ -9205,6 +9239,7 @@ genX(crocus_init_state)(struct crocus_context *ice)
    ctx->set_shader_images = crocus_set_shader_images;
    ctx->set_sampler_views = crocus_set_sampler_views;
    ctx->set_tess_state = crocus_set_tess_state;
+   ctx->set_patch_vertices = crocus_set_patch_vertices;
    ctx->set_framebuffer_state = crocus_set_framebuffer_state;
    ctx->set_polygon_stipple = crocus_set_polygon_stipple;
    ctx->set_sample_mask = crocus_set_sample_mask;
@@ -9233,6 +9268,7 @@ genX(crocus_init_state)(struct crocus_context *ice)
    ice->state.sample_mask = 0xff;
    ice->state.num_viewports = 1;
    ice->state.prim_mode = PIPE_PRIM_MAX;
+   ice->state.reduced_prim_mode = PIPE_PRIM_MAX;
    ice->state.genx = calloc(1, sizeof(struct crocus_genx_state));
    ice->draw.derived_params.drawid = -1;
 
diff --git a/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_blit.cpp b/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_blit.cpp
index d130476ac5..ed44263c8f 100644
--- a/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_blit.cpp	
+++ b/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_blit.cpp	
@@ -469,7 +469,7 @@ create_staging_resource(struct d3d12_context *ctx,
                         unsigned mask)
 
 {
-   struct pipe_resource templ = {{0}};
+   struct pipe_resource templ = {};
    struct pipe_resource *staging_res;
    struct pipe_box copy_src;
 
@@ -780,7 +780,7 @@ resolve_stencil_to_temp(struct d3d12_context *ctx,
    void *sampler_state = get_sampler_state(ctx);
 
    util_blit_save_state(ctx);
-   pctx->set_sampler_views(pctx, PIPE_SHADER_FRAGMENT, 0, 1, 0, &src_view);
+   pctx->set_sampler_views(pctx, PIPE_SHADER_FRAGMENT, 0, 1, 0, false, &src_view);
    pctx->bind_sampler_states(pctx, PIPE_SHADER_FRAGMENT, 0, 1, &sampler_state);
    util_blitter_custom_shader(ctx->blitter, dst_surf,
                               get_stencil_resolve_vs(ctx),
diff --git a/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_compiler.cpp b/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_compiler.cpp
index 5e24fe96a1..b2d4d9a9f5 100644
--- a/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_compiler.cpp	
+++ b/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_compiler.cpp	
@@ -160,7 +160,7 @@ compile_nir(struct d3d12_context *ctx, struct d3d12_shader_selector *sel,
    NIR_PASS_V(nir, nir_lower_packed_ubo_loads);
    NIR_PASS_V(nir, d3d12_lower_load_first_vertex);
    NIR_PASS_V(nir, d3d12_lower_state_vars, shader);
-   NIR_PASS_V(nir, d3d12_lower_bool_input);
+   NIR_PASS_V(nir, dxil_nir_lower_bool_input);
 
    struct nir_to_dxil_options opts = {};
    opts.interpolate_at_vertex = screen->have_load_at_vertex;
@@ -400,7 +400,7 @@ get_provoking_vertex(struct d3d12_selection_context *sel_ctx, bool *alternate)
       mode = (enum pipe_prim_type)last_vertex_stage->current->nir->info.gs.output_primitive;
       break;
    case PIPE_SHADER_VERTEX:
-      mode = sel_ctx->dinfo ? sel_ctx->dinfo->mode : PIPE_PRIM_TRIANGLES;
+      mode = sel_ctx->dinfo ? (enum pipe_prim_type)sel_ctx->dinfo->mode : PIPE_PRIM_TRIANGLES;
       break;
    default:
       unreachable("Tesselation shaders are not supported");
diff --git a/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_context.cpp b/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_context.cpp
index 6167f9be28..73fa7b125c 100644
--- a/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_context.cpp	
+++ b/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_context.cpp	
@@ -947,6 +947,7 @@ d3d12_set_sampler_views(struct pipe_context *pctx,
                         unsigned start_slot,
                         unsigned num_views,
                         unsigned unbind_num_trailing_slots,
+                        bool take_ownership,
                         struct pipe_sampler_view **views)
 {
    struct d3d12_context *ctx = d3d12_context(pctx);
@@ -962,7 +963,12 @@ d3d12_set_sampler_views(struct pipe_context *pctx,
       if (new_view)
          d3d12_increment_sampler_view_bind_count(pctx, shader_type, new_view);
 
-      pipe_sampler_view_reference(&old_view, views[i]);
+      if (take_ownership) {
+         pipe_sampler_view_reference(&old_view, NULL);
+         old_view = views[i];
+      } else {
+         pipe_sampler_view_reference(&old_view, views[i]);
+      }
 
       if (views[i]) {
          dxil_wrap_sampler_state &wss = ctx->tex_wrap_states[shader_type][start_slot + i];
diff --git a/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_draw.cpp b/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_draw.cpp
index 1a2f0b98c4..0dcb0c51dd 100644
--- a/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_draw.cpp	
+++ b/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_draw.cpp	
@@ -461,16 +461,16 @@ d3d12_draw_vbo(struct pipe_context *pctx,
    unsigned index_offset = 0;
    enum d3d12_surface_conversion_mode conversion_modes[PIPE_MAX_COLOR_BUFS] = {};
 
-   if (!prim_supported(dinfo->mode) ||
+   if (!prim_supported((enum pipe_prim_type)dinfo->mode) ||
        dinfo->index_size == 1 ||
        (dinfo->primitive_restart && dinfo->restart_index != 0xffff &&
         dinfo->restart_index != 0xffffffff)) {
 
       if (!dinfo->primitive_restart &&
-          !u_trim_pipe_prim(dinfo->mode, (unsigned *)&draws[0].count))
+          !u_trim_pipe_prim((enum pipe_prim_type)dinfo->mode, (unsigned *)&draws[0].count))
          return;
 
-      ctx->initial_api_prim = dinfo->mode;
+      ctx->initial_api_prim = (enum pipe_prim_type)dinfo->mode;
       util_primconvert_save_rasterizer_state(ctx->primconvert, &ctx->gfx_pipeline_state.rast->base);
       util_primconvert_draw_vbo(ctx->primconvert, dinfo, drawid_offset, indirect, draws, num_draws);
       return;
@@ -497,13 +497,13 @@ d3d12_draw_vbo(struct pipe_context *pctx,
                                                  D3D12_SHADER_DIRTY_SAMPLERS;
 
    /* this should *really* be fixed at a higher level than here! */
-   enum pipe_prim_type reduced_prim = u_reduced_prim(dinfo->mode);
+   enum pipe_prim_type reduced_prim = u_reduced_prim((enum pipe_prim_type)dinfo->mode);
    if (reduced_prim == PIPE_PRIM_TRIANGLES &&
        ctx->gfx_pipeline_state.rast->base.cull_face == PIPE_FACE_FRONT_AND_BACK)
       return;
 
    if (ctx->gfx_pipeline_state.prim_type != dinfo->mode) {
-      ctx->gfx_pipeline_state.prim_type = dinfo->mode;
+      ctx->gfx_pipeline_state.prim_type = (enum pipe_prim_type)dinfo->mode;
       ctx->state_dirty |= D3D12_DIRTY_PRIM_MODE;
    }
 
@@ -640,7 +640,7 @@ d3d12_draw_vbo(struct pipe_context *pctx,
       ctx->cmdlist->OMSetStencilRef(ctx->stencil_ref.ref_value[0]);
 
    if (ctx->cmdlist_dirty & D3D12_DIRTY_PRIM_MODE)
-      ctx->cmdlist->IASetPrimitiveTopology(topology(dinfo->mode));
+      ctx->cmdlist->IASetPrimitiveTopology(topology((enum pipe_prim_type)dinfo->mode));
 
    for (unsigned i = 0; i < ctx->num_vbs; ++i) {
       if (ctx->vbs[i].buffer.resource) {
diff --git a/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_lower_int_cubemap_to_array.c b/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_lower_int_cubemap_to_array.c
index d71689f15c..62cc47fe5b 100644
--- a/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_lower_int_cubemap_to_array.c	
+++ b/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_lower_int_cubemap_to_array.c	
@@ -131,7 +131,7 @@ create_array_tex_from_cube_tex(nir_builder *b, nir_tex_instr *tex, nir_ssa_def *
       nir_src *psrc = (tex->src[i].src_type == nir_tex_src_coord) ?
                          &coord_src : &tex->src[i].src;
 
-      nir_src_copy(&array_tex->src[i].src, psrc, array_tex);
+      nir_src_copy(&array_tex->src[i].src, psrc);
       array_tex->src[i].src_type = tex->src[i].src_type;
    }
 
diff --git a/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_nir_passes.c b/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_nir_passes.c
index 0b3bbbc43d..7ed43cbc7b 100644
--- a/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_nir_passes.c	
+++ b/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_nir_passes.c	
@@ -542,54 +542,6 @@ d3d12_lower_state_vars(nir_shader *nir, struct d3d12_shader *shader)
    return progress;
 }
 
-static bool
-lower_bool_input_filter(const nir_instr *instr,
-                        UNUSED const void *_options)
-{
-   if (instr->type != nir_instr_type_intrinsic)
-      return false;
-
-   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-   if (intr->intrinsic == nir_intrinsic_load_front_face)
-      return true;
-
-   if (intr->intrinsic == nir_intrinsic_load_deref) {
-      nir_deref_instr *deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
-      nir_variable *var = nir_deref_instr_get_variable(deref);
-      return var->data.mode == nir_var_shader_in &&
-             glsl_get_base_type(var->type) == GLSL_TYPE_BOOL;
-   }
-
-   return false;
-}
-
-static nir_ssa_def *
-lower_bool_input_impl(nir_builder *b, nir_instr *instr,
-                      UNUSED void *_options)
-{
-   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
-   if (intr->intrinsic == nir_intrinsic_load_deref) {
-      nir_deref_instr *deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
-      nir_variable *var = nir_deref_instr_get_variable(deref);
-
-      /* rewrite var->type */
-      var->type = glsl_vector_type(GLSL_TYPE_UINT,
-                                   glsl_get_vector_elements(var->type));
-      deref->type = var->type;
-   }
-
-   intr->dest.ssa.bit_size = 32;
-   return nir_i2b1(b, &intr->dest.ssa);
-}
-
-bool
-d3d12_lower_bool_input(struct nir_shader *s)
-{
-   return nir_shader_lower_instructions(s, lower_bool_input_filter,
-                                        lower_bool_input_impl, NULL);
-}
-
 void
 d3d12_add_missing_dual_src_target(struct nir_shader *s,
                                   unsigned missing_mask)
diff --git a/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_nir_passes.h b/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_nir_passes.h
index 38d36206ca..4461939013 100644
--- a/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_nir_passes.h	
+++ b/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_nir_passes.h	
@@ -54,9 +54,6 @@ d3d12_lower_depth_range(nir_shader *nir);
 bool
 d3d12_lower_load_first_vertex(nir_shader *nir);
 
-bool
-d3d12_lower_bool_input(struct nir_shader *s);
-
 void
 d3d12_lower_uint_cast(nir_shader *nir, bool is_signed);
 
diff --git a/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_surface.cpp b/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_surface.cpp
index 58516f6a34..610c87631f 100644
--- a/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_surface.cpp	
+++ b/mesa 3D driver/src/gallium/drivers/d3d12/d3d12_surface.cpp	
@@ -320,7 +320,7 @@ d3d12_surface_update_pre_draw(struct d3d12_surface *surface,
 
    if (mode == D3D12_SURFACE_CONVERSION_BGRA_UINT) {
       if (!surface->rgba_texture) {
-         struct pipe_resource templ = {{0}};
+         struct pipe_resource templ = {};
          struct pipe_resource *src = surface->base.texture;
 
          templ.format = PIPE_FORMAT_R8G8B8A8_UNORM;
diff --git a/mesa 3D driver/src/gallium/drivers/etnaviv/etnaviv_compiler_nir.c b/mesa 3D driver/src/gallium/drivers/etnaviv/etnaviv_compiler_nir.c
index b1ca2e0ddb..00ebc250bd 100644
--- a/mesa 3D driver/src/gallium/drivers/etnaviv/etnaviv_compiler_nir.c	
+++ b/mesa 3D driver/src/gallium/drivers/etnaviv/etnaviv_compiler_nir.c	
@@ -687,7 +687,7 @@ insert_vec_mov(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)
    unsigned write_mask = (1u << start_idx);
 
    nir_alu_instr *mov = nir_alu_instr_create(shader, nir_op_mov);
-   nir_alu_src_copy(&mov->src[0], &vec->src[start_idx], mov);
+   nir_alu_src_copy(&mov->src[0], &vec->src[start_idx]);
 
    mov->src[0].swizzle[0] = vec->src[start_idx].swizzle[0];
    mov->src[0].negate = vec->src[start_idx].negate;
@@ -944,7 +944,7 @@ emit_shader(struct etna_compile *c, unsigned *num_temps, unsigned *num_consts)
       c->const_count = indirect_max;
    }
 
-   /* add mov for any store output using sysval/const  */
+   /* add mov for any store output using sysval/const and for depth stores from intrinsics */
    nir_foreach_block(block, c->impl) {
       nir_foreach_instr_safe(instr, block) {
          if (instr->type != nir_instr_type_intrinsic)
@@ -954,8 +954,13 @@ emit_shader(struct etna_compile *c, unsigned *num_temps, unsigned *num_consts)
 
          switch (intr->intrinsic) {
          case nir_intrinsic_store_deref: {
+            nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
             nir_src *src = &intr->src[1];
-            if (nir_src_is_const(*src) || is_sysval(src->ssa->parent_instr)) {
+            if (nir_src_is_const(*src) || is_sysval(src->ssa->parent_instr) ||
+                (shader->info.stage == MESA_SHADER_FRAGMENT &&
+                 deref->var->data.location == FRAG_RESULT_DEPTH &&
+                 src->is_ssa &&
+                 src->ssa->parent_instr->type != nir_instr_type_alu)) {
                b.cursor = nir_before_instr(instr);
                nir_instr_rewrite_src(instr, src, nir_src_for_ssa(nir_mov(&b, src->ssa)));
             }
diff --git a/mesa 3D driver/src/gallium/drivers/etnaviv/etnaviv_context.c b/mesa 3D driver/src/gallium/drivers/etnaviv/etnaviv_context.c
index 581edc78d2..f2ab4ecfe1 100644
--- a/mesa 3D driver/src/gallium/drivers/etnaviv/etnaviv_context.c	
+++ b/mesa 3D driver/src/gallium/drivers/etnaviv/etnaviv_context.c	
@@ -142,9 +142,6 @@ etna_context_destroy(struct pipe_context *pctx)
 
    util_copy_framebuffer_state(&ctx->framebuffer_s, NULL);
 
-   if (ctx->primconvert)
-      util_primconvert_destroy(ctx->primconvert);
-
    if (ctx->blitter)
       util_blitter_destroy(ctx->blitter);
 
@@ -255,13 +252,6 @@ etna_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
    if (ctx->vertex_elements == NULL || ctx->vertex_elements->num_elements == 0)
       return; /* Nothing to do */
 
-   if (!(ctx->prim_hwsupport & (1 << info->mode))) {
-      struct primconvert_context *primconvert = ctx->primconvert;
-      util_primconvert_save_rasterizer_state(primconvert, ctx->rasterizer);
-      util_primconvert_draw_vbo(primconvert, info, drawid_offset, indirect, draws, num_draws);
-      return;
-   }
-
    int prims = u_decomposed_prims_for_vertices(info->mode, draws[0].count);
    if (unlikely(prims <= 0)) {
       DBG("Invalid draw primitive mode=%i or no primitives to be drawn", info->mode);
@@ -649,27 +639,6 @@ etna_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
    if (!ctx->blitter)
       goto fail;
 
-   /* Generate the bitmask of supported draw primitives. */
-   ctx->prim_hwsupport = 1 << PIPE_PRIM_POINTS |
-                         1 << PIPE_PRIM_LINES |
-                         1 << PIPE_PRIM_LINE_STRIP |
-                         1 << PIPE_PRIM_TRIANGLES |
-                         1 << PIPE_PRIM_TRIANGLE_FAN;
-
-   /* TODO: The bug relates only to indexed draws, but here we signal
-    * that there is no support for triangle strips at all. This should
-    * be refined.
-    */
-   if (VIV_FEATURE(ctx->screen, chipMinorFeatures2, BUG_FIXES8))
-      ctx->prim_hwsupport |= 1 << PIPE_PRIM_TRIANGLE_STRIP;
-
-   if (VIV_FEATURE(ctx->screen, chipMinorFeatures2, LINE_LOOP))
-      ctx->prim_hwsupport |= 1 << PIPE_PRIM_LINE_LOOP;
-
-   ctx->primconvert = util_primconvert_create(pctx, ctx->prim_hwsupport);
-   if (!ctx->primconvert)
-      goto fail;
-
    slab_create_child(&ctx->transfer_pool, &screen->transfer_pool);
    list_inithead(&ctx->active_acc_queries);
 
diff --git a/mesa 3D driver/src/gallium/drivers/etnaviv/etnaviv_context.h b/mesa 3D driver/src/gallium/drivers/etnaviv/etnaviv_context.h
index 21e4d3f33c..1da6a2127f 100644
--- a/mesa 3D driver/src/gallium/drivers/etnaviv/etnaviv_context.h	
+++ b/mesa 3D driver/src/gallium/drivers/etnaviv/etnaviv_context.h	
@@ -32,7 +32,6 @@
 
 #include "etnaviv_resource.h"
 #include "etnaviv_tiling.h"
-#include "indices/u_primconvert.h"
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_format.h"
@@ -143,9 +142,6 @@ struct etna_context {
       ETNA_DIRTY_SCISSOR_CLIP    = (1 << 20),
    } dirty;
 
-   uint32_t prim_hwsupport;
-   struct primconvert_context *primconvert;
-
    struct slab_child_pool transfer_pool;
    struct blitter_context *blitter;
 
diff --git a/mesa 3D driver/src/gallium/drivers/etnaviv/etnaviv_screen.c b/mesa 3D driver/src/gallium/drivers/etnaviv/etnaviv_screen.c
index 45fede5ca0..506e96c6eb 100644
--- a/mesa 3D driver/src/gallium/drivers/etnaviv/etnaviv_screen.c	
+++ b/mesa 3D driver/src/gallium/drivers/etnaviv/etnaviv_screen.c	
@@ -49,7 +49,6 @@
 
 #include "drm-uapi/drm_fourcc.h"
 
-#define ETNA_DRM_VERSION(major, minor) ((major) << 16 | (minor))
 #define ETNA_DRM_VERSION_FENCE_FD      ETNA_DRM_VERSION(1, 1)
 #define ETNA_DRM_VERSION_PERFMON       ETNA_DRM_VERSION(1, 2)
 
@@ -260,6 +259,28 @@ etna_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_MAX_VARYINGS:
       return screen->specs.max_varyings;
 
+   case PIPE_CAP_SUPPORTED_PRIM_MODES:
+   case PIPE_CAP_SUPPORTED_PRIM_MODES_WITH_RESTART: {
+      /* Generate the bitmask of supported draw primitives. */
+      uint32_t modes = 1 << PIPE_PRIM_POINTS |
+                       1 << PIPE_PRIM_LINES |
+                       1 << PIPE_PRIM_LINE_STRIP |
+                       1 << PIPE_PRIM_TRIANGLES |
+                       1 << PIPE_PRIM_TRIANGLE_FAN;
+
+      /* TODO: The bug relates only to indexed draws, but here we signal
+       * that there is no support for triangle strips at all. This should
+       * be refined.
+       */
+      if (VIV_FEATURE(screen, chipMinorFeatures2, BUG_FIXES8))
+         modes |= 1 << PIPE_PRIM_TRIANGLE_STRIP;
+
+      if (VIV_FEATURE(screen, chipMinorFeatures2, LINE_LOOP))
+         modes |= 1 << PIPE_PRIM_LINE_LOOP;
+
+      return modes;
+   }
+
    case PIPE_CAP_PCI_GROUP:
    case PIPE_CAP_PCI_BUS:
    case PIPE_CAP_PCI_DEVICE:
@@ -947,7 +968,6 @@ etna_screen_create(struct etna_device *dev, struct etna_gpu *gpu,
 {
    struct etna_screen *screen = CALLOC_STRUCT(etna_screen);
    struct pipe_screen *pscreen;
-   drmVersionPtr version;
    uint64_t val;
 
    if (!screen)
@@ -959,16 +979,7 @@ etna_screen_create(struct etna_device *dev, struct etna_gpu *gpu,
    screen->ro = ro;
    screen->refcnt = 1;
 
-   if (!screen->ro) {
-      DBG("could not create renderonly object");
-      goto fail;
-   }
-
-   version = drmGetVersion(screen->ro->gpu_fd);
-   screen->drm_version = ETNA_DRM_VERSION(version->version_major,
-                                          version->version_minor);
-   drmFreeVersion(version);
-
+   screen->drm_version = etnaviv_device_version(screen->dev);
    etna_mesa_debug = debug_get_option_etna_mesa_debug();
 
    /* Disable autodisable for correct rendering with TS */
diff --git a/mesa 3D driver/src/gallium/drivers/etnaviv/etnaviv_texture.c b/mesa 3D driver/src/gallium/drivers/etnaviv/etnaviv_texture.c
index 6164ff9959..3a646757b2 100644
--- a/mesa 3D driver/src/gallium/drivers/etnaviv/etnaviv_texture.c	
+++ b/mesa 3D driver/src/gallium/drivers/etnaviv/etnaviv_texture.c	
@@ -240,7 +240,7 @@ etna_texture_handle_incompatible(struct pipe_context *pctx, struct pipe_resource
 
 static void
 set_sampler_views(struct etna_context *ctx, unsigned start, unsigned end,
-                  unsigned nr, struct pipe_sampler_view **views)
+                  unsigned nr, bool take_ownership, struct pipe_sampler_view **views)
 {
    unsigned i, j;
    uint32_t mask = 1 << start;
@@ -249,7 +249,12 @@ set_sampler_views(struct etna_context *ctx, unsigned start, unsigned end,
    for (i = start, j = 0; j < nr; i++, j++, mask <<= 1) {
       struct pipe_sampler_view *view = views ? views[j] : NULL;
 
-      pipe_sampler_view_reference(&ctx->sampler_view[i], view);
+      if (take_ownership) {
+         pipe_sampler_view_reference(&ctx->sampler_view[i], NULL);
+         ctx->sampler_view[i] = view;
+      } else {
+         pipe_sampler_view_reference(&ctx->sampler_view[i], view);
+      }
       if (view) {
          ctx->active_sampler_views |= mask;
          ctx->dirty_sampler_views |= mask;
@@ -268,32 +273,35 @@ set_sampler_views(struct etna_context *ctx, unsigned start, unsigned end,
 
 static inline void
 etna_fragtex_set_sampler_views(struct etna_context *ctx, unsigned nr,
+                               bool take_ownership,
                                struct pipe_sampler_view **views)
 {
    struct etna_screen *screen = ctx->screen;
    unsigned start = 0;
    unsigned end = start + screen->specs.fragment_sampler_count;
 
-   set_sampler_views(ctx, start, end, nr, views);
+   set_sampler_views(ctx, start, end, nr, take_ownership, views);
    ctx->num_fragment_sampler_views = nr;
 }
 
 
 static inline void
 etna_vertex_set_sampler_views(struct etna_context *ctx, unsigned nr,
+                              bool take_ownership,
                               struct pipe_sampler_view **views)
 {
    struct etna_screen *screen = ctx->screen;
    unsigned start = screen->specs.vertex_sampler_offset;
    unsigned end = start + screen->specs.vertex_sampler_count;
 
-   set_sampler_views(ctx, start, end, nr, views);
+   set_sampler_views(ctx, start, end, nr, take_ownership, views);
 }
 
 static void
 etna_set_sampler_views(struct pipe_context *pctx, enum pipe_shader_type shader,
                        unsigned start_slot, unsigned num_views,
                        unsigned unbind_num_trailing_slots,
+                       bool take_ownership,
                        struct pipe_sampler_view **views)
 {
    struct etna_context *ctx = etna_context(pctx);
@@ -303,10 +311,10 @@ etna_set_sampler_views(struct pipe_context *pctx, enum pipe_shader_type shader,
 
    switch (shader) {
    case PIPE_SHADER_FRAGMENT:
-      etna_fragtex_set_sampler_views(ctx, num_views, views);
+      etna_fragtex_set_sampler_views(ctx, num_views, take_ownership, views);
       break;
    case PIPE_SHADER_VERTEX:
-      etna_vertex_set_sampler_views(ctx, num_views, views);
+      etna_vertex_set_sampler_views(ctx, num_views, take_ownership, views);
       break;
    default:;
    }
diff --git a/mesa 3D driver/src/gallium/drivers/etnaviv/etnaviv_texture_state.c b/mesa 3D driver/src/gallium/drivers/etnaviv/etnaviv_texture_state.c
index e9e930033f..73abc281da 100644
--- a/mesa 3D driver/src/gallium/drivers/etnaviv/etnaviv_texture_state.c	
+++ b/mesa 3D driver/src/gallium/drivers/etnaviv/etnaviv_texture_state.c	
@@ -430,7 +430,7 @@ etna_emit_new_texture_state(struct etna_context *ctx)
          }
       }
    }
-   if (unlikely(dirty & (ETNA_DIRTY_SAMPLER_VIEWS))) {
+   if (unlikely(dirty & (ETNA_DIRTY_SAMPLERS))) {
       for (int x = 0; x < VIVS_NTE_SAMPLER__LEN; ++x) {
          if ((1 << x) & active_samplers) {
             struct etna_sampler_state *ss = etna_sampler_state(ctx->sampler[x]);
diff --git a/mesa 3D driver/src/gallium/drivers/etnaviv/meson.build b/mesa 3D driver/src/gallium/drivers/etnaviv/meson.build
index f8dcba887f..a1b58701b3 100644
--- a/mesa 3D driver/src/gallium/drivers/etnaviv/meson.build	
+++ b/mesa 3D driver/src/gallium/drivers/etnaviv/meson.build	
@@ -145,5 +145,6 @@ if with_tests
       dependencies : [idep_gtest, idep_nir],
     ),
     suite : ['compiler', 'etnaviv'],
+    protocol : gtest_test_protocol,
   )
 endif
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a2xx/fd2_context.c b/mesa 3D driver/src/gallium/drivers/freedreno/a2xx/fd2_context.c
index 2c9118f504..ee1711c673 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a2xx/fd2_context.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a2xx/fd2_context.c	
@@ -70,27 +70,6 @@ create_solid_vertexbuf(struct pipe_context *pctx)
    return prsc;
 }
 
-/* clang-format off */
-static const uint8_t a22x_primtypes[PIPE_PRIM_MAX] = {
-   [PIPE_PRIM_POINTS]         = DI_PT_POINTLIST_PSIZE,
-   [PIPE_PRIM_LINES]          = DI_PT_LINELIST,
-   [PIPE_PRIM_LINE_STRIP]     = DI_PT_LINESTRIP,
-   [PIPE_PRIM_LINE_LOOP]      = DI_PT_LINELOOP,
-   [PIPE_PRIM_TRIANGLES]      = DI_PT_TRILIST,
-   [PIPE_PRIM_TRIANGLE_STRIP] = DI_PT_TRISTRIP,
-   [PIPE_PRIM_TRIANGLE_FAN]   = DI_PT_TRIFAN,
-};
-
-static const uint8_t a20x_primtypes[PIPE_PRIM_MAX] = {
-   [PIPE_PRIM_POINTS]         = DI_PT_POINTLIST_PSIZE,
-   [PIPE_PRIM_LINES]          = DI_PT_LINELIST,
-   [PIPE_PRIM_LINE_STRIP]     = DI_PT_LINESTRIP,
-   [PIPE_PRIM_TRIANGLES]      = DI_PT_TRILIST,
-   [PIPE_PRIM_TRIANGLE_STRIP] = DI_PT_TRISTRIP,
-   [PIPE_PRIM_TRIANGLE_FAN]   = DI_PT_TRIFAN,
-};
-/* clang-format on */
-
 struct pipe_context *
 fd2_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
 {
@@ -104,6 +83,7 @@ fd2_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
    pctx = &fd2_ctx->base.base;
    pctx->screen = pscreen;
 
+   fd2_ctx->base.flags = flags;
    fd2_ctx->base.dev = fd_device_ref(screen->dev);
    fd2_ctx->base.screen = fd_screen(pscreen);
 
@@ -118,9 +98,7 @@ fd2_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
    fd2_prog_init(pctx);
    fd2_emit_init(pctx);
 
-   pctx = fd_context_init(
-      &fd2_ctx->base, pscreen,
-      (screen->gpu_id >= 220) ? a22x_primtypes : a20x_primtypes, priv, flags);
+   pctx = fd_context_init(&fd2_ctx->base, pscreen, priv, flags);
    if (!pctx)
       return NULL;
 
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a2xx/fd2_draw.c b/mesa 3D driver/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
index 2a53632bbb..797daf37ed 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a2xx/fd2_draw.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a2xx/fd2_draw.c	
@@ -136,8 +136,8 @@ draw_impl(struct fd_context *ctx, const struct pipe_draw_info *info,
    if (binning || info->mode == PIPE_PRIM_POINTS)
       vismode = IGNORE_VISIBILITY;
 
-   fd_draw_emit(ctx->batch, ring, ctx->primtypes[info->mode], vismode, info,
-                draw, index_offset);
+   fd_draw_emit(ctx->batch, ring, ctx->screen->primtypes[info->mode],
+                vismode, info, draw, index_offset);
 
    if (is_a20x(ctx->screen)) {
       /* not sure why this is required, but it fixes some hangs */
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a2xx/fd2_screen.c b/mesa 3D driver/src/gallium/drivers/freedreno/a2xx/fd2_screen.c
index 1ed14da2ff..c55aa6b9c7 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a2xx/fd2_screen.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a2xx/fd2_screen.c	
@@ -93,6 +93,27 @@ fd2_screen_is_format_supported(struct pipe_screen *pscreen,
    return retval == usage;
 }
 
+/* clang-format off */
+static const uint8_t a22x_primtypes[PIPE_PRIM_MAX] = {
+   [PIPE_PRIM_POINTS]         = DI_PT_POINTLIST_PSIZE,
+   [PIPE_PRIM_LINES]          = DI_PT_LINELIST,
+   [PIPE_PRIM_LINE_STRIP]     = DI_PT_LINESTRIP,
+   [PIPE_PRIM_LINE_LOOP]      = DI_PT_LINELOOP,
+   [PIPE_PRIM_TRIANGLES]      = DI_PT_TRILIST,
+   [PIPE_PRIM_TRIANGLE_STRIP] = DI_PT_TRISTRIP,
+   [PIPE_PRIM_TRIANGLE_FAN]   = DI_PT_TRIFAN,
+};
+
+static const uint8_t a20x_primtypes[PIPE_PRIM_MAX] = {
+   [PIPE_PRIM_POINTS]         = DI_PT_POINTLIST_PSIZE,
+   [PIPE_PRIM_LINES]          = DI_PT_LINELIST,
+   [PIPE_PRIM_LINE_STRIP]     = DI_PT_LINESTRIP,
+   [PIPE_PRIM_TRIANGLES]      = DI_PT_TRILIST,
+   [PIPE_PRIM_TRIANGLE_STRIP] = DI_PT_TRISTRIP,
+   [PIPE_PRIM_TRIANGLE_FAN]   = DI_PT_TRIFAN,
+};
+/* clang-format on */
+
 void
 fd2_screen_init(struct pipe_screen *pscreen)
 {
@@ -107,4 +128,10 @@ fd2_screen_init(struct pipe_screen *pscreen)
       screen->tile_mode = fd2_tile_mode;
 
    fd2_emit_init_screen(pscreen);
+
+   if (screen->gpu_id >= 220) {
+      screen->primtypes = a22x_primtypes;
+   } else {
+      screen->primtypes = a20x_primtypes;
+   }
 }
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a2xx/fd2_texture.c b/mesa 3D driver/src/gallium/drivers/freedreno/a2xx/fd2_texture.c
index 3048011b03..50b72e52cc 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a2xx/fd2_texture.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a2xx/fd2_texture.c	
@@ -204,6 +204,7 @@ static void
 fd2_set_sampler_views(struct pipe_context *pctx, enum pipe_shader_type shader,
                       unsigned start, unsigned nr,
                       unsigned unbind_num_trailing_slots,
+                      bool take_ownership,
                       struct pipe_sampler_view **views) in_dt
 {
    if (shader == PIPE_SHADER_FRAGMENT) {
@@ -218,7 +219,7 @@ fd2_set_sampler_views(struct pipe_context *pctx, enum pipe_shader_type shader,
    }
 
    fd_set_sampler_views(pctx, shader, start, nr, unbind_num_trailing_slots,
-                        views);
+                        take_ownership, views);
 }
 
 /* map gallium sampler-id to hw const-idx.. adreno uses a flat address
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a2xx/ir2_nir.c b/mesa 3D driver/src/gallium/drivers/freedreno/a2xx/ir2_nir.c
index 7433127530..45fe0e2648 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a2xx/ir2_nir.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a2xx/ir2_nir.c	
@@ -692,6 +692,7 @@ emit_tex(struct ir2_context *ctx, nir_tex_instr *tex)
 
    switch (tex->sampler_dim) {
    case GLSL_SAMPLER_DIM_2D:
+   case GLSL_SAMPLER_DIM_EXTERNAL:
       break;
    case GLSL_SAMPLER_DIM_RECT:
       is_rect = true;
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a3xx/fd3_context.c b/mesa 3D driver/src/gallium/drivers/freedreno/a3xx/fd3_context.c
index d85ed79292..84c802660e 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a3xx/fd3_context.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a3xx/fd3_context.c	
@@ -58,19 +58,6 @@ fd3_context_destroy(struct pipe_context *pctx) in_dt
    free(fd3_ctx);
 }
 
-/* clang-format off */
-static const uint8_t primtypes[] = {
-   [PIPE_PRIM_POINTS]         = DI_PT_POINTLIST,
-   [PIPE_PRIM_LINES]          = DI_PT_LINELIST,
-   [PIPE_PRIM_LINE_STRIP]     = DI_PT_LINESTRIP,
-   [PIPE_PRIM_LINE_LOOP]      = DI_PT_LINELOOP,
-   [PIPE_PRIM_TRIANGLES]      = DI_PT_TRILIST,
-   [PIPE_PRIM_TRIANGLE_STRIP] = DI_PT_TRISTRIP,
-   [PIPE_PRIM_TRIANGLE_FAN]   = DI_PT_TRIFAN,
-   [PIPE_PRIM_MAX]            = DI_PT_RECTLIST,  /* internal clear blits */
-};
-/* clang-format on */
-
 struct pipe_context *
 fd3_context_create(struct pipe_screen *pscreen, void *priv,
                    unsigned flags) in_dt
@@ -85,6 +72,7 @@ fd3_context_create(struct pipe_screen *pscreen, void *priv,
    pctx = &fd3_ctx->base.base;
    pctx->screen = pscreen;
 
+   fd3_ctx->base.flags = flags;
    fd3_ctx->base.dev = fd_device_ref(screen->dev);
    fd3_ctx->base.screen = fd_screen(pscreen);
    fd3_ctx->base.last.key = &fd3_ctx->last_key;
@@ -100,7 +88,7 @@ fd3_context_create(struct pipe_screen *pscreen, void *priv,
    fd3_prog_init(pctx);
    fd3_emit_init(pctx);
 
-   pctx = fd_context_init(&fd3_ctx->base, pscreen, primtypes, priv, flags);
+   pctx = fd_context_init(&fd3_ctx->base, pscreen, priv, flags);
    if (!pctx)
       return NULL;
 
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/mesa 3D driver/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
index 786c954802..db1f1f5be1 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a3xx/fd3_draw.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a3xx/fd3_draw.c	
@@ -56,7 +56,7 @@ draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring,
           struct fd3_emit *emit, unsigned index_offset) assert_dt
 {
    const struct pipe_draw_info *info = emit->info;
-   enum pc_di_primtype primtype = ctx->primtypes[info->mode];
+   enum pc_di_primtype primtype = ctx->screen->primtypes[info->mode];
 
    fd3_emit_state(ctx, ring, emit);
 
@@ -105,7 +105,7 @@ fd3_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info,
       .debug = &ctx->debug,
       .vtx = &ctx->vtx,
       .info = info,
-		.drawid_offset = drawid_offset,
+      .drawid_offset = drawid_offset,
       .indirect = indirect,
       .draw = draw,
       .key = {
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a3xx/fd3_screen.c b/mesa 3D driver/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
index f0095bcd85..f9fe811fcd 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a3xx/fd3_screen.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a3xx/fd3_screen.c	
@@ -94,6 +94,19 @@ fd3_screen_is_format_supported(struct pipe_screen *pscreen,
    return retval == usage;
 }
 
+/* clang-format off */
+static const uint8_t primtypes[] = {
+   [PIPE_PRIM_POINTS]         = DI_PT_POINTLIST,
+   [PIPE_PRIM_LINES]          = DI_PT_LINELIST,
+   [PIPE_PRIM_LINE_STRIP]     = DI_PT_LINESTRIP,
+   [PIPE_PRIM_LINE_LOOP]      = DI_PT_LINELOOP,
+   [PIPE_PRIM_TRIANGLES]      = DI_PT_TRILIST,
+   [PIPE_PRIM_TRIANGLE_STRIP] = DI_PT_TRISTRIP,
+   [PIPE_PRIM_TRIANGLE_FAN]   = DI_PT_TRIFAN,
+   [PIPE_PRIM_MAX]            = DI_PT_RECTLIST,  /* internal clear blits */
+};
+/* clang-format on */
+
 void
 fd3_screen_init(struct pipe_screen *pscreen)
 {
@@ -107,4 +120,6 @@ fd3_screen_init(struct pipe_screen *pscreen)
    screen->setup_slices = fd3_setup_slices;
    if (FD_DBG(TTILE))
       screen->tile_mode = fd3_tile_mode;
+
+   screen->primtypes = primtypes;
 }
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a4xx/fd4_context.c b/mesa 3D driver/src/gallium/drivers/freedreno/a4xx/fd4_context.c
index 6074fbc230..58b6f2ddac 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a4xx/fd4_context.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a4xx/fd4_context.c	
@@ -58,19 +58,6 @@ fd4_context_destroy(struct pipe_context *pctx) in_dt
    free(fd4_ctx);
 }
 
-/* clang-format off */
-static const uint8_t primtypes[] = {
-   [PIPE_PRIM_POINTS]         = DI_PT_POINTLIST,
-   [PIPE_PRIM_LINES]          = DI_PT_LINELIST,
-   [PIPE_PRIM_LINE_STRIP]     = DI_PT_LINESTRIP,
-   [PIPE_PRIM_LINE_LOOP]      = DI_PT_LINELOOP,
-   [PIPE_PRIM_TRIANGLES]      = DI_PT_TRILIST,
-   [PIPE_PRIM_TRIANGLE_STRIP] = DI_PT_TRISTRIP,
-   [PIPE_PRIM_TRIANGLE_FAN]   = DI_PT_TRIFAN,
-   [PIPE_PRIM_MAX]            = DI_PT_RECTLIST,  /* internal clear blits */
-};
-/* clang-format on */
-
 struct pipe_context *
 fd4_context_create(struct pipe_screen *pscreen, void *priv,
                    unsigned flags) in_dt
@@ -85,6 +72,7 @@ fd4_context_create(struct pipe_screen *pscreen, void *priv,
    pctx = &fd4_ctx->base.base;
    pctx->screen = pscreen;
 
+   fd4_ctx->base.flags = flags;
    fd4_ctx->base.dev = fd_device_ref(screen->dev);
    fd4_ctx->base.screen = fd_screen(pscreen);
    fd4_ctx->base.last.key = &fd4_ctx->last_key;
@@ -100,7 +88,7 @@ fd4_context_create(struct pipe_screen *pscreen, void *priv,
    fd4_prog_init(pctx);
    fd4_emit_init(pctx);
 
-   pctx = fd_context_init(&fd4_ctx->base, pscreen, primtypes, priv, flags);
+   pctx = fd_context_init(&fd4_ctx->base, pscreen, priv, flags);
    if (!pctx)
       return NULL;
 
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a4xx/fd4_draw.c b/mesa 3D driver/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
index 7647388e5f..cf1828a434 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a4xx/fd4_draw.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a4xx/fd4_draw.c	
@@ -44,7 +44,7 @@ draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring,
           struct fd4_emit *emit, unsigned index_offset) assert_dt
 {
    const struct pipe_draw_info *info = emit->info;
-   enum pc_di_primtype primtype = ctx->primtypes[info->mode];
+   enum pc_di_primtype primtype = ctx->screen->primtypes[info->mode];
 
    fd4_emit_state(ctx, ring, emit);
 
@@ -83,7 +83,7 @@ fd4_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info,
       .debug = &ctx->debug,
       .vtx = &ctx->vtx,
       .info = info,
-		.drawid_offset = drawid_offset,
+      .drawid_offset = drawid_offset,
       .indirect = indirect,
       .draw = draw,
       .key = {
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a4xx/fd4_screen.c b/mesa 3D driver/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
index a85a32675b..8d42677e2f 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a4xx/fd4_screen.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a4xx/fd4_screen.c	
@@ -98,6 +98,19 @@ fd4_screen_is_format_supported(struct pipe_screen *pscreen,
    return retval == usage;
 }
 
+/* clang-format off */
+static const uint8_t primtypes[] = {
+   [PIPE_PRIM_POINTS]         = DI_PT_POINTLIST,
+   [PIPE_PRIM_LINES]          = DI_PT_LINELIST,
+   [PIPE_PRIM_LINE_STRIP]     = DI_PT_LINESTRIP,
+   [PIPE_PRIM_LINE_LOOP]      = DI_PT_LINELOOP,
+   [PIPE_PRIM_TRIANGLES]      = DI_PT_TRILIST,
+   [PIPE_PRIM_TRIANGLE_STRIP] = DI_PT_TRISTRIP,
+   [PIPE_PRIM_TRIANGLE_FAN]   = DI_PT_TRIFAN,
+   [PIPE_PRIM_MAX]            = DI_PT_RECTLIST,  /* internal clear blits */
+};
+/* clang-format on */
+
 void
 fd4_screen_init(struct pipe_screen *pscreen)
 {
@@ -108,4 +121,6 @@ fd4_screen_init(struct pipe_screen *pscreen)
    pscreen->is_format_supported = fd4_screen_is_format_supported;
    fd4_emit_init_screen(pscreen);
    ir3_screen_init(pscreen);
+
+   screen->primtypes = primtypes;
 }
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a4xx/fd4_texture.c b/mesa 3D driver/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
index a1ff744639..49b350665d 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a4xx/fd4_texture.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a4xx/fd4_texture.c	
@@ -246,6 +246,7 @@ static void
 fd4_set_sampler_views(struct pipe_context *pctx, enum pipe_shader_type shader,
                       unsigned start, unsigned nr,
                       unsigned unbind_num_trailing_slots,
+                      bool take_ownership,
                       struct pipe_sampler_view **views)
 {
    struct fd_context *ctx = fd_context(pctx);
@@ -262,7 +263,7 @@ fd4_set_sampler_views(struct pipe_context *pctx, enum pipe_shader_type shader,
    }
 
    fd_set_sampler_views(pctx, shader, start, nr, unbind_num_trailing_slots,
-                        views);
+                        take_ownership, views);
 
    if (shader == PIPE_SHADER_FRAGMENT) {
       fd4_ctx->fastc_srgb = astc_srgb;
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a5xx/fd5_context.c b/mesa 3D driver/src/gallium/drivers/freedreno/a5xx/fd5_context.c
index 2eddc0df89..7296d51f80 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a5xx/fd5_context.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a5xx/fd5_context.c	
@@ -57,19 +57,6 @@ fd5_context_destroy(struct pipe_context *pctx) in_dt
    free(fd5_ctx);
 }
 
-/* clang-format off */
-static const uint8_t primtypes[] = {
-   [PIPE_PRIM_POINTS]         = DI_PT_POINTLIST,
-   [PIPE_PRIM_LINES]          = DI_PT_LINELIST,
-   [PIPE_PRIM_LINE_STRIP]     = DI_PT_LINESTRIP,
-   [PIPE_PRIM_LINE_LOOP]      = DI_PT_LINELOOP,
-   [PIPE_PRIM_TRIANGLES]      = DI_PT_TRILIST,
-   [PIPE_PRIM_TRIANGLE_STRIP] = DI_PT_TRISTRIP,
-   [PIPE_PRIM_TRIANGLE_FAN]   = DI_PT_TRIFAN,
-   [PIPE_PRIM_MAX]            = DI_PT_RECTLIST,  /* internal clear blits */
-};
-/* clang-format on */
-
 struct pipe_context *
 fd5_context_create(struct pipe_screen *pscreen, void *priv,
                    unsigned flags) disable_thread_safety_analysis
@@ -84,6 +71,7 @@ fd5_context_create(struct pipe_screen *pscreen, void *priv,
    pctx = &fd5_ctx->base.base;
    pctx->screen = pscreen;
 
+   fd5_ctx->base.flags = flags;
    fd5_ctx->base.dev = fd_device_ref(screen->dev);
    fd5_ctx->base.screen = fd_screen(pscreen);
    fd5_ctx->base.last.key = &fd5_ctx->last_key;
@@ -103,7 +91,7 @@ fd5_context_create(struct pipe_screen *pscreen, void *priv,
    if (!FD_DBG(NOBLIT))
       fd5_ctx->base.blit = fd5_blitter_blit;
 
-   pctx = fd_context_init(&fd5_ctx->base, pscreen, primtypes, priv, flags);
+   pctx = fd_context_init(&fd5_ctx->base, pscreen, priv, flags);
    if (!pctx)
       return NULL;
 
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a5xx/fd5_draw.c b/mesa 3D driver/src/gallium/drivers/freedreno/a5xx/fd5_draw.c
index 05302459ed..84b024caca 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a5xx/fd5_draw.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a5xx/fd5_draw.c	
@@ -44,7 +44,7 @@ draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring,
           struct fd5_emit *emit, unsigned index_offset) assert_dt
 {
    const struct pipe_draw_info *info = emit->info;
-   enum pc_di_primtype primtype = ctx->primtypes[info->mode];
+   enum pc_di_primtype primtype = ctx->screen->primtypes[info->mode];
 
    fd5_emit_state(ctx, ring, emit);
 
@@ -79,7 +79,7 @@ fd5_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info,
       .debug = &ctx->debug,
       .vtx = &ctx->vtx,
       .info = info,
-		.drawid_offset = drawid_offset,
+      .drawid_offset = drawid_offset,
       .indirect = indirect,
       .draw = draw,
       .key = {
@@ -184,7 +184,8 @@ fd5_clear_lrz(struct fd_batch *batch, struct fd_resource *zsbuf, double depth)
    OUT_PKT4(ring, REG_A5XX_GRAS_SU_CNTL, 1);
    OUT_RING(ring,
             A5XX_GRAS_SU_CNTL_LINEHALFWIDTH(0.0) |
-               COND(zsbuf->b.b.nr_samples > 1, A5XX_GRAS_SU_CNTL_MSAA_ENABLE));
+               A5XX_GRAS_SU_CNTL_LINE_MODE(zsbuf->b.b.nr_samples  > 1 ?
+                                           RECTANGULAR : BRESENHAM));
 
    OUT_PKT4(ring, REG_A5XX_GRAS_CNTL, 1);
    OUT_RING(ring, 0x00000000);
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a5xx/fd5_emit.c b/mesa 3D driver/src/gallium/drivers/freedreno/a5xx/fd5_emit.c
index 08e1173d2d..9e27308eb8 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a5xx/fd5_emit.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a5xx/fd5_emit.c	
@@ -667,7 +667,8 @@ fd5_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
       OUT_PKT4(ring, REG_A5XX_GRAS_SU_CNTL, 1);
       OUT_RING(ring, rasterizer->gras_su_cntl |
-                        COND(pfb->samples > 1, A5XX_GRAS_SU_CNTL_MSAA_ENABLE));
+                        A5XX_GRAS_SU_CNTL_LINE_MODE(pfb->samples > 1 ?
+                                                    RECTANGULAR : BRESENHAM));
 
       OUT_PKT4(ring, REG_A5XX_GRAS_SU_POINT_MINMAX, 2);
       OUT_RING(ring, rasterizer->gras_su_point_minmax);
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a5xx/fd5_program.c b/mesa 3D driver/src/gallium/drivers/freedreno/a5xx/fd5_program.c
index 072d432db6..2236420cdf 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a5xx/fd5_program.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a5xx/fd5_program.c	
@@ -542,20 +542,23 @@ fd5_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
    OUT_PKT4(ring, REG_A5XX_SP_SP_CNTL, 1);
    OUT_RING(ring, 0x00000010); /* XXX */
 
-   /* XXX: missing enable bits for per-sample bary linear centroid and
-    * IJ_PERSP_SIZE (should be identical to a6xx)
-    */
-
    OUT_PKT4(ring, REG_A5XX_GRAS_CNTL, 1);
    OUT_RING(ring,
             CONDREG(ij_regid[IJ_PERSP_PIXEL], A5XX_GRAS_CNTL_IJ_PERSP_PIXEL) |
                CONDREG(ij_regid[IJ_PERSP_CENTROID],
                        A5XX_GRAS_CNTL_IJ_PERSP_CENTROID) |
+               CONDREG(ij_regid[IJ_PERSP_SAMPLE],
+                       A5XX_GRAS_CNTL_IJ_PERSP_SAMPLE) |
+               CONDREG(ij_regid[IJ_LINEAR_PIXEL], A5XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
+               CONDREG(ij_regid[IJ_LINEAR_CENTROID],
+                       A5XX_GRAS_CNTL_IJ_LINEAR_CENTROID) |
+               CONDREG(ij_regid[IJ_LINEAR_SAMPLE],
+                       A5XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
                COND(s[FS].v->fragcoord_compmask != 0,
                     A5XX_GRAS_CNTL_COORD_MASK(s[FS].v->fragcoord_compmask) |
-                       A5XX_GRAS_CNTL_SIZE) |
-               COND(s[FS].v->frag_face, A5XX_GRAS_CNTL_SIZE) |
-               CONDREG(ij_regid[IJ_LINEAR_PIXEL], A5XX_GRAS_CNTL_SIZE));
+                       A5XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
+               COND(s[FS].v->frag_face, A5XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
+               CONDREG(ij_regid[IJ_LINEAR_PIXEL], A5XX_GRAS_CNTL_IJ_LINEAR_PIXEL));
 
    OUT_PKT4(ring, REG_A5XX_RB_RENDER_CONTROL0, 2);
    OUT_RING(
@@ -564,11 +567,19 @@ fd5_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
               A5XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) |
          CONDREG(ij_regid[IJ_PERSP_CENTROID],
                  A5XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) |
+         CONDREG(ij_regid[IJ_PERSP_SAMPLE],
+                 A5XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) |
+         CONDREG(ij_regid[IJ_LINEAR_PIXEL],
+              A5XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
+         CONDREG(ij_regid[IJ_LINEAR_CENTROID],
+                 A5XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) |
+         CONDREG(ij_regid[IJ_LINEAR_SAMPLE],
+                 A5XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
          COND(s[FS].v->fragcoord_compmask != 0,
               A5XX_RB_RENDER_CONTROL0_COORD_MASK(s[FS].v->fragcoord_compmask) |
-                 A5XX_RB_RENDER_CONTROL0_SIZE) |
-         COND(s[FS].v->frag_face, A5XX_RB_RENDER_CONTROL0_SIZE) |
-         CONDREG(ij_regid[IJ_LINEAR_PIXEL], A5XX_RB_RENDER_CONTROL0_SIZE));
+                 A5XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
+         COND(s[FS].v->frag_face, A5XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
+         CONDREG(ij_regid[IJ_LINEAR_PIXEL], A5XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL));
    OUT_RING(ring,
             CONDREG(samp_mask_regid, A5XX_RB_RENDER_CONTROL1_SAMPLEMASK) |
                COND(s[FS].v->frag_face, A5XX_RB_RENDER_CONTROL1_FACENESS) |
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a5xx/fd5_screen.c b/mesa 3D driver/src/gallium/drivers/freedreno/a5xx/fd5_screen.c
index 1393abf85f..bf54f37bf6 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a5xx/fd5_screen.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a5xx/fd5_screen.c	
@@ -115,6 +115,19 @@ fd5_screen_is_format_supported(struct pipe_screen *pscreen,
    return retval == usage;
 }
 
+/* clang-format off */
+static const uint8_t primtypes[] = {
+   [PIPE_PRIM_POINTS]         = DI_PT_POINTLIST,
+   [PIPE_PRIM_LINES]          = DI_PT_LINELIST,
+   [PIPE_PRIM_LINE_STRIP]     = DI_PT_LINESTRIP,
+   [PIPE_PRIM_LINE_LOOP]      = DI_PT_LINELOOP,
+   [PIPE_PRIM_TRIANGLES]      = DI_PT_TRILIST,
+   [PIPE_PRIM_TRIANGLE_STRIP] = DI_PT_TRISTRIP,
+   [PIPE_PRIM_TRIANGLE_FAN]   = DI_PT_TRIFAN,
+   [PIPE_PRIM_MAX]            = DI_PT_RECTLIST,  /* internal clear blits */
+};
+/* clang-format on */
+
 void
 fd5_screen_init(struct pipe_screen *pscreen)
 {
@@ -129,4 +142,6 @@ fd5_screen_init(struct pipe_screen *pscreen)
 
    fd5_emit_init_screen(pscreen);
    ir3_screen_init(pscreen);
+
+   screen->primtypes = primtypes;
 }
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a5xx/fd5_texture.c b/mesa 3D driver/src/gallium/drivers/freedreno/a5xx/fd5_texture.c
index e1b4e463b9..ef30b0d8b0 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a5xx/fd5_texture.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a5xx/fd5_texture.c	
@@ -239,6 +239,7 @@ static void
 fd5_set_sampler_views(struct pipe_context *pctx, enum pipe_shader_type shader,
                       unsigned start, unsigned nr,
                       unsigned unbind_num_trailing_slots,
+                      bool take_ownership,
                       struct pipe_sampler_view **views)
 {
    struct fd_context *ctx = fd_context(pctx);
@@ -255,7 +256,7 @@ fd5_set_sampler_views(struct pipe_context *pctx, enum pipe_shader_type shader,
    }
 
    fd_set_sampler_views(pctx, shader, start, nr, unbind_num_trailing_slots,
-                        views);
+                        take_ownership, views);
 
    if (shader == PIPE_SHADER_FRAGMENT) {
       fd5_ctx->fastc_srgb = astc_srgb;
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c
index d227e320b4..9435aaf701 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c	
@@ -126,7 +126,7 @@ ok_dims(const struct pipe_resource *r, const struct pipe_box *b, int lvl)
 static bool
 ok_format(enum pipe_format pfmt)
 {
-   enum a6xx_format fmt = fd6_pipe2color(pfmt);
+   enum a6xx_format fmt = fd6_color_format(pfmt, TILE6_LINEAR);
 
    if (util_format_is_compressed(pfmt))
       return true;
@@ -252,7 +252,7 @@ static void
 emit_blit_setup(struct fd_ringbuffer *ring, enum pipe_format pfmt,
                 bool scissor_enable, union pipe_color_union *color)
 {
-   enum a6xx_format fmt = fd6_pipe2color(pfmt);
+   enum a6xx_format fmt = fd6_color_format(pfmt, TILE6_LINEAR);
    bool is_srgb = util_format_is_srgb(pfmt);
    enum a6xx_2d_ifmt ifmt = fd6_ifmt(fmt);
 
@@ -524,6 +524,7 @@ fd6_clear_ubwc(struct fd_batch *batch, struct fd_resource *rsc) assert_dt
    fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true);
    fd6_event_write(batch, ring, PC_CCU_FLUSH_DEPTH_TS, true);
    fd6_event_write(batch, ring, CACHE_FLUSH_TS, true);
+   fd_wfi(batch, ring);
    fd6_cache_inv(batch, ring);
 }
 
@@ -532,9 +533,9 @@ emit_blit_dst(struct fd_ringbuffer *ring, struct pipe_resource *prsc,
               enum pipe_format pfmt, unsigned level, unsigned layer)
 {
    struct fd_resource *dst = fd_resource(prsc);
-   enum a6xx_format fmt = fd6_pipe2color(pfmt);
+   enum a6xx_format fmt = fd6_color_format(pfmt, dst->layout.tile_mode);
    enum a6xx_tile_mode tile = fd_resource_tile_mode(prsc, level);
-   enum a3xx_color_swap swap = fd6_resource_swap(dst, pfmt);
+   enum a3xx_color_swap swap = fd6_color_swap(pfmt, dst->layout.tile_mode);
    uint32_t pitch = fd_resource_pitch(dst, level);
    bool ubwc_enabled = fd_resource_ubwc_enabled(dst, level);
    unsigned off = fd_resource_offset(dst, level, layer);
@@ -570,10 +571,10 @@ emit_blit_src(struct fd_ringbuffer *ring, const struct pipe_blit_info *info,
               unsigned layer, unsigned nr_samples)
 {
    struct fd_resource *src = fd_resource(info->src.resource);
-   enum a6xx_format sfmt = fd6_pipe2color(info->src.format);
+   enum a6xx_format sfmt = fd6_texture_format(info->src.format, src->layout.tile_mode);
    enum a6xx_tile_mode stile =
       fd_resource_tile_mode(info->src.resource, info->src.level);
-   enum a3xx_color_swap sswap = fd6_resource_swap(src, info->src.format);
+   enum a3xx_color_swap sswap = fd6_texture_swap(info->src.format, src->layout.tile_mode);
    uint32_t pitch = fd_resource_pitch(src, info->src.level);
    bool subwc_enabled = fd_resource_ubwc_enabled(src, info->src.level);
    unsigned soff = fd_resource_offset(src, info->src.level, layer);
@@ -586,8 +587,8 @@ emit_blit_src(struct fd_ringbuffer *ring, const struct pipe_blit_info *info,
 
    enum a3xx_msaa_samples samples = fd_msaa_samples(src->b.b.nr_samples);
 
-   if (sfmt == FMT6_10_10_10_2_UNORM_DEST)
-      sfmt = FMT6_10_10_10_2_UNORM;
+   if (info->src.format == PIPE_FORMAT_A8_UNORM)
+      sfmt = FMT6_A8_UNORM;
 
    OUT_PKT4(ring, REG_A6XX_SP_PS_2D_SRC_INFO, 10);
    OUT_RING(ring, A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(sfmt) |
@@ -717,7 +718,7 @@ emit_clear_color(struct fd_ringbuffer *ring, enum pipe_format pfmt,
    }
 
    OUT_PKT4(ring, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
-   switch (fd6_ifmt(fd6_pipe2color(pfmt))) {
+   switch (fd6_ifmt(fd6_color_format(pfmt, TILE6_LINEAR))) {
    case R2D_UNORM8:
    case R2D_UNORM8_SRGB:
       /* The r2d ifmt is badly named, it also covers the signed case: */
@@ -855,7 +856,7 @@ fd6_resolve_tile(struct fd_batch *batch, struct fd_ringbuffer *ring,
    emit_blit_dst(ring, psurf->texture, psurf->format, psurf->u.tex.level,
                  psurf->u.tex.first_layer);
 
-   enum a6xx_format sfmt = fd6_pipe2color(psurf->format);
+   enum a6xx_format sfmt = fd6_color_format(psurf->format, TILE6_LINEAR);
    enum a3xx_msaa_samples samples = fd_msaa_samples(batch->framebuffer.samples);
 
    OUT_PKT4(ring, REG_A6XX_SP_PS_2D_SRC_INFO, 10);
@@ -893,6 +894,7 @@ fd6_resolve_tile(struct fd_batch *batch, struct fd_ringbuffer *ring,
     * results in sysmem, so we need to flush manually here.
     */
    fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true);
+   fd_wfi(batch, ring);
 }
 
 static bool
@@ -936,7 +938,7 @@ handle_rgba_blit(struct fd_context *ctx,
 
    DBG_BLIT(info, batch);
 
-   trace_start_blit(&batch->trace, info->src.resource->target,
+   trace_start_blit(&batch->trace, batch->draw, info->src.resource->target,
                     info->dst.resource->target);
 
    if ((info->src.resource->target == PIPE_BUFFER) &&
@@ -951,11 +953,12 @@ handle_rgba_blit(struct fd_context *ctx,
       emit_blit_texture(ctx, batch->draw, info);
    }
 
-   trace_end_blit(&batch->trace);
+   trace_end_blit(&batch->trace, batch->draw);
 
    fd6_event_write(batch, batch->draw, PC_CCU_FLUSH_COLOR_TS, true);
    fd6_event_write(batch, batch->draw, PC_CCU_FLUSH_DEPTH_TS, true);
    fd6_event_write(batch, batch->draw, CACHE_FLUSH_TS, true);
+   fd_wfi(batch, batch->draw);
    fd6_cache_inv(batch, batch->draw);
 
    fd_batch_unlock_submit(batch);
@@ -1123,35 +1126,6 @@ handle_compressed_blit(struct fd_context *ctx,
    return do_rewritten_blit(ctx, &blit);
 }
 
-static enum pipe_format
-snorm_copy_format(enum pipe_format format)
-{
-   switch (format) {
-   case PIPE_FORMAT_R8_SNORM:           return PIPE_FORMAT_R8_UNORM;
-   case PIPE_FORMAT_R16_SNORM:          return PIPE_FORMAT_R16_UNORM;
-   case PIPE_FORMAT_A16_SNORM:          return PIPE_FORMAT_A16_UNORM;
-   case PIPE_FORMAT_L16_SNORM:          return PIPE_FORMAT_L16_UNORM;
-   case PIPE_FORMAT_I16_SNORM:          return PIPE_FORMAT_I16_UNORM;
-   case PIPE_FORMAT_R8G8_SNORM:         return PIPE_FORMAT_R8G8_UNORM;
-   case PIPE_FORMAT_R8G8B8_SNORM:       return PIPE_FORMAT_R8G8B8_UNORM;
-   case PIPE_FORMAT_R32_SNORM:          return PIPE_FORMAT_R32_UNORM;
-   case PIPE_FORMAT_R16G16_SNORM:       return PIPE_FORMAT_R16G16_UNORM;
-   case PIPE_FORMAT_L16A16_SNORM:       return PIPE_FORMAT_L16A16_UNORM;
-   case PIPE_FORMAT_R8G8B8A8_SNORM:     return PIPE_FORMAT_R8G8B8A8_UNORM;
-   case PIPE_FORMAT_R10G10B10A2_SNORM:  return PIPE_FORMAT_R10G10B10A2_UNORM;
-   case PIPE_FORMAT_B10G10R10A2_SNORM:  return PIPE_FORMAT_B10G10R10A2_UNORM;
-   case PIPE_FORMAT_R16G16B16_SNORM:    return PIPE_FORMAT_R16G16B16_UNORM;
-   case PIPE_FORMAT_R16G16B16A16_SNORM: return PIPE_FORMAT_R16G16B16A16_UNORM;
-   case PIPE_FORMAT_R16G16B16X16_SNORM: return PIPE_FORMAT_R16G16B16X16_UNORM;
-   case PIPE_FORMAT_R32G32_SNORM:       return PIPE_FORMAT_R32G32_UNORM;
-   case PIPE_FORMAT_R32G32B32_SNORM:    return PIPE_FORMAT_R32G32B32_UNORM;
-   case PIPE_FORMAT_R32G32B32A32_SNORM: return PIPE_FORMAT_R32G32B32A32_UNORM;
-   default:
-      unreachable("unhandled snorm format");
-      return format;
-   }
-}
-
 /**
  * For SNORM formats, copy them as the equivalent UNORM format.  If we treat
  * them as snorm then the 0x80 (-1.0 snorm8) value will get clamped to 0x81
@@ -1165,7 +1139,7 @@ handle_snorm_copy_blit(struct fd_context *ctx,
 {
    struct pipe_blit_info blit = *info;
 
-   blit.src.format = blit.dst.format = snorm_copy_format(info->src.format);
+   blit.src.format = blit.dst.format = util_format_snorm_to_unorm(info->src.format);
 
    return do_rewritten_blit(ctx, &blit);
 }
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_compute.c b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_compute.c
index 9e8d68a6aa..bd8e2500ef 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_compute.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_compute.c	
@@ -70,11 +70,17 @@ cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
                COND(v->mergedregs, A6XX_SP_CS_CTRL_REG0_MERGEDREGS) |
                A6XX_SP_CS_CTRL_REG0_BRANCHSTACK(ir3_shader_branchstack_hw(v)));
 
-   uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1);
+   uint32_t shared_size = MAX2(((int)v->shader->cs.req_local_mem - 1) / 1024, 1);
    OUT_PKT4(ring, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
    OUT_RING(ring, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) |
                      A6XX_SP_CS_UNKNOWN_A9B1_UNK6);
 
+   if (ctx->screen->info->a6xx.has_lpac) {
+      OUT_PKT4(ring, REG_A6XX_HLSQ_CS_UNKNOWN_B9D0, 1);
+      OUT_RING(ring, A6XX_HLSQ_CS_UNKNOWN_B9D0_SHARED_SIZE(shared_size) |
+                        A6XX_HLSQ_CS_UNKNOWN_B9D0_UNK6);
+   }
+
    uint32_t local_invocation_id, work_group_id;
    local_invocation_id =
       ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
@@ -88,6 +94,16 @@ cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
    OUT_RING(ring, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
                      A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz));
 
+   if (ctx->screen->info->a6xx.has_lpac) {
+      OUT_PKT4(ring, REG_A6XX_SP_CS_CNTL_0, 2);
+      OUT_RING(ring, A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
+                        A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
+                        A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
+                        A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
+      OUT_RING(ring, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
+                        A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz));
+   }
+
    OUT_PKT4(ring, REG_A6XX_SP_CS_OBJ_START, 2);
    OUT_RELOC(ring, v->bo, 0, 0, 0); /* SP_CS_OBJ_START_LO/HI */
 
@@ -158,8 +174,8 @@ fd6_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) in_dt
    OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Y */
    OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Z */
 
-   trace_grid_info(&ctx->batch->trace, info);
-   trace_start_compute(&ctx->batch->trace);
+   trace_grid_info(&ctx->batch->trace, ring, info);
+   trace_start_compute(&ctx->batch->trace, ring);
 
    if (info->indirect) {
       struct fd_resource *rsc = fd_resource(info->indirect);
@@ -179,7 +195,7 @@ fd6_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) in_dt
       OUT_RING(ring, CP_EXEC_CS_3_NGROUPS_Z(info->grid[2]));
    }
 
-   trace_end_compute(&ctx->batch->trace);
+   trace_end_compute(&ctx->batch->trace, ring);
 
    OUT_WFI5(ring);
 
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_const.c b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_const.c
index b30b441558..dc8343f881 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_const.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_const.c	
@@ -148,7 +148,7 @@ fd6_build_tess_consts(struct fd6_emit *emit)
     * size is dwords, since that's what LDG/STG use.
     */
    unsigned num_vertices = emit->hs
-                              ? emit->info->vertices_per_patch
+                              ? emit->patch_vertices
                               : emit->gs->shader->nir->info.gs.vertices_in;
 
    uint32_t vs_params[4] = {
@@ -162,7 +162,7 @@ fd6_build_tess_consts(struct fd6_emit *emit)
       uint32_t hs_params[4] = {
          emit->vs->output_size * num_vertices * 4, /* vs primitive stride */
          emit->vs->output_size * 4,                /* vs vertex stride */
-         emit->hs->output_size, emit->info->vertices_per_patch};
+         emit->hs->output_size, emit->patch_vertices};
 
       emit_stage_tess_consts(constobj, emit->hs, hs_params,
                              ARRAY_SIZE(hs_params));
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_context.c b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_context.c
index 9b12c38bf9..7951e1094e 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_context.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_context.c	
@@ -51,6 +51,9 @@ fd6_context_destroy(struct pipe_context *pctx) in_dt
    u_upload_destroy(fd6_ctx->border_color_uploader);
    pipe_resource_reference(&fd6_ctx->border_color_buf, NULL);
 
+   if (fd6_ctx->streamout_disable_stateobj)
+      fd_ringbuffer_del(fd6_ctx->streamout_disable_stateobj);
+
    fd_context_destroy(pctx);
 
    if (fd6_ctx->vsc_draw_strm)
@@ -66,24 +69,6 @@ fd6_context_destroy(struct pipe_context *pctx) in_dt
    free(fd6_ctx);
 }
 
-/* clang-format off */
-static const uint8_t primtypes[] = {
-   [PIPE_PRIM_POINTS]                      = DI_PT_POINTLIST,
-   [PIPE_PRIM_LINES]                       = DI_PT_LINELIST,
-   [PIPE_PRIM_LINE_STRIP]                  = DI_PT_LINESTRIP,
-   [PIPE_PRIM_LINE_LOOP]                   = DI_PT_LINELOOP,
-   [PIPE_PRIM_TRIANGLES]                   = DI_PT_TRILIST,
-   [PIPE_PRIM_TRIANGLE_STRIP]              = DI_PT_TRISTRIP,
-   [PIPE_PRIM_TRIANGLE_FAN]                = DI_PT_TRIFAN,
-   [PIPE_PRIM_LINES_ADJACENCY]             = DI_PT_LINE_ADJ,
-   [PIPE_PRIM_LINE_STRIP_ADJACENCY]        = DI_PT_LINESTRIP_ADJ,
-   [PIPE_PRIM_TRIANGLES_ADJACENCY]         = DI_PT_TRI_ADJ,
-   [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]    = DI_PT_TRISTRIP_ADJ,
-   [PIPE_PRIM_PATCHES]                     = DI_PT_PATCHES0,
-   [PIPE_PRIM_MAX]                         = DI_PT_RECTLIST,  /* internal clear blits */
-};
-/* clang-format on */
-
 static void *
 fd6_vertex_state_create(struct pipe_context *pctx, unsigned num_elements,
                         const struct pipe_vertex_element *elements)
@@ -101,7 +86,7 @@ fd6_vertex_state_create(struct pipe_context *pctx, unsigned num_elements,
    for (int32_t i = 0; i < num_elements; i++) {
       const struct pipe_vertex_element *elem = &elements[i];
       enum pipe_format pfmt = elem->src_format;
-      enum a6xx_format fmt = fd6_pipe2vtx(pfmt);
+      enum a6xx_format fmt = fd6_vertex_format(pfmt);
       bool isint = util_format_is_pure_integer(pfmt);
       debug_assert(fmt != FMT6_NONE);
 
@@ -110,7 +95,7 @@ fd6_vertex_state_create(struct pipe_context *pctx, unsigned num_elements,
                         A6XX_VFD_DECODE_INSTR_FORMAT(fmt) |
                         COND(elem->instance_divisor,
                              A6XX_VFD_DECODE_INSTR_INSTANCED) |
-                        A6XX_VFD_DECODE_INSTR_SWAP(fd6_pipe2swap(pfmt)) |
+                        A6XX_VFD_DECODE_INSTR_SWAP(fd6_vertex_swap(pfmt)) |
                         A6XX_VFD_DECODE_INSTR_UNK30 |
                         COND(!isint, A6XX_VFD_DECODE_INSTR_FLOAT));
       OUT_RING(ring,
@@ -225,6 +210,7 @@ fd6_context_create(struct pipe_screen *pscreen, void *priv,
    pctx = &fd6_ctx->base.base;
    pctx->screen = pscreen;
 
+   fd6_ctx->base.flags = flags;
    fd6_ctx->base.dev = fd_device_ref(screen->dev);
    fd6_ctx->base.screen = fd_screen(pscreen);
    fd6_ctx->base.last.key = &fd6_ctx->last_key;
@@ -245,7 +231,7 @@ fd6_context_create(struct pipe_screen *pscreen, void *priv,
 
    setup_state_map(&fd6_ctx->base);
 
-   pctx = fd_context_init(&fd6_ctx->base, pscreen, primtypes, priv, flags);
+   pctx = fd_context_init(&fd6_ctx->base, pscreen, priv, flags);
    if (!pctx)
       return NULL;
 
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_context.h b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_context.h
index 1487e74b72..122642e4e8 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_context.h	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_context.h	
@@ -70,6 +70,9 @@ struct fd6_context {
    struct u_upload_mgr *border_color_uploader;
    struct pipe_resource *border_color_buf;
 
+   /* pre-backed stateobj for stream-out disable: */
+   struct fd_ringbuffer *streamout_disable_stateobj;
+
    /* storage for ctx->last.key: */
    struct ir3_shader_key last_key;
 
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_draw.c b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_draw.c
index 466bbdd8cd..9c1bd2117d 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_draw.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_draw.c	
@@ -143,7 +143,7 @@ fd6_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info,
       .ctx = ctx,
       .vtx = &ctx->vtx,
       .info = info,
-		.drawid_offset = drawid_offset,
+      .drawid_offset = drawid_offset,
       .indirect = indirect,
       .draw = draw,
       .key = {
@@ -161,6 +161,7 @@ fd6_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info,
       .sprite_coord_enable = ctx->rasterizer->sprite_coord_enable,
       .sprite_coord_mode = ctx->rasterizer->sprite_coord_mode,
       .primitive_restart = info->primitive_restart && info->index_size,
+      .patch_vertices = ctx->patch_vertices,
    };
 
    if (!(ctx->prog.vs && ctx->prog.fs))
@@ -235,7 +236,7 @@ fd6_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info,
    struct fd_ringbuffer *ring = ctx->batch->draw;
 
    struct CP_DRAW_INDX_OFFSET_0 draw0 = {
-      .prim_type = ctx->primtypes[info->mode],
+      .prim_type = ctx->screen->primtypes[info->mode],
       .vis_cull = USE_VISIBILITY,
       .gs_enable = !!emit.key.gs,
    };
@@ -270,7 +271,7 @@ fd6_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info,
          unreachable("bad tessmode");
       }
 
-      draw0.prim_type = DI_PT_PATCHES0 + info->vertices_per_patch;
+      draw0.prim_type = DI_PT_PATCHES0 + ctx->patch_vertices;
       draw0.tess_enable = true;
 
       const unsigned max_count = 2048;
@@ -281,10 +282,10 @@ fd6_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info,
        * limit.  But in the indirect-draw case we must assume the worst.
        */
       if (indirect && indirect->buffer) {
-         count = ALIGN_NPOT(max_count, info->vertices_per_patch);
+         count = ALIGN_NPOT(max_count, ctx->patch_vertices);
       } else {
          count = MIN2(max_count, draw->count);
-         count = ALIGN_NPOT(count, info->vertices_per_patch);
+         count = ALIGN_NPOT(count, ctx->patch_vertices);
       }
 
       OUT_PKT7(ring, CP_SET_SUBDRAW_SIZE, 1);
@@ -372,7 +373,7 @@ fd6_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info,
 }
 
 static void
-fd6_clear_lrz(struct fd_batch *batch, struct fd_resource *zsbuf, double depth)
+fd6_clear_lrz(struct fd_batch *batch, struct fd_resource *zsbuf, double depth) assert_dt
 {
    struct fd_ringbuffer *ring;
    struct fd_screen *screen = batch->ctx->screen;
@@ -431,6 +432,7 @@ fd6_clear_lrz(struct fd_batch *batch, struct fd_resource *zsbuf, double depth)
 
    fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true);
    fd6_event_write(batch, ring, PC_CCU_INVALIDATE_COLOR, false);
+   fd_wfi(batch, ring);
 
    OUT_PKT4(ring, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
    OUT_RING(ring, fui(depth));
@@ -476,6 +478,7 @@ fd6_clear_lrz(struct fd_batch *batch, struct fd_resource *zsbuf, double depth)
    fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true);
    fd6_event_write(batch, ring, PC_CCU_FLUSH_DEPTH_TS, true);
    fd6_event_write(batch, ring, CACHE_FLUSH_TS, true);
+   fd_wfi(batch, ring);
 
    fd6_cache_inv(batch, ring);
 }
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_emit.c b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_emit.c
index 3c4716ff68..85cbf0996d 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_emit.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_emit.c	
@@ -111,6 +111,7 @@ setup_border_colors(struct fd_texture_stateobj *tex,
       enum pipe_format format = view->format;
       const struct util_format_description *desc =
          util_format_description(format);
+      const struct fd_resource *rsc = fd_resource(view->texture);
 
       e->rgb565 = 0;
       e->rgb5a1 = 0;
@@ -120,7 +121,7 @@ setup_border_colors(struct fd_texture_stateobj *tex,
 
       unsigned char swiz[4];
 
-      fd6_tex_swiz(format, swiz, view->swizzle_r, view->swizzle_g,
+      fd6_tex_swiz(format, rsc->layout.tile_mode, swiz, view->swizzle_r, view->swizzle_g,
                    view->swizzle_b, view->swizzle_a);
 
       for (j = 0; j < 4; j++) {
@@ -909,24 +910,31 @@ fd6_emit_streamout(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt
    if (emit->streamout_mask) {
       fd6_emit_add_group(emit, prog->streamout_stateobj, FD6_GROUP_SO,
                          ENABLE_ALL);
-   } else {
+   } else if (ctx->last.streamout_mask != 0) {
       /* If we transition from a draw with streamout to one without, turn
        * off streamout.
        */
-      if (ctx->last.streamout_mask != 0) {
-         struct fd_ringbuffer *obj = fd_submit_new_ringbuffer(
-            emit->ctx->batch->submit, 5 * 4, FD_RINGBUFFER_STREAMING);
-
-         OUT_PKT7(obj, CP_CONTEXT_REG_BUNCH, 4);
-         OUT_RING(obj, REG_A6XX_VPC_SO_CNTL);
-         OUT_RING(obj, 0);
-         OUT_RING(obj, REG_A6XX_VPC_SO_STREAM_CNTL);
-         OUT_RING(obj, 0);
-
-         fd6_emit_take_group(emit, obj, FD6_GROUP_SO, ENABLE_ALL);
-      }
+      fd6_emit_add_group(emit, fd6_context(ctx)->streamout_disable_stateobj,
+                         FD6_GROUP_SO, ENABLE_ALL);
    }
 
+   /* Make sure that any use of our TFB outputs (indirect draw source or shader
+    * UBO reads) comes after the TFB output is written.  From the GL 4.6 core
+    * spec:
+    *
+    *     "Buffers should not be bound or in use for both transform feedback and
+    *      other purposes in the GL.  Specifically, if a buffer object is
+    *      simultaneously bound to a transform feedback buffer binding point
+    *      and elsewhere in the GL, any writes to or reads from the buffer
+    *      generate undefined values."
+    *
+    * So we idle whenever SO buffers change.  Note that this function is called
+    * on every draw with TFB enabled, so check the dirty flag for the buffers
+    * themselves.
+    */
+   if (ctx->dirty & FD_DIRTY_STREAMOUT)
+      fd_wfi(ctx->batch, ring);
+
    ctx->last.streamout_mask = emit->streamout_mask;
 }
 
@@ -1217,10 +1225,10 @@ fd6_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 void
 fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring)
 {
-   // struct fd_context *ctx = batch->ctx;
+   struct fd_screen *screen = batch->ctx->screen;
 
    if (!batch->nondraw) {
-      trace_start_state_restore(&batch->trace);
+      trace_start_state_restore(&batch->trace, ring);
    }
 
    fd6_cache_inv(batch, ring);
@@ -1241,7 +1249,7 @@ fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring)
    WRITE(REG_A6XX_SP_UNKNOWN_AE00, 0);
    WRITE(REG_A6XX_SP_PERFCTR_ENABLE, 0x3f);
    WRITE(REG_A6XX_TPL1_UNKNOWN_B605, 0x44);
-   WRITE(REG_A6XX_TPL1_UNKNOWN_B600, 0x100000);
+   WRITE(REG_A6XX_TPL1_DBG_ECO_CNTL, screen->info->a6xx.magic.TPL1_DBG_ECO_CNTL);
    WRITE(REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80);
    WRITE(REG_A6XX_HLSQ_UNKNOWN_BE01, 0);
 
@@ -1333,7 +1341,7 @@ fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring)
    OUT_RING(ring, 0x00000000);
 
    if (!batch->nondraw) {
-      trace_end_state_restore(&batch->trace);
+      trace_end_state_restore(&batch->trace, ring);
    }
 }
 
@@ -1386,6 +1394,7 @@ fd6_framebuffer_barrier(struct fd_context *ctx) assert_dt
    fd6_event_write(batch, ring, PC_CCU_FLUSH_DEPTH_TS, true);
 
    seqno = fd6_event_write(batch, ring, CACHE_FLUSH_TS, true);
+   fd_wfi(batch, ring);
 
    fd6_event_write(batch, ring, 0x31, false);
 
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_emit.h b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_emit.h
index 5d54f8d8a1..0c46d8a569 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_emit.h	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_emit.h	
@@ -100,6 +100,7 @@ struct fd6_emit {
    bool sprite_coord_mode;
    bool rasterflat;
    bool primitive_restart;
+   uint8_t patch_vertices;
 
    /* cached to avoid repeated lookups: */
    const struct fd6_program_state *prog;
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_format.c b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_format.c
index aaa521c46c..5007001dc3 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_format.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_format.c	
@@ -31,364 +31,6 @@
 #include "fd6_format.h"
 #include "freedreno_resource.h"
 
-/* Specifies the table of all the formats and their features. Also supplies
- * the helpers that look up various data in those tables.
- */
-
-struct fd6_format {
-   enum a6xx_format vtx;
-   enum a6xx_format tex;
-   enum a6xx_format rb;
-   enum a3xx_color_swap swap;
-   boolean present;
-};
-
-#define FMT(pipe, vtxfmt, texfmt, rbfmt, swapfmt)                              \
-   [PIPE_FORMAT_##pipe] = {.present = 1,                                       \
-                           .vtx = FMT6_##vtxfmt,                               \
-                           .tex = FMT6_##texfmt,                               \
-                           .rb = FMT6_##rbfmt,                                 \
-                           .swap = swapfmt}
-
-/* vertex + texture + color */
-#define VTC(pipe, fmt, swapfmt) FMT(pipe, fmt, fmt, fmt, swapfmt)
-
-#define _TC(pipe, fmt, swapfmt) FMT(pipe, NONE, fmt, fmt, swapfmt)
-#define _T_(pipe, fmt, swapfmt) FMT(pipe, NONE, fmt, NONE, swapfmt)
-#define VT_(pipe, fmt, swapfmt) FMT(pipe, fmt, fmt, NONE, swapfmt)
-#define V__(pipe, fmt, swapfmt) FMT(pipe, fmt, NONE, NONE, swapfmt)
-
-/* clang-format off */
-static struct fd6_format formats[PIPE_FORMAT_COUNT] = {
-   /* 8-bit */
-   VTC(R8_UNORM,   8_UNORM,                     WZYX),
-   VTC(R8_SNORM,   8_SNORM,                     WZYX),
-   VTC(R8_UINT,    8_UINT,                      WZYX),
-   VTC(R8_SINT,    8_SINT,                      WZYX),
-   V__(R8_USCALED, 8_UINT,                      WZYX),
-   V__(R8_SSCALED, 8_SINT,                      WZYX),
-
-   FMT(A8_UNORM,   NONE, 8_UNORM, A8_UNORM,     WZYX),
-   _TC(L8_UNORM,   8_UNORM,                     WZYX),
-   _T_(I8_UNORM,   8_UNORM,                     WZYX),
-
-   _T_(A8_UINT,    8_UINT,                      WZYX),
-   _T_(A8_SINT,    8_SINT,                      WZYX),
-   _T_(L8_UINT,    8_UINT,                      WZYX),
-   _T_(L8_SINT,    8_SINT,                      WZYX),
-   _T_(I8_UINT,    8_UINT,                      WZYX),
-   _T_(I8_SINT,    8_SINT,                      WZYX),
-
-   _TC(S8_UINT,    8_UINT,                      WZYX),
-
-   /* 16-bit */
-   VTC(R16_UNORM,   16_UNORM,                   WZYX),
-   VTC(R16_SNORM,   16_SNORM,                   WZYX),
-   VTC(R16_UINT,    16_UINT,                    WZYX),
-   VTC(R16_SINT,    16_SINT,                    WZYX),
-   V__(R16_USCALED, 16_UINT,                    WZYX),
-   V__(R16_SSCALED, 16_SINT,                    WZYX),
-   VTC(R16_FLOAT,   16_FLOAT,                   WZYX),
-   _TC(Z16_UNORM,   16_UNORM,                   WZYX),
-
-   _T_(A16_UNORM,   16_UNORM,                   WZYX),
-   _T_(A16_SNORM,   16_SNORM,                   WZYX),
-   _T_(A16_UINT,    16_UINT,                    WZYX),
-   _T_(A16_SINT,    16_SINT,                    WZYX),
-   _T_(L16_UNORM,   16_UNORM,                   WZYX),
-   _T_(L16_SNORM,   16_SNORM,                   WZYX),
-   _T_(L16_UINT,    16_UINT,                    WZYX),
-   _T_(L16_SINT,    16_SINT,                    WZYX),
-   _T_(I16_UNORM,   16_UNORM,                   WZYX),
-   _T_(I16_SNORM,   16_SNORM,                   WZYX),
-   _T_(I16_UINT,    16_UINT,                    WZYX),
-   _T_(I16_SINT,    16_SINT,                    WZYX),
-
-   VTC(R8G8_UNORM,   8_8_UNORM,                 WZYX),
-   VTC(R8G8_SNORM,   8_8_SNORM,                 WZYX),
-   VTC(R8G8_UINT,    8_8_UINT,                  WZYX),
-   VTC(R8G8_SINT,    8_8_SINT,                  WZYX),
-   V__(R8G8_USCALED, 8_8_UINT,                  WZYX),
-   V__(R8G8_SSCALED, 8_8_SINT,                  WZYX),
-
-   _T_(L8A8_UINT,    8_8_UINT,                  WZYX),
-   _T_(L8A8_SINT,    8_8_SINT,                  WZYX),
-
-   _TC(B5G6R5_UNORM,   5_6_5_UNORM,             WXYZ),
-   _TC(B5G5R5A1_UNORM, 5_5_5_1_UNORM,           WXYZ),
-   _TC(B5G5R5X1_UNORM, 5_5_5_1_UNORM,           WXYZ),
-   _TC(B4G4R4A4_UNORM, 4_4_4_4_UNORM,           WXYZ),
-
-   /* 24-bit */
-   V__(R8G8B8_UNORM,   8_8_8_UNORM,             WZYX),
-   V__(R8G8B8_SNORM,   8_8_8_SNORM,             WZYX),
-   V__(R8G8B8_UINT,    8_8_8_UINT,              WZYX),
-   V__(R8G8B8_SINT,    8_8_8_SINT,              WZYX),
-   V__(R8G8B8_USCALED, 8_8_8_UINT,              WZYX),
-   V__(R8G8B8_SSCALED, 8_8_8_SINT,              WZYX),
-
-   /* 32-bit */
-   V__(R32_UNORM,   32_UNORM,                   WZYX),
-   V__(R32_SNORM,   32_SNORM,                   WZYX),
-   VTC(R32_UINT,    32_UINT,                    WZYX),
-   VTC(R32_SINT,    32_SINT,                    WZYX),
-   V__(R32_USCALED, 32_UINT,                    WZYX),
-   V__(R32_SSCALED, 32_SINT,                    WZYX),
-   VTC(R32_FLOAT,   32_FLOAT,                   WZYX),
-   V__(R32_FIXED,   32_FIXED,                   WZYX),
-
-   _T_(A32_UINT,    32_UINT,                    WZYX),
-   _T_(A32_SINT,    32_SINT,                    WZYX),
-   _T_(L32_UINT,    32_UINT,                    WZYX),
-   _T_(L32_SINT,    32_SINT,                    WZYX),
-   _T_(I32_UINT,    32_UINT,                    WZYX),
-   _T_(I32_SINT,    32_SINT,                    WZYX),
-
-   VTC(R16G16_UNORM,   16_16_UNORM,             WZYX),
-   VTC(R16G16_SNORM,   16_16_SNORM,             WZYX),
-   VTC(R16G16_UINT,    16_16_UINT,              WZYX),
-   VTC(R16G16_SINT,    16_16_SINT,              WZYX),
-   VT_(R16G16_USCALED, 16_16_UINT,              WZYX),
-   VT_(R16G16_SSCALED, 16_16_SINT,              WZYX),
-   VTC(R16G16_FLOAT,   16_16_FLOAT,             WZYX),
-
-   _T_(L16A16_UNORM,   16_16_UNORM,             WZYX),
-   _T_(L16A16_SNORM,   16_16_SNORM,             WZYX),
-   _T_(L16A16_UINT,    16_16_UINT,              WZYX),
-   _T_(L16A16_SINT,    16_16_SINT,              WZYX),
-
-   VTC(R8G8B8A8_UNORM,   8_8_8_8_UNORM,         WZYX),
-   _TC(R8G8B8X8_UNORM,   8_8_8_8_UNORM,         WZYX),
-   _TC(R8G8B8A8_SRGB,    8_8_8_8_UNORM,         WZYX),
-   _TC(R8G8B8X8_SRGB,    8_8_8_8_UNORM,         WZYX),
-   VTC(R8G8B8A8_SNORM,   8_8_8_8_SNORM,         WZYX),
-   VTC(R8G8B8A8_UINT,    8_8_8_8_UINT,          WZYX),
-   VTC(R8G8B8A8_SINT,    8_8_8_8_SINT,          WZYX),
-   V__(R8G8B8A8_USCALED, 8_8_8_8_UINT,          WZYX),
-   V__(R8G8B8A8_SSCALED, 8_8_8_8_SINT,          WZYX),
-
-   VTC(B8G8R8A8_UNORM,   8_8_8_8_UNORM,         WXYZ),
-   _TC(B8G8R8X8_UNORM,   8_8_8_8_UNORM,         WXYZ),
-   VTC(B8G8R8A8_SRGB,    8_8_8_8_UNORM,         WXYZ),
-   _TC(B8G8R8X8_SRGB,    8_8_8_8_UNORM,         WXYZ),
-
-   VTC(A8B8G8R8_UNORM,   8_8_8_8_UNORM,         XYZW),
-   _TC(X8B8G8R8_UNORM,   8_8_8_8_UNORM,         XYZW),
-   _TC(A8B8G8R8_SRGB,    8_8_8_8_UNORM,         XYZW),
-   _TC(X8B8G8R8_SRGB,    8_8_8_8_UNORM,         XYZW),
-
-   VTC(A8R8G8B8_UNORM,   8_8_8_8_UNORM,         ZYXW),
-   _TC(X8R8G8B8_UNORM,   8_8_8_8_UNORM,         ZYXW),
-   _TC(A8R8G8B8_SRGB,    8_8_8_8_UNORM,         ZYXW),
-   _TC(X8R8G8B8_SRGB,    8_8_8_8_UNORM,         ZYXW),
-
-   FMT(R10G10B10A2_UNORM, 10_10_10_2_UNORM, 10_10_10_2_UNORM, 10_10_10_2_UNORM_DEST, WZYX),
-   FMT(B10G10R10A2_UNORM, 10_10_10_2_UNORM, 10_10_10_2_UNORM, 10_10_10_2_UNORM_DEST, WXYZ),
-   FMT(B10G10R10X2_UNORM, NONE,             10_10_10_2_UNORM, 10_10_10_2_UNORM_DEST, WXYZ),
-   V__(R10G10B10A2_SNORM,   10_10_10_2_SNORM,   WZYX),
-   V__(B10G10R10A2_SNORM,   10_10_10_2_SNORM,   WXYZ),
-   VTC(R10G10B10A2_UINT,    10_10_10_2_UINT,    WZYX),
-   VTC(B10G10R10A2_UINT,    10_10_10_2_UINT,    WXYZ),
-   V__(R10G10B10A2_USCALED, 10_10_10_2_UINT,    WZYX),
-   V__(B10G10R10A2_USCALED, 10_10_10_2_UINT,    WXYZ),
-   V__(R10G10B10A2_SSCALED, 10_10_10_2_SINT,    WZYX),
-   V__(B10G10R10A2_SSCALED, 10_10_10_2_SINT,    WXYZ),
-
-   VTC(R11G11B10_FLOAT, 11_11_10_FLOAT,         WZYX),
-   _T_(R9G9B9E5_FLOAT,  9_9_9_E5_FLOAT,         WZYX),
-
-   _TC(Z24X8_UNORM,          Z24_UNORM_S8_UINT, WZYX),
-   _TC(X24S8_UINT,           8_8_8_8_UINT,      WZYX),
-   _TC(Z24_UNORM_S8_UINT,    Z24_UNORM_S8_UINT, WZYX),
-   _TC(Z32_FLOAT,            32_FLOAT,          WZYX),
-   _TC(Z32_FLOAT_S8X24_UINT, 32_FLOAT,          WZYX),
-   _TC(X32_S8X24_UINT,       8_UINT,            WZYX),
-
-   /* special format for blits: */
-   _TC(Z24_UNORM_S8_UINT_AS_R8G8B8A8, Z24_UNORM_S8_UINT_AS_R8G8B8A8, WZYX),
-
-   /* 48-bit */
-   V__(R16G16B16_UNORM,   16_16_16_UNORM,       WZYX),
-   V__(R16G16B16_SNORM,   16_16_16_SNORM,       WZYX),
-   V__(R16G16B16_UINT,    16_16_16_UINT,        WZYX),
-   V__(R16G16B16_SINT,    16_16_16_SINT,        WZYX),
-   V__(R16G16B16_USCALED, 16_16_16_UINT,        WZYX),
-   V__(R16G16B16_SSCALED, 16_16_16_SINT,        WZYX),
-   V__(R16G16B16_FLOAT,   16_16_16_FLOAT,       WZYX),
-
-   /* 64-bit */
-   VTC(R16G16B16A16_UNORM,   16_16_16_16_UNORM, WZYX),
-   VTC(R16G16B16X16_UNORM,   16_16_16_16_UNORM, WZYX),
-   VTC(R16G16B16A16_SNORM,   16_16_16_16_SNORM, WZYX),
-   VTC(R16G16B16X16_SNORM,   16_16_16_16_SNORM, WZYX),
-   VTC(R16G16B16A16_UINT,    16_16_16_16_UINT,  WZYX),
-   VTC(R16G16B16X16_UINT,    16_16_16_16_UINT,  WZYX),
-   VTC(R16G16B16A16_SINT,    16_16_16_16_SINT,  WZYX),
-   VTC(R16G16B16X16_SINT,    16_16_16_16_SINT,  WZYX),
-   VT_(R16G16B16A16_USCALED, 16_16_16_16_UINT,  WZYX),
-   VT_(R16G16B16A16_SSCALED, 16_16_16_16_SINT,  WZYX),
-   VTC(R16G16B16A16_FLOAT,   16_16_16_16_FLOAT, WZYX),
-   VTC(R16G16B16X16_FLOAT,   16_16_16_16_FLOAT, WZYX),
-
-   V__(R32G32_UNORM,   32_32_UNORM,             WZYX),
-   V__(R32G32_SNORM,   32_32_SNORM,             WZYX),
-   VTC(R32G32_UINT,    32_32_UINT,              WZYX),
-   VTC(R32G32_SINT,    32_32_SINT,              WZYX),
-   V__(R32G32_USCALED, 32_32_UINT,              WZYX),
-   V__(R32G32_SSCALED, 32_32_SINT,              WZYX),
-   VTC(R32G32_FLOAT,   32_32_FLOAT,             WZYX),
-   V__(R32G32_FIXED,   32_32_FIXED,             WZYX),
-
-   _T_(L32A32_UINT,    32_32_UINT,              WZYX),
-   _T_(L32A32_SINT,    32_32_SINT,              WZYX),
-
-   /* 96-bit */
-   V__(R32G32B32_UNORM,   32_32_32_UNORM,       WZYX),
-   V__(R32G32B32_SNORM,   32_32_32_SNORM,       WZYX),
-   VT_(R32G32B32_UINT,    32_32_32_UINT,        WZYX),
-   VT_(R32G32B32_SINT,    32_32_32_SINT,        WZYX),
-   V__(R32G32B32_USCALED, 32_32_32_UINT,        WZYX),
-   V__(R32G32B32_SSCALED, 32_32_32_SINT,        WZYX),
-   VT_(R32G32B32_FLOAT,   32_32_32_FLOAT,       WZYX),
-   V__(R32G32B32_FIXED,   32_32_32_FIXED,       WZYX),
-
-   /* 128-bit */
-   V__(R32G32B32A32_UNORM,   32_32_32_32_UNORM, WZYX),
-   V__(R32G32B32A32_SNORM,   32_32_32_32_SNORM, WZYX),
-   VTC(R32G32B32A32_UINT,    32_32_32_32_UINT,  WZYX),
-   _TC(R32G32B32X32_UINT,    32_32_32_32_UINT,  WZYX),
-   VTC(R32G32B32A32_SINT,    32_32_32_32_SINT,  WZYX),
-   _TC(R32G32B32X32_SINT,    32_32_32_32_SINT,  WZYX),
-   V__(R32G32B32A32_USCALED, 32_32_32_32_UINT,  WZYX),
-   V__(R32G32B32A32_SSCALED, 32_32_32_32_SINT,  WZYX),
-   VTC(R32G32B32A32_FLOAT,   32_32_32_32_FLOAT, WZYX),
-   _TC(R32G32B32X32_FLOAT,   32_32_32_32_FLOAT, WZYX),
-   V__(R32G32B32A32_FIXED,   32_32_32_32_FIXED, WZYX),
-
-   /* compressed */
-   _T_(ETC1_RGB8, ETC1,                         WZYX),
-   _T_(ETC2_RGB8, ETC2_RGB8,                    WZYX),
-   _T_(ETC2_SRGB8, ETC2_RGB8,                   WZYX),
-   _T_(ETC2_RGB8A1, ETC2_RGB8A1,                WZYX),
-   _T_(ETC2_SRGB8A1, ETC2_RGB8A1,               WZYX),
-   _T_(ETC2_RGBA8, ETC2_RGBA8,                  WZYX),
-   _T_(ETC2_SRGBA8, ETC2_RGBA8,                 WZYX),
-   _T_(ETC2_R11_UNORM, ETC2_R11_UNORM,          WZYX),
-   _T_(ETC2_R11_SNORM, ETC2_R11_SNORM,          WZYX),
-   _T_(ETC2_RG11_UNORM, ETC2_RG11_UNORM,        WZYX),
-   _T_(ETC2_RG11_SNORM, ETC2_RG11_SNORM,        WZYX),
-
-   _T_(DXT1_RGB,   DXT1,                        WZYX),
-   _T_(DXT1_SRGB,  DXT1,                        WZYX),
-   _T_(DXT1_RGBA,  DXT1,                        WZYX),
-   _T_(DXT1_SRGBA, DXT1,                        WZYX),
-   _T_(DXT3_RGBA,  DXT3,                        WZYX),
-   _T_(DXT3_SRGBA, DXT3,                        WZYX),
-   _T_(DXT5_RGBA,  DXT5,                        WZYX),
-   _T_(DXT5_SRGBA, DXT5,                        WZYX),
-
-   _T_(BPTC_RGBA_UNORM, BPTC,                   WZYX),
-   _T_(BPTC_SRGBA,      BPTC,                   WZYX),
-   _T_(BPTC_RGB_FLOAT,  BPTC_FLOAT,             WZYX),
-   _T_(BPTC_RGB_UFLOAT, BPTC_UFLOAT,            WZYX),
-
-   _T_(RGTC1_UNORM, RGTC1_UNORM,                WZYX),
-   _T_(RGTC1_SNORM, RGTC1_SNORM,                WZYX),
-   _T_(RGTC2_UNORM, RGTC2_UNORM,                WZYX),
-   _T_(RGTC2_SNORM, RGTC2_SNORM,                WZYX),
-   _T_(LATC1_UNORM, RGTC1_UNORM,                WZYX),
-   _T_(LATC1_SNORM, RGTC1_SNORM,                WZYX),
-   _T_(LATC2_UNORM, RGTC2_UNORM,                WZYX),
-   _T_(LATC2_SNORM, RGTC2_SNORM,                WZYX),
-
-   _T_(ASTC_4x4,   ASTC_4x4,                    WZYX),
-   _T_(ASTC_5x4,   ASTC_5x4,                    WZYX),
-   _T_(ASTC_5x5,   ASTC_5x5,                    WZYX),
-   _T_(ASTC_6x5,   ASTC_6x5,                    WZYX),
-   _T_(ASTC_6x6,   ASTC_6x6,                    WZYX),
-   _T_(ASTC_8x5,   ASTC_8x5,                    WZYX),
-   _T_(ASTC_8x6,   ASTC_8x6,                    WZYX),
-   _T_(ASTC_8x8,   ASTC_8x8,                    WZYX),
-   _T_(ASTC_10x5,  ASTC_10x5,                   WZYX),
-   _T_(ASTC_10x6,  ASTC_10x6,                   WZYX),
-   _T_(ASTC_10x8,  ASTC_10x8,                   WZYX),
-   _T_(ASTC_10x10, ASTC_10x10,                  WZYX),
-   _T_(ASTC_12x10, ASTC_12x10,                  WZYX),
-   _T_(ASTC_12x12, ASTC_12x12,                  WZYX),
-
-   _T_(ASTC_4x4_SRGB,   ASTC_4x4,               WZYX),
-   _T_(ASTC_5x4_SRGB,   ASTC_5x4,               WZYX),
-   _T_(ASTC_5x5_SRGB,   ASTC_5x5,               WZYX),
-   _T_(ASTC_6x5_SRGB,   ASTC_6x5,               WZYX),
-   _T_(ASTC_6x6_SRGB,   ASTC_6x6,               WZYX),
-   _T_(ASTC_8x5_SRGB,   ASTC_8x5,               WZYX),
-   _T_(ASTC_8x6_SRGB,   ASTC_8x6,               WZYX),
-   _T_(ASTC_8x8_SRGB,   ASTC_8x8,               WZYX),
-   _T_(ASTC_10x5_SRGB,  ASTC_10x5,              WZYX),
-   _T_(ASTC_10x6_SRGB,  ASTC_10x6,              WZYX),
-   _T_(ASTC_10x8_SRGB,  ASTC_10x8,              WZYX),
-   _T_(ASTC_10x10_SRGB, ASTC_10x10,             WZYX),
-   _T_(ASTC_12x10_SRGB, ASTC_12x10,             WZYX),
-   _T_(ASTC_12x12_SRGB, ASTC_12x12,             WZYX),
-
-   _T_(R8_G8B8_420_UNORM, R8_G8B8_2PLANE_420_UNORM, WZYX),
-};
-/* clang-format on */
-
-/* convert pipe format to vertex buffer format: */
-enum a6xx_format
-fd6_pipe2vtx(enum pipe_format format)
-{
-   if (!formats[format].present)
-      return FMT6_NONE;
-   return formats[format].vtx;
-}
-
-/* convert pipe format to texture sampler format: */
-enum a6xx_format
-fd6_pipe2tex(enum pipe_format format)
-{
-   if (!formats[format].present)
-      return FMT6_NONE;
-   return formats[format].tex;
-}
-
-/* convert pipe format to MRT / copydest format used for render-target: */
-enum a6xx_format
-fd6_pipe2color(enum pipe_format format)
-{
-   if (!formats[format].present)
-      return FMT6_NONE;
-   return formats[format].rb;
-}
-
-enum a3xx_color_swap
-fd6_pipe2swap(enum pipe_format format)
-{
-   if (!formats[format].present)
-      return WZYX;
-   return formats[format].swap;
-}
-
-enum a6xx_depth_format
-fd6_pipe2depth(enum pipe_format format)
-{
-   switch (format) {
-   case PIPE_FORMAT_Z16_UNORM:
-      return DEPTH6_16;
-   case PIPE_FORMAT_Z24X8_UNORM:
-   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
-   case PIPE_FORMAT_X8Z24_UNORM:
-   case PIPE_FORMAT_S8_UINT_Z24_UNORM:
-      return DEPTH6_24_8;
-   case PIPE_FORMAT_Z32_FLOAT:
-   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-      return DEPTH6_32;
-   default:
-      return ~0;
-   }
-}
-
 enum a6xx_tex_swiz
 fd6_pipe2swiz(unsigned swiz)
 {
@@ -410,7 +52,7 @@ fd6_pipe2swiz(unsigned swiz)
 }
 
 void
-fd6_tex_swiz(enum pipe_format format, unsigned char *swiz, unsigned swizzle_r,
+fd6_tex_swiz(enum pipe_format format, enum a6xx_tile_mode tile_mode, unsigned char *swiz, unsigned swizzle_r,
              unsigned swizzle_g, unsigned swizzle_b, unsigned swizzle_a)
 {
    const struct util_format_description *desc = util_format_description(format);
@@ -423,7 +65,10 @@ fd6_tex_swiz(enum pipe_format format, unsigned char *swiz, unsigned swizzle_r,
       const unsigned char stencil_swiz[4] = {PIPE_SWIZZLE_W, PIPE_SWIZZLE_W,
                                              PIPE_SWIZZLE_W, PIPE_SWIZZLE_W};
       util_format_compose_swizzles(stencil_swiz, uswiz, swiz);
-   } else if (fd6_pipe2swap(format) != WZYX) {
+   } else if (format == PIPE_FORMAT_R8G8_R8B8_UNORM || format == PIPE_FORMAT_G8R8_B8R8_UNORM) {
+      unsigned char fswiz[4] = {PIPE_SWIZZLE_Z, PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_1};
+      util_format_compose_swizzles(fswiz, uswiz, swiz);
+   } else if (fd6_texture_swap(format, TILE6_LINEAR) != WZYX || format == PIPE_FORMAT_A1R5G5B5_UNORM) {
       /* Formats with a non-pass-through swap are permutations of RGBA
        * formats. We program the permutation using the swap and don't
        * need to compose the format swizzle with the user swizzle.
@@ -446,11 +91,11 @@ fd6_tex_const_0(struct pipe_resource *prsc, unsigned level,
    struct fd_resource *rsc = fd_resource(prsc);
    unsigned char swiz[4];
 
-   fd6_tex_swiz(format, swiz, swizzle_r, swizzle_g, swizzle_b, swizzle_a);
+   fd6_tex_swiz(format, rsc->layout.tile_mode, swiz, swizzle_r, swizzle_g, swizzle_b, swizzle_a);
 
-   return A6XX_TEX_CONST_0_FMT(fd6_pipe2tex(format)) |
+   return A6XX_TEX_CONST_0_FMT(fd6_texture_format(format, rsc->layout.tile_mode)) |
           A6XX_TEX_CONST_0_SAMPLES(fd_msaa_samples(prsc->nr_samples)) |
-          A6XX_TEX_CONST_0_SWAP(fd6_resource_swap(rsc, format)) |
+          A6XX_TEX_CONST_0_SWAP(fd6_texture_swap(format, rsc->layout.tile_mode)) |
           A6XX_TEX_CONST_0_TILE_MODE(fd_resource_tile_mode(prsc, level)) |
           COND(util_format_is_srgb(format), A6XX_TEX_CONST_0_SRGB) |
           A6XX_TEX_CONST_0_SWIZ_X(fd6_pipe2swiz(swiz[0])) |
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_format.h b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_format.h
index b9e2f55486..667ce729cd 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_format.h	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_format.h	
@@ -28,19 +28,15 @@
 #ifndef FD6_UTIL_H_
 #define FD6_UTIL_H_
 
+#include "fdl/fd6_format_table.h"
 #include "freedreno_resource.h"
 #include "freedreno_util.h"
 
 #include "a6xx.xml.h"
 
-enum a6xx_format fd6_pipe2vtx(enum pipe_format format);
-enum a6xx_format fd6_pipe2tex(enum pipe_format format);
-enum a6xx_format fd6_pipe2color(enum pipe_format format);
-enum a3xx_color_swap fd6_pipe2swap(enum pipe_format format);
-enum a6xx_depth_format fd6_pipe2depth(enum pipe_format format);
 enum a6xx_tex_swiz fd6_pipe2swiz(unsigned swiz);
 
-void fd6_tex_swiz(enum pipe_format format, unsigned char *swiz,
+void fd6_tex_swiz(enum pipe_format format, enum a6xx_tile_mode tile_mode, unsigned char *swiz,
                   unsigned swizzle_r, unsigned swizzle_g, unsigned swizzle_b,
                   unsigned swizzle_a);
 
@@ -49,10 +45,4 @@ uint32_t fd6_tex_const_0(struct pipe_resource *prsc, unsigned level,
                          unsigned swizzle_g, unsigned swizzle_b,
                          unsigned swizzle_a);
 
-static inline uint32_t
-fd6_resource_swap(struct fd_resource *rsc, enum pipe_format format)
-{
-   return rsc->layout.tile_mode ? WZYX : fd6_pipe2swap(format);
-}
-
 #endif /* FD6_UTIL_H_ */
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c
index 4c641b99bd..4910739894 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c	
@@ -81,7 +81,6 @@ emit_mrt(struct fd_ringbuffer *ring, struct pipe_framebuffer_state *pfb,
    unsigned max_layer_index = 0;
 
    for (i = 0; i < pfb->nr_cbufs; i++) {
-      enum a6xx_format format = 0;
       enum a3xx_color_swap swap = WZYX;
       bool sint = false, uint = false;
       struct fd_resource *rsc = NULL;
@@ -89,7 +88,6 @@ emit_mrt(struct fd_ringbuffer *ring, struct pipe_framebuffer_state *pfb,
       uint32_t stride = 0;
       uint32_t array_stride = 0;
       uint32_t offset;
-      uint32_t tile_mode;
 
       if (!pfb->cbufs[i])
          continue;
@@ -102,7 +100,8 @@ emit_mrt(struct fd_ringbuffer *ring, struct pipe_framebuffer_state *pfb,
 
       uint32_t base = gmem ? gmem->cbuf_base[i] : 0;
       slice = fd_resource_slice(rsc, psurf->u.tex.level);
-      format = fd6_pipe2color(pformat);
+      uint32_t tile_mode = fd_resource_tile_mode(psurf->texture, psurf->u.tex.level);
+      enum a6xx_format format = fd6_color_format(pformat, tile_mode);
       sint = util_format_is_pure_sint(pformat);
       uint = util_format_is_pure_uint(pformat);
 
@@ -114,9 +113,8 @@ emit_mrt(struct fd_ringbuffer *ring, struct pipe_framebuffer_state *pfb,
 
       stride = fd_resource_pitch(rsc, psurf->u.tex.level);
       array_stride = fd_resource_layer_stride(rsc, psurf->u.tex.level);
-      swap = fd6_resource_swap(rsc, pformat);
+      swap = fd6_color_swap(pformat, rsc->layout.tile_mode);
 
-      tile_mode = fd_resource_tile_mode(psurf->texture, psurf->u.tex.level);
       max_layer_index = psurf->u.tex.last_layer - psurf->u.tex.first_layer;
 
       debug_assert((offset + slice->size0) <= fd_bo_size(rsc->bo));
@@ -710,9 +708,9 @@ emit_binning_pass(struct fd_batch *batch) assert_dt
             A6XX_SP_TP_WINDOW_OFFSET_X(0) | A6XX_SP_TP_WINDOW_OFFSET_Y(0));
 
    /* emit IB to binning drawcmds: */
-   trace_start_binning_ib(&batch->trace);
+   trace_start_binning_ib(&batch->trace, ring);
    fd6_emit_ib(ring, batch->draw);
-   trace_end_binning_ib(&batch->trace);
+   trace_end_binning_ib(&batch->trace, ring);
 
    fd_reset_wfi(batch);
 
@@ -732,9 +730,9 @@ emit_binning_pass(struct fd_batch *batch) assert_dt
 
    OUT_PKT7(ring, CP_WAIT_FOR_ME, 0);
 
-   trace_start_vsc_overflow_test(&batch->trace);
+   trace_start_vsc_overflow_test(&batch->trace, batch->gmem);
    emit_vsc_overflow_test(batch);
-   trace_end_vsc_overflow_test(&batch->trace);
+   trace_end_vsc_overflow_test(&batch->trace, batch->gmem);
 
    OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1);
    OUT_RING(ring, 0x0);
@@ -794,9 +792,9 @@ fd6_emit_tile_init(struct fd_batch *batch) assert_dt
    fd6_emit_lrz_flush(ring);
 
    if (batch->prologue) {
-      trace_start_prologue(&batch->trace);
+      trace_start_prologue(&batch->trace, ring);
       fd6_emit_ib(ring, batch->prologue);
-      trace_end_prologue(&batch->trace);
+      trace_end_prologue(&batch->trace, ring);
    }
 
    fd6_cache_inv(batch, ring);
@@ -987,12 +985,12 @@ emit_blit(struct fd_batch *batch, struct fd_ringbuffer *ring, uint32_t base,
 
    debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer);
 
-   enum a6xx_format format = fd6_pipe2color(pfmt);
+   uint32_t tile_mode = fd_resource_tile_mode(&rsc->b.b, psurf->u.tex.level);
+   enum a6xx_format format = fd6_color_format(pfmt, tile_mode);
    uint32_t stride = fd_resource_pitch(rsc, psurf->u.tex.level);
    uint32_t size = fd_resource_slice(rsc, psurf->u.tex.level)->size0;
-   enum a3xx_color_swap swap = fd6_resource_swap(rsc, pfmt);
+   enum a3xx_color_swap swap = fd6_color_swap(pfmt, rsc->layout.tile_mode);
    enum a3xx_msaa_samples samples = fd_msaa_samples(rsc->b.b.nr_samples);
-   uint32_t tile_mode = fd_resource_tile_mode(&rsc->b.b, psurf->u.tex.level);
 
    OUT_REG(ring,
            A6XX_RB_BLIT_DST_INFO(.tile_mode = tile_mode, .samples = samples,
@@ -1052,7 +1050,7 @@ emit_clears(struct fd_batch *batch, struct fd_ringbuffer *ring)
 
          // XXX I think RB_CLEAR_COLOR_DWn wants to take into account SWAP??
          union pipe_color_union swapped;
-         switch (fd6_pipe2swap(pfmt)) {
+         switch (fd6_color_swap(pfmt, TILE6_LINEAR)) {
          case WZYX:
             swapped.ui[0] = color->ui[0];
             swapped.ui[1] = color->ui[1];
@@ -1085,7 +1083,7 @@ emit_clears(struct fd_batch *batch, struct fd_ringbuffer *ring)
          OUT_RING(ring,
                   A6XX_RB_BLIT_DST_INFO_TILE_MODE(TILE6_LINEAR) |
                      A6XX_RB_BLIT_DST_INFO_SAMPLES(samples) |
-                     A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(fd6_pipe2color(pfmt)));
+                     A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(fd6_color_format(pfmt, TILE6_LINEAR)));
 
          OUT_PKT4(ring, REG_A6XX_RB_BLIT_INFO, 1);
          OUT_RING(ring,
@@ -1137,7 +1135,7 @@ emit_clears(struct fd_batch *batch, struct fd_ringbuffer *ring)
       OUT_RING(ring,
                A6XX_RB_BLIT_DST_INFO_TILE_MODE(TILE6_LINEAR) |
                   A6XX_RB_BLIT_DST_INFO_SAMPLES(samples) |
-                  A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(fd6_pipe2color(pfmt)));
+                  A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(fd6_color_format(pfmt, TILE6_LINEAR)));
 
       OUT_PKT4(ring, REG_A6XX_RB_BLIT_INFO, 1);
       OUT_RING(ring, A6XX_RB_BLIT_INFO_GMEM |
@@ -1249,13 +1247,13 @@ fd6_emit_tile_renderprep(struct fd_batch *batch, const struct fd_tile *tile)
    if (!batch->tile_setup)
       return;
 
-   trace_start_clear_restore(&batch->trace, batch->fast_cleared);
+   trace_start_clear_restore(&batch->trace, batch->gmem, batch->fast_cleared);
    if (batch->fast_cleared || !use_hw_binning(batch)) {
       fd6_emit_ib(batch->gmem, batch->tile_setup);
    } else {
       emit_conditional_ib(batch, tile, batch->tile_setup);
    }
-   trace_end_clear_restore(&batch->trace);
+   trace_end_clear_restore(&batch->trace, batch->gmem);
 }
 
 static bool
@@ -1427,13 +1425,13 @@ fd6_emit_tile_gmem2mem(struct fd_batch *batch, const struct fd_tile *tile)
    OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE));
    emit_marker6(ring, 7);
 
-   trace_start_resolve(&batch->trace);
+   trace_start_resolve(&batch->trace, batch->gmem);
    if (batch->fast_cleared || !use_hw_binning(batch)) {
       fd6_emit_ib(batch->gmem, batch->tile_fini);
    } else {
       emit_conditional_ib(batch, tile, batch->tile_fini);
    }
-   trace_end_resolve(&batch->trace);
+   trace_end_resolve(&batch->trace, batch->gmem);
 }
 
 static void
@@ -1466,7 +1464,7 @@ emit_sysmem_clears(struct fd_batch *batch, struct fd_ringbuffer *ring) assert_dt
    if (!buffers)
       return;
 
-   trace_start_clear_restore(&batch->trace, buffers);
+   trace_start_clear_restore(&batch->trace, ring, buffers);
 
    if (buffers & PIPE_CLEAR_COLOR) {
       for (int i = 0; i < pfb->nr_cbufs; i++) {
@@ -1512,8 +1510,9 @@ emit_sysmem_clears(struct fd_batch *batch, struct fd_ringbuffer *ring) assert_dt
    }
 
    fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true);
+   fd_wfi(batch, ring);
 
-   trace_end_clear_restore(&batch->trace);
+   trace_end_clear_restore(&batch->trace, ring);
 }
 
 static void
@@ -1546,11 +1545,11 @@ fd6_emit_sysmem_prep(struct fd_batch *batch) assert_dt
 
    if (batch->prologue) {
       if (!batch->nondraw) {
-         trace_start_prologue(&batch->trace);
+         trace_start_prologue(&batch->trace, ring);
       }
       fd6_emit_ib(ring, batch->prologue);
       if (!batch->nondraw) {
-         trace_end_prologue(&batch->trace);
+         trace_end_prologue(&batch->trace, ring);
       }
    }
 
@@ -1609,7 +1608,7 @@ fd6_emit_sysmem_prep(struct fd_batch *batch) assert_dt
 }
 
 static void
-fd6_emit_sysmem_fini(struct fd_batch *batch)
+fd6_emit_sysmem_fini(struct fd_batch *batch) assert_dt
 {
    struct fd_ringbuffer *ring = batch->gmem;
 
@@ -1625,6 +1624,7 @@ fd6_emit_sysmem_fini(struct fd_batch *batch)
 
    fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true);
    fd6_event_write(batch, ring, PC_CCU_FLUSH_DEPTH_TS, true);
+   fd_wfi(batch, ring);
 }
 
 void
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_image.c b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_image.c
index cc8ee0a362..61c6ccbee7 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_image.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_image.c	
@@ -135,7 +135,6 @@ translate_image(struct fd6_image *img, const struct pipe_image_view *pimg)
 static void
 translate_buf(struct fd6_image *img, const struct pipe_shader_buffer *pimg)
 {
-   enum pipe_format format = PIPE_FORMAT_R32_UINT;
    struct pipe_resource *prsc = pimg->buffer;
    struct fd_resource *rsc = fd_resource(prsc);
 
@@ -144,6 +143,11 @@ translate_buf(struct fd6_image *img, const struct pipe_shader_buffer *pimg)
       return;
    }
 
+   const struct fd_dev_info *dev_info = fd_screen(prsc->screen)->info;
+   enum pipe_format format = dev_info->a6xx.storage_16bit
+                                ? PIPE_FORMAT_R16_UINT
+                                : PIPE_FORMAT_R32_UINT;
+
    img->prsc = prsc;
    img->pfmt = format;
    img->type = fd6_tex_type(prsc->target);
@@ -161,7 +165,7 @@ translate_buf(struct fd6_image *img, const struct pipe_shader_buffer *pimg)
    /* size is encoded with low 15b in WIDTH and high bits in HEIGHT,
     * in units of elements:
     */
-   unsigned sz = pimg->buffer_size / 4;
+   unsigned sz = pimg->buffer_size / (dev_info->a6xx.storage_16bit ? 2 : 4);
    img->width = sz & MASK(15);
    img->height = sz >> 15;
    img->depth = 0;
@@ -256,7 +260,7 @@ emit_image_ssbo(struct fd_ringbuffer *ring, struct fd6_image *img)
    enum a6xx_tile_mode tile_mode = fd_resource_tile_mode(img->prsc, img->level);
    bool ubwc_enabled = fd_resource_ubwc_enabled(rsc, img->level);
 
-   OUT_RING(ring, A6XX_IBO_0_FMT(fd6_pipe2tex(img->pfmt)) |
+   OUT_RING(ring, A6XX_IBO_0_FMT(fd6_texture_format(img->pfmt, rsc->layout.tile_mode)) |
                      A6XX_IBO_0_TILE_MODE(tile_mode));
    OUT_RING(ring,
             A6XX_IBO_1_WIDTH(img->width) | A6XX_IBO_1_HEIGHT(img->height));
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_program.c b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_program.c
index cfcd72f8de..621cce2bb8 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_program.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_program.c	
@@ -152,6 +152,35 @@ fd6_emit_shader(struct fd_context *ctx, struct fd_ringbuffer *ring,
    OUT_RELOC(ring, so->bo, 0, 0, 0);
 }
 
+/**
+ * Build a pre-baked state-obj to disable SO, so that we aren't dynamically
+ * building this at draw time whenever we transition from SO enabled->disabled
+ */
+static void
+setup_stream_out_disable(struct fd_context *ctx)
+{
+   unsigned sizedw = 4;
+
+   if (ctx->screen->info->a6xx.tess_use_shared)
+      sizedw += 2;
+
+   struct fd_ringbuffer *ring =
+      fd_ringbuffer_new_object(ctx->pipe, (1 + sizedw) * 4);
+
+   OUT_PKT7(ring, CP_CONTEXT_REG_BUNCH, sizedw);
+   OUT_RING(ring, REG_A6XX_VPC_SO_CNTL);
+   OUT_RING(ring, 0);
+   OUT_RING(ring, REG_A6XX_VPC_SO_STREAM_CNTL);
+   OUT_RING(ring, 0);
+
+   if (ctx->screen->info->a6xx.tess_use_shared) {
+      OUT_RING(ring, REG_A6XX_PC_SO_STREAM_CNTL);
+      OUT_RING(ring, 0);
+   }
+
+   fd6_context(ctx)->streamout_disable_stateobj = ring;
+}
+
 static void
 setup_stream_out(struct fd_context *ctx, struct fd6_program_state *state,
                  const struct ir3_shader_variant *v,
@@ -203,10 +232,14 @@ setup_stream_out(struct fd_context *ctx, struct fd6_program_state *state,
       }
    }
 
-   struct fd_ringbuffer *ring =
-      fd_ringbuffer_new_object(ctx->pipe, (13 + (2 * prog_count)) * 4);
+   unsigned sizedw = 12 + (2 * prog_count);
+   if (ctx->screen->info->a6xx.tess_use_shared)
+      sizedw += 2;
 
-   OUT_PKT7(ring, CP_CONTEXT_REG_BUNCH, 12 + (2 * prog_count));
+   struct fd_ringbuffer *ring =
+      fd_ringbuffer_new_object(ctx->pipe, (1 + sizedw) * 4);
+
+   OUT_PKT7(ring, CP_CONTEXT_REG_BUNCH, sizedw);
    OUT_RING(ring, REG_A6XX_VPC_SO_STREAM_CNTL);
    OUT_RING(ring,
             A6XX_VPC_SO_STREAM_CNTL_STREAM_ENABLE(0x1) |
@@ -228,6 +261,13 @@ setup_stream_out(struct fd_context *ctx, struct fd6_program_state *state,
       OUT_RING(ring, REG_A6XX_VPC_SO_PROG);
       OUT_RING(ring, prog[i]);
    }
+   if (ctx->screen->info->a6xx.tess_use_shared) {
+      /* Possibly not tess_use_shared related, but the combination of
+       * tess + xfb fails some tests if we don't emit this.
+       */
+      OUT_RING(ring, REG_A6XX_PC_SO_STREAM_CNTL);
+      OUT_RING(ring, A6XX_PC_SO_STREAM_CNTL_STREAM_ENABLE);
+   }
 
    state->streamout_stateobj = ring;
 }
@@ -353,10 +393,10 @@ setup_stateobj(struct fd_ringbuffer *ring, struct fd_context *ctx,
    layer_regid = ir3_find_output_regid(vs, VARYING_SLOT_LAYER);
    vertex_regid = ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID);
    instance_regid = ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID);
-   if (gs)
-      vs_primitive_regid = ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID);
-   else if (hs)
+   if (hs)
       vs_primitive_regid = ir3_find_sysval_regid(hs, SYSTEM_VALUE_PRIMITIVE_ID);
+   else if (gs)
+      vs_primitive_regid = ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID);
    else
       vs_primitive_regid = regid(63, 0);
 
@@ -557,6 +597,9 @@ setup_stateobj(struct fd_ringbuffer *ring, struct fd_context *ctx,
     */
    if (do_streamout && !binning_pass) {
       setup_stream_out(ctx, state, last_shader, &l);
+
+      if (!fd6_context(ctx)->streamout_disable_stateobj)
+         setup_stream_out_disable(ctx);
    }
 
    debug_assert(l.cnt <= 32);
@@ -628,29 +671,45 @@ setup_stateobj(struct fd_ringbuffer *ring, struct fd_context *ctx,
       OUT_PKT4(ring, REG_A6XX_PC_TESS_NUM_VERTEX, 1);
       OUT_RING(ring, hs_info->tess.tcs_vertices_out);
 
-      /* Total attribute slots in HS incoming patch. */
-      OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1);
-      OUT_RING(ring, hs_info->tess.tcs_vertices_out * vs->output_size / 4);
+      if (ctx->screen->info->a6xx.tess_use_shared) {
+         unsigned hs_input_size = 6 + (3 * (vs->output_size - 1));
+         unsigned wave_input_size =
+               MIN2(64, DIV_ROUND_UP(hs_input_size * 4,
+                                     hs_info->tess.tcs_vertices_out));
 
-      const uint32_t wavesize = 64;
-      const uint32_t max_wave_input_size = 64;
-      const uint32_t patch_control_points = hs_info->tess.tcs_vertices_out;
+         OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1);
+         OUT_RING(ring, hs_input_size);
 
-      /* note: if HS is really just the VS extended, then this
-       * should be by MAX2(patch_control_points, hs_info->tess.tcs_vertices_out)
-       * however that doesn't match the blob, and fails some dEQP tests.
-       */
-      uint32_t prims_per_wave = wavesize / hs_info->tess.tcs_vertices_out;
-      uint32_t max_prims_per_wave = max_wave_input_size * wavesize /
-                                    (vs->output_size * patch_control_points);
-      prims_per_wave = MIN2(prims_per_wave, max_prims_per_wave);
+         OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
+         OUT_RING(ring, wave_input_size);
+      } else {
+         uint32_t hs_input_size =
+               hs_info->tess.tcs_vertices_out * vs->output_size / 4;
 
-      uint32_t total_size =
-         vs->output_size * patch_control_points * prims_per_wave;
-      uint32_t wave_input_size = DIV_ROUND_UP(total_size, wavesize);
+         /* Total attribute slots in HS incoming patch. */
+         OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1);
+         OUT_RING(ring, hs_input_size);
 
-      OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
-      OUT_RING(ring, wave_input_size);
+         const uint32_t wavesize = 64;
+         const uint32_t max_wave_input_size = 64;
+         const uint32_t patch_control_points = hs_info->tess.tcs_vertices_out;
+
+         /* note: if HS is really just the VS extended, then this
+          * should be by MAX2(patch_control_points, hs_info->tess.tcs_vertices_out)
+          * however that doesn't match the blob, and fails some dEQP tests.
+          */
+         uint32_t prims_per_wave = wavesize / hs_info->tess.tcs_vertices_out;
+         uint32_t max_prims_per_wave = max_wave_input_size * wavesize /
+               (vs->output_size * patch_control_points);
+         prims_per_wave = MIN2(prims_per_wave, max_prims_per_wave);
+
+         uint32_t total_size =
+               vs->output_size * patch_control_points * prims_per_wave;
+         uint32_t wave_input_size = DIV_ROUND_UP(total_size, wavesize);
+
+         OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
+         OUT_RING(ring, wave_input_size);
+      }
 
       shader_info *ds_info = &ds->shader->nir->info;
       OUT_PKT4(ring, REG_A6XX_PC_TESS_CNTL, 1);
@@ -775,10 +834,6 @@ setup_stateobj(struct fd_ringbuffer *ring, struct fd_context *ctx,
       else
          need_size = true;
    }
-   if (VALIDREG(ij_regid[IJ_LINEAR_PIXEL]))
-      need_size = true;
-
-   /* XXX: enable bits for linear centroid and linear sample bary */
 
    OUT_PKT4(ring, REG_A6XX_GRAS_CNTL, 1);
    OUT_RING(
@@ -787,8 +842,12 @@ setup_stateobj(struct fd_ringbuffer *ring, struct fd_context *ctx,
          CONDREG(ij_regid[IJ_PERSP_CENTROID],
                  A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) |
          CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) |
-         COND(need_size, A6XX_GRAS_CNTL_SIZE) |
-         COND(need_size_persamp, A6XX_GRAS_CNTL_SIZE_PERSAMP) |
+         CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
+         CONDREG(ij_regid[IJ_LINEAR_CENTROID],
+                 A6XX_GRAS_CNTL_IJ_LINEAR_CENTROID) |
+         CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
+         COND(need_size, A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
+         COND(need_size_persamp, A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
          COND(fs->fragcoord_compmask != 0,
               A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask)));
 
@@ -801,9 +860,15 @@ setup_stateobj(struct fd_ringbuffer *ring, struct fd_context *ctx,
                  A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) |
          CONDREG(ij_regid[IJ_PERSP_SAMPLE],
                  A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) |
-         COND(need_size, A6XX_RB_RENDER_CONTROL0_SIZE) |
+         CONDREG(ij_regid[IJ_LINEAR_PIXEL],
+              A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
+         CONDREG(ij_regid[IJ_LINEAR_CENTROID],
+                 A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) |
+         CONDREG(ij_regid[IJ_LINEAR_SAMPLE],
+                 A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
+         COND(need_size, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
          COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) |
-         COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_SIZE_PERSAMP) |
+         COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
          COND(fs->fragcoord_compmask != 0,
               A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask)));
 
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.c b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.c
index b08cbae32b..a1142f7419 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.c	
@@ -39,7 +39,7 @@ __fd6_setup_rasterizer_stateobj(struct fd_context *ctx,
                                 const struct pipe_rasterizer_state *cso,
                                 bool primitive_restart)
 {
-   struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 18 * 4);
+   struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 26 * 4);
    float psize_min, psize_max;
 
    if (cso->point_size_per_vertex) {
@@ -61,7 +61,7 @@ __fd6_setup_rasterizer_stateobj(struct fd_context *ctx,
    OUT_REG(ring,
            A6XX_GRAS_SU_CNTL(.linehalfwidth = cso->line_width / 2.0,
                              .poly_offset = cso->offset_tri,
-                             .msaa_enable = cso->multisample,
+                             .line_mode = cso->multisample ? RECTANGULAR : BRESENHAM,
                              .cull_front = cso->cull_face & PIPE_FACE_FRONT,
                              .cull_back = cso->cull_face & PIPE_FACE_BACK,
                              .front_cw = !cso->front_ccw, ));
@@ -94,6 +94,13 @@ __fd6_setup_rasterizer_stateobj(struct fd_context *ctx,
    OUT_REG(ring, A6XX_VPC_POLYGON_MODE(mode));
    OUT_REG(ring, A6XX_PC_POLYGON_MODE(mode));
 
+   if (ctx->screen->info->a6xx.has_shading_rate) {
+      OUT_REG(ring, A6XX_RB_UNKNOWN_8A00());
+      OUT_REG(ring, A6XX_RB_UNKNOWN_8A10());
+      OUT_REG(ring, A6XX_RB_UNKNOWN_8A20());
+      OUT_REG(ring, A6XX_RB_UNKNOWN_8A30());
+   }
+
    return ring;
 }
 
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_resource.c b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_resource.c
index 1869776358..5a9cf1ae6f 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_resource.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_resource.c	
@@ -58,7 +58,7 @@ ok_ubwc_format(struct pipe_screen *pscreen, enum pipe_format pfmt)
       break;
    }
 
-   switch (fd6_pipe2color(pfmt)) {
+   switch (fd6_color_format(pfmt, TILE6_LINEAR)) {
    case FMT6_10_10_10_2_UINT:
    case FMT6_10_10_10_2_UNORM_DEST:
    case FMT6_11_11_10_FLOAT:
@@ -76,6 +76,7 @@ ok_ubwc_format(struct pipe_screen *pscreen, enum pipe_format pfmt)
    case FMT6_32_32_SINT:
    case FMT6_32_32_UINT:
    case FMT6_5_6_5_UNORM:
+   case FMT6_5_5_5_1_UNORM:
    case FMT6_8_8_8_8_SINT:
    case FMT6_8_8_8_8_UINT:
    case FMT6_8_8_8_8_UNORM:
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_screen.c b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_screen.c
index 1a7f373bb7..9a2a88c9d5 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_screen.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_screen.c	
@@ -76,12 +76,15 @@ fd6_screen_is_format_supported(struct pipe_screen *pscreen,
       return false;
 
    if ((usage & PIPE_BIND_VERTEX_BUFFER) &&
-       (fd6_pipe2vtx(format) != FMT6_NONE)) {
+       (fd6_vertex_format(format) != FMT6_NONE)) {
       retval |= PIPE_BIND_VERTEX_BUFFER;
    }
 
+   bool has_color = fd6_color_format(format, TILE6_LINEAR) != FMT6_NONE;
+   bool has_tex = fd6_texture_format(format, TILE6_LINEAR) != FMT6_NONE;
+
    if ((usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE)) &&
-       (fd6_pipe2tex(format) != FMT6_NONE) &&
+       has_tex &&
        (target == PIPE_BUFFER || util_format_get_blocksize(format) != 12)) {
       retval |= usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE);
    }
@@ -89,8 +92,7 @@ fd6_screen_is_format_supported(struct pipe_screen *pscreen,
    if ((usage &
         (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET |
          PIPE_BIND_SCANOUT | PIPE_BIND_SHARED | PIPE_BIND_COMPUTE_RESOURCE)) &&
-       (fd6_pipe2color(format) != FMT6_NONE) &&
-       (fd6_pipe2tex(format) != FMT6_NONE)) {
+       has_color && has_tex) {
       retval |= usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET |
                          PIPE_BIND_SCANOUT | PIPE_BIND_SHARED |
                          PIPE_BIND_COMPUTE_RESOURCE);
@@ -102,8 +104,7 @@ fd6_screen_is_format_supported(struct pipe_screen *pscreen,
    }
 
    if ((usage & PIPE_BIND_DEPTH_STENCIL) &&
-       (fd6_pipe2depth(format) != (enum a6xx_depth_format) ~0) &&
-       (fd6_pipe2tex(format) != FMT6_NONE)) {
+       (fd6_pipe2depth(format) != (enum a6xx_depth_format) ~0) && has_tex) {
       retval |= PIPE_BIND_DEPTH_STENCIL;
    }
 
@@ -121,6 +122,24 @@ fd6_screen_is_format_supported(struct pipe_screen *pscreen,
    return retval == usage;
 }
 
+/* clang-format off */
+static const uint8_t primtypes[] = {
+   [PIPE_PRIM_POINTS]                      = DI_PT_POINTLIST,
+   [PIPE_PRIM_LINES]                       = DI_PT_LINELIST,
+   [PIPE_PRIM_LINE_STRIP]                  = DI_PT_LINESTRIP,
+   [PIPE_PRIM_LINE_LOOP]                   = DI_PT_LINELOOP,
+   [PIPE_PRIM_TRIANGLES]                   = DI_PT_TRILIST,
+   [PIPE_PRIM_TRIANGLE_STRIP]              = DI_PT_TRISTRIP,
+   [PIPE_PRIM_TRIANGLE_FAN]                = DI_PT_TRIFAN,
+   [PIPE_PRIM_LINES_ADJACENCY]             = DI_PT_LINE_ADJ,
+   [PIPE_PRIM_LINE_STRIP_ADJACENCY]        = DI_PT_LINESTRIP_ADJ,
+   [PIPE_PRIM_TRIANGLES_ADJACENCY]         = DI_PT_TRI_ADJ,
+   [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]    = DI_PT_TRISTRIP_ADJ,
+   [PIPE_PRIM_PATCHES]                     = DI_PT_PATCHES0,
+   [PIPE_PRIM_MAX]                         = DI_PT_RECTLIST,  /* internal clear blits */
+};
+/* clang-format on */
+
 void
 fd6_screen_init(struct pipe_screen *pscreen)
 {
@@ -128,6 +147,10 @@ fd6_screen_init(struct pipe_screen *pscreen)
 
    screen->max_rts = A6XX_MAX_RENDER_TARGETS;
 
+   screen->ccu_offset_bypass = screen->info->num_ccu * A6XX_CCU_DEPTH_SIZE;
+   screen->ccu_offset_gmem = (screen->gmemsize_bytes -
+         screen->info->num_ccu * A6XX_CCU_GMEM_COLOR_SIZE);
+
    /* Currently only FB_READ forces GMEM path, mostly because we'd have to
     * deal with cmdstream patching otherwise..
     */
@@ -143,4 +166,6 @@ fd6_screen_init(struct pipe_screen *pscreen)
    fd6_resource_screen_init(pscreen);
    fd6_emit_init_screen(pscreen);
    ir3_screen_init(pscreen);
+
+   screen->primtypes = primtypes;
 }
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_texture.c b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_texture.c
index f1e89b9aa5..6b0a72490e 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_texture.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/a6xx/fd6_texture.c	
@@ -178,12 +178,13 @@ static void
 fd6_set_sampler_views(struct pipe_context *pctx, enum pipe_shader_type shader,
                       unsigned start, unsigned nr,
                       unsigned unbind_num_trailing_slots,
+                      bool take_ownership,
                       struct pipe_sampler_view **views) in_dt
 {
    struct fd_context *ctx = fd_context(pctx);
 
    fd_set_sampler_views(pctx, shader, start, nr, unbind_num_trailing_slots,
-                        views);
+                        take_ownership, views);
 
    if (!views)
       return;
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/ci/piglit-freedreno-a530-fails.txt b/mesa 3D driver/src/gallium/drivers/freedreno/ci/piglit-freedreno-a530-fails.txt
index d6a44470b4..314743f06f 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/ci/piglit-freedreno-a530-fails.txt	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/ci/piglit-freedreno-a530-fails.txt	
@@ -19,6 +19,7 @@ glx@glx_ext_import_context@make current- multi process,Fail
 glx@glx_ext_import_context@make current- single process,Fail
 glx@glx_ext_import_context@query context info,Fail
 shaders@glsl-bug-110796,Fail
+shaders@glsl-predication-on-large-array,Fail
 shaders@point-vertex-id divisor,Crash
 shaders@point-vertex-id gl_instanceid,Crash
 shaders@point-vertex-id gl_instanceid divisor,Crash
@@ -103,8 +104,8 @@ spec@arb_depth_buffer_float@fbo-depthstencil-gl_depth32f_stencil8-copypixels,Fai
 spec@arb_depth_buffer_float@fbo-stencil-gl_depth32f_stencil8-blit,Fail
 spec@arb_depth_buffer_float@fbo-stencil-gl_depth32f_stencil8-copypixels,Fail
 spec@arb_direct_state_access@gettextureimage-formats,Crash
-spec@arb_draw_indirect@gl_vertexid used with gldrawarraysindirect,Crash
-spec@arb_draw_indirect@gl_vertexid used with gldrawelementsindirect,Crash
+spec@arb_draw_indirect@gl_vertexid used with gldrawarraysindirect,Fail
+spec@arb_draw_indirect@gl_vertexid used with gldrawelementsindirect,Fail
 spec@arb_framebuffer_no_attachments@arb_framebuffer_no_attachments-atomic,Fail
 spec@arb_framebuffer_no_attachments@arb_framebuffer_no_attachments-atomic@MS4,Fail
 spec@arb_framebuffer_no_attachments@arb_framebuffer_no_attachments-query,Fail
@@ -120,7 +121,6 @@ spec@arb_framebuffer_object@arb_framebuffer_object-depth-stencil-blit stencil gl
 spec@arb_framebuffer_object@framebuffer-blit-levels draw stencil,Fail
 spec@arb_framebuffer_object@framebuffer-blit-levels read stencil,Fail
 spec@arb_map_buffer_alignment@arb_map_buffer_alignment-map-invalidate-range,Fail
-spec@arb_multi_draw_indirect@gl-3.0-multidrawarrays-vertexid -indirect,Crash
 spec@arb_occlusion_query@occlusion_query_order,Fail
 spec@arb_point_sprite@arb_point_sprite-interactions 1.0,Fail
 spec@arb_separate_shader_objects@400 combinations by location,Fail
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-fails.txt b/mesa 3D driver/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-fails.txt
index 4e85532d40..4b702d0218 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-fails.txt	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-fails.txt	
@@ -18,6 +18,9 @@ glx@glx-visuals-depth -pixmap,Crash
 glx@glx-visuals-stencil -pixmap,Crash
 shaders@glsl-fs-fogscale,Fail
 shaders@glsl-fs-fogscale@gs-out and fs,Fail
+
+# "MESA: error: unknown vertex shader output name: VARYING_SLOT_EDGE
+#  gl-2.0-edgeflag: ../src/freedreno/ir3/ir3_context.c:411: ir3_context_error: Assertion `!""' failed."
 shaders@point-vertex-id divisor,Crash
 shaders@point-vertex-id gl_instanceid,Crash
 shaders@point-vertex-id gl_instanceid divisor,Crash
@@ -25,10 +28,15 @@ shaders@point-vertex-id gl_vertexid,Crash
 shaders@point-vertex-id gl_vertexid divisor,Crash
 shaders@point-vertex-id gl_vertexid gl_instanceid,Crash
 shaders@point-vertex-id gl_vertexid gl_instanceid divisor,Crash
+
+# "nir_src_comp_as_uint: Assertion `nir_src_is_const(src)' failed." looking up image
 spec@arb_arrays_of_arrays@execution@image_store@basic-imagestore-mixed-const-non-const-uniform-index2,Crash
 spec@arb_arrays_of_arrays@execution@image_store@basic-imagestore-mixed-const-non-const-uniform-index,Crash
 spec@arb_arrays_of_arrays@execution@image_store@basic-imagestore-non-const-uniform-index,Crash
+
+# "shader_runner: ../src/freedreno/ir3/ir3_compiler_nir.c:3928: collect_tex_prefetches: Assertion `fetch->samp_id < 0xf' failed."
 spec@arb_arrays_of_arrays@execution@sampler@fs-struct-const-index-sampler-const-index,Crash
+
 spec@arb_compute_shader@execution@border-color,Fail
 spec@arb_depth_buffer_float@fbo-clear-formats stencil,Fail
 spec@arb_depth_buffer_float@fbo-clear-formats stencil@GL_DEPTH32F_STENCIL8,Fail
@@ -49,7 +57,10 @@ spec@arb_depth_texture@texwrap formats bordercolor-swizzled,Fail
 spec@arb_depth_texture@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT16- swizzled- border color only,Fail
 spec@arb_depth_texture@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT24- swizzled- border color only,Fail
 spec@arb_depth_texture@texwrap formats bordercolor-swizzled@GL_DEPTH_COMPONENT32- swizzled- border color only,Fail
+
+# "arb_direct_state_access-gettextureimage-formats: ../src/mesa/main/texstore.c:413: _mesa_texstore_s8_z24: Assertion `srcFormat == GL_DEPTH_STENCIL_EXT || srcFormat == GL_DEPTH_COMPONENT || srcFormat == GL_STENCIL_INDEX' failed."
 spec@arb_direct_state_access@gettextureimage-formats,Crash
+
 spec@arb_direct_state_access@transformfeedback-bufferbase,Fail
 spec@arb_direct_state_access@transformfeedback-bufferbase@general test,Fail
 spec@arb_direct_state_access@transformfeedback-bufferrange,Fail
@@ -88,7 +99,10 @@ spec@arb_sample_shading@samplemask 4 all@sample mask_in_one,Fail
 spec@arb_sample_shading@samplemask 4,Fail
 spec@arb_sample_shading@samplemask 4@noms mask_in_one,Fail
 spec@arb_sample_shading@samplemask 4@sample mask_in_one,Fail
+
+# "nir_src_comp_as_uint: Assertion `nir_src_is_const(src)' failed." looking up image
 spec@arb_shader_image_load_store@indexing,Crash
+
 spec@arb_shader_storage_buffer_object@array-ssbo-auto-binding,Fail
 spec@arb_shader_storage_buffer_object@linker@instance-matching-shader-storage-blocks-member-array-size-mismatch,Fail
 spec@arb_tessellation_shader@execution@gs-primitiveid-instanced,Fail
@@ -183,7 +197,10 @@ spec@arb_texture_rg@texwrap formats-int bordercolor-swizzled@GL_RG32I- swizzled-
 spec@arb_texture_rg@texwrap formats-int bordercolor-swizzled@GL_RG32UI- swizzled- border color only,Fail
 spec@arb_texture_rg@texwrap formats-int bordercolor-swizzled@GL_RG8I- swizzled- border color only,Fail
 spec@arb_texture_rg@texwrap formats-int bordercolor-swizzled@GL_RG8UI- swizzled- border color only,Fail
+
+# segfault on fd6_emit_image_tex() of null image resource
 spec@arb_texture_view@bug-layers-image,Crash
+
 spec@arb_texture_view@mipgen,Fail
 spec@arb_texture_view@rendering-layers-image,Fail
 spec@arb_texture_view@rendering-layers-image@layers rendering of image1DArray,Fail
@@ -198,7 +215,10 @@ spec@arb_transform_feedback3@gl_skipcomponents2,Fail
 spec@arb_transform_feedback3@gl_skipcomponents3,Fail
 spec@arb_transform_feedback3@gl_skipcomponents4,Fail
 spec@arb_vertex_type_2_10_10_10_rev@attrib-p-type-size-match,Fail
+
+# crash in dri3_fence_reset with a NULL buffer arg
 spec@egl 1.4@egl-copy-buffers,Crash
+
 spec@egl 1.4@eglterminate then unbind context,Fail
 spec@egl_chromium_sync_control@conformance@eglGetSyncValuesCHROMIUM_msc_and_sbc_test,Fail
 spec@egl_chromium_sync_control@conformance,Fail
@@ -247,8 +267,11 @@ spec@ext_packed_depth_stencil@texwrap formats bordercolor-swizzled@GL_DEPTH24_ST
 spec@ext_packed_float@query-rgba-signed-components,Fail
 spec@ext_packed_float@texwrap formats bordercolor-swizzled,Fail
 spec@ext_packed_float@texwrap formats bordercolor-swizzled@GL_R11F_G11F_B10F- swizzled- border color only,Fail
+
+# "../src/gallium/drivers/freedreno/a6xx/fd6_gmem.c:976:emit_blit: Assertion `psurf->u.tex.first_layer == psurf->u.tex.last_layer' failed."
 spec@ext_texture_array@fbo-depth-array depth-layered-clear,Crash
 spec@ext_texture_array@fbo-depth-array stencil-layered-clear,Crash
+
 spec@ext_texture_compression_rgtc@texwrap formats bordercolor-swizzled,Fail
 spec@ext_texture_compression_rgtc@texwrap formats bordercolor-swizzled@GL_COMPRESSED_RED_RGTC1- swizzled- border color only,Fail
 spec@ext_texture_compression_rgtc@texwrap formats bordercolor-swizzled@GL_COMPRESSED_RG_RGTC2- swizzled- border color only,Fail
@@ -311,7 +334,6 @@ spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SR
 spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT- swizzled- border color only,Fail
 spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SRGB- swizzled- border color only,Fail
 spec@ext_transform_feedback@geometry-shaders-basic,Fail
-spec@ext_transform_feedback@immediate-reuse-index-buffer,Fail
 spec@ext_transform_feedback@intervening-read prims_generated,Fail
 spec@ext_transform_feedback@intervening-read prims_generated use_gs,Fail
 spec@ext_transform_feedback@overflow-edge-cases,Fail
@@ -346,7 +368,6 @@ spec@glsl-1.50@execution@compatibility@vs-gs-ff-frag,Crash
 spec@glsl-1.50@execution@compatibility@vs-gs-texcoord-array-2,Crash
 spec@glsl-1.50@execution@compatibility@vs-gs-texcoord-array,Crash
 spec@glsl-1.50@execution@geometry@end-primitive 0,Fail
-spec@glsl-1.50@execution@geometry@max-input-components,Fail
 spec@glsl-1.50@execution@geometry@primitive-id-restart gl_line_loop ffs,Fail
 spec@glsl-1.50@execution@geometry@primitive-id-restart gl_line_loop other,Fail
 spec@glsl-1.50@execution@geometry@primitive-id-restart gl_lines_adjacency ffs,Fail
@@ -386,11 +407,7 @@ spec@glsl-1.50@execution@geometry@tri-strip-ordering-with-prim-restart gl_triang
 spec@glsl-1.50@execution@geometry@tri-strip-ordering-with-prim-restart gl_triangle_strip other,Fail
 spec@glsl-1.50@execution@primitive-id-no-gs-quads,Fail
 spec@glsl-1.50@execution@primitive-id-no-gs-quad-strip,Fail
-spec@glsl-1.50@execution@variable-indexing@gs-input-array-vec2-index-rd,Fail
-spec@glsl-1.50@execution@variable-indexing@gs-input-array-vec3-index-rd,Fail
-spec@glsl-1.50@execution@variable-indexing@gs-output-array-vec3-index-wr,Fail
 spec@glsl-1.50@execution@variable-indexing@gs-output-array-vec4-index-wr,Crash
-spec@glsl-1.50@execution@variable-indexing@vs-output-array-vec4-index-wr-before-gs,Fail
 spec@glsl-1.50@gs-max-output-components,Fail
 spec@intel_performance_query@intel_performance_query-issue_2235,Fail
 spec@khr_texture_compression_astc@array-gl@12x12 Block Dim,Fail
@@ -435,21 +452,31 @@ spec@khr_texture_compression_astc@sliced-3d-miptree-gl srgb,Fail
 spec@khr_texture_compression_astc@sliced-3d-miptree-gl srgb-fp,Fail
 spec@khr_texture_compression_astc@sliced-3d-miptree-gl srgb-fp@sRGB decode full precision,Fail
 spec@khr_texture_compression_astc@sliced-3d-miptree-gl srgb@sRGB decode,Fail
+
+# "MESA: error: Unhandled ALU op: f2f64"
 spec@nv_copy_depth_to_color@nv_copy_depth_to_color 0 0x223344ff,Crash
 spec@nv_copy_depth_to_color@nv_copy_depth_to_color 0 0x76356278,Crash
 spec@nv_copy_depth_to_color@nv_copy_depth_to_color 1 0x223344ff,Crash
 spec@nv_copy_depth_to_color@nv_copy_depth_to_color 1 0x76356278,Crash
 spec@nv_copy_depth_to_color@nv_copy_depth_to_color,Crash
-spec@nv_primitive_restart@primitive-restart-draw-mode-polygon,Fail
-spec@nv_primitive_restart@primitive-restart-draw-mode-quads,Fail
-spec@nv_primitive_restart@primitive-restart-draw-mode-quad_strip,Fail
+
+# segfault on fd6_emit_image_tex() of null image resource
 spec@oes_egl_image_external_essl3@oes_egl_image_external_essl3,Crash
+
+# "MESA: error: unknown vertex shader output name: VARYING_SLOT_EDGE
+#  gl-2.0-edgeflag: ../src/freedreno/ir3/ir3_context.c:411: ir3_context_error: Assertion `!""' failed."
 spec@!opengl 1.0@gl-1.0-edgeflag,Crash
 spec@!opengl 1.0@gl-1.0-edgeflag-quads,Crash
+
+# "Failure with Polygon Stipple set to fail mode."
 spec@!opengl 1.0@gl-1.0-no-op-paths,Fail
+
 spec@!opengl 1.0@gl-1.0-scissor-offscreen,Fail
 spec@!opengl 1.0@gl-1.0-spot-light,Fail
+
+# crash in tgsi exec doing st_freedback_draw_vbo().
 spec@!opengl 1.0@rasterpos,Crash
+
 spec@!opengl 1.1@linestipple@Factor 2x,Fail
 spec@!opengl 1.1@linestipple@Factor 3x,Fail
 spec@!opengl 1.1@linestipple,Fail
@@ -488,8 +515,13 @@ spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGBA4- swizzled- border
 spec@!opengl 1.1@texwrap formats bordercolor-swizzled@GL_RGBA8- swizzled- border color only,Fail
 spec@!opengl 1.1@windowoverlap,Fail
 spec@!opengl 1.4@gl-1.4-polygon-offset,Fail
+
+# "MESA: error: unknown vertex shader output name: VARYING_SLOT_EDGE
+#  gl-2.0-edgeflag: ../src/freedreno/ir3/ir3_context.c:411: ir3_context_error: Assertion `!""' failed."
 spec@!opengl 2.0@gl-2.0-edgeflag,Crash
 spec@!opengl 2.0@gl-2.0-edgeflag-immediate,Crash
+
+# "../src/freedreno/ir3/ir3_shader.h:843:ir3_find_output: Assertion `0' failed."
 spec@!opengl 2.0@vertex-program-two-side back2,Crash
 spec@!opengl 2.0@vertex-program-two-side back back2,Crash
 spec@!opengl 2.0@vertex-program-two-side back,Crash
@@ -520,6 +552,7 @@ spec@!opengl 2.0@vertex-program-two-side front back front2,Crash
 spec@!opengl 2.0@vertex-program-two-side front,Crash
 spec@!opengl 2.0@vertex-program-two-side front front2 back2,Crash
 spec@!opengl 2.0@vertex-program-two-side front front2,Crash
+
 spec@!opengl 2.1@pbo,Fail
 spec@!opengl 2.1@pbo@test_polygon_stip,Fail
 spec@!opengl 2.1@polygon-stipple-fs,Fail
@@ -527,4 +560,6 @@ spec@!opengl 3.0@clearbuffer-depth-cs-probe,Timeout
 spec@!opengl 3.0@clearbuffer-depth,Fail
 spec@!opengl 3.0@clearbuffer-stencil,Fail
 spec@!opengl 3.1@primitive-restart-xfb generated,Fail
+
+# "../src/gallium/drivers/freedreno/a6xx/fd6_gmem.c:976:emit_blit: Assertion `psurf->u.tex.first_layer == psurf->u.tex.last_layer' failed."
 spec@!opengl 3.2@layered-rendering@clear-depth,Crash
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-flakes.txt b/mesa 3D driver/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-flakes.txt
index 3a9f09d72b..c7ae2c3599 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-flakes.txt	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-flakes.txt	
@@ -8,7 +8,7 @@ glx@glx-visuals-stencil
 
 # Occasionally flakes preceded by a GPU fault;
 #  [  375.034086] adreno 5000000.gpu: [drm:a6xx_irq] *ERROR* gpu fault ring 0 fence aefe8 status 00E51005 rb 0602/06d2 ib1 000000010023D000/0000 ib2 0000000100246170/0000
-glx@glx-ftp
+glx@glx-tfp
 
 # async shader compiler asserts in get_image_samp_tex_src()'s nir_src_comp_as_uint().
 # https://gitlab.freedesktop.org/mesa/mesa/-/issues/4474
@@ -20,6 +20,14 @@ spec@arb_draw_indirect@arb_draw_indirect-transform-feedback
 # Flaky since around 2021-03-22.  First appeared on an innocent mingw branch.
 spec@arb_depth_texture@fbo-clear-formats
 
+# Occasionally passes instead of failing
+spec@egl_chromium_sync_control@conformance
+spec@egl_chromium_sync_control@conformance@eglGetSyncValuesCHROMIUM_msc_and_sbc_test
+
+# First appeared on 2021-07-07 in a container uprev, became frequent by
+# 2021-08-04 starting from a pretty clearly unrelated MR
+spec@ext_framebuffer_object@fbo-blending-format-quirks
+
 # Flaky since around 2021-04-21.  First appeared on
 # https://gitlab.freedesktop.org/robclark/mesa/-/jobs/9101752 (fd/go-fast)
 spec@ext_packed_depth_stencil@fbo-clear-formats stencil
@@ -27,5 +35,13 @@ spec@ext_packed_depth_stencil@fbo-clear-formats stencil
 # Flaky since around 2021-05-21, the day after a piglit uprev reshuffled us.
 spec@arb_map_buffer_range@map_invalidate_buffer_bit offset=0
 
+# Flaky since around 2021-06-21.
+spec@arb_texture_gather@texturegatheroffset@vs-rgb-blue-uint-2darray
+spec@arb_texture_gather@texturegather@fs-rgba-green-uint-cubearray
+spec@arb_texture_gather@texturegather@vs-rgb-none-float-cubearray
+
+# First appeared in https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11511
+spec@glsl-1.30@execution@tex-miplevel-selection texture\(bias\) cubearray
+
 # First appeared on 2021-05-14 on tc-merge-index-unrefs, but not obviously related.
 spec@!opengl 1.0@rasterpos@glsl_vs_tex1D
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_autotune.c b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_autotune.c
index d4ab28bcd8..fd27e2be6e 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_autotune.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_autotune.c	
@@ -50,6 +50,8 @@ get_history(struct fd_autotune *at, struct fd_batch *batch)
 {
    struct fd_batch_history *history;
 
+   /* draw batches should still have their key at this point. */
+   assert(batch->key || batch->nondraw);
    if (!batch->key)
       return NULL;
 
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_batch.c b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_batch.c
index 5a089bc0df..6bc1e06969 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_batch.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_batch.c	
@@ -444,13 +444,13 @@ fd_batch_add_resource(struct fd_batch *batch, struct fd_resource *rsc)
 {
 
    if (likely(fd_batch_references_resource(batch, rsc))) {
-      debug_assert(_mesa_set_search(batch->resources, rsc));
+      debug_assert(_mesa_set_search_pre_hashed(batch->resources, rsc->hash, rsc));
       return;
    }
 
    debug_assert(!_mesa_set_search(batch->resources, rsc));
 
-   _mesa_set_add(batch->resources, rsc);
+   _mesa_set_add_pre_hashed(batch->resources, rsc->hash, rsc);
    rsc->track->batch_mask |= (1 << batch->idx);
 }
 
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_batch.h b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_batch.h
index 842537f8c4..f85ff82385 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_batch.h	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_batch.h	
@@ -31,7 +31,7 @@
 #include "util/simple_mtx.h"
 #include "util/u_inlines.h"
 #include "util/u_queue.h"
-#include "util/u_trace.h"
+#include "util/perf/u_trace.h"
 
 #include "freedreno_context.h"
 #include "freedreno_fence.h"
@@ -414,7 +414,7 @@ fd_batch_get_epilogue(struct fd_batch *batch)
 {
    if (batch->epilogue == NULL) {
       batch->epilogue = fd_submit_new_ringbuffer(batch->submit, 0x1000,
-                                                 (enum fd_ringbuffer_flags)0);
+                                                 FD_RINGBUFFER_GROWABLE);
    }
 
    return batch->epilogue;
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_batch_cache.c b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_batch_cache.c
index 681db86878..5c31475c22 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_batch_cache.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_batch_cache.c	
@@ -315,7 +315,7 @@ fd_bc_invalidate_resource(struct fd_resource *rsc, bool destroy)
 
    if (destroy) {
       foreach_batch (batch, &screen->batch_cache, rsc->track->batch_mask) {
-         struct set_entry *entry = _mesa_set_search(batch->resources, rsc);
+         struct set_entry *entry = _mesa_set_search_pre_hashed(batch->resources, rsc->hash, rsc);
          _mesa_set_remove(batch->resources, entry);
       }
       rsc->track->batch_mask = 0;
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_blitter.c b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_blitter.c
index 8a4b0a1dc9..c41a0736ec 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_blitter.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_blitter.c	
@@ -166,7 +166,7 @@ fd_blitter_blit(struct fd_context *ctx, const struct pipe_blit_info *info)
    util_blitter_blit_generic(
       ctx->blitter, dst_view, &info->dst.box, src_view, &info->src.box,
       src->width0, src->height0, info->mask, info->filter,
-      info->scissor_enable ? &info->scissor : NULL, info->alpha_blend);
+      info->scissor_enable ? &info->scissor : NULL, info->alpha_blend, false);
 
    pipe_surface_reference(&dst_view, NULL);
    pipe_sampler_view_reference(&src_view, NULL);
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_context.c b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_context.c
index e40736c97e..9656d64653 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_context.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_context.c	
@@ -38,6 +38,7 @@
 #include "freedreno_state.h"
 #include "freedreno_texture.h"
 #include "freedreno_util.h"
+#include "util/u_trace_gallium.h"
 
 static void
 fd_context_flush(struct pipe_context *pctx, struct pipe_fence_handle **fencep,
@@ -368,9 +369,6 @@ fd_context_destroy(struct pipe_context *pctx)
       if (ctx->clear_rs_state[i])
          pctx->delete_rasterizer_state(pctx, ctx->clear_rs_state[i]);
 
-   if (ctx->primconvert)
-      util_primconvert_destroy(ctx->primconvert);
-
    slab_destroy_child(&ctx->transfer_pool);
    slab_destroy_child(&ctx->transfer_pool_unsync);
 
@@ -432,11 +430,6 @@ fd_get_device_reset_status(struct pipe_context *pctx)
    int global_faults = fd_get_reset_count(ctx, false);
    enum pipe_reset_status status;
 
-   /* Not called in driver thread, but threaded_context syncs
-    * before calling this:
-    */
-   fd_context_access_begin(ctx);
-
    if (context_faults != ctx->context_reset_count) {
       status = PIPE_GUILTY_CONTEXT_RESET;
    } else if (global_faults != ctx->global_reset_count) {
@@ -448,36 +441,36 @@ fd_get_device_reset_status(struct pipe_context *pctx)
    ctx->context_reset_count = context_faults;
    ctx->global_reset_count = global_faults;
 
-   fd_context_access_end(ctx);
-
    return status;
 }
 
 static void
-fd_trace_record_ts(struct u_trace *ut, struct pipe_resource *timestamps,
+fd_trace_record_ts(struct u_trace *ut, void *cs, void *timestamps,
                    unsigned idx)
 {
    struct fd_batch *batch = container_of(ut, struct fd_batch, trace);
-   struct fd_ringbuffer *ring = batch->nondraw ? batch->draw : batch->gmem;
+   struct fd_ringbuffer *ring = cs;
+   struct pipe_resource *buffer = timestamps;
 
    if (ring->cur == batch->last_timestamp_cmd) {
-      uint64_t *ts = fd_bo_map(fd_resource(timestamps)->bo);
+      uint64_t *ts = fd_bo_map(fd_resource(buffer)->bo);
       ts[idx] = U_TRACE_NO_TIMESTAMP;
       return;
    }
 
    unsigned ts_offset = idx * sizeof(uint64_t);
-   batch->ctx->record_timestamp(ring, fd_resource(timestamps)->bo, ts_offset);
+   batch->ctx->record_timestamp(ring, fd_resource(buffer)->bo, ts_offset);
    batch->last_timestamp_cmd = ring->cur;
 }
 
 static uint64_t
 fd_trace_read_ts(struct u_trace_context *utctx,
-                 struct pipe_resource *timestamps, unsigned idx)
+                 void *timestamps, unsigned idx, void *flush_data)
 {
    struct fd_context *ctx =
       container_of(utctx, struct fd_context, trace_context);
-   struct fd_bo *ts_bo = fd_resource(timestamps)->bo;
+   struct pipe_resource *buffer = timestamps;
+   struct fd_bo *ts_bo = fd_resource(buffer)->bo;
 
    /* Only need to stall on results for the first entry: */
    if (idx == 0) {
@@ -500,6 +493,12 @@ fd_trace_read_ts(struct u_trace_context *utctx,
    return ctx->ts_to_ns(ts[idx]);
 }
 
+static void
+fd_trace_delete_flush_data(struct u_trace_context *utctx, void *flush_data)
+{
+   /* We don't use flush_data at the moment. */
+}
+
 /* TODO we could combine a few of these small buffers (solid_vbuf,
  * blit_texcoord_vbuf, and vsc_size_mem, into a single buffer and
  * save a tiny bit of memory
@@ -583,13 +582,12 @@ fd_context_cleanup_common_vbos(struct fd_context *ctx)
 
 struct pipe_context *
 fd_context_init(struct fd_context *ctx, struct pipe_screen *pscreen,
-                const uint8_t *primtypes, void *priv,
-                unsigned flags) disable_thread_safety_analysis
+                void *priv, unsigned flags)
+   disable_thread_safety_analysis
 {
    struct fd_screen *screen = fd_screen(pscreen);
    struct pipe_context *pctx;
    unsigned prio = 1;
-   int i;
 
    /* lower numerical value == higher priority: */
    if (FD_DBG(HIPRIO))
@@ -605,6 +603,7 @@ fd_context_init(struct fd_context *ctx, struct pipe_screen *pscreen,
    if (FD_DBG(BSTAT) || FD_DBG(MSGS))
       ctx->stats_users++;
 
+   ctx->flags = flags;
    ctx->screen = screen;
    ctx->pipe = fd_pipe_new2(screen->dev, FD_PIPE_3D, prio);
 
@@ -615,12 +614,6 @@ fd_context_init(struct fd_context *ctx, struct pipe_screen *pscreen,
       ctx->global_reset_count = fd_get_reset_count(ctx, false);
    }
 
-   ctx->primtypes = primtypes;
-   ctx->primtype_mask = 0;
-   for (i = 0; i <= PIPE_PRIM_MAX; i++)
-      if (primtypes[i])
-         ctx->primtype_mask |= (1 << i);
-
    simple_mtx_init(&ctx->gmem_lock, mtx_plain);
 
    /* need some sane default in case gallium frontends don't
@@ -660,10 +653,6 @@ fd_context_init(struct fd_context *ctx, struct pipe_screen *pscreen,
    if (!ctx->blitter)
       goto fail;
 
-   ctx->primconvert = util_primconvert_create(pctx, ctx->primtype_mask);
-   if (!ctx->primconvert)
-      goto fail;
-
    list_inithead(&ctx->hw_active_queries);
    list_inithead(&ctx->acc_active_queries);
 
@@ -674,8 +663,10 @@ fd_context_init(struct fd_context *ctx, struct pipe_screen *pscreen,
 
    ctx->current_scissor = &ctx->disabled_scissor;
 
-   u_trace_context_init(&ctx->trace_context, pctx, fd_trace_record_ts,
-                        fd_trace_read_ts);
+   u_trace_pipe_context_init(&ctx->trace_context, pctx,
+                             fd_trace_record_ts,
+                             fd_trace_read_ts,
+                             fd_trace_delete_flush_data);
 
    fd_autotune_init(&ctx->autotune, screen->dev);
 
@@ -701,9 +692,11 @@ fd_context_init_tc(struct pipe_context *pctx, unsigned flags)
    struct pipe_context *tc = threaded_context_create(
       pctx, &ctx->screen->transfer_pool,
       fd_replace_buffer_storage,
-      fd_fence_create_unflushed,
-      fd_resource_busy,
-      false,
+      &(struct threaded_context_options){
+         .create_fence = fd_fence_create_unflushed,
+         .is_resource_busy = fd_resource_busy,
+         .unsynchronized_get_device_reset_status = true,
+      },
       &ctx->tc);
 
    if (tc && tc != pctx)
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_context.h b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_context.h
index ac0fce4429..35135b4e65 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_context.h	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_context.h	
@@ -27,7 +27,6 @@
 #ifndef FREEDRENO_CONTEXT_H_
 #define FREEDRENO_CONTEXT_H_
 
-#include "indices/u_primconvert.h"
 #include "pipe/p_context.h"
 #include "util/libsync.h"
 #include "util/list.h"
@@ -35,7 +34,7 @@
 #include "util/u_blitter.h"
 #include "util/u_string.h"
 #include "util/u_threaded_context.h"
-#include "util/u_trace.h"
+#include "util/perf/u_trace.h"
 
 #include "freedreno_autotune.h"
 #include "freedreno_gmem.h"
@@ -199,6 +198,8 @@ struct ir3_shader_key;
 struct fd_context {
    struct pipe_context base;
 
+   unsigned flags;      /* PIPE_CONTEXT_x */
+
    struct threaded_context *tc;
 
    struct list_head node; /* node in screen->context_list */
@@ -219,7 +220,6 @@ struct fd_context {
 
    struct blitter_context *blitter dt;
    void *clear_rs_state[2] dt;
-   struct primconvert_context *primconvert dt;
 
    /* slab for pipe_transfer allocations: */
    struct slab_child_pool transfer_pool dt;
@@ -250,6 +250,8 @@ struct fd_context {
    struct list_head acc_active_queries dt;
    /*@}*/
 
+   uint8_t patch_vertices;
+
    /* Whether we need to recheck the active_queries list next
     * fd_batch_update_queries().
     */
@@ -260,13 +262,6 @@ struct fd_context {
     */
    bool active_queries dt;
 
-   /* table with PIPE_PRIM_MAX entries mapping PIPE_PRIM_x to
-    * DI_PT_x value to use for draw initiator.  There are some
-    * slight differences between generation:
-    */
-   const uint8_t *primtypes;
-   uint32_t primtype_mask;
-
    /* shaders used by clear, and gmem->mem blits: */
    struct fd_program_stateobj solid_prog; // TODO move to screen?
    struct fd_program_stateobj solid_layered_prog;
@@ -327,9 +322,12 @@ struct fd_context {
     * count increases, it means some other context crashed.  If
     * per-context reset count increases, it means we crashed the
     * gpu.
+    *
+    * Only accessed by front-end thread, never accessed by TC driver
+    * thread.
     */
-   uint32_t context_reset_count dt;
-   uint32_t global_reset_count dt;
+   uint32_t context_reset_count;
+   uint32_t global_reset_count;
 
    /* Context sequence #, used for batch-cache key: */
    uint16_t seqno;
@@ -712,12 +710,6 @@ fd_context_get_scissor(struct fd_context *ctx) assert_dt
    return ctx->current_scissor;
 }
 
-static inline bool
-fd_supported_prim(struct fd_context *ctx, unsigned prim)
-{
-   return (1 << prim) & ctx->primtype_mask;
-}
-
 void fd_context_switch_from(struct fd_context *ctx) assert_dt;
 void fd_context_switch_to(struct fd_context *ctx,
                           struct fd_batch *batch) assert_dt;
@@ -731,8 +723,7 @@ void fd_emit_string5(struct fd_ringbuffer *ring, const char *string, int len);
 
 struct pipe_context *fd_context_init(struct fd_context *ctx,
                                      struct pipe_screen *pscreen,
-                                     const uint8_t *primtypes, void *priv,
-                                     unsigned flags);
+                                     void *priv, unsigned flags);
 struct pipe_context *fd_context_init_tc(struct pipe_context *pctx,
                                         unsigned flags);
 
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_draw.c b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_draw.c
index 88b4ce2c00..baa5f2e553 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_draw.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_draw.c	
@@ -286,16 +286,6 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
    if (!fd_render_condition_check(pctx))
       return;
 
-   /* emulate unsupported primitives: */
-   if (!fd_supported_prim(ctx, info->mode)) {
-      if (ctx->streamout.num_targets > 0)
-         mesa_loge("stream-out with emulated prims");
-      util_primconvert_save_rasterizer_state(ctx->primconvert, ctx->rasterizer);
-      util_primconvert_draw_vbo(ctx->primconvert, info, drawid_offset, indirect, draws,
-                                num_draws);
-      return;
-   }
-
    /* Upload a user index buffer. */
    struct pipe_resource *indexbuf = NULL;
    unsigned index_offset = 0;
@@ -303,7 +293,7 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
    if (info->index_size) {
       if (info->has_user_indices) {
          if (num_draws > 1) {
-				util_draw_multi(pctx, info, drawid_offset, indirect, draws, num_draws);
+            util_draw_multi(pctx, info, drawid_offset, indirect, draws, num_draws);
             return;
          }
          if (!util_upload_index_buffer(pctx, info, &draws[0], &indexbuf,
@@ -319,7 +309,7 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
    }
 
    if ((ctx->streamout.num_targets > 0) && (num_draws > 1)) {
-		util_draw_multi(pctx, info, drawid_offset, indirect, draws, num_draws);
+      util_draw_multi(pctx, info, drawid_offset, indirect, draws, num_draws);
       return;
    }
 
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_gmem.c b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_gmem.c
index 47f4e6f675..dbdc63faec 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_gmem.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_gmem.c	
@@ -33,6 +33,7 @@
 #include "util/u_memory.h"
 #include "util/u_string.h"
 #include "u_tracepoints.h"
+#include "util/u_trace_gallium.h"
 
 #include "freedreno_context.h"
 #include "freedreno_fence.h"
@@ -591,7 +592,7 @@ render_tiles(struct fd_batch *batch, struct fd_gmem_stateobj *gmem) assert_dt
    for (i = 0; i < (gmem->nbins_x * gmem->nbins_y); i++) {
       struct fd_tile *tile = &gmem->tile[i];
 
-      trace_start_tile(&batch->trace, tile->bin_h, tile->yoff, tile->bin_w,
+      trace_start_tile(&batch->trace, batch->gmem, tile->bin_h, tile->yoff, tile->bin_w,
                        tile->xoff);
 
       ctx->emit_tile_prep(batch, tile);
@@ -606,13 +607,13 @@ render_tiles(struct fd_batch *batch, struct fd_gmem_stateobj *gmem) assert_dt
          ctx->query_prepare_tile(batch, i, batch->gmem);
 
       /* emit IB to drawcmds: */
-      trace_start_draw_ib(&batch->trace);
+      trace_start_draw_ib(&batch->trace, batch->gmem);
       if (ctx->emit_tile) {
          ctx->emit_tile(batch, tile);
       } else {
          ctx->screen->emit_ib(batch->gmem, batch->draw);
       }
-      trace_end_draw_ib(&batch->trace);
+      trace_end_draw_ib(&batch->trace, batch->gmem);
       fd_reset_wfi(batch);
 
       /* emit gmem2mem to transfer tile back to system memory: */
@@ -636,13 +637,13 @@ render_sysmem(struct fd_batch *batch) assert_dt
       ctx->query_prepare_tile(batch, 0, batch->gmem);
 
    if (!batch->nondraw) {
-      trace_start_draw_ib(&batch->trace);
+      trace_start_draw_ib(&batch->trace, batch->gmem);
    }
    /* emit IB to drawcmds: */
    ctx->screen->emit_ib(batch->gmem, batch->draw);
 
    if (!batch->nondraw) {
-      trace_end_draw_ib(&batch->trace);
+      trace_end_draw_ib(&batch->trace, batch->gmem);
    }
 
    fd_reset_wfi(batch);
@@ -681,9 +682,9 @@ fd_gmem_render_tiles(struct fd_batch *batch)
        */
       fd_perfetto_submit(ctx);
 #endif
-      trace_flush_batch(&batch->trace, batch, batch->cleared,
+      trace_flush_batch(&batch->trace, batch->gmem, batch, batch->cleared,
                         batch->gmem_reason, batch->num_draws);
-      trace_framebuffer_state(&batch->trace, pfb);
+      trace_framebuffer_state(&batch->trace, batch->gmem, pfb);
    }
 
    if (ctx->emit_sysmem_prep && !batch->nondraw) {
@@ -723,33 +724,34 @@ fd_gmem_render_tiles(struct fd_batch *batch)
 
    if (batch->nondraw) {
       DBG("%p: rendering non-draw", batch);
-      render_sysmem(batch);
+      if (!fd_ringbuffer_empty(batch->draw))
+         render_sysmem(batch);
       ctx->stats.batch_nondraw++;
    } else if (sysmem) {
-      trace_render_sysmem(&batch->trace);
-      trace_start_render_pass(
-         &batch->trace, ctx->submit_count, pipe_surface_format(pfb->cbufs[0]),
+      trace_render_sysmem(&batch->trace, batch->gmem);
+      trace_start_render_pass(&batch->trace, batch->gmem,
+         ctx->submit_count, pipe_surface_format(pfb->cbufs[0]),
          pipe_surface_format(pfb->zsbuf), pfb->width, pfb->height,
          pfb->nr_cbufs, pfb->samples, 0, 0, 0);
       if (ctx->query_prepare)
          ctx->query_prepare(batch, 1);
       render_sysmem(batch);
-      trace_end_render_pass(&batch->trace);
+      trace_end_render_pass(&batch->trace, batch->gmem);
       ctx->stats.batch_sysmem++;
    } else {
       struct fd_gmem_stateobj *gmem = lookup_gmem_state(batch, false, false);
       batch->gmem_state = gmem;
-      trace_render_gmem(&batch->trace, gmem->nbins_x, gmem->nbins_y,
+      trace_render_gmem(&batch->trace, batch->gmem, gmem->nbins_x, gmem->nbins_y,
                         gmem->bin_w, gmem->bin_h);
-      trace_start_render_pass(
-         &batch->trace, ctx->submit_count, pipe_surface_format(pfb->cbufs[0]),
+      trace_start_render_pass(&batch->trace, batch->gmem,
+         ctx->submit_count, pipe_surface_format(pfb->cbufs[0]),
          pipe_surface_format(pfb->zsbuf), pfb->width, pfb->height,
          pfb->nr_cbufs, pfb->samples, gmem->nbins_x * gmem->nbins_y,
          gmem->bin_w, gmem->bin_h);
       if (ctx->query_prepare)
          ctx->query_prepare(batch, gmem->nbins_x * gmem->nbins_y);
       render_tiles(batch, gmem);
-      trace_end_render_pass(&batch->trace);
+      trace_end_render_pass(&batch->trace, batch->gmem);
       batch->gmem_state = NULL;
 
       fd_screen_lock(ctx->screen);
@@ -761,7 +763,7 @@ fd_gmem_render_tiles(struct fd_batch *batch)
 
    flush_ring(batch);
 
-   u_trace_flush(&batch->trace);
+   u_trace_flush(&batch->trace, NULL, false);
 }
 
 /* Determine a worst-case estimate (ie. assuming we don't eliminate an
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_perfetto.cc b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_perfetto.cc
index bc10a510b0..20b7c513ed 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_perfetto.cc	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_perfetto.cc	
@@ -322,6 +322,7 @@ fd_perfetto_submit(struct fd_context *ctx)
 
 void
 fd_start_render_pass(struct pipe_context *pctx, uint64_t ts_ns,
+                     const void *flush_data,
                      const struct trace_start_render_pass *payload)
 {
    stage_start(pctx, ts_ns, SURFACE_STAGE_ID);
@@ -342,6 +343,7 @@ fd_start_render_pass(struct pipe_context *pctx, uint64_t ts_ns,
 
 void
 fd_end_render_pass(struct pipe_context *pctx, uint64_t ts_ns,
+                   const void *flush_data,
                    const struct trace_end_render_pass *payload)
 {
    stage_end(pctx, ts_ns, SURFACE_STAGE_ID);
@@ -349,6 +351,7 @@ fd_end_render_pass(struct pipe_context *pctx, uint64_t ts_ns,
 
 void
 fd_start_binning_ib(struct pipe_context *pctx, uint64_t ts_ns,
+                    const void *flush_data,
                     const struct trace_start_binning_ib *payload)
 {
    stage_start(pctx, ts_ns, BINNING_STAGE_ID);
@@ -356,6 +359,7 @@ fd_start_binning_ib(struct pipe_context *pctx, uint64_t ts_ns,
 
 void
 fd_end_binning_ib(struct pipe_context *pctx, uint64_t ts_ns,
+                  const void *flush_data,
                   const struct trace_end_binning_ib *payload)
 {
    stage_end(pctx, ts_ns, BINNING_STAGE_ID);
@@ -363,6 +367,7 @@ fd_end_binning_ib(struct pipe_context *pctx, uint64_t ts_ns,
 
 void
 fd_start_draw_ib(struct pipe_context *pctx, uint64_t ts_ns,
+                 const void *flush_data,
                  const struct trace_start_draw_ib *payload)
 {
    stage_start(
@@ -372,6 +377,7 @@ fd_start_draw_ib(struct pipe_context *pctx, uint64_t ts_ns,
 
 void
 fd_end_draw_ib(struct pipe_context *pctx, uint64_t ts_ns,
+               const void *flush_data,
                const struct trace_end_draw_ib *payload)
 {
    stage_end(
@@ -381,6 +387,7 @@ fd_end_draw_ib(struct pipe_context *pctx, uint64_t ts_ns,
 
 void
 fd_start_blit(struct pipe_context *pctx, uint64_t ts_ns,
+              const void *flush_data,
               const struct trace_start_blit *payload)
 {
    stage_start(pctx, ts_ns, BLIT_STAGE_ID);
@@ -388,6 +395,7 @@ fd_start_blit(struct pipe_context *pctx, uint64_t ts_ns,
 
 void
 fd_end_blit(struct pipe_context *pctx, uint64_t ts_ns,
+            const void *flush_data,
             const struct trace_end_blit *payload)
 {
    stage_end(pctx, ts_ns, BLIT_STAGE_ID);
@@ -395,6 +403,7 @@ fd_end_blit(struct pipe_context *pctx, uint64_t ts_ns,
 
 void
 fd_start_compute(struct pipe_context *pctx, uint64_t ts_ns,
+                 const void *flush_data,
                  const struct trace_start_compute *payload)
 {
    stage_start(pctx, ts_ns, COMPUTE_STAGE_ID);
@@ -402,6 +411,7 @@ fd_start_compute(struct pipe_context *pctx, uint64_t ts_ns,
 
 void
 fd_end_compute(struct pipe_context *pctx, uint64_t ts_ns,
+               const void *flush_data,
                const struct trace_end_compute *payload)
 {
    stage_end(pctx, ts_ns, COMPUTE_STAGE_ID);
@@ -409,6 +419,7 @@ fd_end_compute(struct pipe_context *pctx, uint64_t ts_ns,
 
 void
 fd_start_clear_restore(struct pipe_context *pctx, uint64_t ts_ns,
+                       const void *flush_data,
                        const struct trace_start_clear_restore *payload)
 {
    stage_start(pctx, ts_ns, CLEAR_RESTORE_STAGE_ID);
@@ -416,6 +427,7 @@ fd_start_clear_restore(struct pipe_context *pctx, uint64_t ts_ns,
 
 void
 fd_end_clear_restore(struct pipe_context *pctx, uint64_t ts_ns,
+                     const void *flush_data,
                      const struct trace_end_clear_restore *payload)
 {
    stage_end(pctx, ts_ns, CLEAR_RESTORE_STAGE_ID);
@@ -423,6 +435,7 @@ fd_end_clear_restore(struct pipe_context *pctx, uint64_t ts_ns,
 
 void
 fd_start_resolve(struct pipe_context *pctx, uint64_t ts_ns,
+                 const void *flush_data,
                  const struct trace_start_resolve *payload)
 {
    stage_start(pctx, ts_ns, RESOLVE_STAGE_ID);
@@ -430,6 +443,7 @@ fd_start_resolve(struct pipe_context *pctx, uint64_t ts_ns,
 
 void
 fd_end_resolve(struct pipe_context *pctx, uint64_t ts_ns,
+               const void *flush_data,
                const struct trace_end_resolve *payload)
 {
    stage_end(pctx, ts_ns, RESOLVE_STAGE_ID);
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_program.c b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_program.c
index 4068edbc7a..5b1532b445 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_program.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_program.c	
@@ -43,6 +43,14 @@ update_bound_stage(struct fd_context *ctx, enum pipe_shader_type shader,
    }
 }
 
+static void
+fd_set_patch_vertices(struct pipe_context *pctx, uint8_t patch_vertices) in_dt
+{
+   struct fd_context *ctx = fd_context(pctx);
+
+   ctx->patch_vertices = patch_vertices;
+}
+
 static void
 fd_vs_state_bind(struct pipe_context *pctx, void *hwcso) in_dt
 {
@@ -194,6 +202,10 @@ fd_prog_init(struct pipe_context *pctx)
    pctx->bind_tes_state = fd_tes_state_bind;
    pctx->bind_gs_state = fd_gs_state_bind;
    pctx->bind_fs_state = fd_fs_state_bind;
+   pctx->set_patch_vertices = fd_set_patch_vertices;
+
+   if (ctx->flags & PIPE_CONTEXT_COMPUTE_ONLY)
+      return;
 
    ctx->solid_prog.fs = assemble_tgsi(pctx, solid_fs, true);
    ctx->solid_prog.vs = assemble_tgsi(pctx, solid_vs, false);
@@ -229,6 +241,9 @@ fd_prog_fini(struct pipe_context *pctx)
    struct fd_context *ctx = fd_context(pctx);
    int i;
 
+   if (ctx->flags & PIPE_CONTEXT_COMPUTE_ONLY)
+      return;
+
    pctx->delete_vs_state(pctx, ctx->solid_prog.vs);
    pctx->delete_fs_state(pctx, ctx->solid_prog.fs);
 
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_query.h b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_query.h
index 2b641a7d4a..6001797190 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_query.h	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_query.h	
@@ -131,4 +131,11 @@ pidx(unsigned query_type)
    }
 }
 
+/** Returns true if get_query_result is being called from the driver thread. */
+static inline bool
+fd_get_query_result_in_driver_thread(struct fd_query *q)
+{
+   return !q->base.flushed;
+}
+
 #endif /* FREEDRENO_QUERY_H_ */
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_query_acc.c b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_query_acc.c
index d579a54759..74da4ce8ab 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_query_acc.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_query_acc.c	
@@ -155,10 +155,10 @@ fd_acc_get_query_result(struct fd_context *ctx, struct fd_query *q, bool wait,
     * So, regardless of whether we are supposed to wait or not, we do need to
     * flush now.
     */
-   if (rsc->track->write_batch) {
+   if (fd_get_query_result_in_driver_thread(q)) {
       tc_assert_driver_thread(ctx->tc);
       fd_context_access_begin(ctx);
-      fd_batch_flush(rsc->track->write_batch);
+      fd_bc_flush_writer(ctx, rsc);
       fd_context_access_end(ctx);
    }
 
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_query_hw.c b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_query_hw.c
index 2dbf2b2a30..560cf1bba3 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_query_hw.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_query_hw.c	
@@ -219,10 +219,10 @@ fd_hw_get_query_result(struct fd_context *ctx, struct fd_query *q, bool wait,
        * So, regardless of whether we are supposed to wait or not, we do need to
        * flush now.
        */
-      if (rsc->track->write_batch) {
+      if (fd_get_query_result_in_driver_thread(q)) {
          tc_assert_driver_thread(ctx->tc);
          fd_context_access_begin(ctx);
-         fd_batch_flush(rsc->track->write_batch);
+         fd_bc_flush_writer(ctx, rsc);
          fd_context_access_end(ctx);
       }
 
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_resource.c b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_resource.c
index eec22c7ac1..f8c8eb9d0c 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_resource.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_resource.c	
@@ -196,6 +196,7 @@ realloc_bo(struct fd_resource *rsc, uint32_t size)
    struct pipe_resource *prsc = &rsc->b.b;
    struct fd_screen *screen = fd_screen(rsc->b.b.screen);
    uint32_t flags =
+      COND(prsc->usage & PIPE_USAGE_STAGING, FD_BO_CACHED_COHERENT) |
       COND(prsc->bind & PIPE_BIND_SCANOUT, FD_BO_SCANOUT);
    /* TODO other flags? */
 
@@ -452,9 +453,9 @@ fd_try_shadow_resource(struct fd_context *ctx, struct fd_resource *rsc,
     */
    debug_assert(shadow->track->batch_mask == 0);
    foreach_batch (batch, &ctx->screen->batch_cache, rsc->track->batch_mask) {
-      struct set_entry *entry = _mesa_set_search(batch->resources, rsc);
+      struct set_entry *entry = _mesa_set_search_pre_hashed(batch->resources, rsc->hash, rsc);
       _mesa_set_remove(batch->resources, entry);
-      _mesa_set_add(batch->resources, shadow);
+      _mesa_set_add_pre_hashed(batch->resources, shadow->hash, shadow);
    }
    swap(rsc->track, shadow->track);
 
@@ -818,6 +819,9 @@ resource_transfer_map(struct pipe_context *pctx, struct pipe_resource *prsc,
 
          return buf;
       }
+   } else if ((usage & PIPE_MAP_READ) && !fd_bo_is_cached(rsc->bo)) {
+      perf_debug_ctx(ctx, "wc readback: prsc=%p, level=%u, usage=%x, box=%dx%d+%d,%d",
+                     prsc, level, usage, box->width, box->height, box->x, box->y);
    }
 
    if (usage & PIPE_MAP_DISCARD_WHOLE_RESOURCE) {
@@ -1092,6 +1096,7 @@ alloc_resource_struct(struct pipe_screen *pscreen,
 
    pipe_reference_init(&prsc->reference, 1);
    prsc->screen = pscreen;
+   rsc->hash = _mesa_hash_pointer(rsc);
 
    util_range_init(&rsc->valid_buffer_range);
    simple_mtx_init(&rsc->lock, mtx_plain);
@@ -1104,7 +1109,7 @@ alloc_resource_struct(struct pipe_screen *pscreen,
 
    pipe_reference_init(&rsc->track->reference, 1);
 
-   threaded_resource_init(prsc);
+   threaded_resource_init(prsc, false, 0);
 
    if (tmpl->target == PIPE_BUFFER)
       rsc->b.buffer_id_unique = util_idalloc_mt_alloc(&screen->buffer_ids);
@@ -1112,6 +1117,87 @@ alloc_resource_struct(struct pipe_screen *pscreen,
    return rsc;
 }
 
+enum fd_layout_type {
+   ERROR,
+   LINEAR,
+   TILED,
+   UBWC,
+};
+
+static enum fd_layout_type
+get_best_layout(struct fd_screen *screen, struct pipe_resource *prsc,
+                const struct pipe_resource *tmpl, const uint64_t *modifiers,
+                int count)
+{
+   bool implicit_modifiers =
+      (count == 0 ||
+       drm_find_modifier(DRM_FORMAT_MOD_INVALID, modifiers, count));
+
+   /* First, find all the conditions which would force us to linear */
+   if (!screen->tile_mode)
+      return LINEAR;
+
+   if (!screen->tile_mode(prsc))
+      return LINEAR;
+
+   if (tmpl->target == PIPE_BUFFER)
+      return LINEAR;
+
+   if (tmpl->bind & PIPE_BIND_LINEAR) {
+      if (tmpl->usage != PIPE_USAGE_STAGING)
+         perf_debug("%" PRSC_FMT ": forcing linear: bind flags",
+                    PRSC_ARGS(prsc));
+      return LINEAR;
+   }
+
+   if (FD_DBG(NOTILE))
+       return LINEAR;
+
+   /* Shared resources with implicit modifiers must always be linear */
+   if (implicit_modifiers && (tmpl->bind & PIPE_BIND_SHARED)) {
+      perf_debug("%" PRSC_FMT
+                 ": forcing linear: shared resource + implicit modifiers",
+                 PRSC_ARGS(prsc));
+      return LINEAR;
+   }
+
+   bool ubwc_ok = is_a6xx(screen);
+   if (FD_DBG(NOUBWC))
+      ubwc_ok = false;
+
+   if (ubwc_ok && !implicit_modifiers &&
+       !drm_find_modifier(DRM_FORMAT_MOD_QCOM_COMPRESSED, modifiers, count)) {
+      perf_debug("%" PRSC_FMT
+                 ": not using UBWC: not in acceptable modifier set",
+                 PRSC_ARGS(prsc));
+      ubwc_ok = false;
+   }
+
+   if (ubwc_ok)
+      return UBWC;
+
+   /* We can't use tiled with explicit modifiers, as there is no modifier token
+    * defined for it. But we might internally force tiled allocation using a
+    * private modifier token.
+    *
+    * TODO we should probably also limit TILED in a similar way to UBWC above,
+    * once we have a public modifier token defined.
+    */
+   if (implicit_modifiers ||
+       drm_find_modifier(FD_FORMAT_MOD_QCOM_TILED, modifiers, count))
+      return TILED;
+
+   if (!drm_find_modifier(DRM_FORMAT_MOD_LINEAR, modifiers, count)) {
+      perf_debug("%" PRSC_FMT ": need linear but not in modifier set",
+                 PRSC_ARGS(prsc));
+      return ERROR;
+   }
+
+   perf_debug("%" PRSC_FMT ": not using tiling: explicit modifiers and no UBWC",
+              PRSC_ARGS(prsc));
+   return LINEAR;
+}
+
 /**
  * Helper that allocates a resource and resolves its layout (but doesn't
  * allocate its bo).
@@ -1137,6 +1223,10 @@ fd_resource_allocate_and_resolve(struct pipe_screen *pscreen,
 
    prsc = &rsc->b.b;
 
+   /* Clover creates buffers with PIPE_FORMAT_NONE: */
+   if ((prsc->target == PIPE_BUFFER) && (format == PIPE_FORMAT_NONE))
+      format = prsc->format = PIPE_FORMAT_R8_UNORM;
+
    DBG("%" PRSC_FMT, PRSC_ARGS(prsc));
 
    if (tmpl->bind & PIPE_BIND_SHARED)
@@ -1144,61 +1234,20 @@ fd_resource_allocate_and_resolve(struct pipe_screen *pscreen,
 
    fd_resource_layout_init(prsc);
 
-#define LINEAR (PIPE_BIND_SCANOUT | PIPE_BIND_LINEAR | PIPE_BIND_DISPLAY_TARGET)
-
-   bool linear = drm_find_modifier(DRM_FORMAT_MOD_LINEAR, modifiers, count);
-   if (linear) {
-      perf_debug("%" PRSC_FMT ": linear: DRM_FORMAT_MOD_LINEAR requested!",
-                 PRSC_ARGS(prsc));
-   } else if (tmpl->bind & LINEAR) {
-      if (tmpl->usage != PIPE_USAGE_STAGING)
-         perf_debug("%" PRSC_FMT ": linear: LINEAR bind requested!",
-                    PRSC_ARGS(prsc));
-      linear = true;
+   enum fd_layout_type layout =
+      get_best_layout(screen, prsc, tmpl, modifiers, count);
+   if (layout == ERROR) {
+      free(prsc);
+      return NULL;
    }
 
-   if (FD_DBG(NOTILE))
-      linear = true;
-
-   /* Normally, for non-shared buffers, allow buffer compression if
-    * not shared, otherwise only allow if QCOM_COMPRESSED modifier
-    * is requested:
-    *
-    * TODO we should probably also limit tiled in a similar way,
-    * except we don't have a format modifier for tiled.  (We probably
-    * should.)
-    */
-   bool allow_ubwc = false;
-   if (!linear) {
-      allow_ubwc = drm_find_modifier(DRM_FORMAT_MOD_INVALID, modifiers, count);
-      if (!allow_ubwc) {
-         perf_debug("%" PRSC_FMT
-                    ": not UBWC: DRM_FORMAT_MOD_INVALID not requested!",
-                    PRSC_ARGS(prsc));
-      }
-      if (tmpl->bind & PIPE_BIND_SHARED) {
-         allow_ubwc =
-            drm_find_modifier(DRM_FORMAT_MOD_QCOM_COMPRESSED, modifiers, count);
-         if (!allow_ubwc) {
-            perf_debug("%" PRSC_FMT
-                       ": not UBWC: shared and DRM_FORMAT_MOD_QCOM_COMPRESSED "
-                       "not requested!",
-                       PRSC_ARGS(prsc));
-            linear = true;
-         }
-      }
-   }
-
-   allow_ubwc &= !FD_DBG(NOUBWC);
-
-   if (screen->tile_mode && (tmpl->target != PIPE_BUFFER) && !linear) {
+   if (layout >= TILED)
       rsc->layout.tile_mode = screen->tile_mode(prsc);
-   }
+   if (layout == UBWC)
+      rsc->layout.ubwc = true;
 
    rsc->internal_format = format;
 
-   rsc->layout.ubwc = rsc->layout.tile_mode && is_a6xx(screen) && allow_ubwc;
-
    if (prsc->target == PIPE_BUFFER) {
       assert(prsc->format == PIPE_FORMAT_R8_UNORM);
       size = prsc->width0;
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_resource.h b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_resource.h
index b7cc41d126..9305717274 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_resource.h	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_resource.h	
@@ -118,6 +118,7 @@ struct fd_resource {
    struct threaded_resource b;
    struct fd_bo *bo; /* use fd_resource_set_bo() to write */
    enum pipe_format internal_format;
+   uint32_t hash; /* _mesa_hash_pointer() on this resource's address. */
    struct fdl_layout layout;
 
    /* buffer range that has been initialized */
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_screen.c b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_screen.c
index 0f3efc2010..15fb442ef8 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_screen.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_screen.c	
@@ -239,6 +239,10 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_PCI_FUNCTION:
       return 0;
 
+   case PIPE_CAP_SUPPORTED_PRIM_MODES:
+   case PIPE_CAP_SUPPORTED_PRIM_MODES_WITH_RESTART:
+      return screen->primtypes_mask;
+
    case PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD:
    case PIPE_CAP_FRAGMENT_SHADER_DERIVATIVES:
    case PIPE_CAP_VERTEX_SHADER_SATURATE:
@@ -1031,6 +1035,8 @@ fd_screen_create(struct fd_device *dev, struct renderonly *ro,
       goto fail;
    }
 
+   screen->info = info;
+
    /* explicitly checking for GPU revisions that are known to work.  This
     * may be overly conservative for a3xx, where spoofing the gpu_id with
     * the blob driver seems to generate identical cmdstream dumps.  But
@@ -1063,13 +1069,12 @@ fd_screen_create(struct fd_device *dev, struct renderonly *ro,
       goto fail;
    }
 
-   screen->info = info;
-
-   if (is_a6xx(screen)) {
-      screen->ccu_offset_bypass = screen->info->num_ccu * A6XX_CCU_DEPTH_SIZE;
-      screen->ccu_offset_gmem = (screen->gmemsize_bytes -
-         screen->info->num_ccu * A6XX_CCU_GMEM_COLOR_SIZE);
-   }
+   /* fdN_screen_init() should set this: */
+   assert(screen->primtypes);
+   screen->primtypes_mask = 0;
+   for (unsigned i = 0; i <= PIPE_PRIM_MAX; i++)
+      if (screen->primtypes[i])
+         screen->primtypes_mask |= (1 << i);
 
    if (FD_DBG(PERFC)) {
       screen->perfcntr_groups =
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_screen.h b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_screen.h
index ca396e0c16..ca15d3e4a5 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_screen.h	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_screen.h	
@@ -148,6 +148,17 @@ struct fd_screen {
    const uint64_t *supported_modifiers;
 
    struct renderonly *ro;
+
+   /* table with PIPE_PRIM_MAX+1 entries mapping PIPE_PRIM_x to
+    * DI_PT_x value to use for draw initiator.  There are some
+    * slight differences between generation.
+    *
+    * Note that primtypes[PRIM_TYPE_MAX] is used to map to the
+    * internal RECTLIST primtype, if available, used for blits/
+    * clears.
+    */
+   const uint8_t *primtypes;
+   uint32_t primtypes_mask;
 };
 
 static inline struct fd_screen *
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_state.c b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_state.c
index 5d5b942be3..90c29894e0 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_state.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_state.c	
@@ -606,11 +606,36 @@ fd_bind_compute_state(struct pipe_context *pctx, void *state) in_dt
    ctx->dirty_shader[PIPE_SHADER_COMPUTE] |= FD_DIRTY_SHADER_PROG;
 }
 
+/* TODO pipe_context::set_compute_resources() should DIAF and clover
+ * should be updated to use pipe_context::set_constant_buffer() and
+ * pipe_context::set_shader_images().  Until then just directly frob
+ * the UBO/image state to avoid the rest of the driver needing to
+ * know about this bastard api..
+ */
 static void
 fd_set_compute_resources(struct pipe_context *pctx, unsigned start,
                          unsigned count, struct pipe_surface **prscs) in_dt
 {
-   // TODO
+   struct fd_context *ctx = fd_context(pctx);
+   struct fd_constbuf_stateobj *so = &ctx->constbuf[PIPE_SHADER_COMPUTE];
+
+   for (unsigned i = 0; i < count; i++) {
+      const uint32_t index = i + start + 1;   /* UBOs start at index 1 */
+
+      if (!prscs) {
+         util_copy_constant_buffer(&so->cb[index], NULL, false);
+         so->enabled_mask &= ~(1 << index);
+      } else if (prscs[i]->format == PIPE_FORMAT_NONE) {
+         struct pipe_constant_buffer cb = {
+               .buffer = prscs[i]->texture,
+         };
+         util_copy_constant_buffer(&so->cb[index], &cb, false);
+         so->enabled_mask |= (1 << index);
+      } else {
+         // TODO images
+         unreachable("finishme");
+      }
+   }
 }
 
 /* used by clover to bind global objects, returning the bo address
@@ -634,9 +659,11 @@ fd_set_global_binding(struct pipe_context *pctx, unsigned first, unsigned count,
 
          if (so->buf[n]) {
             struct fd_resource *rsc = fd_resource(so->buf[n]);
-            uint64_t iova = fd_bo_get_iova(rsc->bo);
-            // TODO need to scream if iova > 32b or fix gallium API..
-            *handles[i] += iova;
+            uint32_t offset = *handles[i];
+            uint64_t iova = fd_bo_get_iova(rsc->bo) + offset;
+
+            /* Yes, really, despite what the type implies: */
+            memcpy(handles[i], &iova, sizeof(iova));
          }
 
          if (prscs[i])
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_texture.c b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_texture.c
index 2ed42cb3f0..0c2edd2f85 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_texture.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_texture.c	
@@ -56,7 +56,7 @@ bind_sampler_states(struct fd_texture_stateobj *tex, unsigned start,
 
    for (i = 0; i < nr; i++) {
       unsigned p = i + start;
-      tex->samplers[p] = hwcso[i];
+      tex->samplers[p] = hwcso ? hwcso[i] : NULL;
       if (tex->samplers[p])
          tex->valid_samplers |= (1 << p);
       else
@@ -68,7 +68,7 @@ bind_sampler_states(struct fd_texture_stateobj *tex, unsigned start,
 
 static void
 set_sampler_views(struct fd_texture_stateobj *tex, unsigned start, unsigned nr,
-                  unsigned unbind_num_trailing_slots,
+                  unsigned unbind_num_trailing_slots, bool take_ownership,
                   struct pipe_sampler_view **views)
 {
    unsigned i;
@@ -76,7 +76,14 @@ set_sampler_views(struct fd_texture_stateobj *tex, unsigned start, unsigned nr,
    for (i = 0; i < nr; i++) {
       struct pipe_sampler_view *view = views ? views[i] : NULL;
       unsigned p = i + start;
-      pipe_sampler_view_reference(&tex->textures[p], view);
+
+      if (take_ownership) {
+         pipe_sampler_view_reference(&tex->textures[p], NULL);
+         tex->textures[p] = view;
+      } else {
+         pipe_sampler_view_reference(&tex->textures[p], view);
+      }
+
       if (tex->textures[p]) {
          fd_resource_set_usage(tex->textures[p]->texture, FD_DIRTY_TEX);
          tex->valid_textures |= (1 << p);
@@ -107,12 +114,13 @@ void
 fd_set_sampler_views(struct pipe_context *pctx, enum pipe_shader_type shader,
                      unsigned start, unsigned nr,
                      unsigned unbind_num_trailing_slots,
+                     bool take_ownership,
                      struct pipe_sampler_view **views) in_dt
 {
    struct fd_context *ctx = fd_context(pctx);
 
    set_sampler_views(&ctx->tex[shader], start, nr, unbind_num_trailing_slots,
-                     views);
+                     take_ownership, views);
    fd_context_dirty_shader(ctx, shader, FD_DIRTY_SHADER_TEX);
 }
 
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_texture.h b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_texture.h
index 7cb523c907..bc89454c77 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_texture.h	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_texture.h	
@@ -36,6 +36,7 @@ void fd_sampler_states_bind(struct pipe_context *pctx,
 void fd_set_sampler_views(struct pipe_context *pctx,
                           enum pipe_shader_type shader, unsigned start,
                           unsigned nr, unsigned unbind_num_trailing_slots,
+                          bool take_ownership,
                           struct pipe_sampler_view **views);
 
 void fd_texture_init(struct pipe_context *pctx);
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_tracepoints.py b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_tracepoints.py
index ea7f01120c..095009b6b8 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_tracepoints.py	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_tracepoints.py	
@@ -37,6 +37,7 @@ sys.path.insert(0, args.import_path)
 
 from u_trace import Header
 from u_trace import Tracepoint
+from u_trace import TracepointArg
 from u_trace import utrace_generate
 
 #
@@ -50,19 +51,19 @@ Tracepoint('start_state_restore')
 Tracepoint('end_state_restore')
 
 Tracepoint('flush_batch',
-    args=[['struct fd_batch *', 'batch'],
-          ['uint16_t', 'cleared'],
-          ['uint16_t', 'gmem_reason'],
-          ['uint16_t', 'num_draws']],
+    args=[TracepointArg(type='struct fd_batch *', var='batch',       c_format='%x'),
+          TracepointArg(type='uint16_t',          var='cleared',     c_format='%x'),
+          TracepointArg(type='uint16_t',          var='gmem_reason', c_format='%x'),
+          TracepointArg(type='uint16_t',          var='num_draws',   c_format='%u')],
     tp_print=['%p: cleared=%x, gmem_reason=%x, num_draws=%u', '__entry->batch',
         '__entry->cleared', '__entry->gmem_reason', '__entry->num_draws'],
 )
 
 Tracepoint('render_gmem',
-    args=[['uint16_t', 'nbins_x'],
-          ['uint16_t', 'nbins_y'],
-          ['uint16_t', 'bin_w'],
-          ['uint16_t', 'bin_h']],
+    args=[TracepointArg(type='uint16_t', var='nbins_x', c_format='%u'),
+          TracepointArg(type='uint16_t', var='nbins_y', c_format='%u'),
+          TracepointArg(type='uint16_t', var='bin_w',   c_format='%u'),
+          TracepointArg(type='uint16_t', var='bin_h',   c_format='%u')],
     tp_print=['%ux%u bins of %ux%u',
         '__entry->nbins_x', '__entry->nbins_y', '__entry->bin_w', '__entry->bin_h'],
 )
@@ -72,16 +73,16 @@ Tracepoint('render_sysmem')
 # Note that this doesn't include full information about all of the MRTs
 # but seems to roughly match what I see with a blob trace
 Tracepoint('start_render_pass',
-    args=[['uint32_t', 'submit_id'],
-          ['enum pipe_format', 'cbuf0_format'],
-          ['enum pipe_format', 'zs_format'],
-          ['uint16_t', 'width'],
-          ['uint16_t', 'height'],
-          ['uint8_t', 'mrts'],
-          ['uint8_t', 'samples'],
-          ['uint16_t', 'nbins'],
-          ['uint16_t', 'binw'],
-          ['uint16_t', 'binh']],
+    args=[TracepointArg(type='uint32_t',         var='submit_id',     c_format='%u'),
+          TracepointArg(type='enum pipe_format', var='cbuf0_format',  c_format='%s', to_prim_type='util_format_description({})->short_name'),
+          TracepointArg(type='enum pipe_format', var='zs_format',     c_format='%s', to_prim_type='util_format_description({})->short_name'),
+          TracepointArg(type='uint16_t',         var='width',         c_format='%u'),
+          TracepointArg(type='uint16_t',         var='height',        c_format='%u'),
+          TracepointArg(type='uint8_t',          var='mrts',          c_format='%u'),
+          TracepointArg(type='uint8_t',          var='samples',       c_format='%u'),
+          TracepointArg(type='uint16_t',         var='nbins',         c_format='%u'),
+          TracepointArg(type='uint16_t',         var='binw',          c_format='%u'),
+          TracepointArg(type='uint16_t',         var='binh',          c_format='%u')],
     tp_perfetto='fd_start_render_pass'
 )
 Tracepoint('end_render_pass',
@@ -100,7 +101,7 @@ Tracepoint('end_prologue')
 
 # For GMEM pass, where this could either be a clear or resolve
 Tracepoint('start_clear_restore',
-    args=[['uint16_t', 'fast_cleared']],
+    args=[TracepointArg(type='uint16_t', var='fast_cleared', c_format='0x%x')],
     tp_print=['fast_cleared: 0x%x', '__entry->fast_cleared'],
     tp_perfetto='fd_start_clear_restore',
 )
@@ -113,10 +114,10 @@ Tracepoint('end_resolve',
     tp_perfetto='fd_end_resolve')
 
 Tracepoint('start_tile',
-    args=[['uint16_t', 'bin_h'],
-          ['uint16_t', 'yoff'],
-          ['uint16_t', 'bin_w'],
-          ['uint16_t', 'xoff']],
+    args=[TracepointArg(type='uint16_t', var='bin_h', c_format='%u'),
+          TracepointArg(type='uint16_t', var='yoff',  c_format='%u'),
+          TracepointArg(type='uint16_t', var='bin_w', c_format='%u'),
+          TracepointArg(type='uint16_t', var='xoff',  c_format='%u')],
     tp_print=['bin_h=%d, yoff=%d, bin_w=%d, xoff=%d',
         '__entry->bin_h', '__entry->yoff', '__entry->bin_w', '__entry->xoff'],
 )
@@ -127,8 +128,8 @@ Tracepoint('end_draw_ib',
     tp_perfetto='fd_end_draw_ib')
 
 Tracepoint('start_blit',
-    args=[['enum pipe_texture_target', 'src_target'],
-          ['enum pipe_texture_target', 'dst_target']],
+    args=[TracepointArg(type='enum pipe_texture_target', var='src_target', c_format='%s', to_prim_type="util_str_tex_target({}, true)"),
+          TracepointArg(type='enum pipe_texture_target', var='dst_target', c_format='%s', to_prim_type="util_str_tex_target({}, true)")],
     tp_print=['%s -> %s', 'util_str_tex_target(__entry->src_target, true)',
         'util_str_tex_target(__entry->dst_target, true)'],
     tp_perfetto='fd_start_blit',
@@ -141,4 +142,4 @@ Tracepoint('start_compute',
 Tracepoint('end_compute',
     tp_perfetto='fd_end_compute')
 
-utrace_generate(cpath=args.src, hpath=args.hdr)
+utrace_generate(cpath=args.src, hpath=args.hdr, ctx_param='struct pipe_context *pctx')
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_util.h b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_util.h
index f8cf9b6cb1..5e0065717e 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_util.h	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/freedreno_util.h	
@@ -114,13 +114,19 @@ extern bool fd_binning_enabled;
                    ##__VA_ARGS__);                                             \
    } while (0)
 
-#define perf_debug_ctx(ctx, ...)                                               \
+#define perf_debug_message(debug, type, ...)                                   \
    do {                                                                        \
       if (FD_DBG(PERF))                                                        \
          mesa_logw(__VA_ARGS__);                                               \
+      struct pipe_debug_callback *__d = (debug);                               \
+      if (__d)                                                                 \
+         pipe_debug_message(__d, type, __VA_ARGS__);                           \
+   } while (0)
+
+#define perf_debug_ctx(ctx, ...)                                               \
+   do {                                                                        \
       struct fd_context *__c = (ctx);                                          \
-      if (__c)                                                                 \
-         pipe_debug_message(&__c->debug, PERF_INFO, __VA_ARGS__);              \
+      perf_debug_message(__c ? &__c->debug : NULL, PERF_INFO, __VA_ARGS__);    \
    } while (0)
 
 #define perf_debug(...) perf_debug_ctx(NULL, __VA_ARGS__)
@@ -398,7 +404,7 @@ emit_marker(struct fd_ringbuffer *ring, int scratch_idx)
    if (reg == HW_QUERY_BASE_REG)
       return;
    if (__EMIT_MARKER) {
-      OUT_WFI5(ring);
+      OUT_WFI(ring);
       OUT_PKT0(ring, reg, 1);
       OUT_RING(ring, p_atomic_inc_return(&marker_cnt));
    }
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/mesa 3D driver/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
index 3a33f1535e..88ff83421d 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c	
@@ -251,6 +251,12 @@ load_spirv(const char *filename, const char *entry, gl_shader_stage stage)
                       stage, entry, &spirv_options,
                       ir3_get_compiler_options(compiler));
 
+   const struct nir_lower_sysvals_to_varyings_options sysvals_to_varyings = {
+      .frag_coord = true,
+      .point_coord = true,
+   };
+   NIR_PASS_V(nir, nir_lower_sysvals_to_varyings, &sysvals_to_varyings);
+
    nir_print_shader(nir, stdout);
 
    return nir;
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/ir3/ir3_const.h b/mesa 3D driver/src/gallium/drivers/freedreno/ir3/ir3_const.h
index 85f5aefafa..c2c239e970 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/ir3/ir3_const.h	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/ir3/ir3_const.h	
@@ -433,6 +433,22 @@ emit_common_consts(const struct ir3_shader_variant *v,
    }
 }
 
+/* emit kernel params */
+static inline void
+emit_kernel_params(struct fd_context *ctx, const struct ir3_shader_variant *v,
+                   struct fd_ringbuffer *ring, const struct pipe_grid_info *info)
+   assert_dt
+{
+   const struct ir3_const_state *const_state = ir3_const_state(v);
+   uint32_t offset = const_state->offsets.kernel_params;
+   if (v->constlen > offset) {
+      ring_wfi(ctx->batch, ring);
+      emit_const_user(ring, v, offset * 4,
+                      align(v->shader->cs.req_input_mem, 4),
+                      info->input);
+   }
+}
+
 static inline void
 ir3_emit_vs_driver_params(const struct ir3_shader_variant *v,
                           struct fd_ringbuffer *ring, struct fd_context *ctx,
@@ -479,9 +495,10 @@ ir3_emit_vs_driver_params(const struct ir3_shader_variant *v,
     * stream so need to copy them to bo.
     */
    if (indirect && needs_vtxid_base) {
+      uint32_t vertex_params_area = align(vertex_params_size, 16);
       struct pipe_resource *vertex_params_rsc =
          pipe_buffer_create(&ctx->screen->base, PIPE_BIND_CONSTANT_BUFFER,
-                            PIPE_USAGE_STREAM, vertex_params_size * 4);
+                            PIPE_USAGE_STREAM, vertex_params_area * 4);
       unsigned src_off = indirect->offset;
       ;
       void *ptr;
@@ -501,7 +518,7 @@ ir3_emit_vs_driver_params(const struct ir3_shader_variant *v,
       ctx->screen->mem_to_mem(ring, vertex_params_rsc, 0, indirect->buffer,
                               src_off, 1);
 
-      emit_const_prsc(ring, v, offset * 4, 0, vertex_params_size,
+      emit_const_prsc(ring, v, offset * 4, 0, vertex_params_area,
                       vertex_params_rsc);
 
       pipe_resource_reference(&vertex_params_rsc, NULL);
@@ -551,6 +568,7 @@ ir3_emit_cs_consts(const struct ir3_shader_variant *v,
    debug_assert(gl_shader_stage_is_compute(v->type));
 
    emit_common_consts(v, ring, ctx, PIPE_SHADER_COMPUTE);
+   emit_kernel_params(ctx, v, ring, info);
 
    /* emit compute-shader driver-params: */
    const struct ir3_const_state *const_state = ir3_const_state(v);
@@ -591,6 +609,7 @@ ir3_emit_cs_consts(const struct ir3_shader_variant *v,
             [IR3_DP_NUM_WORK_GROUPS_X] = info->grid[0],
             [IR3_DP_NUM_WORK_GROUPS_Y] = info->grid[1],
             [IR3_DP_NUM_WORK_GROUPS_Z] = info->grid[2],
+            [IR3_DP_WORK_DIM]          = info->work_dim,
             [IR3_DP_LOCAL_GROUP_SIZE_X] = info->block[0],
             [IR3_DP_LOCAL_GROUP_SIZE_Y] = info->block[1],
             [IR3_DP_LOCAL_GROUP_SIZE_Z] = info->block[2],
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/ir3/ir3_gallium.c b/mesa 3D driver/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
index ee24f1192e..64b95da5d0 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/ir3/ir3_gallium.c	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/ir3/ir3_gallium.c	
@@ -84,7 +84,8 @@ dump_shader_info(struct ir3_shader_variant *v,
       "%s shader: %u inst, %u nops, %u non-nops, %u mov, %u cov, "
       "%u dwords, %u last-baryf, %u half, %u full, %u constlen, "
       "%u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7, "
-      "%u sstall, %u (ss), %u (sy), %d waves, %d max_sun, %d loops\n",
+      "%u stp, %u ldp, %u sstall, %u (ss), %u (sy), %d waves, %d max_sun, "
+      "%d loops\n",
       ir3_shader_stage(v), v->info.instrs_count, v->info.nops_count,
       v->info.instrs_count - v->info.nops_count, v->info.mov_count,
       v->info.cov_count, v->info.sizedwords, v->info.last_baryf,
@@ -92,7 +93,8 @@ dump_shader_info(struct ir3_shader_variant *v,
       v->info.instrs_per_cat[0], v->info.instrs_per_cat[1],
       v->info.instrs_per_cat[2], v->info.instrs_per_cat[3],
       v->info.instrs_per_cat[4], v->info.instrs_per_cat[5],
-      v->info.instrs_per_cat[6], v->info.instrs_per_cat[7], v->info.sstall,
+      v->info.instrs_per_cat[6], v->info.instrs_per_cat[7],
+      v->info.stp_count, v->info.ldp_count, v->info.sstall,
       v->info.ss, v->info.sy, v->info.max_waves, v->max_sun, v->loops);
 }
 
@@ -131,7 +133,7 @@ ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key,
 
    if (created) {
       if (shader->initial_variants_done) {
-         pipe_debug_message(debug, SHADER_INFO,
+         perf_debug_message(debug, SHADER_INFO,
                             "%s shader: recompiling at draw time: global "
                             "0x%08x, vfsamples %x/%x, astc %x/%x\n",
                             ir3_shader_stage(v), key.global, key.vsamples,
@@ -295,6 +297,9 @@ ir3_shader_compute_state_create(struct pipe_context *pctx,
    }
 
    struct ir3_shader *shader = ir3_shader_from_nir(compiler, nir, 0, NULL);
+   shader->cs.req_input_mem = align(cso->req_input_mem, 4) / 4;     /* byte->dword */
+   shader->cs.req_local_mem = cso->req_local_mem;
+
    struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));
 
    util_queue_fence_init(&hwcso->ready);
@@ -460,13 +465,15 @@ ir3_fixup_shader_state(struct pipe_context *pctx, struct ir3_shader_key *key)
    }
 }
 
-static void
+static char *
 ir3_screen_finalize_nir(struct pipe_screen *pscreen, void *nir)
 {
    struct fd_screen *screen = fd_screen(pscreen);
 
    ir3_nir_lower_io_to_temporaries(nir);
    ir3_finalize_nir(screen->compiler, nir);
+
+   return NULL;
 }
 
 static void
diff --git a/mesa 3D driver/src/gallium/drivers/freedreno/meson.build b/mesa 3D driver/src/gallium/drivers/freedreno/meson.build
index dac51b1c3c..ed23138450 100644
--- a/mesa 3D driver/src/gallium/drivers/freedreno/meson.build	
+++ b/mesa 3D driver/src/gallium/drivers/freedreno/meson.build	
@@ -223,7 +223,7 @@ freedreno_tracepoints = custom_target(
   output: ['freedreno_tracepoints.c', 'freedreno_tracepoints.h'],
   command: [
     prog_python, '@INPUT@',
-    '-p', join_paths(meson.source_root(), 'src/gallium/auxiliary/util/'),
+    '-p', join_paths(meson.source_root(), 'src/util/perf/'),
     '-C', '@OUTPUT0@',
     '-H', '@OUTPUT1@',
   ],
@@ -273,6 +273,7 @@ libfreedreno = static_library(
   cpp_args : [freedreno_cpp_args],
   gnu_symbol_visibility : 'hidden',
   dependencies : libfreedreno_dependencies,
+  override_options : ['cpp_std=c++17'],
 )
 
 driver_freedreno = declare_dependency(
diff --git a/mesa 3D driver/src/gallium/drivers/i915/ci/deqp-i915-g33-fails.txt b/mesa 3D driver/src/gallium/drivers/i915/ci/deqp-i915-g33-fails.txt
index b4c82622de..f21887e678 100644
--- a/mesa 3D driver/src/gallium/drivers/i915/ci/deqp-i915-g33-fails.txt	
+++ b/mesa 3D driver/src/gallium/drivers/i915/ci/deqp-i915-g33-fails.txt	
@@ -1,522 +1,51 @@
-dEQP-GLES2.functional.buffer.write.random.0,Crash
-dEQP-GLES2.functional.buffer.write.random.1,Crash
-dEQP-GLES2.functional.buffer.write.random.2,Crash
-dEQP-GLES2.functional.buffer.write.random.3,Crash
-dEQP-GLES2.functional.buffer.write.random.4,Crash
-dEQP-GLES2.functional.buffer.write.random.5,Crash
-dEQP-GLES2.functional.buffer.write.random.8,Crash
-dEQP-GLES2.functional.buffer.write.random.9,Crash
-dEQP-GLES2.functional.buffer.write.recreate_store.random_1,Crash
-dEQP-GLES2.functional.buffer.write.recreate_store.random_2,Crash
-dEQP-GLES2.functional.buffer.write.recreate_store.random_3,Crash
-dEQP-GLES2.functional.buffer.write.recreate_store.random_4,Crash
+# Lines with the center outside the viewport clipped out when they shouldn't be?
 dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_center,Fail
 dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_corner,Fail
+
+# Points with center outside the viewport clipped out when they shouldn't be?
 dEQP-GLES2.functional.clipping.point.wide_point_clip,Fail
 dEQP-GLES2.functional.clipping.point.wide_point_clip_viewport_center,Fail
 dEQP-GLES2.functional.clipping.point.wide_point_clip_viewport_corner,Fail
-dEQP-GLES2.functional.clipping.triangle_vertex.clip_two.clip_neg_y_neg_z_and_neg_x_neg_y_pos_z,Fail
-dEQP-GLES2.functional.clipping.triangle_vertex.clip_two.clip_pos_y_pos_z_and_neg_x_neg_y_neg_z,Fail
-dEQP-GLES2.functional.fbo.completeness.renderable.texture.color0.rgb10_a2,Fail
-dEQP-GLES2.functional.negative_api.shader.uniform_matrixfv_invalid_transpose,Fail
-dEQP-GLES2.functional.negative_api.texture.generatemipmap_zero_level_array_compressed,Fail
-dEQP-GLES2.functional.polygon_offset.default_displacement_with_units,Fail
-dEQP-GLES2.functional.polygon_offset.fixed16_displacement_with_units,Fail
+
+# Maybe doesn't like the color interpolation in wide lines?
 dEQP-GLES2.functional.rasterization.interpolation.basic.line_loop_wide,Fail
 dEQP-GLES2.functional.rasterization.interpolation.basic.line_strip_wide,Fail
 dEQP-GLES2.functional.rasterization.interpolation.basic.lines_wide,Fail
 dEQP-GLES2.functional.rasterization.interpolation.projected.line_loop_wide,Fail
 dEQP-GLES2.functional.rasterization.interpolation.projected.line_strip_wide,Fail
 dEQP-GLES2.functional.rasterization.interpolation.projected.lines_wide,Fail
+
+# "Invalid fragment count in result image."
 dEQP-GLES2.functional.rasterization.primitives.line_loop_wide,Fail
 dEQP-GLES2.functional.rasterization.primitives.line_strip_wide,Fail
 dEQP-GLES2.functional.rasterization.primitives.lines_wide,Fail
+
+# https://gitlab.freedesktop.org/mesa/mesa/-/issues/4980
 dEQP-GLES2.functional.shaders.builtin_variable.frontfacing,Fail
-dEQP-GLES2.functional.shaders.discard.dynamic_loop_always,Fail
-dEQP-GLES2.functional.shaders.discard.dynamic_loop_dynamic,Fail
-dEQP-GLES2.functional.shaders.discard.dynamic_loop_texture,Fail
-dEQP-GLES2.functional.shaders.discard.dynamic_loop_uniform,Fail
-dEQP-GLES2.functional.shaders.discard.function_static_loop_always,Fail
-dEQP-GLES2.functional.shaders.discard.function_static_loop_dynamic,Fail
-dEQP-GLES2.functional.shaders.discard.function_static_loop_texture,Fail
-dEQP-GLES2.functional.shaders.discard.function_static_loop_uniform,Fail
-dEQP-GLES2.functional.shaders.discard.static_loop_always,Fail
-dEQP-GLES2.functional.shaders.discard.static_loop_dynamic,Fail
-dEQP-GLES2.functional.shaders.discard.static_loop_texture,Fail
-dEQP-GLES2.functional.shaders.discard.static_loop_uniform,Fail
+
 dEQP-GLES2.functional.shaders.functions.control_flow.mixed_return_break_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.functions.control_flow.return_after_break_fragment,Fail
 dEQP-GLES2.functional.shaders.functions.control_flow.return_after_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.functions.control_flow.return_after_loop_fragment,Fail
-dEQP-GLES2.functional.shaders.functions.control_flow.return_after_loop_sequence_fragment,Fail
 dEQP-GLES2.functional.shaders.functions.control_flow.return_in_loop_if_fragment,Fail
 dEQP-GLES2.functional.shaders.functions.control_flow.return_in_nested_loop_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat2_dynamic_loop_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat2_dynamic_loop_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat2_dynamic_loop_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat2_dynamic_loop_write_static_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat2_dynamic_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat2_dynamic_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat2_static_loop_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat2_static_loop_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat2_static_loop_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat2_static_loop_write_static_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat2_static_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat2_static_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat3_dynamic_loop_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat3_dynamic_loop_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat3_dynamic_loop_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat3_dynamic_loop_write_static_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat3_dynamic_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat3_dynamic_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat3_static_loop_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat3_static_loop_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat3_static_loop_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat3_static_loop_write_static_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat3_static_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat3_static_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat4_dynamic_loop_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat4_dynamic_loop_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat4_dynamic_loop_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat4_dynamic_loop_write_static_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat4_dynamic_write_dynamic_loop_read_fragment,Fail
 dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat4_dynamic_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat4_dynamic_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat4_static_loop_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat4_static_loop_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat4_static_loop_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat4_static_loop_write_static_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat4_static_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat4_static_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.float_const_write_dynamic_loop_read_fragment,Fail
 dEQP-GLES2.functional.shaders.indexing.tmp_array.float_const_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.float_const_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.float_dynamic_loop_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.float_dynamic_loop_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.float_dynamic_loop_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.float_dynamic_loop_write_static_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.float_dynamic_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.float_dynamic_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.float_dynamic_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.float_static_loop_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.float_static_loop_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.float_static_loop_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.float_static_loop_write_static_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.float_static_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.float_static_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec2_const_write_dynamic_loop_read_fragment,Fail
 dEQP-GLES2.functional.shaders.indexing.tmp_array.vec2_const_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec2_const_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec2_dynamic_loop_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec2_dynamic_loop_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec2_dynamic_loop_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec2_dynamic_loop_write_static_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec2_dynamic_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec2_dynamic_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec2_dynamic_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec2_static_loop_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec2_static_loop_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec2_static_loop_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec2_static_loop_write_static_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec2_static_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec2_static_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_loop_read_fragment,Fail
 dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_dynamic_loop_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_dynamic_loop_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_dynamic_loop_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_dynamic_loop_write_static_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_dynamic_write_dynamic_loop_read_fragment,Fail
 dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_dynamic_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_dynamic_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_static_loop_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_static_loop_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_static_loop_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_static_loop_write_static_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_static_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_static_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_loop_read_fragment,Fail
 dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_dynamic_loop_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_dynamic_loop_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_dynamic_loop_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_dynamic_loop_write_static_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_dynamic_write_dynamic_loop_read_fragment,Fail
 dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_dynamic_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_dynamic_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_static_loop_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_static_loop_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_static_loop_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_static_loop_write_static_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_static_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_static_write_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.uniform_array.float_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.uniform_array.float_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.uniform_array.vec2_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.uniform_array.vec2_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.uniform_array.vec3_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.uniform_array.vec3_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.uniform_array.vec4_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.uniform_array.vec4_static_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.float_dynamic_loop_write_dynamic_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.float_dynamic_loop_write_static_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.float_dynamic_write_dynamic_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.float_dynamic_write_static_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.float_static_loop_write_dynamic_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.float_static_loop_write_static_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.float_static_write_dynamic_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.float_static_write_static_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.vec2_dynamic_loop_write_dynamic_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.vec2_dynamic_loop_write_static_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.vec2_dynamic_write_dynamic_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.vec2_dynamic_write_static_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.vec2_static_loop_write_dynamic_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.vec2_static_loop_write_static_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.vec2_static_write_dynamic_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.vec2_static_write_static_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.vec3_dynamic_loop_write_dynamic_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.vec3_dynamic_loop_write_static_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.vec3_dynamic_write_dynamic_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.vec3_dynamic_write_static_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.vec3_static_loop_write_dynamic_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.vec3_static_loop_write_static_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.vec3_static_write_dynamic_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.vec3_static_write_static_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.vec4_dynamic_loop_write_dynamic_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.vec4_dynamic_loop_write_static_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.vec4_dynamic_write_dynamic_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.vec4_dynamic_write_static_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.vec4_static_loop_write_dynamic_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.vec4_static_loop_write_static_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.vec4_static_write_dynamic_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.varying_array.vec4_static_write_static_loop_read,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec2_component_write_dynamic_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec2_component_write_static_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec2_direct_write_dynamic_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec2_direct_write_static_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec2_dynamic_loop_subscript_write_component_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec2_dynamic_loop_subscript_write_direct_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec2_dynamic_loop_subscript_write_dynamic_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec2_dynamic_loop_subscript_write_dynamic_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec2_dynamic_loop_subscript_write_static_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec2_dynamic_loop_subscript_write_static_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec2_dynamic_subscript_write_dynamic_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec2_dynamic_subscript_write_static_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec2_static_loop_subscript_write_component_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec2_static_loop_subscript_write_direct_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec2_static_loop_subscript_write_dynamic_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec2_static_loop_subscript_write_dynamic_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec2_static_loop_subscript_write_static_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec2_static_loop_subscript_write_static_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec2_static_subscript_write_dynamic_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec2_static_subscript_write_static_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec3_component_write_dynamic_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec3_component_write_static_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec3_direct_write_dynamic_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec3_direct_write_static_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec3_dynamic_loop_subscript_write_component_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec3_dynamic_loop_subscript_write_direct_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec3_dynamic_loop_subscript_write_dynamic_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec3_dynamic_loop_subscript_write_dynamic_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec3_dynamic_loop_subscript_write_static_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec3_dynamic_loop_subscript_write_static_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec3_dynamic_subscript_write_dynamic_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec3_dynamic_subscript_write_dynamic_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec3_dynamic_subscript_write_static_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec3_static_loop_subscript_write_component_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec3_static_loop_subscript_write_direct_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec3_static_loop_subscript_write_dynamic_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec3_static_loop_subscript_write_dynamic_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec3_static_loop_subscript_write_static_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec3_static_loop_subscript_write_static_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec3_static_subscript_write_dynamic_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec3_static_subscript_write_static_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_component_write_dynamic_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_component_write_static_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_direct_write_dynamic_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_direct_write_static_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_dynamic_loop_subscript_write_component_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_dynamic_loop_subscript_write_direct_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_dynamic_loop_subscript_write_dynamic_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_dynamic_loop_subscript_write_dynamic_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_dynamic_loop_subscript_write_static_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_dynamic_loop_subscript_write_static_subscript_read_fragment,Fail
+
+dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_dynamic_subscript_write_static_loop_subscript_read_fragment,Fail
 dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_dynamic_subscript_write_component_read_fragment,Fail
 dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_dynamic_subscript_write_direct_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_dynamic_subscript_write_dynamic_loop_subscript_read_fragment,Fail
 dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_dynamic_subscript_write_dynamic_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_dynamic_subscript_write_static_loop_subscript_read_fragment,Fail
 dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_dynamic_subscript_write_static_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_static_loop_subscript_write_component_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_static_loop_subscript_write_direct_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_static_loop_subscript_write_dynamic_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_static_loop_subscript_write_dynamic_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_static_loop_subscript_write_static_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_static_loop_subscript_write_static_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_static_subscript_write_dynamic_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.vector_subscript.vec4_static_subscript_write_static_loop_subscript_read_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_constant_iterations.101_iterations_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_constant_iterations.compound_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_constant_iterations.conditional_body_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_constant_iterations.conditional_break_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_constant_iterations.conditional_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_constant_iterations.double_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_constant_iterations.function_call_inout_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_constant_iterations.function_call_return_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_constant_iterations.infinite_with_conditional_break_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_constant_iterations.mixed_break_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_constant_iterations.nested_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_constant_iterations.nested_sequence_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_constant_iterations.nested_tricky_dataflow_1_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_constant_iterations.nested_tricky_dataflow_2_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_constant_iterations.post_increment_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_constant_iterations.pre_increment_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_constant_iterations.select_iteration_count_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_constant_iterations.sequence_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_constant_iterations.sequence_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_constant_iterations.single_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_constant_iterations.unconditional_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_constant_iterations.vector_counter_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.101_iterations_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.basic_highp_float_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.basic_highp_int_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.basic_lowp_float_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.basic_lowp_int_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.basic_mediump_float_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.basic_mediump_int_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.compound_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.conditional_body_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.conditional_break_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.conditional_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.double_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.function_call_inout_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.function_call_return_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.infinite_with_conditional_break_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.mixed_break_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.nested_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.nested_sequence_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.nested_tricky_dataflow_1_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.nested_tricky_dataflow_2_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.post_increment_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.pre_increment_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.select_iteration_count_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.sequence_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.sequence_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.single_iteration_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.single_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.unconditional_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_dynamic_iterations.vector_counter_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.101_iterations_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.basic_highp_float_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.basic_highp_int_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.basic_lowp_float_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.basic_lowp_int_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.basic_mediump_float_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.basic_mediump_int_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.compound_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.conditional_body_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.conditional_break_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.conditional_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.double_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.function_call_inout_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.function_call_return_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.infinite_with_conditional_break_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.mixed_break_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.nested_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.nested_sequence_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.nested_tricky_dataflow_1_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.nested_tricky_dataflow_2_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.post_increment_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.pre_increment_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.select_iteration_count_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.sequence_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.sequence_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.single_iteration_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.single_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.unconditional_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.do_while_uniform_iterations.vector_counter_fragment,Fail
+
 dEQP-GLES2.functional.shaders.loops.for_constant_iterations.101_iterations_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.basic_highp_float_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.basic_highp_int_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.basic_lowp_float_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.basic_lowp_int_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.basic_mediump_float_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.basic_mediump_int_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.compound_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.conditional_body_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.conditional_break_fragment,Fail
 dEQP-GLES2.functional.shaders.loops.for_constant_iterations.conditional_continue_fragment,Fail
 dEQP-GLES2.functional.shaders.loops.for_constant_iterations.double_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.function_call_inout_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.function_call_return_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.infinite_with_conditional_break_fragment,Fail
 dEQP-GLES2.functional.shaders.loops.for_constant_iterations.mixed_break_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.nested_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.nested_sequence_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.nested_tricky_dataflow_1_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.nested_tricky_dataflow_2_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.no_iterations_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.post_increment_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.pre_increment_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.select_iteration_count_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.sequence_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.sequence_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.single_iteration_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.single_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.unconditional_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_constant_iterations.vector_counter_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.101_iterations_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.basic_highp_float_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.basic_highp_int_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.basic_lowp_float_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.basic_lowp_int_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.basic_mediump_float_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.basic_mediump_int_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.compound_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.conditional_body_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.conditional_break_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.conditional_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.double_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.function_call_inout_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.function_call_return_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.infinite_with_conditional_break_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.mixed_break_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.nested_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.nested_sequence_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.nested_tricky_dataflow_1_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.nested_tricky_dataflow_2_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.no_iterations_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.post_increment_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.pre_increment_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.select_iteration_count_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.sequence_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.sequence_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.single_iteration_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.single_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.unconditional_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_dynamic_iterations.vector_counter_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.101_iterations_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.basic_highp_float_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.basic_highp_int_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.basic_lowp_float_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.basic_lowp_int_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.basic_mediump_float_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.basic_mediump_int_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.compound_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.conditional_body_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.conditional_break_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.conditional_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.double_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.function_call_inout_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.function_call_return_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.infinite_with_conditional_break_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.mixed_break_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.nested_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.nested_sequence_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.nested_tricky_dataflow_1_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.nested_tricky_dataflow_2_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.no_iterations_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.post_increment_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.pre_increment_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.select_iteration_count_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.sequence_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.sequence_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.single_iteration_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.single_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.unconditional_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.for_uniform_iterations.vector_counter_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.101_iterations_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.basic_highp_float_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.basic_highp_int_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.basic_lowp_float_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.basic_lowp_int_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.basic_mediump_float_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.basic_mediump_int_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.compound_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.conditional_body_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.conditional_break_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.conditional_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.double_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.function_call_inout_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.function_call_return_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.infinite_with_conditional_break_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.mixed_break_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.nested_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.nested_sequence_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.nested_tricky_dataflow_1_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.nested_tricky_dataflow_2_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.no_iterations_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.post_increment_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.pre_increment_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.select_iteration_count_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.sequence_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.sequence_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.single_iteration_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.single_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.unconditional_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_constant_iterations.vector_counter_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.101_iterations_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.basic_highp_float_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.basic_highp_int_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.basic_lowp_float_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.basic_lowp_int_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.basic_mediump_float_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.basic_mediump_int_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.compound_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.conditional_body_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.conditional_break_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.conditional_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.double_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.function_call_inout_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.function_call_return_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.infinite_with_conditional_break_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.mixed_break_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.nested_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.nested_sequence_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.nested_tricky_dataflow_1_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.nested_tricky_dataflow_2_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.no_iterations_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.post_increment_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.pre_increment_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.select_iteration_count_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.sequence_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.sequence_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.single_iteration_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.single_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.unconditional_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_dynamic_iterations.vector_counter_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.101_iterations_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.basic_highp_float_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.basic_highp_int_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.basic_lowp_float_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.basic_lowp_int_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.basic_mediump_float_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.basic_mediump_int_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.compound_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.conditional_body_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.conditional_break_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.conditional_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.double_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.function_call_inout_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.function_call_return_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.infinite_with_conditional_break_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.mixed_break_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.nested_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.nested_sequence_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.nested_tricky_dataflow_1_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.nested_tricky_dataflow_2_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.no_iterations_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.post_increment_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.pre_increment_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.select_iteration_count_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.sequence_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.sequence_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.single_iteration_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.single_statement_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.unconditional_continue_fragment,Fail
-dEQP-GLES2.functional.shaders.loops.while_uniform_iterations.vector_counter_fragment,Fail
+
 dEQP-GLES2.functional.shaders.random.all_features.fragment.22,Fail
 dEQP-GLES2.functional.shaders.random.all_features.fragment.32,Fail
 dEQP-GLES2.functional.shaders.random.all_features.fragment.34,Fail
@@ -535,29 +64,16 @@ dEQP-GLES2.functional.shaders.random.trigonometric.fragment.42,Fail
 dEQP-GLES2.functional.shaders.random.trigonometric.fragment.45,Fail
 dEQP-GLES2.functional.shaders.random.trigonometric.fragment.52,Fail
 dEQP-GLES2.functional.shaders.random.trigonometric.fragment.81,Fail
-dEQP-GLES2.functional.shaders.return.return_in_dynamic_loop_dynamic_fragment,Fail
-dEQP-GLES2.functional.shaders.return.return_in_dynamic_loop_never_fragment,Fail
-dEQP-GLES2.functional.shaders.return.return_in_static_loop_dynamic_fragment,Fail
-dEQP-GLES2.functional.shaders.return.return_in_static_loop_never_fragment,Fail
-dEQP-GLES2.functional.shaders.struct.local.dynamic_loop_assignment_fragment,Fail
-dEQP-GLES2.functional.shaders.struct.local.dynamic_loop_nested_struct_array_fragment,Fail
-dEQP-GLES2.functional.shaders.struct.local.dynamic_loop_struct_array_fragment,Fail
-dEQP-GLES2.functional.shaders.struct.local.loop_assignment_fragment,Fail
-dEQP-GLES2.functional.shaders.struct.local.loop_nested_struct_array_fragment,Fail
-dEQP-GLES2.functional.shaders.struct.local.loop_struct_array_fragment,Fail
-dEQP-GLES2.functional.shaders.struct.local.nested_dynamic_loop_assignment_fragment,Fail
-dEQP-GLES2.functional.shaders.struct.local.nested_loop_assignment_fragment,Fail
+
 dEQP-GLES2.functional.shaders.struct.local.nested_struct_array_dynamic_index_fragment,Fail
-dEQP-GLES2.functional.shaders.struct.uniform.dynamic_loop_nested_struct_array_fragment,Fail
-dEQP-GLES2.functional.shaders.struct.uniform.dynamic_loop_struct_array_fragment,Fail
-dEQP-GLES2.functional.shaders.struct.uniform.loop_nested_struct_array_fragment,Fail
-dEQP-GLES2.functional.shaders.struct.uniform.loop_struct_array_fragment,Fail
 dEQP-GLES2.functional.shaders.struct.uniform.nested_struct_array_dynamic_index_fragment,Fail
 
 # Need to port the DP3 trick for all_equal from st_glsl_to_tgsi.
 dEQP-GLES2.functional.shaders.struct.uniform.equal_fragment,Fail
 dEQP-GLES2.functional.shaders.struct.uniform.not_equal_fragment,Fail
 
+# https://gitlab.freedesktop.org/mesa/mesa/-/issues/4982
+# -Y face renders incorrectly (black instead of texture contents)
 dEQP-GLES2.functional.texture.filtering.cube.linear_linear_clamp_l8_pot,Fail
 dEQP-GLES2.functional.texture.filtering.cube.linear_linear_mirror_l8_pot,Fail
 dEQP-GLES2.functional.texture.filtering.cube.linear_linear_repeat_l8_pot,Fail
@@ -575,17 +91,21 @@ dEQP-GLES2.functional.texture.filtering.cube.nearest_nearest_clamp_l8_pot,Fail
 dEQP-GLES2.functional.texture.filtering.cube.nearest_nearest_mirror_l8_pot,Fail
 dEQP-GLES2.functional.texture.filtering.cube.nearest_nearest_repeat_l8_pot,Fail
 dEQP-GLES2.functional.texture.size.cube.128x128_l8_mipmap,Fail
+dEQP-GLES2.functional.texture.size.cube.64x64_l8_mipmap,Fail
+
+# https://gitlab.freedesktop.org/mesa/mesa/-/issues/4982
+# Minor errors on all faces, perhaps around filtering's texel selection?
 dEQP-GLES2.functional.texture.size.cube.15x15_l8,Fail
 dEQP-GLES2.functional.texture.size.cube.15x15_rgb888,Fail
 dEQP-GLES2.functional.texture.size.cube.15x15_rgba4444,Fail
 dEQP-GLES2.functional.texture.size.cube.15x15_rgba8888,Fail
 dEQP-GLES2.functional.texture.size.cube.16x16_l8_mipmap,Fail
-dEQP-GLES2.functional.texture.size.cube.64x64_l8_mipmap,Fail
+
+# Texel differences in the middle of the texture?
 dEQP-GLES2.functional.texture.wrap.clamp_clamp_nearest_npot_etc1,Fail
+
 dEQP-GLES2.functional.uniform_api.random.13,Fail
 dEQP-GLES2.functional.uniform_api.random.20,Fail
-dEQP-GLES2.functional.uniform_api.random.21,Fail
-dEQP-GLES2.functional.uniform_api.random.24,Fail
 dEQP-GLES2.functional.uniform_api.random.54,Fail
 dEQP-GLES2.functional.uniform_api.random.71,Fail
 dEQP-GLES2.functional.uniform_api.random.74,Fail
@@ -593,7 +113,6 @@ dEQP-GLES2.functional.uniform_api.random.80,Fail
 dEQP-GLES2.functional.uniform_api.random.81,Fail
 dEQP-GLES2.functional.uniform_api.value.assigned.by_pointer.render.array_in_struct.mat4_mat2_both,Fail
 dEQP-GLES2.functional.uniform_api.value.assigned.by_pointer.render.array_in_struct.mat4_mat2_fragment,Fail
-dEQP-GLES2.functional.uniform_api.value.assigned.by_pointer.render.array_in_struct.sampler2D_samplerCube_both,Fail
 dEQP-GLES2.functional.uniform_api.value.assigned.by_pointer.render.array_in_struct.sampler2D_samplerCube_fragment,Fail
 dEQP-GLES2.functional.uniform_api.value.assigned.by_pointer.render.basic_array.mat4_both,Fail
 dEQP-GLES2.functional.uniform_api.value.assigned.by_pointer.render.basic_array.mat4_fragment,Fail
@@ -602,7 +121,6 @@ dEQP-GLES2.functional.uniform_api.value.assigned.by_pointer.render.multiple_nest
 dEQP-GLES2.functional.uniform_api.value.assigned.by_pointer.render.nested_structs_arrays.mat4_mat2_fragment,Fail
 dEQP-GLES2.functional.uniform_api.value.assigned.by_pointer.render.struct_in_array.mat4_mat2_both,Fail
 dEQP-GLES2.functional.uniform_api.value.assigned.by_pointer.render.struct_in_array.mat4_mat2_fragment,Fail
-dEQP-GLES2.functional.uniform_api.value.assigned.by_value.render.array_in_struct.sampler2D_samplerCube_both,Fail
 dEQP-GLES2.functional.uniform_api.value.assigned.by_value.render.array_in_struct.sampler2D_samplerCube_fragment,Fail
 dEQP-GLES2.functional.uniform_api.value.assigned.by_value.render.multiple_nested_structs_arrays.both,Fail
 dEQP-GLES2.functional.uniform_api.value.assigned.by_value.render.multiple_nested_structs_arrays.fragment,Fail
@@ -615,22 +133,20 @@ dEQP-GLES2.functional.uniform_api.value.initial.render.multiple_nested_structs_a
 dEQP-GLES2.functional.uniform_api.value.initial.render.nested_structs_arrays.mat4_mat2_fragment,Fail
 dEQP-GLES2.functional.uniform_api.value.initial.render.struct_in_array.mat4_mat2_both,Fail
 dEQP-GLES2.functional.uniform_api.value.initial.render.struct_in_array.mat4_mat2_fragment,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.alpha8_oes,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.luminance4_alpha4_oes,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.luminance8_alpha8_oes,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.luminance8_oes,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.rgb10_a2,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.rgb565,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.rgb5_a1,Fail
-KHR-GLES2.core.internalformat.copy_tex_image.rgba4,Fail
+
+# depth texture is sampling as as white instead of red.
 KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_int_depth_component16,Fail
 KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_int_depth_component24,Fail
 KHR-GLES2.core.internalformat.texture2d.depth_component_unsigned_short_depth_component16,Fail
-KHR-GLES2.shaders.aggressive_optimizations.cos_float_frag,Fail
-KHR-GLES2.shaders.aggressive_optimizations.cos_vec2_frag,Fail
-KHR-GLES2.shaders.aggressive_optimizations.cos_vec3_frag,Fail
+
+# https://gitlab.freedesktop.org/mesa/mesa/-/issues/4979
+# Non-unrolled loops, but they should be unrollable.
 KHR-GLES2.shaders.aggressive_optimizations.sin_vec2_frag,Fail
 KHR-GLES2.shaders.aggressive_optimizations.sin_vec3_frag,Fail
+
+# https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3133
 KHR-GLES2.texture_3d.copy_sub_image.negative,Fail
 KHR-GLES2.texture_3d.copy_sub_image.rgba,Fail
+
+# https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3134
 KHR-GLES2.texture_3d.filtering.combinations.negative,Fail
diff --git a/mesa 3D driver/src/gallium/drivers/i915/ci/deqp-i915g.toml b/mesa 3D driver/src/gallium/drivers/i915/ci/deqp-i915g.toml
index 5dee0a0a59..dec2e8628d 100644
--- a/mesa 3D driver/src/gallium/drivers/i915/ci/deqp-i915g.toml	
+++ b/mesa 3D driver/src/gallium/drivers/i915/ci/deqp-i915g.toml	
@@ -8,6 +8,8 @@ deqp_args = [
     "--deqp-gl-config-name=rgba8888d24s8ms0",
     "--deqp-visibility=hidden"
 ]
+version_check = "GL ES 2.0.*git"
+renderer_check = "i915"
 
 [[deqp]]
 deqp = "/deqp/external/openglcts/modules/glcts"
diff --git a/mesa 3D driver/src/gallium/drivers/i915/ci/gitlab-ci.yml b/mesa 3D driver/src/gallium/drivers/i915/ci/gitlab-ci.yml
index b6f7f9a6a6..efbeae8d20 100644
--- a/mesa 3D driver/src/gallium/drivers/i915/ci/gitlab-ci.yml	
+++ b/mesa 3D driver/src/gallium/drivers/i915/ci/gitlab-ci.yml	
@@ -7,7 +7,6 @@
   tags:
     - anholt-g33
   variables:
-    DEQP_EXPECTED_RENDERER: i915
     GPU_VERSION: i915-g33
 
 i915-g33-deqp:
@@ -15,7 +14,6 @@ i915-g33-deqp:
     - .anholt-g33-test
     - .deqp-test
   variables:
-    DEQP_VER: gles2
     DEQP_SUITE: i915g
 
 i915-g33-piglit:
diff --git a/mesa 3D driver/src/gallium/drivers/i915/ci/piglit-i915-g33-fails.txt b/mesa 3D driver/src/gallium/drivers/i915/ci/piglit-i915-g33-fails.txt
index e8a9dc8883..616b56405a 100644
--- a/mesa 3D driver/src/gallium/drivers/i915/ci/piglit-i915-g33-fails.txt	
+++ b/mesa 3D driver/src/gallium/drivers/i915/ci/piglit-i915-g33-fails.txt	
@@ -1,49 +1,63 @@
 shaders@glsl-bug-110796,Fail
+
+# No derivatives support in the HW.
 shaders@glsl-derivs,Fail
+
 shaders@glsl-fs-loop,Fail
 shaders@glsl-fs-loop-nested,Fail
 shaders@glsl-fs-raytrace-bug27060,Fail
+
+# No derivatives support in the HW.
 shaders@glsl-fwidth,Fail
+
+# "GL_MAX_VARYING_FLOATS = 40
+#  Probe color at (34,34)
+#    Expected: 0 255 0
+#    Observed: 255 0 0
+#    Failure with 9 vec4 varyings used in varying index 8
+#  Probe color at (34,38)
+#    Expected: 0 255 0
+#    Observed: 255 0 0
+#    Failure with 10 vec4 varyings used in varying index 8"
 shaders@glsl-max-varyings,Fail
+
 shaders@glsl-max-varyings >max_varying_components,Fail
+
+# Failed to link: error: looping not supported i915 fragment shaders, all loops must be statically unrollable.
+# and then the test assertion fails setting up its uniforms (if piglit is built with assertions, which CI doesn't).
 shaders@glsl-uniform-interstage-limits@520 vs- 1 fs,Fail
 shaders@glsl-uniform-interstage-limits@subdivide 5,Fail
 shaders@glsl-uniform-interstage-limits@subdivide 5- statechanges,Fail
+
+shaders@glsl-predication-on-large-array,Fail
+
 shaders@ssa@fs-if-def-else-break,Fail
 shaders@ssa@fs-lost-copy-problem,Fail
 shaders@ssa@fs-swap-problem,Fail
 shaders@ssa@fs-while-loop-rotate-value,Fail
-spec@!opengl 1.0@gl-1.0-bitmap-heart-dance,Fail
+
+# "MESA: error: Empty fragment shader"
 spec@!opengl 1.0@gl-1.0-drawbuffer-modes,Fail
-spec@!opengl 1.0@gl-1.0-long-line-loop,Crash
+
+# "Probe color at (74,4)
+#    Expected: 1.000000 1.000000 1.000000
+#    Observed: 0.000000 0.000000 0.000000
+#  Failure with path Polygon Stipple set to always pass.
+#  Probe color at (4,14)
+#    Expected: 1.000000 1.000000 1.000000
+#    Observed: 0.000000 0.000000 0.000000
+#  Failure with always-pass paths enabled.
+#  Probe color at (54,24)
+#    Expected: 0.000000 0.000000 0.000000
+#    Observed: 1.000000 1.000000 1.000000
+#  Failure with Scissor Test set to fail mode."
 spec@!opengl 1.0@gl-1.0-no-op-paths,Fail
-spec@!opengl 1.0@gl-1.0-ortho-pos,Crash
+
 spec@!opengl 1.0@gl-1.0-scissor-offscreen,Fail
 spec@!opengl 1.0@gl-1.0-swapbuffers-behavior,Fail
 
-spec@!opengl 1.1@clipflat,Fail
-spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
-spec@!opengl 1.1@clipflat@glBegin/End(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
-spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
-spec@!opengl 1.1@clipflat@glBegin/End(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
-spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
-spec@!opengl 1.1@clipflat@glBegin/End(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
-spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
-spec@!opengl 1.1@clipflat@glDrawArrays(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
-spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
-spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
-spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
-spec@!opengl 1.1@clipflat@glDrawArrays(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
-spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
-spec@!opengl 1.1@clipflat@glDrawElements(GL_TRIANGLE_FAN)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
-spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
-spec@!opengl 1.1@clipflat@glDrawElements(GL_QUADS)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
-spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CCW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
-spec@!opengl 1.1@clipflat@glDrawElements(GL_QUAD_STRIP)- glFrontFace(GL_CW)- glPolygonMode(GL_FILL)- quadrant: center middle PV: FIRST,Fail
 spec@!opengl 1.1@depthstencil-default_fb-clear,Fail
 spec@!opengl 1.1@getteximage-formats,Fail
-spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 varray gl_quad_strip,Crash
-spec@!opengl 1.1@gl-1.1-drawarrays-vertex-count 100000 vbo gl_quad_strip,Crash
 spec@!opengl 1.1@gl-1.2-texture-base-level,Fail
 spec@!opengl 1.1@line-flat-clip-color,Fail
 spec@!opengl 1.1@linestipple,Fail
@@ -51,13 +65,11 @@ spec@!opengl 1.1@linestipple@Factor 2x,Fail
 spec@!opengl 1.1@linestipple@Factor 3x,Fail
 spec@!opengl 1.1@linestipple@Line loop,Fail
 spec@!opengl 1.1@linestipple@Line strip,Fail
-spec@!opengl 1.1@longprim,Crash
+
+# "../src/compiler/nir/nir_lower_int_to_float.c:102: lower_alu_instr: Assertion `nir_alu_type_get_base_type(info->output_type) != nir_type_int && nir_alu_type_get_base_type(info->output_type) != nir_type_uint' failed."
 spec@!opengl 1.1@point-line-no-cull,Crash
+
 spec@!opengl 1.1@polygon-mode-offset,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on bottom edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on left edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on right edge,Fail
-spec@!opengl 1.1@polygon-mode-offset@config 0: Expected white pixel on top edge,Fail
 spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on bottom edge,Fail
 spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on left edge,Fail
 spec@!opengl 1.1@polygon-mode-offset@config 3: Expected white pixel on right edge,Fail
@@ -66,10 +78,10 @@ spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on bottom ed
 spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on left edge,Fail
 spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on right edge,Fail
 spec@!opengl 1.1@polygon-mode-offset@config 4: Expected white pixel on top edge,Fail
-spec@!opengl 1.1@polygon-offset,Fail
 spec@!opengl 1.1@read-front,Fail
 spec@!opengl 1.1@read-front clear-front-first,Fail
-spec@!opengl 1.1@tex-upside-down-miptree,Fail
+
+# https://gitlab.freedesktop.org/mesa/mesa/-/issues/4985
 spec@!opengl 1.1@texwrap 1d bordercolor,Fail
 spec@!opengl 1.1@texwrap 1d bordercolor@GL_RGBA8- border color only,Fail
 spec@!opengl 1.1@texwrap 1d proj bordercolor,Fail
@@ -111,25 +123,29 @@ spec@!opengl 1.1@texwrap formats bordercolor@GL_RGBA16- border color only,Fail
 spec@!opengl 1.1@texwrap formats bordercolor@GL_RGBA2- border color only,Fail
 spec@!opengl 1.1@texwrap formats bordercolor@GL_RGBA4- border color only,Fail
 spec@!opengl 1.1@texwrap formats bordercolor@GL_RGBA8- border color only,Fail
+
 spec@!opengl 1.1@windowoverlap,Fail
 spec@!opengl 1.2@levelclamp,Fail
-spec@!opengl 1.2@teximage-errors,Fail
+
 spec@!opengl 1.2@texwrap 3d,Fail
 spec@!opengl 1.2@texwrap 3d@GL_RGBA8- NPOT,Fail
+
+# https://gitlab.freedesktop.org/mesa/mesa/-/issues/4985
 spec@!opengl 1.2@texwrap 3d bordercolor,Fail
 spec@!opengl 1.2@texwrap 3d bordercolor@GL_RGBA8- border color only,Fail
 spec@!opengl 1.2@texwrap 3d proj,Fail
 spec@!opengl 1.2@texwrap 3d proj@GL_RGBA8- NPOT- projected,Fail
 spec@!opengl 1.2@texwrap 3d proj bordercolor,Fail
 spec@!opengl 1.2@texwrap 3d proj bordercolor@GL_RGBA8- projected- border color only,Fail
-spec@!opengl 1.4@gl-1.4-polygon-offset,Fail
+
 spec@!opengl 1.4@tex-miplevel-selection,Fail
 spec@!opengl 1.4@tex-miplevel-selection-lod,Fail
 spec@!opengl 1.4@tex-miplevel-selection-lod-bias,Fail
 spec@!opengl 1.5@depth-tex-compare,Fail
-spec@!opengl 2.0@max-samplers,Fail
-spec@!opengl 2.0@max-samplers border,Fail
+
+# Need to be able to report 0 OQ bits, since there are no HW OQs.
 spec@!opengl 2.0@occlusion-query-discard,Fail
+
 spec@!opengl 2.0@tex3d-npot,Fail
 spec@!opengl 2.0@vertex-program-two-side back front2,Fail
 spec@!opengl 2.0@vertex-program-two-side back front2@vs and fs,Fail
@@ -176,22 +192,8 @@ spec@!opengl 2.0@vertex-program-two-side front2 back2@vs and fs,Fail
 spec@!opengl 2.1@pbo,Fail
 spec@!opengl 2.1@pbo@test_polygon_stip,Fail
 spec@!opengl 2.1@polygon-stipple-fs,Fail
-spec@!opengl es 2.0@draw_buffers_gles2,Fail
+
 spec@arb_arrays_of_arrays@execution@glsl-arrays-copy-size-mismatch,Fail
-spec@arb_color_buffer_float@gl_rgba16f-render,Crash
-spec@arb_color_buffer_float@gl_rgba16f-render-fog,Crash
-spec@arb_color_buffer_float@gl_rgba16f-render-sanity,Crash
-spec@arb_color_buffer_float@gl_rgba16f-render-sanity-fog,Crash
-spec@arb_color_buffer_float@gl_rgba32f-render,Crash
-spec@arb_color_buffer_float@gl_rgba32f-render-fog,Crash
-spec@arb_color_buffer_float@gl_rgba32f-render-sanity,Crash
-spec@arb_color_buffer_float@gl_rgba32f-render-sanity-fog,Crash
-spec@arb_color_buffer_float@gl_rgba8-render,Crash
-spec@arb_color_buffer_float@gl_rgba8-render-fog,Crash
-spec@arb_color_buffer_float@gl_rgba8_snorm-render,Crash
-spec@arb_color_buffer_float@gl_rgba8_snorm-render-fog,Crash
-spec@arb_color_buffer_float@gl_rgba8_snorm-render-sanity,Crash
-spec@arb_color_buffer_float@gl_rgba8_snorm-render-sanity-fog,Crash
 spec@arb_depth_texture@depth-level-clamp,Fail
 spec@arb_depth_texture@fbo-clear-formats,Fail
 spec@arb_depth_texture@fbo-clear-formats@GL_DEPTH_COMPONENT,Fail
@@ -208,15 +210,21 @@ spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT24 NPOT,Fail
 spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32,Fail
 spec@arb_depth_texture@fbo-generatemipmap-formats@GL_DEPTH_COMPONENT32 NPOT,Fail
 spec@arb_depth_texture@texdepth,Fail
+
+# https://gitlab.freedesktop.org/mesa/mesa/-/issues/4985
 spec@arb_depth_texture@texwrap formats bordercolor,Fail
 spec@arb_depth_texture@texwrap formats bordercolor@GL_DEPTH_COMPONENT16- border color only,Fail
 spec@arb_depth_texture@texwrap formats bordercolor@GL_DEPTH_COMPONENT24- border color only,Fail
 spec@arb_depth_texture@texwrap formats bordercolor@GL_DEPTH_COMPONENT32- border color only,Fail
-spec@arb_es2_compatibility@arb_es2_compatibility-drawbuffers,Fail
+
+# https://gitlab.freedesktop.org/mesa/mesa/-/issues/4985
 spec@arb_es2_compatibility@texwrap formats bordercolor,Fail
 spec@arb_es2_compatibility@texwrap formats bordercolor@GL_RGB565- border color only,Fail
+
+# fails at 4 samples (for our 4 supported indirections) because the 1D coordinate workaround
+# causes an extra texture indirection phase.
 spec@arb_fragment_program@fp-indirections2,Fail
-spec@arb_fragment_program@minmax,Fail
+
 spec@arb_fragment_program_shadow@tex-shadow1d,Fail
 spec@arb_fragment_program_shadow@tex-shadow2d,Fail
 spec@arb_fragment_program_shadow@tex-shadow2drect,Fail
@@ -229,20 +237,25 @@ spec@arb_framebuffer_object@arb_framebuffer_object-depth-stencil-blit stencil gl
 spec@arb_framebuffer_object@arb_framebuffer_object-depth-stencil-blit stencil gl_stencil_index4,Fail
 spec@arb_framebuffer_object@arb_framebuffer_object-depth-stencil-blit stencil gl_stencil_index8,Fail
 spec@arb_framebuffer_object@fbo-attachments-blit-scaled-linear,Fail
+
+# No derivatives support in the HW.
 spec@arb_framebuffer_object@fbo-deriv,Fail
+
 spec@arb_framebuffer_object@fbo-generatemipmap-1d,Fail
 spec@arb_framebuffer_object@fbo-generatemipmap-cubemap s3tc_dxt1,Fail
 spec@arb_framebuffer_object@framebuffer-blit-levels draw stencil,Fail
 spec@arb_framebuffer_object@framebuffer-blit-levels read stencil,Fail
 spec@arb_framebuffer_object@same-attachment-glframebuffertexture2d-gl_depth_stencil_attachment,Fail
 spec@arb_internalformat_query2@all internalformat_<x>_type pname checks,Timeout
+
+# Need to be able to report 0 OQ bits, since there are no HW OQs.
 spec@arb_occlusion_query2@render,Fail
 spec@arb_occlusion_query@occlusion_query,Fail
 spec@arb_occlusion_query@occlusion_query_conform,Fail
 spec@arb_occlusion_query@occlusion_query_meta_fragments,Fail
 spec@arb_occlusion_query@occlusion_query_meta_no_fragments,Fail
 spec@arb_occlusion_query@occlusion_query_meta_save,Fail
-spec@arb_occlusion_query@occlusion_query_order,Crash
+
 spec@arb_pixel_buffer_object@cubemap npot pbo,Fail
 spec@arb_pixel_buffer_object@fbo-pbo-readpixels-small,Fail
 spec@arb_pixel_buffer_object@pbo-getteximage,Fail
@@ -254,6 +267,8 @@ spec@arb_point_parameters@arb_point_parameters-point-attenuation@Aliased combina
 spec@arb_provoking_vertex@arb-provoking-vertex-render,Fail
 spec@arb_sampler_objects@gl_ext_texture_srgb_decode,Fail
 spec@arb_shader_texture_lod@execution@glsl-fs-texturelod-01,Fail
+
+# https://gitlab.freedesktop.org/mesa/mesa/-/issues/4985
 spec@arb_texture_compression@texwrap formats bordercolor,Fail
 spec@arb_texture_compression@texwrap formats bordercolor@GL_COMPRESSED_ALPHA- border color only,Fail
 spec@arb_texture_compression@texwrap formats bordercolor@GL_COMPRESSED_INTENSITY- border color only,Fail
@@ -261,6 +276,7 @@ spec@arb_texture_compression@texwrap formats bordercolor@GL_COMPRESSED_LUMINANCE
 spec@arb_texture_compression@texwrap formats bordercolor@GL_COMPRESSED_LUMINANCE_ALPHA- border color only,Fail
 spec@arb_texture_compression@texwrap formats bordercolor@GL_COMPRESSED_RGB- border color only,Fail
 spec@arb_texture_compression@texwrap formats bordercolor@GL_COMPRESSED_RGBA- border color only,Fail
+
 spec@arb_texture_cube_map@copyteximage cube,Fail
 spec@arb_texture_cube_map@cubemap npot,Fail
 spec@arb_texture_rectangle@glsl-fs-shadow2drect,Fail
@@ -276,45 +292,41 @@ spec@arb_texture_rectangle@tex-miplevel-selection gl2:textureproj 2drectshadow,F
 # IN[x].xyxw setting up the TXP coords.
 spec@arb_texture_rectangle@texrect-many,Fail
 
+# https://gitlab.freedesktop.org/mesa/mesa/-/issues/4985
 spec@arb_texture_rectangle@texwrap rect bordercolor,Fail
 spec@arb_texture_rectangle@texwrap rect bordercolor@GL_RGBA8- border color only,Fail
 spec@arb_texture_rectangle@texwrap rect proj bordercolor,Fail
 spec@arb_texture_rectangle@texwrap rect proj bordercolor@GL_RGBA8- projected- border color only,Fail
+
+# "Mesa: User error: GL_INVALID_OPERATION in glTexSubImage2D(invalid texture level 8)
+#  Probe color at (80,80)
+#    Expected: 0.000000 1.000000 0.000000
+#    Observed: 1.000000 0.000000 0.000000"
+#  texture-storage: wrong color for mipmap level 1
+# PIGLIT: {"subtest": {"2D mipmap rendering " : "fail"}}"
 spec@arb_texture_storage@texture-storage,Fail
 spec@arb_texture_storage@texture-storage@2D mipmap rendering ,Fail
+
+# test checks for required ext (which we don't have) after compiling the shader
+# that uses dynamic loops
+spec@arb_timer_query@query gl_timestamp,Fail
+
+# "Probe color at (68,68)
+#   Left: 0.784314 1.000000 1.000000 1.000000
+#   Right: 0.800000 1.000000 1.000000 1.000000
+#  PIGLIT: {"subtest": {"mov 8*c0" : "fail"}}"
 spec@ati_fragment_shader@ati_fragment_shader-render-ops,Fail
 spec@ati_fragment_shader@ati_fragment_shader-render-ops@mov 8*c0,Fail
+
 spec@egl 1.4@eglterminate then unbind context,Fail
 spec@egl_ext_protected_content@conformance,Fail
 spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_depth_component24,Fail
 spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_rgba,Fail
 spec@egl_khr_surfaceless_context@viewport,Fail
-spec@ext_direct_state_access@indexed-state-queries 12,Fail
-spec@ext_direct_state_access@indexed-state-queries 12@GetIntegerIndexedvEXT,Fail
-spec@ext_direct_state_access@multi-texture,Fail
-spec@ext_direct_state_access@multi-texture@MultiTexParameterfEXT,Fail
-spec@ext_direct_state_access@multi-texture@MultiTexParameterfEXT + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@multi-texture@MultiTexParameterfEXT + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@multi-texture@MultiTexParameteriEXT,Fail
-spec@ext_direct_state_access@multi-texture@MultiTexParameteriEXT + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@multi-texture@MultiTexParameteriEXT + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@multi-texture@MultiTexParameterivEXT,Fail
-spec@ext_direct_state_access@multi-texture@MultiTexParameterivEXT + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@multi-texture@MultiTexParameterivEXT + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@textures,Fail
-spec@ext_direct_state_access@textures@TextureImage2DEXT,Fail
-spec@ext_direct_state_access@textures@TextureImage2DEXT + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@textures@TextureImage2DEXT + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@textures@TextureParameterfEXT,Fail
-spec@ext_direct_state_access@textures@TextureParameterfEXT + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@textures@TextureParameterfEXT + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@textures@TextureParameteriEXT,Fail
-spec@ext_direct_state_access@textures@TextureParameteriEXT + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@textures@TextureParameteriEXT + display list GL_COMPILE_AND_EXECUTE,Fail
-spec@ext_direct_state_access@textures@TextureParameterivEXT,Fail
-spec@ext_direct_state_access@textures@TextureParameterivEXT + display list GL_COMPILE,Fail
-spec@ext_direct_state_access@textures@TextureParameterivEXT + display list GL_COMPILE_AND_EXECUTE,Fail
+
+# "Mesa: User error: GL_INVALID_OPERATION in unsupported function called (unsupported extension or deprecated function?)"
 spec@ext_draw_instanced@ext_draw_instanced-drawarrays,Fail
+
 spec@ext_external_objects@vk-buf-exchange,Fail
 spec@ext_external_objects@vk-depth-display,Fail
 spec@ext_external_objects@vk-image-display,Fail
@@ -325,7 +337,13 @@ spec@ext_external_objects@vk-semaphores,Fail
 spec@ext_external_objects@vk-semaphores-2,Fail
 spec@ext_external_objects@vk-vert-buf-reuse,Fail
 spec@ext_external_objects@vk-vert-buf-update-errors,Fail
+
+# "Failed blit src(0,0;1,1) - dst(0,0;2147483647,2147483647)
+#  Failed blit src(0,0;40,40) - dst(0,0;134217727,134217727)
+#  ..."
 spec@ext_framebuffer_blit@fbo-blit-check-limits,Fail
+
+# Unsupported non-unrolled loop
 spec@ext_framebuffer_multisample@accuracy all_samples color depthstencil,Fail
 spec@ext_framebuffer_multisample@accuracy all_samples color depthstencil linear,Fail
 spec@ext_framebuffer_multisample@accuracy all_samples color small depthstencil,Fail
@@ -342,21 +360,36 @@ spec@ext_framebuffer_multisample@accuracy all_samples stencil_draw depthstencil,
 spec@ext_framebuffer_multisample@accuracy all_samples stencil_draw small depthstencil,Fail
 spec@ext_framebuffer_multisample@accuracy all_samples stencil_resolve depthstencil,Fail
 spec@ext_framebuffer_multisample@accuracy all_samples stencil_resolve small depthstencil,Fail
-spec@ext_framebuffer_object@fbo-blending-format-quirks,Fail
+
 spec@ext_framebuffer_object@fbo-blending-formats,Fail
+# "Probe color at (192,0)
+#    Expected: 0.810000 0.000000 0.000000 1.000000
+#    Observed: 0.901961 0.000000 0.000000 1.000000
+#    when testing FBO result, blending with DST_ALPHA."
 spec@ext_framebuffer_object@fbo-blending-formats@GL_INTENSITY,Fail
 spec@ext_framebuffer_object@fbo-blending-formats@GL_INTENSITY12,Fail
 spec@ext_framebuffer_object@fbo-blending-formats@GL_INTENSITY16,Fail
 spec@ext_framebuffer_object@fbo-blending-formats@GL_INTENSITY4,Fail
 spec@ext_framebuffer_object@fbo-blending-formats@GL_INTENSITY8,Fail
+# "Testing GL_LUMINANCE4
+#  Probe color at (192,0)
+#    Expected: 0.800000 0.000000 0.000000 1.000000
+#    Observed: 0.901961 0.000000 0.000000 1.000000
+#    when testing FBO result, blending with DST_ALPHA."
 spec@ext_framebuffer_object@fbo-blending-formats@GL_LUMINANCE,Fail
 spec@ext_framebuffer_object@fbo-blending-formats@GL_LUMINANCE12,Fail
 spec@ext_framebuffer_object@fbo-blending-formats@GL_LUMINANCE16,Fail
 spec@ext_framebuffer_object@fbo-blending-formats@GL_LUMINANCE4,Fail
 spec@ext_framebuffer_object@fbo-blending-formats@GL_LUMINANCE8,Fail
+
 spec@ext_framebuffer_object@fbo-blending-formats@GL_RGB10,Fail
+# "Probe color at (64,0)
+#    Expected: 0.480000 0.800000 0.680000 1.000000
+#    Observed: 0.533333 0.800000 0.666667 0.466667
+#    when testing window result, blending with CONSTANT_COLOR.
 spec@ext_framebuffer_object@fbo-blending-formats@GL_RGB4,Fail
 spec@ext_framebuffer_object@fbo-blending-formats@GL_RGB5,Fail
+
 spec@ext_framebuffer_object@fbo-clear-formats,Fail
 spec@ext_framebuffer_object@fbo-clear-formats@GL_INTENSITY,Fail
 spec@ext_framebuffer_object@fbo-clear-formats@GL_INTENSITY12,Fail
@@ -385,7 +418,10 @@ spec@ext_framebuffer_object@fbo-colormask-formats@GL_LUMINANCE16,Fail
 spec@ext_framebuffer_object@fbo-colormask-formats@GL_LUMINANCE4,Fail
 spec@ext_framebuffer_object@fbo-colormask-formats@GL_LUMINANCE8,Fail
 spec@ext_framebuffer_object@fbo-depth-sample-compare,Fail
+
+# https://gitlab.freedesktop.org/mesa/mesa/-/issues/4980
 spec@ext_framebuffer_object@fbo-fragcoord2,Fail
+
 spec@ext_framebuffer_object@fbo-generatemipmap-filtering,Fail
 spec@ext_framebuffer_object@fbo-stencil-gl_stencil_index1-blit,Fail
 spec@ext_framebuffer_object@fbo-stencil-gl_stencil_index16-blit,Fail
@@ -410,7 +446,10 @@ spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_yuv420,Fail
 spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_yuyv,Fail
 spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_yvu420,Fail
 spec@ext_image_dma_buf_import@ext_image_dma_buf_import-transcode-nv12-as-r8-gr88,Fail
+
+# Need to be able to report 0 OQ bits, since there are no HW OQs.
 spec@ext_occlusion_query_boolean@any-samples,Fail
+
 spec@ext_packed_depth_stencil@depth_stencil texture,Fail
 spec@ext_packed_depth_stencil@fbo-clear-formats,Fail
 spec@ext_packed_depth_stencil@fbo-clear-formats@GL_DEPTH24_STENCIL8,Fail
@@ -420,6 +459,8 @@ spec@ext_packed_depth_stencil@fbo-clear-formats stencil@GL_DEPTH24_STENCIL8,Fail
 spec@ext_packed_depth_stencil@fbo-clear-formats stencil@GL_DEPTH_STENCIL,Fail
 spec@ext_packed_depth_stencil@fbo-depthstencil-gl_depth24_stencil8-clear,Fail
 spec@ext_packed_depth_stencil@fbo-stencil-gl_depth24_stencil8-blit,Fail
+
+# https://gitlab.freedesktop.org/mesa/mesa/-/issues/4985
 spec@ext_packed_depth_stencil@texwrap formats bordercolor,Fail
 spec@ext_packed_depth_stencil@texwrap formats bordercolor@GL_DEPTH24_STENCIL8- border color only,Fail
 spec@ext_texture_compression_s3tc@texwrap formats bordercolor,Fail
@@ -427,8 +468,27 @@ spec@ext_texture_compression_s3tc@texwrap formats bordercolor@GL_COMPRESSED_RGBA
 spec@ext_texture_compression_s3tc@texwrap formats bordercolor@GL_COMPRESSED_RGBA_S3TC_DXT3_EXT- border color only,Fail
 spec@ext_texture_compression_s3tc@texwrap formats bordercolor@GL_COMPRESSED_RGBA_S3TC_DXT5_EXT- border color only,Fail
 spec@ext_texture_compression_s3tc@texwrap formats bordercolor@GL_COMPRESSED_RGB_S3TC_DXT1_EXT- border color only,Fail
+
+# "Unexpected GL error: GL_NO_ERROR 0x0
+#  (Error at tests/spec/ext_texture_format_bgra8888/api-errors.c:63)
+#  Expected GL error: GL_INVALID_OPERATION 0x502
+#  Unexpected GL error: GL_NO_ERROR 0x0
+#  (Error at tests/spec/ext_texture_format_bgra8888/api-errors.c:69)
+#  Expected GL error: GL_INVALID_OPERATION 0x502
+#  Mesa: User error: GL_INVALID_OPERATION in glTexImage2D(format = GL_BGRA, type = GL_FLOAT, internalformat = GL_BGRA)
+#  Unexpected GL error: GL_NO_ERROR 0x0
+#  (Error at tests/spec/ext_texture_format_bgra8888/api-errors.c:103)
+#  Expected GL error: GL_INVALID_OPERATION 0x502
+#  Mesa: User error: GL_INVALID_OPERATION in glTexSubImage2D(format = GL_BGRA, type = GL_FLOAT, internalformat = GL_BGRA)
+#  PIGLIT: {"result": "fail" }"
 spec@ext_texture_format_bgra8888@api-errors,Fail
+
+# "Probe color at (20,0)
+#    Expected: 0 76 0
+#    Observed: 0 18 0"
 spec@ext_texture_srgb@tex-srgb,Fail
+
+# https://gitlab.freedesktop.org/mesa/mesa/-/issues/4985
 spec@ext_texture_srgb@texwrap formats bordercolor,Fail
 spec@ext_texture_srgb@texwrap formats bordercolor@GL_SLUMINANCE8- border color only,Fail
 spec@ext_texture_srgb@texwrap formats bordercolor@GL_SLUMINANCE8_ALPHA8- border color only,Fail
@@ -443,8 +503,7 @@ spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SRGB_ALPHA_
 spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT- border color only,Fail
 spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT- border color only,Fail
 spec@ext_texture_srgb@texwrap formats-s3tc bordercolor@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT- border color only,Fail
-spec@glsl-1.10@execution@built-in-functions@fs-atan-vec3-vec3,Fail
-spec@glsl-1.10@execution@built-in-functions@fs-atan-vec4-vec4,Fail
+
 spec@glsl-1.10@execution@built-in-functions@fs-degrees-float,Fail
 spec@glsl-1.10@execution@built-in-functions@fs-degrees-vec2,Fail
 spec@glsl-1.10@execution@built-in-functions@fs-degrees-vec3,Fail
@@ -615,17 +674,22 @@ spec@glsl-1.10@execution@built-in-functions@fs-smoothstep-vec3-vec3-vec3,Fail
 spec@glsl-1.10@execution@built-in-functions@fs-smoothstep-vec4-vec4-vec4,Fail
 spec@glsl-1.10@execution@builtins@glsl-fs-atan-3,Fail
 spec@glsl-1.10@execution@copy-propagation@glsl-copy-propagation-loop-1,Fail
-spec@glsl-1.10@execution@copy-propagation@glsl-copy-propagation-loop-2,Fail
+
+# No derivatives support in the HW.
 spec@glsl-1.10@execution@derivatives@glsl-derivs-abs,Fail
 spec@glsl-1.10@execution@derivatives@glsl-derivs-abs-sign,Fail
 spec@glsl-1.10@execution@derivatives@glsl-derivs-sign,Fail
 spec@glsl-1.10@execution@derivatives@glsl-derivs-swizzle,Fail
 spec@glsl-1.10@execution@derivatives@glsl-derivs-varyings,Fail
+
 spec@glsl-1.10@execution@discard@glsl-fs-discard-04,Fail
+
+# https://gitlab.freedesktop.org/mesa/mesa/-/issues/4980
 spec@glsl-1.10@execution@fs-frontfacing-ternary-0.0-neg-1.0,Fail
 spec@glsl-1.10@execution@fs-frontfacing-ternary-1-neg-1,Fail
 spec@glsl-1.10@execution@fs-frontfacing-ternary-1.0-neg-1.0,Fail
 spec@glsl-1.10@execution@fs-frontfacing-ternary-neg-1.0-1.0,Fail
+
 spec@glsl-1.10@execution@fs-loop-bounds-unrolled,Fail
 spec@glsl-1.10@execution@fs-notequal-of-expression,Fail
 spec@glsl-1.10@execution@fs-sign-times-abs,Fail
@@ -638,11 +702,8 @@ spec@glsl-1.10@execution@glsl-1.10-built-in-uniform-state,Fail
 spec@glsl-1.10@execution@glsl-clamp-vertex-color,Fail
 spec@glsl-1.10@execution@glsl-fs-convolution-1,Fail
 spec@glsl-1.10@execution@glsl-fs-convolution-2,Fail
-spec@glsl-1.10@execution@glsl-fs-functions-5,Fail
-spec@glsl-1.10@execution@glsl-fs-functions-6,Fail
 spec@glsl-1.10@execution@glsl-fs-if-nested-loop,Fail
 spec@glsl-1.10@execution@glsl-fs-loop-while-false-03,Fail
-spec@glsl-1.10@execution@glsl-texcoord-array-2,Fail
 spec@glsl-1.10@execution@interpolation@interpolation-none-gl_backcolor-flat-fixed,Fail
 spec@glsl-1.10@execution@interpolation@interpolation-none-gl_backcolor-flat-none,Fail
 spec@glsl-1.10@execution@interpolation@interpolation-none-gl_backcolor-flat-vertex,Fail
@@ -663,19 +724,10 @@ spec@glsl-1.10@execution@interpolation@interpolation-none-gl_frontsecondarycolor
 spec@glsl-1.10@execution@interpolation@interpolation-none-gl_frontsecondarycolor-smooth-vertex,Fail
 spec@glsl-1.10@execution@loops@glsl-fs-continue-inside-do-while,Fail
 spec@glsl-1.10@execution@loops@glsl-fs-loop-300,Fail
-spec@glsl-1.10@execution@loops@glsl-fs-loop-break,Fail
+
 spec@glsl-1.10@execution@loops@glsl-fs-loop-continue,Fail
-spec@glsl-1.10@execution@loops@glsl-fs-loop-ge,Fail
-spec@glsl-1.10@execution@loops@glsl-fs-loop-gt,Fail
-spec@glsl-1.10@execution@loops@glsl-fs-loop-le,Fail
-spec@glsl-1.10@execution@loops@glsl-fs-loop-lt,Fail
-spec@glsl-1.10@execution@loops@glsl-fs-loop-nested-if,Fail
 spec@glsl-1.10@execution@loops@glsl-fs-loop-redundant-condition,Fail
-spec@glsl-1.10@execution@loops@glsl-fs-loop-two-counter-01,Fail
-spec@glsl-1.10@execution@loops@glsl-fs-loop-two-counter-02,Fail
-spec@glsl-1.10@execution@loops@glsl-fs-loop-two-counter-03,Fail
-spec@glsl-1.10@execution@loops@glsl-fs-loop-two-counter-04,Fail
-spec@glsl-1.10@execution@loops@glsl-fs-loop-zero-iter,Fail
+spec@glsl-1.10@execution@loops@glsl-fs-loop-shadow-variables,Fail
 spec@glsl-1.10@execution@loops@glsl-fs-unroll-out-param,Fail
 spec@glsl-1.10@execution@loops@glsl-fs-unroll-side-effect,Fail
 spec@glsl-1.10@execution@samplers@glsl-fs-shadow1d,Fail
@@ -712,11 +764,9 @@ spec@glsl-1.10@execution@variable-indexing@fs-temp-array-mat3-index-row-wr,Fail
 spec@glsl-1.10@execution@variable-indexing@fs-temp-array-mat4-index-col-row-wr,Fail
 spec@glsl-1.10@execution@variable-indexing@fs-temp-array-mat4-index-col-wr,Fail
 spec@glsl-1.10@execution@variable-indexing@fs-temp-array-mat4-index-row-wr,Fail
-spec@glsl-1.10@execution@variable-indexing@fs-temp-array-mat4-index-wr,Fail
 spec@glsl-1.10@execution@variable-indexing@fs-uniform-array-mat4-index-col-row-rd,Fail
 spec@glsl-1.10@execution@variable-indexing@fs-varying-array-mat3-index-col-rd,Fail
 spec@glsl-1.10@execution@variable-indexing@fs-varying-array-mat3-index-col-row-rd,Fail
-spec@glsl-1.10@execution@variable-indexing@vs-output-array-float-index-wr,Fail
 spec@glsl-1.10@execution@variable-indexing@vs-output-array-vec2-index-wr,Fail
 spec@glsl-1.10@execution@variable-indexing@vs-output-array-vec3-index-wr,Fail
 spec@glsl-1.10@execution@variable-indexing@vs-output-array-vec4-index-wr,Fail
@@ -753,6 +803,7 @@ spec@glsl-1.10@execution@varying-packing@simple vec3 separate,Fail
 spec@glsl-1.10@execution@varying-packing@simple vec4 array,Fail
 spec@glsl-1.10@execution@varying-packing@simple vec4 separate,Fail
 spec@glsl-1.20@compiler@invalid-vec4-array-to-vec3-array-conversion.vert,Fail
+
 spec@glsl-1.20@execution@built-in-functions@fs-matrixcompmult-mat2x3-mat2x3,Fail
 spec@glsl-1.20@execution@built-in-functions@fs-matrixcompmult-mat2x4-mat2x4,Fail
 spec@glsl-1.20@execution@built-in-functions@fs-matrixcompmult-mat3x2-mat3x2,Fail
@@ -877,7 +928,6 @@ spec@glsl-1.20@execution@built-in-functions@fs-outerproduct-vec4-vec4,Fail
 spec@glsl-1.20@execution@fs-const-array-of-struct,Fail
 spec@glsl-1.20@execution@fs-const-array-of-struct-of-array,Fail
 spec@glsl-1.20@execution@fs-function-inout-array-of-structs,Fail
-spec@glsl-1.20@execution@fs-mix-1.0,Fail
 spec@glsl-1.20@execution@fs-nan-builtin-max,Fail
 spec@glsl-1.20@execution@fs-nan-builtin-min,Fail
 spec@glsl-1.20@execution@fs-vec4-const-array-indirect-access-032-elements,Fail
@@ -885,8 +935,11 @@ spec@glsl-1.20@execution@fs-vec4-const-array-indirect-access-048-elements,Fail
 spec@glsl-1.20@execution@fs-vec4-const-array-indirect-access-064-elements,Fail
 spec@glsl-1.20@execution@fs-vec4-const-array-indirect-access-128-elements,Fail
 spec@glsl-1.20@execution@fs-vec4-const-array-indirect-access-256-elements,Fail
+
+# https://gitlab.freedesktop.org/mesa/mesa/-/issues/4980
 spec@glsl-1.20@execution@glsl-fs-frontfacing,Fail
 spec@glsl-1.20@execution@glsl-fs-frontfacing-not,Fail
+
 spec@glsl-1.20@execution@out-parameter-indexing@fs-inout-index-two-level,Fail
 spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() 1d,Fail
 spec@glsl-1.20@execution@tex-miplevel-selection gl2:texture() 1dshadow,Fail
@@ -923,8 +976,6 @@ spec@glsl-1.20@execution@uniform-initializer@fs-mat4,Fail
 spec@glsl-1.20@execution@uniform-initializer@fs-mat4-array,Fail
 spec@glsl-1.20@execution@uniform-initializer@fs-mat4-from-const,Fail
 spec@glsl-1.20@execution@uniform-initializer@fs-mat4-set-by-other-stage,Fail
-spec@glsl-1.20@execution@variable-indexing@fs-temp-array-mat2-index-col-wr,Fail
-spec@glsl-1.20@execution@variable-indexing@fs-temp-array-mat2-index-row-wr,Fail
 spec@glsl-1.20@execution@variable-indexing@fs-temp-array-mat2-index-col-row-wr,Fail
 spec@glsl-1.20@execution@variable-indexing@fs-temp-array-mat3-index-col-row-wr,Fail
 spec@glsl-1.20@execution@variable-indexing@fs-temp-array-mat3-index-col-wr,Fail
@@ -932,7 +983,6 @@ spec@glsl-1.20@execution@variable-indexing@fs-temp-array-mat3-index-row-wr,Fail
 spec@glsl-1.20@execution@variable-indexing@fs-temp-array-mat4-index-col-row-wr,Fail
 spec@glsl-1.20@execution@variable-indexing@fs-temp-array-mat4-index-col-wr,Fail
 spec@glsl-1.20@execution@variable-indexing@fs-temp-array-mat4-index-row-wr,Fail
-spec@glsl-1.20@execution@variable-indexing@fs-temp-array-mat4-index-wr,Fail
 spec@glsl-1.20@execution@variable-indexing@fs-uniform-array-mat4-index-col-row-rd,Fail
 spec@glsl-1.20@execution@variable-indexing@fs-varying-array-mat3-index-col-rd,Fail
 spec@glsl-1.20@execution@variable-indexing@fs-varying-array-mat3-index-col-row-rd,Fail
@@ -942,7 +992,6 @@ spec@glsl-1.20@execution@variable-indexing@vs-varying-array-mat3-index-col-row-w
 spec@glsl-1.20@execution@variable-indexing@vs-varying-array-mat3-index-col-wr,Fail
 spec@glsl-1.20@execution@variable-indexing@vs-varying-array-mat3-index-row-wr,Fail
 spec@glsl-1.20@execution@variable-indexing@vs-varying-array-mat3-index-wr,Fail
-spec@glsl-es-1.00@execution@unroll-do-while-false-loop-only-once,Fail
 spec@intel_performance_query@intel_performance_query-issue_2235,Fail
 spec@nv_primitive_restart@primitive-restart-draw-mode-lines,Fail
 spec@nv_primitive_restart@primitive-restart-draw-mode-quad_strip,Fail
diff --git a/mesa 3D driver/src/gallium/drivers/i915/ci/traces-i915.yml b/mesa 3D driver/src/gallium/drivers/i915/ci/traces-i915.yml
index f35d482f75..85b8c4b4b3 100644
--- a/mesa 3D driver/src/gallium/drivers/i915/ci/traces-i915.yml	
+++ b/mesa 3D driver/src/gallium/drivers/i915/ci/traces-i915.yml	
@@ -8,11 +8,11 @@ traces:
   - path: glxgears/glxgears-2.trace
     expectations:
       - device: i915-g33
-        checksum: 83b1a41392bec71f7aeea6f1170cc23d
+        checksum: ee8dcdb3b9eaef2b32a2914b89373419
   - path: gputest/plot3d.trace
     expectations:
       - device: i915-g33
-        checksum: dc187485584bc45b9050e88163e72c34
+        checksum: 4731890d1f782c106dd9c12af77b3607
   - path: gputest/triangle.trace
     expectations:
       # Weird white bar behind Tux's head.
@@ -27,7 +27,7 @@ traces:
     # The cel shading is pretty chunky, but maybe that's just precision stuff.
     expectations:
       - device: i915-g33
-        checksum: 2258848deb4052a3c19145fe524e2805
+        checksum: 7518414e4f1a4f1e07c04ec1500f53e7
   # Requires GLSL 1.30
   #- path: humus/DynamicBranching3.trace
   #  expectations:
@@ -41,7 +41,7 @@ traces:
   - path: humus/Portals.trace
     expectations:
       - device: i915-g33
-        checksum: 9306a8d9f37d83a327759319c46f7cae
+        checksum: f782469019417923831d5d53dbe6a507
   # The shadow raytracing shader fails to compile
   #- path: humus/RaytracedShadows.trace
   #  expectations:
@@ -55,9 +55,9 @@ traces:
   - path: neverball/neverball.trace
     expectations:
       - device: i915-g33
-        checksum: 344479e929ad4bc7f7316b5e574b0131
+        checksum: f925aa92da47ebc757285f826113f4db
   - path: valve/counterstrike-v2.trace
     # Tree foliage rendering is extra aliased.
     expectations:
       - device: i915-g33
-        checksum: 66a18e91b95674a3a1d227ed5681257d
+        checksum: d17298ad3bb44b43b6c17e017f7c1e6a
diff --git a/mesa 3D driver/src/gallium/drivers/i915/i915_context.c b/mesa 3D driver/src/gallium/drivers/i915/i915_context.c
index d86e036b73..71e9f5defa 100644
--- a/mesa 3D driver/src/gallium/drivers/i915/i915_context.c	
+++ b/mesa 3D driver/src/gallium/drivers/i915/i915_context.c	
@@ -27,6 +27,7 @@
 
 #include "i915_context.h"
 #include "i915_batch.h"
+#include "i915_debug.h"
 #include "i915_query.h"
 #include "i915_resource.h"
 #include "i915_screen.h"
@@ -42,8 +43,6 @@
 #include "util/u_prim.h"
 #include "util/u_upload_mgr.h"
 
-DEBUG_GET_ONCE_BOOL_OPTION(i915_no_vbuf, "I915_NO_VBUF", false)
-
 /*
  * Draw functions
  */
@@ -113,7 +112,7 @@ i915_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info,
    /*
     * Do the drawing
     */
-   draw_vbo(i915->draw, info, drawid_offset, NULL, draws, num_draws);
+   draw_vbo(i915->draw, info, drawid_offset, NULL, draws, num_draws, 0);
 
    /*
     * unmap vertex/index buffers
@@ -216,7 +215,7 @@ i915_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
     */
    i915->draw = draw_create(&i915->base);
    assert(i915->draw);
-   if (!debug_get_option_i915_no_vbuf()) {
+   if (i915_debug & DBG_VBUF) {
       draw_set_rasterize_stage(i915->draw, i915_draw_vbuf_stage(i915));
    } else {
       draw_set_rasterize_stage(i915->draw, i915_draw_render_stage(i915));
diff --git a/mesa 3D driver/src/gallium/drivers/i915/i915_context.h b/mesa 3D driver/src/gallium/drivers/i915/i915_context.h
index 00f892861a..44e7a9bc1d 100644
--- a/mesa 3D driver/src/gallium/drivers/i915/i915_context.h	
+++ b/mesa 3D driver/src/gallium/drivers/i915/i915_context.h	
@@ -36,6 +36,7 @@
 
 #include "tgsi/tgsi_scan.h"
 
+#include "util/log.h"
 #include "util/slab.h"
 #include "util/u_blitter.h"
 #include "i915_reg.h"
@@ -122,10 +123,15 @@ struct i915_fragment_shader {
    ubyte constant_flags[I915_MAX_CONSTANT];
 
    /**
-    * The mapping between generics and hw texture coords.
+    * The mapping between TGSI inputs and hw texture coords.
     * We need to share this between the vertex and fragment stages.
     **/
-   int generic_mapping[I915_TEX_UNITS];
+   struct {
+      enum tgsi_semantic semantic;
+      int index;
+   } texcoords[I915_TEX_UNITS];
+
+   bool reads_pntc;
 };
 
 struct i915_cache_context;
diff --git a/mesa 3D driver/src/gallium/drivers/i915/i915_debug.c b/mesa 3D driver/src/gallium/drivers/i915/i915_debug.c
index fb258f0c85..e64c9b6245 100644
--- a/mesa 3D driver/src/gallium/drivers/i915/i915_debug.c	
+++ b/mesa 3D driver/src/gallium/drivers/i915/i915_debug.c	
@@ -43,6 +43,7 @@ static const struct debug_named_value i915_debug_options[] = {
    {"texture", DBG_TEXTURE, "Texture information"},
    {"constants", DBG_CONSTANTS, "Constant buffers"},
    {"fs", DBG_FS, "Dump fragment shaders"},
+   {"vbuf", DBG_VBUF, "Use the WIP vbuf code path"},
    DEBUG_NAMED_VALUE_END};
 
 unsigned i915_debug = 0;
diff --git a/mesa 3D driver/src/gallium/drivers/i915/i915_debug.h b/mesa 3D driver/src/gallium/drivers/i915/i915_debug.h
index 8195832aed..56db7cc88b 100644
--- a/mesa 3D driver/src/gallium/drivers/i915/i915_debug.h	
+++ b/mesa 3D driver/src/gallium/drivers/i915/i915_debug.h	
@@ -45,6 +45,7 @@ struct i915_winsys_batchbuffer;
 #define DBG_TEXTURE   0x10
 #define DBG_CONSTANTS 0x20
 #define DBG_FS        0x40
+#define DBG_VBUF      0x80
 
 extern unsigned i915_debug;
 
diff --git a/mesa 3D driver/src/gallium/drivers/i915/i915_fpc.h b/mesa 3D driver/src/gallium/drivers/i915/i915_fpc.h
index 4c8c77e299..c223599400 100644
--- a/mesa 3D driver/src/gallium/drivers/i915/i915_fpc.h	
+++ b/mesa 3D driver/src/gallium/drivers/i915/i915_fpc.h	
@@ -39,10 +39,6 @@ struct nir_shader;
 
 #define I915_PROGRAM_SIZE 192
 
-/* Use those indices for pos/face routing, must be >= num of inputs */
-#define I915_SEMANTIC_POS  100
-#define I915_SEMANTIC_FACE 101
-
 /**
  * Program translation state
  */
diff --git a/mesa 3D driver/src/gallium/drivers/i915/i915_fpc_translate.c b/mesa 3D driver/src/gallium/drivers/i915/i915_fpc_translate.c
index 940bfa5302..dc1832aa76 100644
--- a/mesa 3D driver/src/gallium/drivers/i915/i915_fpc_translate.c	
+++ b/mesa 3D driver/src/gallium/drivers/i915/i915_fpc_translate.c	
@@ -108,15 +108,18 @@ i915_program_error(struct i915_fp_compile *p, const char *msg, ...)
 }
 
 static uint32_t
-get_mapping(struct i915_fragment_shader *fs, int unit)
+get_mapping(struct i915_fragment_shader *fs, enum tgsi_semantic semantic,
+            int index)
 {
    int i;
    for (i = 0; i < I915_TEX_UNITS; i++) {
-      if (fs->generic_mapping[i] == -1) {
-         fs->generic_mapping[i] = unit;
+      if (fs->texcoords[i].semantic == -1) {
+         fs->texcoords[i].semantic = semantic;
+         fs->texcoords[i].index = index;
          return i;
       }
-      if (fs->generic_mapping[i] == unit)
+      if (fs->texcoords[i].semantic == semantic &&
+          fs->texcoords[i].index == index)
          return i;
    }
    debug_printf("Exceeded max generics\n");
@@ -158,9 +161,14 @@ src_vector(struct i915_fp_compile *p,
       sem_ind = p->shader->info.input_semantic_index[index];
 
       switch (sem_name) {
+      case TGSI_SEMANTIC_GENERIC:
+      case TGSI_SEMANTIC_TEXCOORD:
+      case TGSI_SEMANTIC_PCOORD:
       case TGSI_SEMANTIC_POSITION: {
-         /* for fragcoord */
-         int real_tex_unit = get_mapping(fs, I915_SEMANTIC_POS);
+         if (sem_name == TGSI_SEMANTIC_PCOORD)
+            fs->reads_pntc = true;
+
+         int real_tex_unit = get_mapping(fs, sem_name, sem_ind);
          src = i915_emit_decl(p, REG_TYPE_T, T_TEX0 + real_tex_unit,
                               D0_CHANNEL_ALL);
          break;
@@ -179,15 +187,9 @@ src_vector(struct i915_fp_compile *p,
          src = i915_emit_decl(p, REG_TYPE_T, T_FOG_W, D0_CHANNEL_W);
          src = swizzle(src, W, W, W, W);
          break;
-      case TGSI_SEMANTIC_GENERIC: {
-         int real_tex_unit = get_mapping(fs, sem_ind);
-         src = i915_emit_decl(p, REG_TYPE_T, T_TEX0 + real_tex_unit,
-                              D0_CHANNEL_ALL);
-         break;
-      }
       case TGSI_SEMANTIC_FACE: {
          /* for back/front faces */
-         int real_tex_unit = get_mapping(fs, I915_SEMANTIC_FACE);
+         int real_tex_unit = get_mapping(fs, sem_name, sem_ind);
          src =
             i915_emit_decl(p, REG_TYPE_T, T_TEX0 + real_tex_unit, D0_CHANNEL_X);
          break;
@@ -875,7 +877,7 @@ i915_init_compile(struct i915_context *i915, struct i915_fragment_shader *ifs)
    memset(&p->register_phases, 0, sizeof(p->register_phases));
 
    for (i = 0; i < I915_TEX_UNITS; i++)
-      ifs->generic_mapping[i] = -1;
+      ifs->texcoords[i].semantic = -1;
 
    p->log_program_errors = !i915->no_log_program_errors;
 
diff --git a/mesa 3D driver/src/gallium/drivers/i915/i915_prim_vbuf.c b/mesa 3D driver/src/gallium/drivers/i915/i915_prim_vbuf.c
index 9dba054f78..fb192264d6 100644
--- a/mesa 3D driver/src/gallium/drivers/i915/i915_prim_vbuf.c	
+++ b/mesa 3D driver/src/gallium/drivers/i915/i915_prim_vbuf.c	
@@ -50,8 +50,6 @@
 #include "i915_reg.h"
 #include "i915_state.h"
 
-#define VBUF_MAP_BUFFER
-
 /**
  * Primitive renderer for i915.
  */
@@ -82,12 +80,6 @@ struct i915_vbuf_render {
    void *vbo_ptr;
    size_t vbo_max_used;
    size_t vbo_max_index; /**< index offset to be added to all indices */
-
-#ifndef VBUF_MAP_BUFFER
-   size_t map_used_start;
-   size_t map_used_end;
-   size_t map_size;
-#endif
 };
 
 /**
@@ -196,14 +188,6 @@ i915_vbuf_render_new_buf(struct i915_vbuf_render *i915_render, size_t size)
    i915_render->vbo_sw_offset = 0;
    i915_render->vbo_index = 0;
 
-#ifndef VBUF_MAP_BUFFER
-   if (i915_render->vbo_size > i915_render->map_size) {
-      i915_render->map_size = i915_render->vbo_size;
-      FREE(i915_render->vbo_ptr);
-      i915_render->vbo_ptr = MALLOC(i915_render->map_size);
-   }
-#endif
-
    i915_render->vbo =
       iws->buffer_create(iws, i915_render->vbo_size, I915_NEW_VERTEX);
    i915_render->vbo_ptr = iws->buffer_map(iws, i915_render->vbo, true);
@@ -264,11 +248,7 @@ i915_vbuf_render_map_vertices(struct vbuf_render *render)
    if (i915->vbo_flushed)
       debug_printf("%s bad vbo flush occurred stalling on hw\n", __FUNCTION__);
 
-#ifdef VBUF_MAP_BUFFER
    return (unsigned char *)i915_render->vbo_ptr + i915_render->vbo_sw_offset;
-#else
-   return (unsigned char *)i915_render->vbo_ptr;
-#endif
 }
 
 static void
@@ -276,24 +256,10 @@ i915_vbuf_render_unmap_vertices(struct vbuf_render *render, ushort min_index,
                                 ushort max_index)
 {
    struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
-   struct i915_context *i915 = i915_render->i915;
-   struct i915_winsys *iws = i915->iws;
 
    i915_render->vbo_max_index = max_index;
    i915_render->vbo_max_used = MAX2(i915_render->vbo_max_used,
                                     i915_render->vertex_size * (max_index + 1));
-#ifdef VBUF_MAP_BUFFER
-   (void)iws;
-#else
-   i915_render->map_used_start = i915_render->vertex_size * min_index;
-   i915_render->map_used_end = i915_render->vertex_size * (max_index + 1);
-   iws->buffer_write(
-      iws, i915_render->vbo,
-      i915_render->map_used_start + i915_render->vbo_sw_offset,
-      i915_render->map_used_end - i915_render->map_used_start,
-      (unsigned char *)i915_render->vbo_ptr + i915_render->map_used_start);
-
-#endif
 }
 
 /**
@@ -467,6 +433,9 @@ draw_arrays_fallback(struct vbuf_render *render, unsigned start, uint32_t nr)
       i915->vbo_flushed = 1;
 
       if (!BEGIN_BATCH(1 + (nr_indices + 1) / 2)) {
+         mesa_loge("i915: Failed to allocate space for %d indices in fresh "
+                   "batch with %d bytes left\n",
+                   nr_indices, (int)i915_winsys_batchbuffer_space(i915->batch));
          assert(0);
          goto out;
       }
@@ -625,6 +594,9 @@ i915_vbuf_render_draw_elements(struct vbuf_render *render,
       i915->vbo_flushed = 1;
 
       if (!BEGIN_BATCH(1 + (nr_indices + 1) / 2)) {
+         mesa_loge("i915: Failed to allocate space for %d indices in fresh "
+                   "batch with %d bytes left\n",
+                   nr_indices, (int)i915_winsys_batchbuffer_space(i915->batch));
          assert(0);
          goto out;
       }
@@ -677,8 +649,6 @@ static struct vbuf_render *
 i915_vbuf_render_create(struct i915_context *i915)
 {
    struct i915_vbuf_render *i915_render = CALLOC_STRUCT(i915_vbuf_render);
-   struct i915_winsys *iws = i915->iws;
-   int i;
 
    i915_render->i915 = i915;
 
@@ -701,12 +671,6 @@ i915_vbuf_render_create(struct i915_context *i915)
    i915_render->base.release_vertices = i915_vbuf_render_release_vertices;
    i915_render->base.destroy = i915_vbuf_render_destroy;
 
-#ifndef VBUF_MAP_BUFFER
-   i915_render->map_size = 0;
-   i915_render->map_used_start = 0;
-   i915_render->map_used_end = 0;
-#endif
-
    i915_render->vbo = NULL;
    i915_render->vbo_ptr = NULL;
    i915_render->vbo_size = 0;
@@ -714,19 +678,6 @@ i915_vbuf_render_create(struct i915_context *i915)
    i915_render->vbo_sw_offset = 0;
    i915_render->vbo_alloc_size = i915_render->base.max_vertex_buffer_bytes * 4;
 
-#ifdef VBUF_USE_POOL
-   i915_render->pool_used = false;
-   i915_render->pool_buffer_size = i915_render->vbo_alloc_size;
-   i915_render->pool_fifo = u_fifo_create(6);
-   for (i = 0; i < 6; i++)
-      u_fifo_add(i915_render->pool_fifo,
-                 iws->buffer_create(iws, i915_render->pool_buffer_size,
-                                    I915_NEW_VERTEX));
-#else
-   (void)i;
-   (void)iws;
-#endif
-
    return &i915_render->base;
 }
 
diff --git a/mesa 3D driver/src/gallium/drivers/i915/i915_resource_texture.c b/mesa 3D driver/src/gallium/drivers/i915/i915_resource_texture.c
index a88444a589..75a1ce5bf1 100644
--- a/mesa 3D driver/src/gallium/drivers/i915/i915_resource_texture.c	
+++ b/mesa 3D driver/src/gallium/drivers/i915/i915_resource_texture.c	
@@ -193,9 +193,6 @@ i9x5_scanout_layout(struct i915_texture *tex)
    if (pt->last_level > 0 || util_format_get_blocksize(pt->format) != 4)
       return false;
 
-   i915_texture_set_level_info(tex, 0, 1);
-   i915_texture_set_image_offset(tex, 0, 0, 0, 0);
-
    if (pt->width0 >= 240) {
       tex->stride = align(util_format_get_stride(pt->format, pt->width0), 64);
       tex->total_nblocksy = align_nblocksy(pt->format, pt->height0, 8);
@@ -208,6 +205,10 @@ i9x5_scanout_layout(struct i915_texture *tex)
       return false;
    }
 
+   i915_texture_set_level_info(tex, 0, 1);
+   i915_texture_set_image_offset(tex, 0, 0, 0, 0);
+
+
 #if DEBUG_TEXTURE
    debug_printf("%s size: %d,%d,%d offset %d,%d (0x%x)\n", __FUNCTION__,
                 pt->width0, pt->height0, util_format_get_blocksize(pt->format),
@@ -647,7 +648,7 @@ i945_texture_layout_cube(struct i915_texture *tex)
    unsigned level;
    unsigned face;
 
-   assert(pt->width0 == pt->height0);       /* cubemap images are square */
+   assert(pt->width0 == pt->height0); /* cubemap images are square */
    assert(util_format_is_compressed(pt->format)); /* compressed only */
 
    /*
diff --git a/mesa 3D driver/src/gallium/drivers/i915/i915_screen.c b/mesa 3D driver/src/gallium/drivers/i915/i915_screen.c
index 417f96bb4b..e08c6d056a 100644
--- a/mesa 3D driver/src/gallium/drivers/i915/i915_screen.c	
+++ b/mesa 3D driver/src/gallium/drivers/i915/i915_screen.c	
@@ -118,6 +118,7 @@ static const nir_shader_compiler_options i915_compiler_options = {
    .lower_uniforms_to_ubo = true,
    .lower_vector_cmp = true,
    .use_interpolated_input_intrinsics = true,
+   .force_indirect_unrolling = ~0,
 };
 
 static const struct nir_shader_compiler_options gallivm_nir_options = {
@@ -134,7 +135,8 @@ static const struct nir_shader_compiler_options gallivm_nir_options = {
    .lower_ffma64 = true,
    .lower_fmod = true,
    .lower_hadd = true,
-   .lower_add_sat = true,
+   .lower_uadd_sat = true,
+   .lower_iadd_sat = true,
    .lower_ldexp = true,
    .lower_pack_snorm_2x16 = true,
    .lower_pack_snorm_4x8 = true,
@@ -174,6 +176,97 @@ i915_get_compiler_options(struct pipe_screen *pscreen, enum pipe_shader_ir ir,
       return &gallivm_nir_options;
 }
 
+static void
+i915_optimize_nir(struct nir_shader *s)
+{
+   bool progress;
+
+   do {
+      progress = false;
+
+      NIR_PASS_V(s, nir_lower_vars_to_ssa);
+
+      NIR_PASS(progress, s, nir_copy_prop);
+      NIR_PASS(progress, s, nir_opt_algebraic);
+      NIR_PASS(progress, s, nir_opt_constant_folding);
+      NIR_PASS(progress, s, nir_opt_remove_phis);
+      NIR_PASS(progress, s, nir_opt_conditional_discard);
+      NIR_PASS(progress, s, nir_opt_dce);
+      NIR_PASS(progress, s, nir_opt_dead_cf);
+      NIR_PASS(progress, s, nir_opt_cse);
+      NIR_PASS(progress, s, nir_opt_find_array_copies);
+      NIR_PASS(progress, s, nir_opt_if, true);
+      NIR_PASS(progress, s, nir_opt_peephole_select, ~0 /* flatten all IFs. */,
+               true, true);
+      NIR_PASS(progress, s, nir_opt_algebraic);
+      NIR_PASS(progress, s, nir_opt_constant_folding);
+      NIR_PASS(progress, s, nir_opt_shrink_vectors, true);
+      NIR_PASS(progress, s, nir_opt_trivial_continues);
+      NIR_PASS(progress, s, nir_opt_undef);
+      NIR_PASS(progress, s, nir_opt_loop_unroll);
+
+   } while (progress);
+
+   NIR_PASS(progress, s, nir_remove_dead_variables, nir_var_function_temp,
+            NULL);
+}
+
+static char *i915_check_control_flow(nir_shader *s)
+{
+   if (s->info.stage == MESA_SHADER_FRAGMENT) {
+      nir_function_impl *impl = nir_shader_get_entrypoint(s);
+      nir_block *first = nir_start_block(impl);
+      nir_cf_node *next = nir_cf_node_next(&first->cf_node);
+
+      if (next) {
+         switch (next->type) {
+         case nir_cf_node_if:
+            return "if/then statements not supported by i915 fragment shaders, should have been flattened by peephole_select.";
+         case nir_cf_node_loop:
+            return "looping not supported i915 fragment shaders, all loops must be statically unrollable.";
+         default:
+            return "Unknown control flow type";
+         }
+      }
+   }
+
+   return NULL;
+}
+
+static char *
+i915_finalize_nir(struct pipe_screen *pscreen, void *nir)
+{
+   nir_shader *s = nir;
+
+   if (s->info.stage == MESA_SHADER_FRAGMENT)
+      i915_optimize_nir(s);
+
+   /* st_program.c's parameter list optimization requires that future nir
+    * variants don't reallocate the uniform storage, so we have to remove
+    * uniforms that occupy storage.  But we don't want to remove samplers,
+    * because they're needed for YUV variant lowering.
+    */
+   nir_remove_dead_derefs(s);
+   nir_foreach_uniform_variable_safe(var, s)
+   {
+      if (var->data.mode == nir_var_uniform &&
+          (glsl_type_get_image_count(var->type) ||
+           glsl_type_get_sampler_count(var->type)))
+         continue;
+
+      exec_node_remove(&var->node);
+   }
+   nir_validate_shader(s, "after uniform var removal");
+
+   nir_sweep(s);
+
+   char *msg = i915_check_control_flow(s);
+   if (msg)
+      return strdup(msg);
+
+   return NULL;
+}
+
 static int
 i915_get_shader_param(struct pipe_screen *screen, enum pipe_shader_type shader,
                       enum pipe_shader_cap cap)
@@ -226,7 +319,7 @@ i915_get_shader_param(struct pipe_screen *screen, enum pipe_shader_type shader,
       case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS:
          return I915_MAX_TEX_INSN;
       case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS:
-         return 8;
+         return 4;
       case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
          return 0;
       case PIPE_SHADER_CAP_MAX_INPUTS:
@@ -238,7 +331,8 @@ i915_get_shader_param(struct pipe_screen *screen, enum pipe_shader_type shader,
       case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
          return 1;
       case PIPE_SHADER_CAP_MAX_TEMPS:
-         return 12; /* XXX: 12 -> 32 ? */
+         /* 16 inter-phase temps, 3 intra-phase temps.  i915c reported 16. too. */
+         return 16;
       case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
          return 0;
@@ -302,6 +396,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_VERTEX_COLOR_CLAMPED:
    case PIPE_CAP_USER_VERTEX_BUFFERS:
    case PIPE_CAP_MIXED_COLOR_DEPTH_BITS:
+   case PIPE_CAP_TGSI_TEXCOORD:
       return 1;
 
    case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
@@ -596,6 +691,7 @@ i915_screen_create(struct i915_winsys *iws)
    is->base.get_shader_param = i915_get_shader_param;
    is->base.get_paramf = i915_get_paramf;
    is->base.get_compiler_options = i915_get_compiler_options;
+   is->base.finalize_nir = i915_finalize_nir;
    is->base.is_format_supported = i915_is_format_supported;
 
    is->base.context_create = i915_create_context;
diff --git a/mesa 3D driver/src/gallium/drivers/i915/i915_state.c b/mesa 3D driver/src/gallium/drivers/i915/i915_state.c
index 947a67c149..c0c5ce5e64 100644
--- a/mesa 3D driver/src/gallium/drivers/i915/i915_state.c	
+++ b/mesa 3D driver/src/gallium/drivers/i915/i915_state.c	
@@ -580,6 +580,10 @@ i915_bind_fs_state(struct pipe_context *pipe, void *shader)
    draw_bind_fragment_shader(i915->draw,
                              (i915->fs ? i915->fs->draw_data : NULL));
 
+   /* Tell draw if we need to do point sprites so we can get PNTC. */
+   if (i915->fs)
+      draw_wide_point_sprites(i915->draw, i915->fs->reads_pntc);
+
    i915->dirty |= I915_NEW_FS;
 }
 
@@ -714,6 +718,7 @@ static void
 i915_set_sampler_views(struct pipe_context *pipe, enum pipe_shader_type shader,
                        unsigned start, unsigned num,
                        unsigned unbind_num_trailing_slots,
+                       bool take_ownership,
                        struct pipe_sampler_view **views)
 {
    if (shader != PIPE_SHADER_FRAGMENT) {
@@ -732,11 +737,23 @@ i915_set_sampler_views(struct pipe_context *pipe, enum pipe_shader_type shader,
    /* Check for no-op */
    if (views && num == i915->num_fragment_sampler_views &&
        !memcmp(i915->fragment_sampler_views, views,
-               num * sizeof(struct pipe_sampler_view *)))
+               num * sizeof(struct pipe_sampler_view *))) {
+      if (take_ownership) {
+         for (unsigned i = 0; i < num; i++) {
+            struct pipe_sampler_view *view = views[i];
+            pipe_sampler_view_reference(&view, NULL);
+         }
+      }
       return;
+   }
 
    for (i = 0; i < num; i++) {
-      pipe_sampler_view_reference(&i915->fragment_sampler_views[i], views[i]);
+      if (take_ownership) {
+         pipe_sampler_view_reference(&i915->fragment_sampler_views[i], NULL);
+         i915->fragment_sampler_views[i] = views[i];
+      } else {
+         pipe_sampler_view_reference(&i915->fragment_sampler_views[i], views[i]);
+      }
    }
 
    for (i = num; i < i915->num_fragment_sampler_views; i++)
@@ -815,6 +832,8 @@ i915_set_framebuffer_state(struct pipe_context *pipe,
       pipe_surface_reference(&i915->framebuffer.cbufs[0], NULL);
    }
    pipe_surface_reference(&i915->framebuffer.zsbuf, fb->zsbuf);
+   if (fb->zsbuf)
+      draw_set_zs_format(i915->draw, fb->zsbuf->format);
 
    i915->dirty |= I915_NEW_FRAMEBUFFER;
 }
diff --git a/mesa 3D driver/src/gallium/drivers/i915/i915_state_derived.c b/mesa 3D driver/src/gallium/drivers/i915/i915_state_derived.c
index db565e80ff..37a0d16181 100644
--- a/mesa 3D driver/src/gallium/drivers/i915/i915_state_derived.c	
+++ b/mesa 3D driver/src/gallium/drivers/i915/i915_state_derived.c	
@@ -28,6 +28,7 @@
 #include "draw/draw_context.h"
 #include "draw/draw_vertex.h"
 #include "pipe/p_shader_tokens.h"
+#include "util/log.h"
 #include "util/u_memory.h"
 #include "i915_context.h"
 #include "i915_debug.h"
@@ -35,17 +36,6 @@
 #include "i915_reg.h"
 #include "i915_state.h"
 
-static uint32_t
-find_mapping(const struct i915_fragment_shader *fs, int unit)
-{
-   int i;
-   for (i = 0; i < I915_TEX_UNITS; i++) {
-      if (fs->generic_mapping[i] == unit)
-         return i;
-   }
-   debug_printf("Mapping not found\n");
-   return 0;
-}
 
 /***********************************************************************
  * Determine the hardware vertex layout.
@@ -56,11 +46,10 @@ calculate_vertex_layout(struct i915_context *i915)
 {
    const struct i915_fragment_shader *fs = i915->fs;
    struct vertex_info vinfo;
-   bool texCoords[I915_TEX_UNITS], colors[2], fog, needW, face;
+   bool colors[2], fog, needW, face;
    uint32_t i;
    int src;
 
-   memset(texCoords, 0, sizeof(texCoords));
    colors[0] = colors[1] = fog = needW = face = false;
    memset(&vinfo, 0, sizeof(vinfo));
 
@@ -69,27 +58,22 @@ calculate_vertex_layout(struct i915_context *i915)
     */
    for (i = 0; i < fs->info.num_inputs; i++) {
       switch (fs->info.input_semantic_name[i]) {
-      case TGSI_SEMANTIC_POSITION: {
-         uint32_t unit = I915_SEMANTIC_POS;
-         texCoords[find_mapping(fs, unit)] = true;
-      } break;
+      case TGSI_SEMANTIC_POSITION:
+      case TGSI_SEMANTIC_PCOORD:
+      case TGSI_SEMANTIC_FACE:
+         /* Handled as texcoord inputs below */
+         break;
       case TGSI_SEMANTIC_COLOR:
          assert(fs->info.input_semantic_index[i] < 2);
          colors[fs->info.input_semantic_index[i]] = true;
          break;
-      case TGSI_SEMANTIC_GENERIC: {
-         /* texcoords/varyings/other generic */
-         uint32_t unit = fs->info.input_semantic_index[i];
-
-         texCoords[find_mapping(fs, unit)] = true;
+      case TGSI_SEMANTIC_TEXCOORD:
+      case TGSI_SEMANTIC_GENERIC:
          needW = true;
-      } break;
+         break;
       case TGSI_SEMANTIC_FOG:
          fog = true;
          break;
-      case TGSI_SEMANTIC_FACE:
-         face = true;
-         break;
       default:
          debug_printf("Unknown input type %d\n",
                       fs->info.input_semantic_name[i]);
@@ -142,36 +126,27 @@ calculate_vertex_layout(struct i915_context *i915)
    /* texcoords/varyings */
    for (i = 0; i < I915_TEX_UNITS; i++) {
       uint32_t hwtc;
-      if (texCoords[i]) {
-         hwtc = TEXCOORDFMT_4D;
-         if (fs->generic_mapping[i] == I915_SEMANTIC_POS) {
-            src =
-               draw_find_shader_output(i915->draw, TGSI_SEMANTIC_POSITION, 0);
+      if (fs->texcoords[i].semantic != -1) {
+         src = draw_find_shader_output(i915->draw, fs->texcoords[i].semantic,
+                                       fs->texcoords[i].index);
+         if (fs->texcoords[i].semantic == TGSI_SEMANTIC_FACE) {
+            /* XXX Because of limitations in the draw module, currently src will
+             * be 0 for SEMANTIC_FACE, so this aliases to POS. We need to fix in
+             * the draw module by adding an extra shader output.
+             */
+            mesa_loge("Front/back face is broken\n");
+            draw_emit_vertex_attr(&vinfo, EMIT_1F, src);
+            hwtc = TEXCOORDFMT_1D;
          } else {
-            src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_GENERIC,
-                                          fs->generic_mapping[i]);
+            hwtc = TEXCOORDFMT_4D;
+            draw_emit_vertex_attr(&vinfo, EMIT_4F, src);
          }
-         draw_emit_vertex_attr(&vinfo, EMIT_4F, src);
       } else {
          hwtc = TEXCOORDFMT_NOT_PRESENT;
       }
       vinfo.hwfmt[1] |= hwtc << (i * 4);
    }
 
-   /* front/back face */
-   if (face) {
-      uint32_t slot = find_mapping(fs, I915_SEMANTIC_FACE);
-      debug_printf("Front/back face is broken\n");
-      /* XXX Because of limitations in the draw module, currently src will be 0
-       * for SEMANTIC_FACE, so this aliases to POS. We need to fix in the draw
-       * module by adding an extra shader output.
-       */
-      src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_FACE, 0);
-      draw_emit_vertex_attr(&vinfo, EMIT_1F, src);
-      vinfo.hwfmt[1] &= ~(TEXCOORDFMT_NOT_PRESENT << (slot * 4));
-      vinfo.hwfmt[1] |= TEXCOORDFMT_1D << (slot * 4);
-   }
-
    draw_compute_vertex_size(&vinfo);
 
    if (memcmp(&i915->current.vertex_info, &vinfo, sizeof(vinfo))) {
diff --git a/mesa 3D driver/src/gallium/drivers/i915/i915_surface.c b/mesa 3D driver/src/gallium/drivers/i915/i915_surface.c
index 79e2d657e6..38059b9749 100644
--- a/mesa 3D driver/src/gallium/drivers/i915/i915_surface.c	
+++ b/mesa 3D driver/src/gallium/drivers/i915/i915_surface.c	
@@ -118,7 +118,7 @@ i915_surface_copy_render(struct pipe_context *pipe, struct pipe_resource *dst,
 
    util_blitter_blit_generic(i915->blitter, dst_view, &dstbox, src_view,
                              src_box, src_width0, src_height0, PIPE_MASK_RGBAZS,
-                             PIPE_TEX_FILTER_NEAREST, NULL, false);
+                             PIPE_TEX_FILTER_NEAREST, NULL, false, false);
    return;
 
 fallback:
diff --git a/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-amly-fails.txt b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-amly-fails.txt
index 4acd392174..b586aa5e9c 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-amly-fails.txt	
+++ b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-amly-fails.txt	
@@ -13,6 +13,12 @@ dEQP-GLES3.functional.rasterization.interpolation.basic.lines_wide,Fail
 dEQP-GLES3.functional.rasterization.interpolation.projected.line_loop_wide,Fail
 dEQP-GLES3.functional.rasterization.interpolation.projected.line_strip_wide,Fail
 dEQP-GLES3.functional.rasterization.interpolation.projected.lines_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.basic.line_loop_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.basic.line_strip_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.basic.lines_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.projected.line_loop_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.projected.line_strip_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.projected.lines_wide,Fail
 
 # https://gitlab.freedesktop.org/mesa/mesa/-/issues/4167
 dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.10x10,Fail
@@ -31,8 +37,6 @@ dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.8x8,Fail
 
 KHR-GL46.get_texture_sub_image.errors_test,Fail
 KHR-GL46.get_texture_sub_image.functional_test,Fail
-KHR-GL46.gl_spirv.spirv_glsl_to_spirv_builtin_functions_test,Fail
-KHR-GL46.shader_image_load_store.basic-api-bind,Fail
 KHR-GL46.transform_feedback.capture_vertex_interleaved_test,Fail
 KHR-GL46.transform_feedback.capture_vertex_separate_test,Fail
 KHR-GL46.transform_feedback.discard_vertex_test,Fail
diff --git a/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-amly.toml b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-amly.toml
new file mode 100644
index 0000000000..28688cfa32
--- /dev/null
+++ b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-amly.toml	
@@ -0,0 +1,96 @@
+# Basic test set
+[[deqp]]
+deqp = "/deqp/modules/gles2/deqp-gles2"
+caselists = ["/deqp/mustpass/gles2-master.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+version_check = "GL ES 3.2.*git"
+renderer_check = "AML"
+
+[[deqp]]
+deqp = "/deqp/modules/gles3/deqp-gles3"
+caselists = ["/deqp/mustpass/gles3-master.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+
+[[deqp]]
+deqp = "/deqp/modules/gles31/deqp-gles31"
+caselists = ["/deqp/mustpass/gles31-master.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+
+# Note that KHR-GL3* test sets include all tests from the previous
+# version, so we only need to run one test list (unlike dEQP-GLES,
+# where the test sets are separate).
+[[deqp]]
+deqp = "/deqp/external/openglcts/modules/glcts"
+caselists = [
+    "/deqp/mustpass/gles2-khr-master.txt",
+    "/deqp/mustpass/gles3-khr-master.txt",
+    "/deqp/mustpass/gles31-khr-master.txt",
+    "/deqp/mustpass/gl46-master.txt",
+]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+
+# 565-nozs
+[[deqp]]
+deqp = "/deqp/modules/gles3/deqp-gles3"
+caselists = ["/deqp/mustpass/gles3-565-no-depth-no-stencil.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgb565d0s0ms0",
+]
+prefix = "565-nozs-"
+
+[[deqp]]
+deqp = "/deqp/modules/gles31/deqp-gles31"
+caselists = ["/deqp/mustpass/gles31-565-no-depth-no-stencil.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgb565d0s0ms0",
+]
+prefix = "565-nozs-"
+
+# multisample
+[[deqp]]
+deqp = "/deqp/modules/gles3/deqp-gles3"
+caselists = ["/deqp/mustpass/gles3-multisample.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms4",
+]
+prefix = "multisample-"
+
+[[deqp]]
+deqp = "/deqp/modules/gles31/deqp-gles31"
+caselists = ["/deqp/mustpass/gles31-multisample.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms4",
+]
+prefix = "multisample-"
diff --git a/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-apl-fails.txt b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-apl-fails.txt
index 7b4687eb3e..a2ae27e264 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-apl-fails.txt	
+++ b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-apl-fails.txt	
@@ -13,11 +13,15 @@ dEQP-GLES3.functional.rasterization.interpolation.basic.lines_wide,Fail
 dEQP-GLES3.functional.rasterization.interpolation.projected.line_loop_wide,Fail
 dEQP-GLES3.functional.rasterization.interpolation.projected.line_strip_wide,Fail
 dEQP-GLES3.functional.rasterization.interpolation.projected.lines_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.basic.line_loop_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.basic.line_strip_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.basic.lines_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.projected.line_loop_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.projected.line_strip_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.projected.lines_wide,Fail
 
 KHR-GL46.get_texture_sub_image.errors_test,Fail
 KHR-GL46.get_texture_sub_image.functional_test,Fail
-KHR-GL46.gl_spirv.spirv_glsl_to_spirv_builtin_functions_test,Fail
-KHR-GL46.shader_image_load_store.basic-api-bind,Fail
 KHR-GL46.transform_feedback.capture_vertex_interleaved_test,Fail
 KHR-GL46.transform_feedback.capture_vertex_separate_test,Fail
 KHR-GL46.transform_feedback.discard_vertex_test,Fail
diff --git a/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-apl.toml b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-apl.toml
new file mode 100644
index 0000000000..d945d8f59f
--- /dev/null
+++ b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-apl.toml	
@@ -0,0 +1,98 @@
+# Basic test set
+[[deqp]]
+deqp = "/deqp/modules/gles2/deqp-gles2"
+caselists = ["/deqp/mustpass/gles2-master.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+version_check = "GL ES 3.2.*git"
+renderer_check = "APL"
+
+[[deqp]]
+deqp = "/deqp/modules/gles3/deqp-gles3"
+caselists = ["/deqp/mustpass/gles3-master.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+fraction = 3
+
+[[deqp]]
+deqp = "/deqp/modules/gles31/deqp-gles31"
+caselists = ["/deqp/mustpass/gles31-master.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+fraction = 3
+
+# Note that KHR-GL3* test sets include all tests from the previous
+# version, so we only need to run one test list (unlike dEQP-GLES,
+# where the test sets are separate).
+[[deqp]]
+deqp = "/deqp/external/openglcts/modules/glcts"
+caselists = [
+    "/deqp/mustpass/gles2-khr-master.txt",
+    "/deqp/mustpass/gles3-khr-master.txt",
+    "/deqp/mustpass/gles31-khr-master.txt",
+    "/deqp/mustpass/gl46-master.txt",
+]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+
+# 565-nozs
+[[deqp]]
+deqp = "/deqp/modules/gles3/deqp-gles3"
+caselists = ["/deqp/mustpass/gles3-565-no-depth-no-stencil.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgb565d0s0ms0",
+]
+prefix = "565-nozs-"
+
+[[deqp]]
+deqp = "/deqp/modules/gles31/deqp-gles31"
+caselists = ["/deqp/mustpass/gles31-565-no-depth-no-stencil.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgb565d0s0ms0",
+]
+prefix = "565-nozs-"
+
+# multisample
+[[deqp]]
+deqp = "/deqp/modules/gles3/deqp-gles3"
+caselists = ["/deqp/mustpass/gles3-multisample.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms4",
+]
+prefix = "multisample-"
+
+[[deqp]]
+deqp = "/deqp/modules/gles31/deqp-gles31"
+caselists = ["/deqp/mustpass/gles31-multisample.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms4",
+]
+prefix = "multisample-"
diff --git a/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-cml-fails.txt b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-cml-fails.txt
new file mode 100644
index 0000000000..b586aa5e9c
--- /dev/null
+++ b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-cml-fails.txt	
@@ -0,0 +1,46 @@
+# These line rasterization failures are waived in the CTS
+dEQP-GLES2.functional.rasterization.interpolation.basic.line_loop_wide,Fail
+dEQP-GLES2.functional.rasterization.interpolation.basic.line_strip_wide,Fail
+dEQP-GLES2.functional.rasterization.interpolation.basic.lines_wide,Fail
+dEQP-GLES2.functional.rasterization.interpolation.projected.line_loop_wide,Fail
+dEQP-GLES2.functional.rasterization.interpolation.projected.line_strip_wide,Fail
+dEQP-GLES2.functional.rasterization.interpolation.projected.lines_wide,Fail
+dEQP-GLES3.functional.rasterization.fbo.rbo_singlesample.interpolation.lines_wide,Fail
+dEQP-GLES3.functional.rasterization.fbo.texture_2d.interpolation.lines_wide,Fail
+dEQP-GLES3.functional.rasterization.interpolation.basic.line_loop_wide,Fail
+dEQP-GLES3.functional.rasterization.interpolation.basic.line_strip_wide,Fail
+dEQP-GLES3.functional.rasterization.interpolation.basic.lines_wide,Fail
+dEQP-GLES3.functional.rasterization.interpolation.projected.line_loop_wide,Fail
+dEQP-GLES3.functional.rasterization.interpolation.projected.line_strip_wide,Fail
+dEQP-GLES3.functional.rasterization.interpolation.projected.lines_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.basic.line_loop_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.basic.line_strip_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.basic.lines_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.projected.line_loop_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.projected.line_strip_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.projected.lines_wide,Fail
+
+# https://gitlab.freedesktop.org/mesa/mesa/-/issues/4167
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.10x10,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.10x5,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.10x6,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.10x8,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.12x10,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.12x12,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.4x4,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.5x4,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.6x5,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.6x6,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.8x5,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.8x6,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.8x8,Fail
+
+KHR-GL46.get_texture_sub_image.errors_test,Fail
+KHR-GL46.get_texture_sub_image.functional_test,Fail
+KHR-GL46.transform_feedback.capture_vertex_interleaved_test,Fail
+KHR-GL46.transform_feedback.capture_vertex_separate_test,Fail
+KHR-GL46.transform_feedback.discard_vertex_test,Fail
+KHR-GL46.transform_feedback.draw_xfb_instanced_test,Crash
+KHR-GL46.transform_feedback.draw_xfb_stream_instanced_test,Crash
+KHR-GL46.transform_feedback.query_vertex_interleaved_test,Fail
+KHR-GL46.transform_feedback.query_vertex_separate_test,Fail
diff --git a/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-cml.toml b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-cml.toml
new file mode 100644
index 0000000000..313fb6fb57
--- /dev/null
+++ b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-cml.toml	
@@ -0,0 +1,99 @@
+# Basic test set
+[[deqp]]
+deqp = "/deqp/modules/gles2/deqp-gles2"
+caselists = ["/deqp/mustpass/gles2-master.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+fraction = 2
+version_check = "GL ES 3.2.*git"
+renderer_check = "CML"
+
+[[deqp]]
+deqp = "/deqp/modules/gles3/deqp-gles3"
+caselists = ["/deqp/mustpass/gles3-master.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+fraction = 2
+
+[[deqp]]
+deqp = "/deqp/modules/gles31/deqp-gles31"
+caselists = ["/deqp/mustpass/gles31-master.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+fraction = 2
+
+# Note that KHR-GL3* test sets include all tests from the previous
+# version, so we only need to run one test list (unlike dEQP-GLES,
+# where the test sets are separate).
+[[deqp]]
+deqp = "/deqp/external/openglcts/modules/glcts"
+caselists = [
+    "/deqp/mustpass/gles2-khr-master.txt",
+    "/deqp/mustpass/gles3-khr-master.txt",
+    "/deqp/mustpass/gles31-khr-master.txt",
+    "/deqp/mustpass/gl46-master.txt",
+]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+
+# 565-nozs
+[[deqp]]
+deqp = "/deqp/modules/gles3/deqp-gles3"
+caselists = ["/deqp/mustpass/gles3-565-no-depth-no-stencil.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgb565d0s0ms0",
+]
+prefix = "565-nozs-"
+
+[[deqp]]
+deqp = "/deqp/modules/gles31/deqp-gles31"
+caselists = ["/deqp/mustpass/gles31-565-no-depth-no-stencil.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgb565d0s0ms0",
+]
+prefix = "565-nozs-"
+
+# multisample
+[[deqp]]
+deqp = "/deqp/modules/gles3/deqp-gles3"
+caselists = ["/deqp/mustpass/gles3-multisample.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms4",
+]
+prefix = "multisample-"
+
+[[deqp]]
+deqp = "/deqp/modules/gles31/deqp-gles31"
+caselists = ["/deqp/mustpass/gles31-multisample.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms4",
+]
+prefix = "multisample-"
diff --git a/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-glk-fails.txt b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-glk-fails.txt
index b0cfc82d2e..26aaa12388 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-glk-fails.txt	
+++ b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-glk-fails.txt	
@@ -13,3 +13,24 @@ dEQP-GLES3.functional.rasterization.interpolation.basic.lines_wide,Fail
 dEQP-GLES3.functional.rasterization.interpolation.projected.line_loop_wide,Fail
 dEQP-GLES3.functional.rasterization.interpolation.projected.line_strip_wide,Fail
 dEQP-GLES3.functional.rasterization.interpolation.projected.lines_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.basic.line_loop_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.basic.line_strip_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.basic.lines_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.projected.line_loop_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.projected.line_strip_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.projected.lines_wide,Fail
+
+KHR-GL46.get_texture_sub_image.errors_test,Fail
+KHR-GL46.get_texture_sub_image.functional_test,Fail
+
+# "Result is different than expected at index: 33 Expected value: 664128 Result value: 658538 "
+KHR-GL46.texture_buffer.texture_buffer_atomic_functions,Fail
+
+KHR-GL46.transform_feedback.capture_vertex_interleaved_test,Fail
+KHR-GL46.transform_feedback.capture_vertex_separate_test,Fail
+KHR-GL46.transform_feedback.discard_vertex_test,Fail
+KHR-GL46.transform_feedback.draw_xfb_instanced_test,Crash
+KHR-GL46.transform_feedback.draw_xfb_stream_instanced_test,Crash
+KHR-GL46.transform_feedback.query_vertex_interleaved_test,Fail
+KHR-GL46.transform_feedback.query_vertex_separate_test,Fail
+KHR-GLES31.core.texture_buffer.texture_buffer_atomic_functions,Fail
diff --git a/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-glk-flakes.txt b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-glk-flakes.txt
new file mode 100644
index 0000000000..05dc1fd4fb
--- /dev/null
+++ b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-glk-flakes.txt	
@@ -0,0 +1,2 @@
+KHR-GL46.texture_buffer.texture_buffer_atomic_functions
+KHR-GLES31.core.texture_buffer.texture_buffer_atomic_functions
diff --git a/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-glk.toml b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-glk.toml
new file mode 100644
index 0000000000..34f02b4ac5
--- /dev/null
+++ b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-glk.toml	
@@ -0,0 +1,99 @@
+# Basic test set
+[[deqp]]
+deqp = "/deqp/modules/gles2/deqp-gles2"
+caselists = ["/deqp/mustpass/gles2-master.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+fraction = 3
+version_check = "GL ES 3.2.*git"
+renderer_check = "GLK"
+
+[[deqp]]
+deqp = "/deqp/modules/gles3/deqp-gles3"
+caselists = ["/deqp/mustpass/gles3-master.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+fraction = 4
+
+[[deqp]]
+deqp = "/deqp/modules/gles31/deqp-gles31"
+caselists = ["/deqp/mustpass/gles31-master.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+fraction = 4
+
+# Note that KHR-GL3* test sets include all tests from the previous
+# version, so we only need to run one test list (unlike dEQP-GLES,
+# where the test sets are separate).
+[[deqp]]
+deqp = "/deqp/external/openglcts/modules/glcts"
+caselists = [
+    "/deqp/mustpass/gles2-khr-master.txt",
+    "/deqp/mustpass/gles3-khr-master.txt",
+    "/deqp/mustpass/gles31-khr-master.txt",
+    "/deqp/mustpass/gl46-master.txt",
+]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+
+# 565-nozs
+[[deqp]]
+deqp = "/deqp/modules/gles3/deqp-gles3"
+caselists = ["/deqp/mustpass/gles3-565-no-depth-no-stencil.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgb565d0s0ms0",
+]
+prefix = "565-nozs-"
+
+[[deqp]]
+deqp = "/deqp/modules/gles31/deqp-gles31"
+caselists = ["/deqp/mustpass/gles31-565-no-depth-no-stencil.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgb565d0s0ms0",
+]
+prefix = "565-nozs-"
+
+# multisample
+[[deqp]]
+deqp = "/deqp/modules/gles3/deqp-gles3"
+caselists = ["/deqp/mustpass/gles3-multisample.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms4",
+]
+prefix = "multisample-"
+
+[[deqp]]
+deqp = "/deqp/modules/gles31/deqp-gles31"
+caselists = ["/deqp/mustpass/gles31-multisample.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms4",
+]
+prefix = "multisample-"
diff --git a/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-kbl-fails.txt b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-kbl-fails.txt
new file mode 100644
index 0000000000..b586aa5e9c
--- /dev/null
+++ b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-kbl-fails.txt	
@@ -0,0 +1,46 @@
+# These line rasterization failures are waived in the CTS
+dEQP-GLES2.functional.rasterization.interpolation.basic.line_loop_wide,Fail
+dEQP-GLES2.functional.rasterization.interpolation.basic.line_strip_wide,Fail
+dEQP-GLES2.functional.rasterization.interpolation.basic.lines_wide,Fail
+dEQP-GLES2.functional.rasterization.interpolation.projected.line_loop_wide,Fail
+dEQP-GLES2.functional.rasterization.interpolation.projected.line_strip_wide,Fail
+dEQP-GLES2.functional.rasterization.interpolation.projected.lines_wide,Fail
+dEQP-GLES3.functional.rasterization.fbo.rbo_singlesample.interpolation.lines_wide,Fail
+dEQP-GLES3.functional.rasterization.fbo.texture_2d.interpolation.lines_wide,Fail
+dEQP-GLES3.functional.rasterization.interpolation.basic.line_loop_wide,Fail
+dEQP-GLES3.functional.rasterization.interpolation.basic.line_strip_wide,Fail
+dEQP-GLES3.functional.rasterization.interpolation.basic.lines_wide,Fail
+dEQP-GLES3.functional.rasterization.interpolation.projected.line_loop_wide,Fail
+dEQP-GLES3.functional.rasterization.interpolation.projected.line_strip_wide,Fail
+dEQP-GLES3.functional.rasterization.interpolation.projected.lines_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.basic.line_loop_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.basic.line_strip_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.basic.lines_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.projected.line_loop_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.projected.line_strip_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.projected.lines_wide,Fail
+
+# https://gitlab.freedesktop.org/mesa/mesa/-/issues/4167
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.10x10,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.10x5,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.10x6,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.10x8,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.12x10,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.12x12,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.4x4,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.5x4,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.6x5,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.6x6,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.8x5,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.8x6,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.8x8,Fail
+
+KHR-GL46.get_texture_sub_image.errors_test,Fail
+KHR-GL46.get_texture_sub_image.functional_test,Fail
+KHR-GL46.transform_feedback.capture_vertex_interleaved_test,Fail
+KHR-GL46.transform_feedback.capture_vertex_separate_test,Fail
+KHR-GL46.transform_feedback.discard_vertex_test,Fail
+KHR-GL46.transform_feedback.draw_xfb_instanced_test,Crash
+KHR-GL46.transform_feedback.draw_xfb_stream_instanced_test,Crash
+KHR-GL46.transform_feedback.query_vertex_interleaved_test,Fail
+KHR-GL46.transform_feedback.query_vertex_separate_test,Fail
diff --git a/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-kbl.toml b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-kbl.toml
new file mode 100644
index 0000000000..e78ddf4374
--- /dev/null
+++ b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-kbl.toml	
@@ -0,0 +1,96 @@
+# Basic test set
+[[deqp]]
+deqp = "/deqp/modules/gles2/deqp-gles2"
+caselists = ["/deqp/mustpass/gles2-master.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+version_check = "GL ES 3.2.*git"
+renderer_check = "KBL"
+
+[[deqp]]
+deqp = "/deqp/modules/gles3/deqp-gles3"
+caselists = ["/deqp/mustpass/gles3-master.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+
+[[deqp]]
+deqp = "/deqp/modules/gles31/deqp-gles31"
+caselists = ["/deqp/mustpass/gles31-master.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+
+# Note that KHR-GL3* test sets include all tests from the previous
+# version, so we only need to run one test list (unlike dEQP-GLES,
+# where the test sets are separate).
+[[deqp]]
+deqp = "/deqp/external/openglcts/modules/glcts"
+caselists = [
+    "/deqp/mustpass/gles2-khr-master.txt",
+    "/deqp/mustpass/gles3-khr-master.txt",
+    "/deqp/mustpass/gles31-khr-master.txt",
+    "/deqp/mustpass/gl46-master.txt",
+]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+
+# 565-nozs
+[[deqp]]
+deqp = "/deqp/modules/gles3/deqp-gles3"
+caselists = ["/deqp/mustpass/gles3-565-no-depth-no-stencil.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgb565d0s0ms0",
+]
+prefix = "565-nozs-"
+
+[[deqp]]
+deqp = "/deqp/modules/gles31/deqp-gles31"
+caselists = ["/deqp/mustpass/gles31-565-no-depth-no-stencil.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgb565d0s0ms0",
+]
+prefix = "565-nozs-"
+
+# multisample
+[[deqp]]
+deqp = "/deqp/modules/gles3/deqp-gles3"
+caselists = ["/deqp/mustpass/gles3-multisample.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms4",
+]
+prefix = "multisample-"
+
+[[deqp]]
+deqp = "/deqp/modules/gles31/deqp-gles31"
+caselists = ["/deqp/mustpass/gles31-multisample.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms4",
+]
+prefix = "multisample-"
diff --git a/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-skips.txt b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-skips.txt
new file mode 100644
index 0000000000..4c45b67696
--- /dev/null
+++ b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-skips.txt	
@@ -0,0 +1,14 @@
+# 60s timeout in CI
+dEQP-GLES31.functional.ubo.random.all_per_block_buffers.20
+KHR-GL46.arrays_of_arrays_gl.ConstructorsAndUnsizedDeclConstructorSizing1
+KHR-GL46.arrays_of_arrays_gl.ConstructorsAndUnsizedDeclConstructors1
+KHR-GL46.arrays_of_arrays_gl.SizedDeclarationsPrimitive
+KHR-GL46.arrays_of_arrays_gl.SubroutineFunctionCalls2
+KHR-GL46.copy_image.functional
+KHR-GL46.enhanced_layouts.uniform_block_member_align_non_power_of_2
+KHR-GL46.enhanced_layouts.uniform_block_member_invalid_offset_alignment
+KHR-GL46.enhanced_layouts.ssb_member_align_non_power_of_2
+KHR-GL46.tessellation_shader.tessellation_control_to_tessellation_evaluation.gl_MaxPatchVertices_Position_PointSize
+KHR-GL46.texture_swizzle.functional
+KHR-GL46.texture_swizzle.smoke
+KHR-GLES31.core.tessellation_shader.tessellation_control_to_tessellation_evaluation.gl_MaxPatchVertices_Position_PointSize
diff --git a/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-whl-fails.txt b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-whl-fails.txt
new file mode 100644
index 0000000000..c38856e4fb
--- /dev/null
+++ b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-whl-fails.txt	
@@ -0,0 +1,57 @@
+# These line rasterization failures are waived in the CTS
+dEQP-GLES2.functional.rasterization.interpolation.basic.line_loop_wide,Fail
+dEQP-GLES2.functional.rasterization.interpolation.basic.line_strip_wide,Fail
+dEQP-GLES2.functional.rasterization.interpolation.basic.lines_wide,Fail
+dEQP-GLES2.functional.rasterization.interpolation.projected.line_loop_wide,Fail
+dEQP-GLES2.functional.rasterization.interpolation.projected.line_strip_wide,Fail
+dEQP-GLES2.functional.rasterization.interpolation.projected.lines_wide,Fail
+dEQP-GLES3.functional.rasterization.fbo.rbo_singlesample.interpolation.lines_wide,Fail
+dEQP-GLES3.functional.rasterization.fbo.texture_2d.interpolation.lines_wide,Fail
+dEQP-GLES3.functional.rasterization.interpolation.basic.line_loop_wide,Fail
+dEQP-GLES3.functional.rasterization.interpolation.basic.line_strip_wide,Fail
+dEQP-GLES3.functional.rasterization.interpolation.basic.lines_wide,Fail
+dEQP-GLES3.functional.rasterization.interpolation.projected.line_loop_wide,Fail
+dEQP-GLES3.functional.rasterization.interpolation.projected.line_strip_wide,Fail
+dEQP-GLES3.functional.rasterization.interpolation.projected.lines_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.basic.line_loop_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.basic.line_strip_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.basic.lines_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.projected.line_loop_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.projected.line_strip_wide,Fail
+565-nozs-dEQP-GLES3.functional.rasterization.interpolation.projected.lines_wide,Fail
+
+# https://gitlab.freedesktop.org/mesa/mesa/-/issues/4167
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.10x10,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.10x5,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.10x6,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.10x8,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.12x10,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.12x12,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.4x4,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.5x4,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.6x5,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.6x6,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.8x5,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.8x6,Fail
+dEQP-GLES3.functional.texture.compressed.astc.void_extent_ldr.8x8,Fail
+
+KHR-GL46.direct_state_access.textures_storage_1d_rg8ui,Fail
+
+KHR-GL46.get_texture_sub_image.errors_test,Fail
+KHR-GL46.get_texture_sub_image.functional_test,Fail
+
+# "Result is different than expected at index: 0 Expected value: 1606528 Result value: 0"
+KHR-GL46.texture_buffer.texture_buffer_atomic_functions,Fail
+
+KHR-GL46.transform_feedback.capture_vertex_interleaved_test,Fail
+KHR-GL46.transform_feedback.capture_vertex_separate_test,Fail
+KHR-GL46.transform_feedback.discard_vertex_test,Fail
+KHR-GL46.transform_feedback.draw_xfb_instanced_test,Crash
+KHR-GL46.transform_feedback.draw_xfb_stream_instanced_test,Crash
+KHR-GL46.transform_feedback.query_vertex_interleaved_test,Fail
+KHR-GL46.transform_feedback.query_vertex_separate_test,Fail
+
+KHR-GLES31.core.pixelstoragemodes.teximage2d.r8snorm.16_0,Fail
+
+# "Result is different than expected at index: 0 Expected value: 1606528 Result value: 0"
+KHR-GLES31.core.texture_buffer.texture_buffer_atomic_functions,Fail
diff --git a/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-whl.toml b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-whl.toml
new file mode 100644
index 0000000000..4304b0094c
--- /dev/null
+++ b/mesa 3D driver/src/gallium/drivers/iris/ci/deqp-iris-whl.toml	
@@ -0,0 +1,99 @@
+# Basic test set
+[[deqp]]
+deqp = "/deqp/modules/gles2/deqp-gles2"
+caselists = ["/deqp/mustpass/gles2-master.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+fraction = 5
+version_check = "GL ES 3.2.*git"
+renderer_check = "WHL"
+
+[[deqp]]
+deqp = "/deqp/modules/gles3/deqp-gles3"
+caselists = ["/deqp/mustpass/gles3-master.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+fraction = 8
+
+[[deqp]]
+deqp = "/deqp/modules/gles31/deqp-gles31"
+caselists = ["/deqp/mustpass/gles31-master.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+fraction = 8
+
+# Note that KHR-GL3* test sets include all tests from the previous
+# version, so we only need to run one test list (unlike dEQP-GLES,
+# where the test sets are separate).
+[[deqp]]
+deqp = "/deqp/external/openglcts/modules/glcts"
+caselists = [
+    "/deqp/mustpass/gles2-khr-master.txt",
+    "/deqp/mustpass/gles3-khr-master.txt",
+    "/deqp/mustpass/gles31-khr-master.txt",
+    "/deqp/mustpass/gl46-master.txt",
+]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms0",
+]
+
+# 565-nozs
+[[deqp]]
+deqp = "/deqp/modules/gles3/deqp-gles3"
+caselists = ["/deqp/mustpass/gles3-565-no-depth-no-stencil.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgb565d0s0ms0",
+]
+prefix = "565-nozs-"
+
+[[deqp]]
+deqp = "/deqp/modules/gles31/deqp-gles31"
+caselists = ["/deqp/mustpass/gles31-565-no-depth-no-stencil.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgb565d0s0ms0",
+]
+prefix = "565-nozs-"
+
+# multisample
+[[deqp]]
+deqp = "/deqp/modules/gles3/deqp-gles3"
+caselists = ["/deqp/mustpass/gles3-multisample.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms4",
+]
+prefix = "multisample-"
+
+[[deqp]]
+deqp = "/deqp/modules/gles31/deqp-gles31"
+caselists = ["/deqp/mustpass/gles31-multisample.txt"]
+skips = ["/install/deqp-iris-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-visibility=hidden",
+    "--deqp-gl-config-name=rgba8888d24s8ms4",
+]
+prefix = "multisample-"
diff --git a/mesa 3D driver/src/gallium/drivers/iris/ci/gitlab-ci.yml b/mesa 3D driver/src/gallium/drivers/iris/ci/gitlab-ci.yml
index 069a4515ff..ddcba20989 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/ci/gitlab-ci.yml	
+++ b/mesa 3D driver/src/gallium/drivers/iris/ci/gitlab-ci.yml	
@@ -10,76 +10,77 @@
     FLAKES_CHANNEL: "#intel-ci"
     HWCI_FREQ_MAX: "true"
 
+# 5 boards
 .iris-apl-test:
   extends:
     - .iris-test
   variables:
     DEVICE_TYPE: asus-C523NA-A20057-coral
     GPU_VERSION: iris-apl
-    DEQP_EXPECTED_RENDERER: APL
   tags:
     - mesa-ci-x86-64-lava-asus-C523NA-A20057-coral
 
+# 4 boards
 .iris-glk-test:
   extends:
     - .iris-test
     - .test-manual-mr
   variables:
-    DEVICE_TYPE: hp-x360-12b-n4000-octopus
+    DEVICE_TYPE: hp-x360-12b-ca0010nr-n4020-octopus
     GPU_VERSION: iris-glk
-    DEQP_EXPECTED_RENDERER: GLK
   tags:
-    - mesa-ci-x86-64-lava-hp-x360-12b-n4000-octopus
+    - mesa-ci-x86-64-lava-hp-x360-12b-ca0010nr-n4020-octopus
 
+# 5 boards
 .iris-amly-test:
   extends:
     - .iris-test
+    - .test-manual-mr
   variables:
     DEVICE_TYPE: asus-C433TA-AJ0005-rammus
     GPU_VERSION: iris-amly
-    DEQP_EXPECTED_RENDERER: AML
   tags:
     - mesa-ci-x86-64-lava-asus-C433TA-AJ0005-rammus
 
+# 6 boards
 .iris-kbl-test:
   extends:
     - .iris-test
+    - .test-manual-mr
   variables:
     DEVICE_TYPE: hp-x360-14-G1-sona
     GPU_VERSION: iris-kbl
-    DEQP_EXPECTED_RENDERER: KBL
   tags:
     - mesa-ci-x86-64-lava-hp-x360-14-G1-sona
 
-iris-apl-gles2:
+# 1 board
+.iris-whl-test:
   extends:
-    - .iris-apl-test
-  variables:
-    DEQP_VER: gles2
-
-iris-apl-gles3:
-  extends:
-    - .iris-apl-test
-  variables:
-    DEQP_VER: gles3
-    DEQP_EXPECTED_RENDERER: APL
-    DEQP_FRACTION: 4
-
-iris-apl-gles31:
-  extends:
-    - .iris-apl-test
-  variables:
-    DEQP_VER: gles31
-    DEQP_EXPECTED_RENDERER: APL
-    DEQP_FRACTION: 6
-
-# this appears to silently kill the board sometimes.  Leave it as a manual job for debugging.
-iris-apl-gl:
-  extends:
-    - .iris-apl-test
+    - .iris-test
     - .test-manual-mr
   variables:
-    DEQP_VER: gl46
+    DEVICE_TYPE: dell-latitude-5400-4305U-sarien
+    GPU_VERSION: iris-whl
+  tags:
+    - mesa-ci-x86-64-lava-dell-latitude-5400-4305U-sarien
+
+# 4 boards
+.iris-cml-test:
+  extends:
+    - .iris-test
+    - .test-manual-mr
+  variables:
+    DEVICE_TYPE: asus-C436FA-Flip-hatch
+    GPU_VERSION: iris-cml
+  tags:
+    - mesa-ci-x86-64-lava-asus-C436FA-flip-hatch
+
+iris-apl-deqp:
+  extends:
+    - .iris-apl-test
+  variables:
+    DEQP_SUITE: iris-apl
+  parallel: 3
 
 iris-apl-egl:
   extends:
@@ -89,36 +90,12 @@ iris-apl-egl:
     HWCI_START_XORG: 1
     DEQP_VER: egl
 
-iris-glk-gles2:
+iris-glk-deqp:
   extends:
     - .iris-glk-test
   variables:
-    DEQP_VER: gles2
-
-iris-glk-gles3:
-  extends:
-    - .iris-glk-test
-  variables:
-    DEQP_VER: gles3
-    DEQP_EXPECTED_RENDERER: GLK
-    DEQP_FRACTION: 8
-
-iris-glk-gles31:
-  extends:
-    - .iris-glk-test
-  variables:
-    DEQP_VER: gles31
-    DEQP_EXPECTED_RENDERER: GLK
-    DEQP_FRACTION: 8
-
-# this appears to be just unusably slow, timed out at an hour.  Needs debugging,
-# so leave it as a manual job.
-iris-glk-gl:
-  extends:
-    - .iris-glk-test
-    - .test-manual-mr
-  variables:
-    DEQP_VER: gl46
+    DEQP_SUITE: iris-glk
+  parallel: 2
 
 iris-glk-egl:
   extends:
@@ -128,37 +105,34 @@ iris-glk-egl:
     HWCI_START_XORG: 1
     DEQP_VER: egl
 
-.iris-amly-gles2:
+iris-amly-deqp:
   extends:
     - .iris-amly-test
   variables:
-    DEQP_VER: gles2
+    DEQP_SUITE: iris-amly
+  parallel: 3
 
-.iris-amly-gles3:
+iris-kbl-deqp:
   extends:
-    - .iris-amly-test
-    - .test-manual-mr
+    - .iris-kbl-test
   variables:
-    DEQP_VER: gles3
-    DEQP_EXPECTED_RENDERER: AML
-    DEQP_FRACTION: 2
+    DEQP_SUITE: iris-kbl
+  parallel: 3
 
-.iris-amly-gles31:
+iris-whl-deqp:
   extends:
-    - .iris-amly-test
-    - .test-manual-mr
+    - .iris-whl-test
   variables:
-    DEQP_VER: gles31
-    DEQP_EXPECTED_RENDERER: AML
-    DEQP_FRACTION: 2
+    DEQP_SUITE: iris-whl
 
-.iris-amly-gl:
+iris-cml-deqp:
   extends:
-    - .iris-amly-test
+    - .iris-cml-test
   variables:
-    DEQP_VER: gl46
+    DEQP_SUITE: iris-cml
+  parallel: 2
 
-.iris-amly-egl:
+iris-amly-egl:
   extends:
     - .iris-amly-test
   variables:
@@ -166,7 +140,7 @@ iris-glk-egl:
     HWCI_START_XORG: 1
     DEQP_VER: egl
 
-.iris-amly-piglit:
+iris-amly-piglit:
   extends:
     - .iris-amly-test
     - .lava-piglit
@@ -204,7 +178,7 @@ iris-glk-traces:
     - .iris-glk-test
     - .iris-traces
 
-.iris-amly-traces:
+iris-amly-traces:
   variables:
     GPU_VERSION: intel-amly
   extends:
@@ -217,4 +191,77 @@ iris-kbl-traces:
   extends:
     - .iris-kbl-test
     - .iris-traces
-    - .test-manual-mr
+
+iris-whl-traces:
+  variables:
+    GPU_VERSION: intel-whl
+  extends:
+    - .iris-whl-test
+    - .iris-traces
+
+iris-cml-traces:
+  variables:
+    GPU_VERSION: intel-cml
+  extends:
+    - .iris-cml-test
+    - .iris-traces
+
+.profile-traces:
+  extends:
+    - .iris-traces
+    - .iris-rules-performance
+  variables:
+    PIGLIT_REPLAY_SUBCOMMAND: "profile"
+    PIGLIT_REPLAY_EXTRA_ARGS: "--db-path ${CI_PROJECT_DIR}/replayer-db/"
+    # More than this can hit OOM due to BOs leaked during the replay of the last frame
+    PIGLIT_REPLAY_LOOP_TIMES: 150
+    # We don't want for more than one workload to be submitted to the GPU at a time
+    FDO_CI_CONCURRENT: 1
+    # So we aren't capped by VSync by the X server
+    EGL_PLATFORM: surfaceless
+    GIT_STRATEGY: none
+    HWCI_FREQ_MAX: "true"
+    LAVA_TAGS: "cbg-0"
+  allow_failure: true
+
+iris-apl-traces-performance:
+  extends:
+    - .iris-apl-test
+    - .profile-traces
+  variables:
+    GPU_VERSION: intel-apl
+
+iris-glk-traces-performance:
+  extends:
+    - .iris-glk-test
+    - .profile-traces
+  variables:
+    GPU_VERSION: intel-glk
+
+iris-amly-traces-performance:
+  extends:
+    - .iris-amly-test
+    - .profile-traces
+  variables:
+    GPU_VERSION: intel-amly
+
+iris-kbl-traces-performance:
+  extends:
+    - .iris-kbl-test
+    - .profile-traces
+  variables:
+    GPU_VERSION: intel-kbl
+
+iris-whl-traces-performance:
+  extends:
+    - .iris-whl-test
+    - .profile-traces
+  variables:
+    GPU_VERSION: intel-whl
+
+iris-cml-traces-performance:
+  extends:
+    - .iris-cml-test
+    - .profile-traces
+  variables:
+    GPU_VERSION: intel-cml
diff --git a/mesa 3D driver/src/gallium/drivers/iris/ci/piglit-iris-amly-fails.txt b/mesa 3D driver/src/gallium/drivers/iris/ci/piglit-iris-amly-fails.txt
index 73cd4a4f96..8f1ff05953 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/ci/piglit-iris-amly-fails.txt	
+++ b/mesa 3D driver/src/gallium/drivers/iris/ci/piglit-iris-amly-fails.txt	
@@ -21,6 +21,9 @@ glx@glx-swap-pixmap-bad,Fail
 glx@glx-visuals-depth -pixmap,Crash
 glx@glx-visuals-stencil -pixmap,Crash
 
+# X error 167 (GLXBadFBConfig (9)) was generated, but X error 8 was expected.
+glx@glx_arb_create_context_no_error@no error,Fail
+
 # "X error 2 (non-GLX error (-1)) was generated, but X error 8 was expected."
 glx@glx_arb_create_context_robustness@invalid reset notification strategy,Fail
 
@@ -43,7 +46,10 @@ glx@glx_ext_import_context@make current- multi process,Fail
 glx@glx_ext_import_context@make current- single process,Fail
 glx@glx_ext_import_context@query context info,Fail
 
-spec@!opengl 1.0@rasterpos,Fail
+# "Unsupported intrinsic: vec4 32 ssa_56 = intrinsic image_load (ssa_44, ssa_55, ssa_40, ssa_44) (5, 0, 12, 80, 160) /* image_dim=Buf */ /* image_array=false */ /* format=r32g32b32a32_float  */ /* access=80 */ /* dest_type=float32 */
+#  rasterpos: ../src/gallium/auxiliary/gallivm/lp_bld_nir.c:1930: visit_intrinsic: Assertion `0' failed."
+spec@!opengl 1.0@rasterpos,Crash
+
 spec@!opengl 1.0@rasterpos@glsl_vs_gs_linked,Fail
 spec@!opengl 1.0@rasterpos@glsl_vs_tes_linked,Fail
 
@@ -197,38 +203,6 @@ spec@egl_khr_gl_image@egl_khr_gl_renderbuffer_image-clear-shared-image gl_depth_
 spec@egl_khr_surfaceless_context@viewport,Fail
 spec@egl_mesa_configless_context@basic,Fail
 
-# "Failed to spawn ext_image_dma_buf_import-intel_external_sampler_only"
-# Are we not building these on x86 for some reason?
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-export-tex,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-intel_external_sampler_only,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-invalid_attributes,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-ownership_transfer,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_argb8888,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_nv12,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_p012,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_xyuv,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y210,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y216,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y412,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_yuyv,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_yvu420,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-transcode-nv12-as-r8-gr88,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-export,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-invalid_hints,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-missing_attributes,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-refcount,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-reimport-bug,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_ayuv,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_p010,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_p016,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_uyvy,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_xrgb8888,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y212,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y410,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_y416,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_yuv420,Fail
-spec@ext_image_dma_buf_import@ext_image_dma_buf_import-unsupported_format,Fail
-
 # texture-integer: error 0x502 at tests/fbo/fbo-integer.c:300
 spec@ext_texture_integer@fbo-integer,Fail
 
diff --git a/mesa 3D driver/src/gallium/drivers/iris/ci/traces-iris.yml b/mesa 3D driver/src/gallium/drivers/iris/ci/traces-iris.yml
index cb6f47086c..60cf8a7b53 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/ci/traces-iris.yml	
+++ b/mesa 3D driver/src/gallium/drivers/iris/ci/traces-iris.yml	
@@ -13,17 +13,25 @@ traces:
 #      - device: gl-intel-amly
 #        checksum: becaa29fae8a988b8b5f4694f59383db
 #      - device: gl-intel-kbl
+#        checksum: becaa29fae8a988b8b5f4694f59383db
+#      - device: gl-intel-whl
+#        checksum: becaa29fae8a988b8b5f4694f59383db
+#      - device: gl-intel-cml
 #        checksum: becaa29fae8a988b8b5f4694f59383db
   - path: behdad-glyphy/glyphy.trace
     expectations:
       - device: gl-intel-apl
         checksum: 5c7e2fc5961b43a18ea65523fe025b96
       - device: gl-intel-glk
-        checksum: e749faf799fe11f95208a01ca6e32fef
+        checksum: 5c7e2fc5961b43a18ea65523fe025b96
       - device: gl-intel-amly
         checksum: e749faf799fe11f95208a01ca6e32fef
       - device: gl-intel-kbl
-        checksum: e749faf799fe11f95208a01ca6e32fef
+        checksum: 5c7e2fc5961b43a18ea65523fe025b96
+      - device: gl-intel-whl
+        checksum: 5c7e2fc5961b43a18ea65523fe025b96
+      - device: gl-intel-cml
+        checksum: 5c7e2fc5961b43a18ea65523fe025b96
   - path: glmark2/desktop:windows=4:effect=blur:blur-radius=5:passes=1:separable=true.trace
     expectations:
       - device: gl-intel-apl
@@ -34,16 +42,24 @@ traces:
         checksum: c8c96a8cc167e0d07f889d7310316922
       - device: gl-intel-kbl
         checksum: c8c96a8cc167e0d07f889d7310316922
+      - device: gl-intel-whl
+        checksum: c8c96a8cc167e0d07f889d7310316922
+      - device: gl-intel-cml
+        checksum: c8c96a8cc167e0d07f889d7310316922
   - path: glmark2/jellyfish.trace
     expectations:
       - device: gl-intel-apl
         checksum: c31236c496768159903aa48f5a47bcdc
       - device: gl-intel-glk
-        checksum: 3b7d64de0a60d527f63695518897f367
+        checksum: c31236c496768159903aa48f5a47bcdc
       - device: gl-intel-amly
         checksum: 3b7d64de0a60d527f63695518897f367
       - device: gl-intel-kbl
-        checksum: 3b7d64de0a60d527f63695518897f367
+        checksum: c31236c496768159903aa48f5a47bcdc
+      - device: gl-intel-whl
+        checksum: c31236c496768159903aa48f5a47bcdc
+      - device: gl-intel-cml
+        checksum: c31236c496768159903aa48f5a47bcdc
   - path: glxgears/glxgears-2.trace
     expectations:
       - device: gl-intel-apl
@@ -54,16 +70,24 @@ traces:
         checksum: f53ac20e17da91c0359c31f2fa3f401e
       - device: gl-intel-kbl
         checksum: f53ac20e17da91c0359c31f2fa3f401e
+      - device: gl-intel-whl
+        checksum: f53ac20e17da91c0359c31f2fa3f401e
+      - device: gl-intel-cml
+        checksum: f53ac20e17da91c0359c31f2fa3f401e
   - path: 0ad/0ad.trace
     expectations:
       - device: gl-intel-apl
         checksum: 45739401f068971e6e1052f10afe9f99
       - device: gl-intel-glk
-        checksum: 60d295cddd4679982993ffe60b4f5f12
+        checksum: 45739401f068971e6e1052f10afe9f99
       - device: gl-intel-amly
         checksum: 60d295cddd4679982993ffe60b4f5f12
       - device: gl-intel-kbl
-        checksum: 60d295cddd4679982993ffe60b4f5f12
+        checksum: 45739401f068971e6e1052f10afe9f99
+      - device: gl-intel-whl
+        checksum: 45739401f068971e6e1052f10afe9f99
+      - device: gl-intel-cml
+        checksum: 45739401f068971e6e1052f10afe9f99
   - path: pathfinder/demo.trace
     expectations:
       - device: gl-intel-apl
@@ -74,6 +98,10 @@ traces:
         checksum: d9b33f0a2efe17c21b7933242afd9ec7
       - device: gl-intel-kbl
         checksum: d9b33f0a2efe17c21b7933242afd9ec7
+      - device: gl-intel-whl
+        checksum: d9b33f0a2efe17c21b7933242afd9ec7
+      - device: gl-intel-cml
+        checksum: d9b33f0a2efe17c21b7933242afd9ec7
   - path: pathfinder/canvas_moire.trace
     expectations:
       - device: gl-intel-apl
@@ -84,6 +112,10 @@ traces:
         checksum: 21bccd42f2deb6416cf6591cd6a99258
       - device: gl-intel-kbl
         checksum: 21bccd42f2deb6416cf6591cd6a99258
+      - device: gl-intel-whl
+        checksum: 21bccd42f2deb6416cf6591cd6a99258
+      - device: gl-intel-cml
+        checksum: 21bccd42f2deb6416cf6591cd6a99258
   - path: pathfinder/canvas_text_v2.trace
     expectations:
       - device: gl-intel-apl
@@ -94,26 +126,38 @@ traces:
         checksum: 448886e3b24f6408e013ea13f7c96d28
       - device: gl-intel-kbl
         checksum: 448886e3b24f6408e013ea13f7c96d28
+      - device: gl-intel-whl
+        checksum: 448886e3b24f6408e013ea13f7c96d28
+      - device: gl-intel-cml
+        checksum: 448886e3b24f6408e013ea13f7c96d28
   - path: gputest/furmark.trace
     expectations:
       - device: gl-intel-apl
         checksum: 06d587a2b934295da6ad874b750b9c9d
       - device: gl-intel-glk
-        checksum: 34466e5a6103be730f18eea2d4d357ee
+        checksum: 06d587a2b934295da6ad874b750b9c9d
       - device: gl-intel-amly
         checksum: 34466e5a6103be730f18eea2d4d357ee
       - device: gl-intel-kbl
-        checksum: 34466e5a6103be730f18eea2d4d357ee
+        checksum: 06d587a2b934295da6ad874b750b9c9d
+      - device: gl-intel-whl
+        checksum: 06d587a2b934295da6ad874b750b9c9d
+      - device: gl-intel-cml
+        checksum: 06d587a2b934295da6ad874b750b9c9d
   - path: gputest/pixmark-piano.trace
     expectations:
       - device: gl-intel-apl
         checksum: 3899d6cd893b7c3ef6baa637cbd79690
       - device: gl-intel-glk
-        checksum: 9534418a99a80b31251266a987fb7c07
+        checksum: 3899d6cd893b7c3ef6baa637cbd79690
       - device: gl-intel-amly
         checksum: 9534418a99a80b31251266a987fb7c07
       - device: gl-intel-kbl
-        checksum: 9534418a99a80b31251266a987fb7c07
+        checksum: 3899d6cd893b7c3ef6baa637cbd79690
+      - device: gl-intel-whl
+        checksum: 3899d6cd893b7c3ef6baa637cbd79690
+      - device: gl-intel-cml
+        checksum: 3899d6cd893b7c3ef6baa637cbd79690
   - path: gputest/triangle.trace
     expectations:
       - device: gl-intel-apl
@@ -124,46 +168,66 @@ traces:
         checksum: 0a1524303e0772f6d869e4875fe1f401
       - device: gl-intel-kbl
         checksum: 0a1524303e0772f6d869e4875fe1f401
+      - device: gl-intel-whl
+        checksum: 0a1524303e0772f6d869e4875fe1f401
+      - device: gl-intel-cml
+        checksum: 0a1524303e0772f6d869e4875fe1f401
   - path: glmark2/buffer:update-fraction=0.5:update-dispersion=0.9:columns=200:update-method=map:interleave=false.trace
     expectations:
       - device: gl-intel-apl
         checksum: 46052998784defb089ce20fc8f39c6e9
       - device: gl-intel-glk
-        checksum: 7cd8249faa260ddb384b6707701370b1
+        checksum: 46052998784defb089ce20fc8f39c6e9
       - device: gl-intel-amly
         checksum: 7cd8249faa260ddb384b6707701370b1
       - device: gl-intel-kbl
-        checksum: 7cd8249faa260ddb384b6707701370b1
+        checksum: 46052998784defb089ce20fc8f39c6e9
+      - device: gl-intel-whl
+        checksum: 46052998784defb089ce20fc8f39c6e9
+      - device: gl-intel-cml
+        checksum: 46052998784defb089ce20fc8f39c6e9
   - path: glmark2/buffer:update-fraction=0.5:update-dispersion=0.9:columns=200:update-method=map:interleave=true.trace
     expectations:
       - device: gl-intel-apl
         checksum: 9ef1ec1d83a0b9448b2303b3c5f78447
       - device: gl-intel-glk
-        checksum: f3ffb9c057daf37836fa1d9edded10c8
+        checksum: 9ef1ec1d83a0b9448b2303b3c5f78447
       - device: gl-intel-amly
         checksum: f3ffb9c057daf37836fa1d9edded10c8
       - device: gl-intel-kbl
-        checksum: f3ffb9c057daf37836fa1d9edded10c8
+        checksum: 9ef1ec1d83a0b9448b2303b3c5f78447
+      - device: gl-intel-whl
+        checksum: 9ef1ec1d83a0b9448b2303b3c5f78447
+      - device: gl-intel-cml
+        checksum: 9ef1ec1d83a0b9448b2303b3c5f78447
   - path: glmark2/buffer:update-fraction=0.5:update-dispersion=0.9:columns=200:update-method=subdata:interleave=false.trace
     expectations:
       - device: gl-intel-apl
         checksum: 29a7734de59f3745158596942d0fb2fe
       - device: gl-intel-glk
-        checksum: 5c59cc9445dfbfcb41464fd469af9c2b
+        checksum: 29a7734de59f3745158596942d0fb2fe
       - device: gl-intel-amly
         checksum: 5c59cc9445dfbfcb41464fd469af9c2b
       - device: gl-intel-kbl
-        checksum: 5c59cc9445dfbfcb41464fd469af9c2b
+        checksum: 29a7734de59f3745158596942d0fb2fe
+      - device: gl-intel-whl
+        checksum: 29a7734de59f3745158596942d0fb2fe
+      - device: gl-intel-cml
+        checksum: 29a7734de59f3745158596942d0fb2fe
   - path: glmark2/bump:bump-render=height.trace
     expectations:
       - device: gl-intel-apl
         checksum: e299189c84a7005726554f7ca74611eb
       - device: gl-intel-glk
-        checksum: 5b119c2de40f8a46708470f567952852
+        checksum: e299189c84a7005726554f7ca74611eb
       - device: gl-intel-amly
         checksum: 5b119c2de40f8a46708470f567952852
       - device: gl-intel-kbl
-        checksum: 5b119c2de40f8a46708470f567952852
+        checksum: e299189c84a7005726554f7ca74611eb
+      - device: gl-intel-whl
+        checksum: e299189c84a7005726554f7ca74611eb
+      - device: gl-intel-cml
+        checksum: e299189c84a7005726554f7ca74611eb
   - path: glmark2/bump:bump-render=high-poly.trace
     expectations:
       - device: gl-intel-apl
@@ -174,6 +238,10 @@ traces:
         checksum: a6e89335a4443a80eaf7b1ae75575cac
       - device: gl-intel-kbl
         checksum: a6e89335a4443a80eaf7b1ae75575cac
+      - device: gl-intel-whl
+        checksum: a6e89335a4443a80eaf7b1ae75575cac
+      - device: gl-intel-cml
+        checksum: a6e89335a4443a80eaf7b1ae75575cac
   - path: glmark2/bump:bump-render=normals.trace
     expectations:
       - device: gl-intel-apl
@@ -184,6 +252,10 @@ traces:
         checksum: c8726956c5a3879b575ea244c36e372b
       - device: gl-intel-kbl
         checksum: c8726956c5a3879b575ea244c36e372b
+      - device: gl-intel-whl
+        checksum: c8726956c5a3879b575ea244c36e372b
+      - device: gl-intel-cml
+        checksum: c8726956c5a3879b575ea244c36e372b
   - path: glmark2/conditionals:vertex-steps=0:fragment-steps=0.trace
     expectations:
       - device: gl-intel-apl
@@ -194,6 +266,10 @@ traces:
         checksum: 70cd4a7a8ae25177bc4a2e3044c44c2d
       - device: gl-intel-kbl
         checksum: 70cd4a7a8ae25177bc4a2e3044c44c2d
+      - device: gl-intel-whl
+        checksum: 70cd4a7a8ae25177bc4a2e3044c44c2d
+      - device: gl-intel-cml
+        checksum: 70cd4a7a8ae25177bc4a2e3044c44c2d
   - path: glmark2/conditionals:vertex-steps=0:fragment-steps=5.trace
     expectations:
       - device: gl-intel-apl
@@ -204,6 +280,10 @@ traces:
         checksum: ef659484c57854a3fedb54c4f5b3983e
       - device: gl-intel-kbl
         checksum: ef659484c57854a3fedb54c4f5b3983e
+      - device: gl-intel-whl
+        checksum: ef659484c57854a3fedb54c4f5b3983e
+      - device: gl-intel-cml
+        checksum: ef659484c57854a3fedb54c4f5b3983e
   - path: glmark2/conditionals:vertex-steps=5:fragment-steps=0.trace
     expectations:
       - device: gl-intel-apl
@@ -214,6 +294,10 @@ traces:
         checksum: 0ee6864cc2c68cf767ffa773a1f81b6f
       - device: gl-intel-kbl
         checksum: 0ee6864cc2c68cf767ffa773a1f81b6f
+      - device: gl-intel-whl
+        checksum: 0ee6864cc2c68cf767ffa773a1f81b6f
+      - device: gl-intel-cml
+        checksum: 0ee6864cc2c68cf767ffa773a1f81b6f
   - path: glmark2/desktop:windows=4:effect=shadow.trace
     expectations:
       - device: gl-intel-apl
@@ -224,6 +308,10 @@ traces:
         checksum: 9099e32184329cecdfc6388b2a18964a
       - device: gl-intel-kbl
         checksum: 9099e32184329cecdfc6388b2a18964a
+      - device: gl-intel-whl
+        checksum: 9099e32184329cecdfc6388b2a18964a
+      - device: gl-intel-cml
+        checksum: 9099e32184329cecdfc6388b2a18964a
   - path: glmark2/effect2d:kernel=0,1,0;1,-4,1;0,1,0;.trace
     expectations:
       - device: gl-intel-apl
@@ -234,6 +322,10 @@ traces:
         checksum: e3677a85bc346a9bd52a6569d17bbe6e
       - device: gl-intel-kbl
         checksum: e3677a85bc346a9bd52a6569d17bbe6e
+      - device: gl-intel-whl
+        checksum: e3677a85bc346a9bd52a6569d17bbe6e
+      - device: gl-intel-cml
+        checksum: e3677a85bc346a9bd52a6569d17bbe6e
   - path: glmark2/effect2d:kernel=1,1,1,1,1;1,1,1,1,1;1,1,1,1,1;.trace
     expectations:
       - device: gl-intel-apl
@@ -244,6 +336,10 @@ traces:
         checksum: b80963dae6ecf40c83bfb16943ef1011
       - device: gl-intel-kbl
         checksum: b80963dae6ecf40c83bfb16943ef1011
+      - device: gl-intel-whl
+        checksum: b80963dae6ecf40c83bfb16943ef1011
+      - device: gl-intel-cml
+        checksum: b80963dae6ecf40c83bfb16943ef1011
   - path: glmark2/function:fragment-steps=5:fragment-complexity=low.trace
     expectations:
       - device: gl-intel-apl
@@ -254,6 +350,10 @@ traces:
         checksum: 8fa0f14154ac1ec8162ea8a0d5f26092
       - device: gl-intel-kbl
         checksum: 8fa0f14154ac1ec8162ea8a0d5f26092
+      - device: gl-intel-whl
+        checksum: 8fa0f14154ac1ec8162ea8a0d5f26092
+      - device: gl-intel-cml
+        checksum: 8fa0f14154ac1ec8162ea8a0d5f26092
   - path: glmark2/function:fragment-steps=5:fragment-complexity=medium.trace
     expectations:
       - device: gl-intel-apl
@@ -264,16 +364,24 @@ traces:
         checksum: 2ead7a061d05a6431c0efd076cb8731a
       - device: gl-intel-kbl
         checksum: 2ead7a061d05a6431c0efd076cb8731a
+      - device: gl-intel-whl
+        checksum: 2ead7a061d05a6431c0efd076cb8731a
+      - device: gl-intel-cml
+        checksum: 2ead7a061d05a6431c0efd076cb8731a
   - path: glmark2/build:use-vbo=false.trace
     expectations:
       - device: gl-intel-apl
         checksum: cd8899ad41a62df1425e9b9c9c8d9817
       - device: gl-intel-glk
-        checksum: cf8ee908ffab16537acf899cd31698d3
+        checksum: cd8899ad41a62df1425e9b9c9c8d9817
       - device: gl-intel-amly
         checksum: cf8ee908ffab16537acf899cd31698d3
       - device: gl-intel-kbl
-        checksum: cf8ee908ffab16537acf899cd31698d3
+        checksum: cd8899ad41a62df1425e9b9c9c8d9817
+      - device: gl-intel-whl
+        checksum: cd8899ad41a62df1425e9b9c9c8d9817
+      - device: gl-intel-cml
+        checksum: cd8899ad41a62df1425e9b9c9c8d9817
   - path: glmark2/build:use-vbo=true.trace
     expectations:
       - device: gl-intel-apl
@@ -284,16 +392,24 @@ traces:
         checksum: ef075cad089512504539bdb4139190f5
       - device: gl-intel-kbl
         checksum: ef075cad089512504539bdb4139190f5
+      - device: gl-intel-whl
+        checksum: ef075cad089512504539bdb4139190f5
+      - device: gl-intel-cml
+        checksum: ef075cad089512504539bdb4139190f5
   - path: glmark2/ideas:speed=10000.trace
     expectations:
       - device: gl-intel-apl
         checksum: 6af136c5ad47c86dfb48eaefae99856c
       - device: gl-intel-glk
-        checksum: b6d2684de7801307d540c1dc8d1a7bc4
+        checksum: 6af136c5ad47c86dfb48eaefae99856c
       - device: gl-intel-amly
         checksum: b6d2684de7801307d540c1dc8d1a7bc4
       - device: gl-intel-kbl
-        checksum: b6d2684de7801307d540c1dc8d1a7bc4
+        checksum: 6af136c5ad47c86dfb48eaefae99856c
+      - device: gl-intel-whl
+        checksum: 6af136c5ad47c86dfb48eaefae99856c
+      - device: gl-intel-cml
+        checksum: 6af136c5ad47c86dfb48eaefae99856c
   - path: glmark2/loop:vertex-steps=5:fragment-steps=5:fragment-loop=false.trace
     expectations:
       - device: gl-intel-apl
@@ -304,6 +420,10 @@ traces:
         checksum: 6c5675c503aec910095e57f0390d0311
       - device: gl-intel-kbl
         checksum: 6c5675c503aec910095e57f0390d0311
+      - device: gl-intel-whl
+        checksum: 6c5675c503aec910095e57f0390d0311
+      - device: gl-intel-cml
+        checksum: 6c5675c503aec910095e57f0390d0311
   - path: glmark2/loop:vertex-steps=5:fragment-steps=5:fragment-uniform=false.trace
     expectations:
       - device: gl-intel-apl
@@ -314,6 +434,10 @@ traces:
         checksum: 35521cba43b8c537f335bf65a31b6492
       - device: gl-intel-kbl
         checksum: 35521cba43b8c537f335bf65a31b6492
+      - device: gl-intel-whl
+        checksum: 35521cba43b8c537f335bf65a31b6492
+      - device: gl-intel-cml
+        checksum: 35521cba43b8c537f335bf65a31b6492
   - path: glmark2/loop:vertex-steps=5:fragment-steps=5:fragment-uniform=true.trace
     expectations:
       - device: gl-intel-apl
@@ -324,6 +448,10 @@ traces:
         checksum: 314cebc76c8eb24bc4a453ac7b85e6a7
       - device: gl-intel-kbl
         checksum: 314cebc76c8eb24bc4a453ac7b85e6a7
+      - device: gl-intel-whl
+        checksum: 314cebc76c8eb24bc4a453ac7b85e6a7
+      - device: gl-intel-cml
+        checksum: 314cebc76c8eb24bc4a453ac7b85e6a7
   - path: glmark2/pulsar:quads=5:texture=false:light=false.trace
     expectations:
       - device: gl-intel-apl
@@ -334,16 +462,24 @@ traces:
         checksum: e5c0ce9da8cd96f9a07dfdf053683b66
       - device: gl-intel-kbl
         checksum: e5c0ce9da8cd96f9a07dfdf053683b66
+      - device: gl-intel-whl
+        checksum: e5c0ce9da8cd96f9a07dfdf053683b66
+      - device: gl-intel-cml
+        checksum: e5c0ce9da8cd96f9a07dfdf053683b66
   - path: glmark2/refract.trace
     expectations:
       - device: gl-intel-apl
         checksum: fce8611398afd2fbef4b6d4e571fe878
       - device: gl-intel-glk
-        checksum: dd8079f362dcbdd0cd1bdaac02c86dc4
+        checksum: fce8611398afd2fbef4b6d4e571fe878
       - device: gl-intel-amly
         checksum: dd8079f362dcbdd0cd1bdaac02c86dc4
       - device: gl-intel-kbl
-        checksum: dd8079f362dcbdd0cd1bdaac02c86dc4
+        checksum: fce8611398afd2fbef4b6d4e571fe878
+      - device: gl-intel-whl
+        checksum: fce8611398afd2fbef4b6d4e571fe878
+      - device: gl-intel-cml
+        checksum: fce8611398afd2fbef4b6d4e571fe878
   - path: glmark2/shading:shading=blinn-phong-inf.trace
     expectations:
       - device: gl-intel-apl
@@ -354,6 +490,10 @@ traces:
         checksum: 8e29dfb2accb040d3ea40ad106e776a9
       - device: gl-intel-kbl
         checksum: 8e29dfb2accb040d3ea40ad106e776a9
+      - device: gl-intel-whl
+        checksum: 8e29dfb2accb040d3ea40ad106e776a9
+      - device: gl-intel-cml
+        checksum: 8e29dfb2accb040d3ea40ad106e776a9
   - path: glmark2/shading:shading=cel.trace
     expectations:
       - device: gl-intel-apl
@@ -364,6 +504,10 @@ traces:
         checksum: adf3d3fd63b94dff23312fad6939e789
       - device: gl-intel-kbl
         checksum: adf3d3fd63b94dff23312fad6939e789
+      - device: gl-intel-whl
+        checksum: adf3d3fd63b94dff23312fad6939e789
+      - device: gl-intel-cml
+        checksum: adf3d3fd63b94dff23312fad6939e789
   - path: glmark2/shading:shading=gouraud.trace
     expectations:
       - device: gl-intel-apl
@@ -374,16 +518,24 @@ traces:
         checksum: 76d7f65e289f474a7024a44152252aa8
       - device: gl-intel-kbl
         checksum: 76d7f65e289f474a7024a44152252aa8
+      - device: gl-intel-whl
+        checksum: 76d7f65e289f474a7024a44152252aa8
+      - device: gl-intel-cml
+        checksum: 76d7f65e289f474a7024a44152252aa8
   - path: glmark2/shading:shading=phong.trace
     expectations:
       - device: gl-intel-apl
         checksum: f6b5fd88e53d12d90622f3504b92e6db
       - device: gl-intel-glk
-        checksum: 3fd0b081537d54868292b148ffa5ad80
+        checksum: f6b5fd88e53d12d90622f3504b92e6db
       - device: gl-intel-amly
         checksum: 3fd0b081537d54868292b148ffa5ad80
       - device: gl-intel-kbl
-        checksum: 3fd0b081537d54868292b148ffa5ad80
+        checksum: f6b5fd88e53d12d90622f3504b92e6db
+      - device: gl-intel-whl
+        checksum: f6b5fd88e53d12d90622f3504b92e6db
+      - device: gl-intel-cml
+        checksum: f6b5fd88e53d12d90622f3504b92e6db
   - path: glmark2/shadow.trace
     expectations:
       - device: gl-intel-apl
@@ -394,16 +546,24 @@ traces:
         checksum: 5d152ef79b699de16dfc1e2c72951346
       - device: gl-intel-kbl
         checksum: 5d152ef79b699de16dfc1e2c72951346
+      - device: gl-intel-whl
+        checksum: 5d152ef79b699de16dfc1e2c72951346
+      - device: gl-intel-cml
+        checksum: 5d152ef79b699de16dfc1e2c72951346
   - path: glmark2/terrain.trace
     expectations:
       - device: gl-intel-apl
         checksum: 4cff4ebdf12cf46ffe3a4baee607f32c
       - device: gl-intel-glk
-        checksum: 649b4fc7926c1f062fd72c80b1ffea93
+        checksum: 4cff4ebdf12cf46ffe3a4baee607f32c
       - device: gl-intel-amly
         checksum: 649b4fc7926c1f062fd72c80b1ffea93
       - device: gl-intel-kbl
-        checksum: 649b4fc7926c1f062fd72c80b1ffea93
+        checksum: 4cff4ebdf12cf46ffe3a4baee607f32c
+      - device: gl-intel-whl
+        checksum: 4cff4ebdf12cf46ffe3a4baee607f32c
+      - device: gl-intel-cml
+        checksum: 4cff4ebdf12cf46ffe3a4baee607f32c
   - path: glmark2/texture:texture-filter=linear.trace
     expectations:
       - device: gl-intel-apl
@@ -414,6 +574,10 @@ traces:
         checksum: 9c15add29b4c783c93e1cc0d0fa0b084
       - device: gl-intel-kbl
         checksum: 9c15add29b4c783c93e1cc0d0fa0b084
+      - device: gl-intel-whl
+        checksum: 9c15add29b4c783c93e1cc0d0fa0b084
+      - device: gl-intel-cml
+        checksum: 9c15add29b4c783c93e1cc0d0fa0b084
   - path: glmark2/texture:texture-filter=mipmap.trace
     expectations:
       - device: gl-intel-apl
@@ -424,6 +588,10 @@ traces:
         checksum: ae4c0e010181f4e97d37e254737238c1
       - device: gl-intel-kbl
         checksum: ae4c0e010181f4e97d37e254737238c1
+      - device: gl-intel-whl
+        checksum: ae4c0e010181f4e97d37e254737238c1
+      - device: gl-intel-cml
+        checksum: ae4c0e010181f4e97d37e254737238c1
   - path: glmark2/texture:texture-filter=nearest.trace
     expectations:
       - device: gl-intel-apl
@@ -434,26 +602,38 @@ traces:
         checksum: 15d736a49c5457bdcf0abcfb9eb07890
       - device: gl-intel-kbl
         checksum: 15d736a49c5457bdcf0abcfb9eb07890
+      - device: gl-intel-whl
+        checksum: 15d736a49c5457bdcf0abcfb9eb07890
+      - device: gl-intel-cml
+        checksum: 15d736a49c5457bdcf0abcfb9eb07890
   - path: godot/Material Testers.x86_64_2020.04.08_13.38_frame799.rdc
     expectations:
       - device: gl-intel-apl
         checksum: ba5302821a4a4024ade9b98a191e80cc
       - device: gl-intel-glk
-        checksum: daa6f0258a8f25e8cc6aa242ed796f64
+        checksum: ba5302821a4a4024ade9b98a191e80cc
       - device: gl-intel-amly
         checksum: daa6f0258a8f25e8cc6aa242ed796f64
       - device: gl-intel-kbl
-        checksum: daa6f0258a8f25e8cc6aa242ed796f64
+        checksum: ba5302821a4a4024ade9b98a191e80cc
+      - device: gl-intel-whl
+        checksum: ba5302821a4a4024ade9b98a191e80cc
+      - device: gl-intel-cml
+        checksum: ba5302821a4a4024ade9b98a191e80cc
   - path: gputest/pixmark-julia-fp32.trace
     expectations:
       - device: gl-intel-apl
         checksum: a5ec72a5da355dfcd689411f89164f0c
       - device: gl-intel-glk
-        checksum: 9beb523176e7c153300521679853127a
+        checksum: a5ec72a5da355dfcd689411f89164f0c
       - device: gl-intel-amly
         checksum: 9beb523176e7c153300521679853127a
       - device: gl-intel-kbl
-        checksum: 9beb523176e7c153300521679853127a
+        checksum: a5ec72a5da355dfcd689411f89164f0c
+      - device: gl-intel-whl
+        checksum: a5ec72a5da355dfcd689411f89164f0c
+      - device: gl-intel-cml
+        checksum: a5ec72a5da355dfcd689411f89164f0c
   - path: gputest/pixmark-julia-fp64.trace
     expectations:
       - device: gl-intel-apl
@@ -464,56 +644,80 @@ traces:
         checksum: 95235e084d88bc41f8a1a05b79e88e33
       - device: gl-intel-kbl
         checksum: 95235e084d88bc41f8a1a05b79e88e33
+      - device: gl-intel-whl
+        checksum: 95235e084d88bc41f8a1a05b79e88e33
+      - device: gl-intel-cml
+        checksum: 95235e084d88bc41f8a1a05b79e88e33
   - path: gputest/pixmark-volplosion.trace
     expectations:
       - device: gl-intel-apl
         checksum: 9c891fd21b0cdc79f7071001d95d549c
       - device: gl-intel-glk
-        checksum: eedef23963d477408028b620badca109
+        checksum: 9c891fd21b0cdc79f7071001d95d549c
       - device: gl-intel-amly
         checksum: eedef23963d477408028b620badca109
       - device: gl-intel-kbl
-        checksum: eedef23963d477408028b620badca109
+        checksum: 9c891fd21b0cdc79f7071001d95d549c
+      - device: gl-intel-whl
+        checksum: 9c891fd21b0cdc79f7071001d95d549c
+      - device: gl-intel-cml
+        checksum: 9c891fd21b0cdc79f7071001d95d549c
   - path: gputest/plot3d.trace
     expectations:
       - device: gl-intel-apl
         checksum: cea8f26227763a3c3b2a2a6af9a0c1da
       - device: gl-intel-glk
-        checksum: 777f3cb48ccfdc541d78b12104761616
+        checksum: cea8f26227763a3c3b2a2a6af9a0c1da
       - device: gl-intel-amly
         checksum: 777f3cb48ccfdc541d78b12104761616
       - device: gl-intel-kbl
-        checksum: 777f3cb48ccfdc541d78b12104761616
+        checksum: cea8f26227763a3c3b2a2a6af9a0c1da
+      - device: gl-intel-whl
+        checksum: cea8f26227763a3c3b2a2a6af9a0c1da
+      - device: gl-intel-cml
+        checksum: cea8f26227763a3c3b2a2a6af9a0c1da
   - path: gputest/tessmark.trace
     expectations:
       - device: gl-intel-apl
         checksum: 10e49cd5a5e12d4a01f504c14b4335cc
       - device: gl-intel-glk
-        checksum: 37a451853926683b350353442a296c47
+        checksum: 10e49cd5a5e12d4a01f504c14b4335cc
       - device: gl-intel-amly
         checksum: 37a451853926683b350353442a296c47
       - device: gl-intel-kbl
-        checksum: 37a451853926683b350353442a296c47
+        checksum: 10e49cd5a5e12d4a01f504c14b4335cc
+      - device: gl-intel-whl
+        checksum: 10e49cd5a5e12d4a01f504c14b4335cc
+      - device: gl-intel-cml
+        checksum: 10e49cd5a5e12d4a01f504c14b4335cc
   - path: humus/AmbientAperture.trace
     expectations:
       - device: gl-intel-apl
         checksum: 5d0d439423d38d2f0f2520e6a1c4c819
       - device: gl-intel-glk
-        checksum: a946252f3188a979144f4c8f4ea22fea
+        checksum: 5d0d439423d38d2f0f2520e6a1c4c819
       - device: gl-intel-amly
         checksum: a946252f3188a979144f4c8f4ea22fea
       - device: gl-intel-kbl
-        checksum: a946252f3188a979144f4c8f4ea22fea
+        checksum: 5d0d439423d38d2f0f2520e6a1c4c819
+      - device: gl-intel-whl
+        checksum: 5d0d439423d38d2f0f2520e6a1c4c819
+      - device: gl-intel-cml
+        checksum: 5d0d439423d38d2f0f2520e6a1c4c819
   - path: humus/Portals.trace
     expectations:
       - device: gl-intel-apl
         checksum: 4b50340b8898687c37a908e799f9238e
       - device: gl-intel-glk
-        checksum: 84da7effee40af07b99e60d12c836c3c
+        checksum: 4b50340b8898687c37a908e799f9238e
       - device: gl-intel-amly
         checksum: 84da7effee40af07b99e60d12c836c3c
       - device: gl-intel-kbl
-        checksum: 84da7effee40af07b99e60d12c836c3c
+        checksum: 4b50340b8898687c37a908e799f9238e
+      - device: gl-intel-whl
+        checksum: 4b50340b8898687c37a908e799f9238e
+      - device: gl-intel-cml
+        checksum: 4b50340b8898687c37a908e799f9238e
   - path: humus/CelShading.trace
     expectations:
       - device: gl-intel-apl
@@ -524,16 +728,24 @@ traces:
         checksum: 5476bd1ed551e20fbe118dc8b4b5d515
       - device: gl-intel-kbl
         checksum: 5476bd1ed551e20fbe118dc8b4b5d515
+      - device: gl-intel-whl
+        checksum: 5476bd1ed551e20fbe118dc8b4b5d515
+      - device: gl-intel-cml
+        checksum: 5476bd1ed551e20fbe118dc8b4b5d515
   - path: humus/DynamicBranching3.trace
     expectations:
       - device: gl-intel-apl
         checksum: 51d5ffe732076a80ffaa7f935c528c42
       - device: gl-intel-glk
-        checksum: e4ad0f6c724ac072fe41c15d35fabc7c
+        checksum: 51d5ffe732076a80ffaa7f935c528c42
       - device: gl-intel-amly
         checksum: e4ad0f6c724ac072fe41c15d35fabc7c
       - device: gl-intel-kbl
-        checksum: e4ad0f6c724ac072fe41c15d35fabc7c
+        checksum: 51d5ffe732076a80ffaa7f935c528c42
+      - device: gl-intel-whl
+        checksum: 51d5ffe732076a80ffaa7f935c528c42
+      - device: gl-intel-cml
+        checksum: 51d5ffe732076a80ffaa7f935c528c42
   - path: humus/HDR.trace
     expectations:
       - device: gl-intel-apl
@@ -544,6 +756,10 @@ traces:
         checksum: e23bfd9704f3cc6ce3fd38685f5d3c61
       - device: gl-intel-kbl
         checksum: e23bfd9704f3cc6ce3fd38685f5d3c61
+      - device: gl-intel-whl
+        checksum: e23bfd9704f3cc6ce3fd38685f5d3c61
+      - device: gl-intel-cml
+        checksum: e23bfd9704f3cc6ce3fd38685f5d3c61
   - path: humus/RaytracedShadows.trace
     expectations:
       - device: gl-intel-apl
@@ -554,6 +770,10 @@ traces:
         checksum: c6ecea7b8ddb60526af4de7ec1a5be62
       - device: gl-intel-kbl
         checksum: c6ecea7b8ddb60526af4de7ec1a5be62
+      - device: gl-intel-whl
+        checksum: c6ecea7b8ddb60526af4de7ec1a5be62
+      - device: gl-intel-cml
+        checksum: c6ecea7b8ddb60526af4de7ec1a5be62
   - path: humus/VolumetricFogging2.trace
     expectations:
       - device: gl-intel-apl
@@ -564,26 +784,38 @@ traces:
         checksum: 2224e04e0aa162c19a22cbeb5bf16114
       - device: gl-intel-kbl
         checksum: 2224e04e0aa162c19a22cbeb5bf16114
+      - device: gl-intel-whl
+        checksum: 2224e04e0aa162c19a22cbeb5bf16114
+      - device: gl-intel-cml
+        checksum: 2224e04e0aa162c19a22cbeb5bf16114
   - path: neverball/neverball.trace
     expectations:
       - device: gl-intel-apl
         checksum: 75acd3160d268de2c5f286ff4546258c
       - device: gl-intel-glk
-        checksum: 4ca08b4200d9e8a5e74babcb9e14b726
+        checksum: 75acd3160d268de2c5f286ff4546258c
       - device: gl-intel-amly
         checksum: 4ca08b4200d9e8a5e74babcb9e14b726
       - device: gl-intel-kbl
-        checksum: 4ca08b4200d9e8a5e74babcb9e14b726
+        checksum: 75acd3160d268de2c5f286ff4546258c
+      - device: gl-intel-whl
+        checksum: 75acd3160d268de2c5f286ff4546258c
+      - device: gl-intel-cml
+        checksum: 75acd3160d268de2c5f286ff4546258c
   - path: supertuxkart/supertuxkart-antediluvian-abyss.rdc
     expectations:
       - device: gl-intel-apl
         checksum: 0af2faa0d9183c1bc4dc7612befe1f0a
       - device: gl-intel-glk
-        checksum: 8b76f8dc6770a62b413b956d700a1080
+        checksum: 0af2faa0d9183c1bc4dc7612befe1f0a
       - device: gl-intel-amly
         checksum: 8b76f8dc6770a62b413b956d700a1080
       - device: gl-intel-kbl
-        checksum: 8b76f8dc6770a62b413b956d700a1080
+        checksum: 0af2faa0d9183c1bc4dc7612befe1f0a
+      - device: gl-intel-whl
+        checksum: 0af2faa0d9183c1bc4dc7612befe1f0a
+      - device: gl-intel-cml
+        checksum: 0af2faa0d9183c1bc4dc7612befe1f0a
   - path: supertuxkart/supertuxkart-menu.rdc
     expectations:
       - device: gl-intel-apl
@@ -594,16 +826,24 @@ traces:
         checksum: 0a4095dc7b441643a3336975b61c9e6a
       - device: gl-intel-kbl
         checksum: 0a4095dc7b441643a3336975b61c9e6a
+      - device: gl-intel-whl
+        checksum: 0a4095dc7b441643a3336975b61c9e6a
+      - device: gl-intel-cml
+        checksum: 0a4095dc7b441643a3336975b61c9e6a
   - path: supertuxkart/supertuxkart-ravenbridge-mansion.rdc
     expectations:
       - device: gl-intel-apl
         checksum: ca0b64f1a62e01765146be8391eae636
       - device: gl-intel-glk
-        checksum: 66a7f3ad9511fd2700f5ec59589b0c7d
+        checksum: ca0b64f1a62e01765146be8391eae636
       - device: gl-intel-amly
         checksum: 66a7f3ad9511fd2700f5ec59589b0c7d
       - device: gl-intel-kbl
-        checksum: 66a7f3ad9511fd2700f5ec59589b0c7d
+        checksum: ca0b64f1a62e01765146be8391eae636
+      - device: gl-intel-whl
+        checksum: ca0b64f1a62e01765146be8391eae636
+      - device: gl-intel-cml
+        checksum: ca0b64f1a62e01765146be8391eae636
   - path: valve/counterstrike-v2.trace
     expectations:
       - device: gl-intel-apl
@@ -614,6 +854,10 @@ traces:
         checksum: 757e0c9a37ecfc5a2efb10505f98ad95
       - device: gl-intel-kbl
         checksum: 757e0c9a37ecfc5a2efb10505f98ad95
+      - device: gl-intel-whl
+        checksum: 757e0c9a37ecfc5a2efb10505f98ad95
+      - device: gl-intel-cml
+        checksum: 757e0c9a37ecfc5a2efb10505f98ad95
   # 3 minutes on APL
   #- path: valve/counterstrike-source-v2.trace
   #  expectations:
@@ -625,6 +869,10 @@ traces:
   #      checksum: 072e488a7d0289a49d60dc5bcb0b1878
   #    - device: gl-intel-kbl
   #      checksum: 072e488a7d0289a49d60dc5bcb0b1878
+  #    - device: gl-intel-whl
+  #      checksum: 072e488a7d0289a49d60dc5bcb0b1878
+  #    - device: gl-intel-cml
+  #      checksum: 072e488a7d0289a49d60dc5bcb0b1878
   # 4 minutes on APL
   #- path: valve/half-life-2-v2.trace
   #  expectations:
@@ -636,6 +884,10 @@ traces:
   #      checksum: 64575bc6478dbc2b2dde1552010ac37b
   #    - device: gl-intel-kbl
   #      checksum: 64575bc6478dbc2b2dde1552010ac37b
+  #    - device: gl-intel-whl
+  #      checksum: 64575bc6478dbc2b2dde1552010ac37b
+  #    - device: gl-intel-cml
+  #      checksum: 64575bc6478dbc2b2dde1552010ac37b
   # 3 minutes on APL
   #- path: valve/portal-2-v2.trace
   #  expectations:
@@ -647,6 +899,10 @@ traces:
   #      checksum: 9c698dac9ca633e3a54980b80b7b287a
   #    - device: gl-intel-kbl
   #      checksum: 9c698dac9ca633e3a54980b80b7b287a
+  #    - device: gl-intel-whl
+  #      checksum: 9c698dac9ca633e3a54980b80b7b287a
+  #    - device: gl-intel-cml
+  #      checksum: 9c698dac9ca633e3a54980b80b7b287a
   # Seeing connection resets downloading this.
   # - path: xonotic/xonotic-keybench-high.trace
   #   expectations:
@@ -658,3 +914,7 @@ traces:
   #       checksum: 95f60026993c36b37957043190d1a21c
   #    - device: gl-intel-kbl
   #       checksum: 95f60026993c36b37957043190d1a21c
+  #    - device: gl-intel-whl
+  #       checksum: 95f60026993c36b37957043190d1a21c
+  #    - device: gl-intel-cml
+  #       checksum: 95f60026993c36b37957043190d1a21c
diff --git a/mesa 3D driver/src/gallium/drivers/iris/iris_batch.c b/mesa 3D driver/src/gallium/drivers/iris/iris_batch.c
index bc5b58670d..4d073077df 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/iris_batch.c	
+++ b/mesa 3D driver/src/gallium/drivers/iris/iris_batch.c	
@@ -97,22 +97,29 @@ dump_fence_list(struct iris_batch *batch)
  * Debugging code to dump the validation list, used by INTEL_DEBUG=submit.
  */
 static void
-dump_validation_list(struct iris_batch *batch)
+dump_bo_list(struct iris_batch *batch)
 {
-   fprintf(stderr, "Validation list (length %d):\n", batch->exec_count);
+   fprintf(stderr, "BO list (length %d):\n", batch->exec_count);
 
    for (int i = 0; i < batch->exec_count; i++) {
-      uint64_t flags = batch->validation_list[i].flags;
-      assert(batch->validation_list[i].handle ==
-             batch->exec_bos[i]->gem_handle);
-      fprintf(stderr, "[%2d]: %2d %-14s @ 0x%"PRIx64" (%"PRIu64"B)\t %2d refs %s\n",
+      struct iris_bo *bo = batch->exec_bos[i];
+      struct iris_bo *backing = iris_get_backing_bo(bo);
+      bool written = BITSET_TEST(batch->bos_written, i);
+      bool exported = iris_bo_is_exported(bo);
+      bool imported = iris_bo_is_imported(bo);
+
+      fprintf(stderr, "[%2d]: %3d (%3d) %-14s @ 0x%016"PRIx64" (%-6s %8"PRIu64"B) %2d refs %s%s%s\n",
               i,
-              batch->validation_list[i].handle,
-              batch->exec_bos[i]->name,
-              (uint64_t)batch->validation_list[i].offset,
-              batch->exec_bos[i]->size,
-              batch->exec_bos[i]->refcount,
-              (flags & EXEC_OBJECT_WRITE) ? " (write)" : "");
+              bo->gem_handle,
+              backing->gem_handle,
+              bo->name,
+              bo->address,
+              backing->real.local ? "local" : "system",
+              bo->size,
+              bo->refcount,
+              written ? " write" : "",
+              exported ? " exported" : "",
+              imported ? " imported" : "");
    }
 }
 
@@ -196,11 +203,12 @@ iris_init_batch(struct iris_context *ice,
    util_dynarray_init(&batch->syncobjs, ralloc_context(NULL));
 
    batch->exec_count = 0;
-   batch->exec_array_size = 100;
+   batch->max_gem_handle = 0;
+   batch->exec_array_size = 128;
    batch->exec_bos =
       malloc(batch->exec_array_size * sizeof(batch->exec_bos[0]));
-   batch->validation_list =
-      malloc(batch->exec_array_size * sizeof(batch->validation_list[0]));
+   batch->bos_written =
+      rzalloc_array(NULL, BITSET_WORD, BITSET_WORDS(batch->exec_array_size));
 
    batch->cache.render = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
                                                  _mesa_key_pointer_equal);
@@ -212,10 +220,10 @@ iris_init_batch(struct iris_context *ice,
          batch->other_batches[j++] = &ice->batches[i];
    }
 
-   if (INTEL_DEBUG) {
+   if (INTEL_DEBUG(DEBUG_ANY)) {
       const unsigned decode_flags =
          INTEL_BATCH_DECODE_FULL |
-         ((INTEL_DEBUG & DEBUG_COLOR) ? INTEL_BATCH_DECODE_IN_COLOR : 0) |
+         (INTEL_DEBUG(DEBUG_COLOR) ? INTEL_BATCH_DECODE_IN_COLOR : 0) |
          INTEL_BATCH_DECODE_OFFSETS |
          INTEL_BATCH_DECODE_FLOATS;
 
@@ -232,34 +240,37 @@ iris_init_batch(struct iris_context *ice,
    iris_batch_reset(batch);
 }
 
-static struct drm_i915_gem_exec_object2 *
-find_validation_entry(struct iris_batch *batch, struct iris_bo *bo)
+static int
+find_exec_index(struct iris_batch *batch, struct iris_bo *bo)
 {
    unsigned index = READ_ONCE(bo->index);
 
    if (index < batch->exec_count && batch->exec_bos[index] == bo)
-      return &batch->validation_list[index];
+      return index;
 
    /* May have been shared between multiple active batches */
    for (index = 0; index < batch->exec_count; index++) {
       if (batch->exec_bos[index] == bo)
-         return &batch->validation_list[index];
+         return index;
    }
 
-   return NULL;
+   return -1;
 }
 
 static void
 ensure_exec_obj_space(struct iris_batch *batch, uint32_t count)
 {
    while (batch->exec_count + count > batch->exec_array_size) {
+      unsigned old_size = batch->exec_array_size;
+
       batch->exec_array_size *= 2;
       batch->exec_bos =
          realloc(batch->exec_bos,
                  batch->exec_array_size * sizeof(batch->exec_bos[0]));
-      batch->validation_list =
-         realloc(batch->validation_list,
-                 batch->exec_array_size * sizeof(batch->validation_list[0]));
+      batch->bos_written =
+         rerzalloc(NULL, batch->bos_written, BITSET_WORD,
+                   BITSET_WORDS(old_size),
+                   BITSET_WORDS(batch->exec_array_size));
    }
 }
 
@@ -272,16 +283,15 @@ add_bo_to_batch(struct iris_batch *batch, struct iris_bo *bo, bool writable)
 
    batch->exec_bos[batch->exec_count] = bo;
 
-   batch->validation_list[batch->exec_count] =
-      (struct drm_i915_gem_exec_object2) {
-         .handle = bo->gem_handle,
-         .offset = bo->address,
-         .flags = bo->kflags | (writable ? EXEC_OBJECT_WRITE : 0),
-      };
+   if (writable)
+      BITSET_SET(batch->bos_written, batch->exec_count);
 
    bo->index = batch->exec_count;
    batch->exec_count++;
    batch->aperture_space += bo->size;
+
+   batch->max_gem_handle =
+      MAX2(batch->max_gem_handle, iris_get_backing_bo(bo)->gem_handle);
 }
 
 /**
@@ -295,7 +305,7 @@ iris_use_pinned_bo(struct iris_batch *batch,
                    struct iris_bo *bo,
                    bool writable, enum iris_domain access)
 {
-   assert(bo->kflags & EXEC_OBJECT_PINNED);
+   assert(iris_get_backing_bo(bo)->real.kflags & EXEC_OBJECT_PINNED);
    assert(bo != batch->bo);
 
    /* Never mark the workaround BO with EXEC_OBJECT_WRITE.  We don't care
@@ -312,13 +322,12 @@ iris_use_pinned_bo(struct iris_batch *batch,
       iris_bo_bump_seqno(bo, batch->next_seqno, access);
    }
 
-   struct drm_i915_gem_exec_object2 *existing_entry =
-      find_validation_entry(batch, bo);
+   int existing_index = find_exec_index(batch, bo);
 
-   if (existing_entry) {
-      /* The BO is already in the validation list; mark it writable */
+   if (existing_index != -1) {
+      /* The BO is already in the list; mark it writable */
       if (writable)
-         existing_entry->flags |= EXEC_OBJECT_WRITE;
+         BITSET_SET(batch->bos_written, existing_index);
 
       return;
    }
@@ -328,8 +337,8 @@ iris_use_pinned_bo(struct iris_batch *batch,
        * we may need to flush and synchronize with other batches.
        */
       for (int b = 0; b < ARRAY_SIZE(batch->other_batches); b++) {
-         struct drm_i915_gem_exec_object2 *other_entry =
-            find_validation_entry(batch->other_batches[b], bo);
+         struct iris_batch *other_batch = batch->other_batches[b];
+         int other_index = find_exec_index(other_batch, bo);
 
          /* If the buffer is referenced by another batch, and either batch
           * intends to write it, then flush the other batch and synchronize.
@@ -345,13 +354,9 @@ iris_use_pinned_bo(struct iris_batch *batch,
           * share a streaming state buffer or shader assembly buffer, and
           * we want to avoid synchronizing in this case.
           */
-         if (other_entry &&
-             ((other_entry->flags & EXEC_OBJECT_WRITE) || writable)) {
-            iris_batch_flush(batch->other_batches[b]);
-            iris_batch_add_syncobj(batch,
-                                   batch->other_batches[b]->last_fence->syncobj,
-                                   I915_EXEC_FENCE_WAIT);
-         }
+         if (other_index != -1 &&
+             (writable || BITSET_TEST(other_batch->bos_written, other_index)))
+            iris_batch_flush(other_batch);
       }
    }
 
@@ -365,10 +370,11 @@ create_batch(struct iris_batch *batch)
    struct iris_screen *screen = batch->screen;
    struct iris_bufmgr *bufmgr = screen->bufmgr;
 
+   /* TODO: We probably could suballocate batches... */
    batch->bo = iris_bo_alloc(bufmgr, "command buffer",
                              BATCH_SZ + BATCH_RESERVED, 1,
-                             IRIS_MEMZONE_OTHER, 0);
-   batch->bo->kflags |= EXEC_OBJECT_CAPTURE;
+                             IRIS_MEMZONE_OTHER, BO_ALLOC_NO_SUBALLOC);
+   iris_get_backing_bo(batch->bo)->real.kflags |= EXEC_OBJECT_CAPTURE;
    batch->map = iris_bo_map(NULL, batch->bo, MAP_READ | MAP_WRITE);
    batch->map_next = batch->map;
 
@@ -398,6 +404,7 @@ static void
 iris_batch_reset(struct iris_batch *batch)
 {
    struct iris_screen *screen = batch->screen;
+   struct iris_bufmgr *bufmgr = screen->bufmgr;
 
    iris_bo_unreference(batch->bo);
    batch->primary_batch_size = 0;
@@ -409,9 +416,12 @@ iris_batch_reset(struct iris_batch *batch)
    create_batch(batch);
    assert(batch->bo->index == 0);
 
-   struct iris_syncobj *syncobj = iris_create_syncobj(screen);
+   memset(batch->bos_written, 0,
+          sizeof(BITSET_WORD) * BITSET_WORDS(batch->exec_array_size));
+
+   struct iris_syncobj *syncobj = iris_create_syncobj(bufmgr);
    iris_batch_add_syncobj(batch, syncobj, I915_EXEC_FENCE_SIGNAL);
-   iris_syncobj_reference(screen, &syncobj, NULL);
+   iris_syncobj_reference(bufmgr, &syncobj, NULL);
 
    assert(!batch->sync_region_depth);
    iris_batch_sync_boundary(batch);
@@ -435,14 +445,14 @@ iris_batch_free(struct iris_batch *batch)
       iris_bo_unreference(batch->exec_bos[i]);
    }
    free(batch->exec_bos);
-   free(batch->validation_list);
+   ralloc_free(batch->bos_written);
 
    ralloc_free(batch->exec_fences.mem_ctx);
 
    pipe_resource_reference(&batch->fine_fences.ref.res, NULL);
 
    util_dynarray_foreach(&batch->syncobjs, struct iris_syncobj *, s)
-      iris_syncobj_reference(screen, s, NULL);
+      iris_syncobj_reference(bufmgr, s, NULL);
    ralloc_free(batch->syncobjs.mem_ctx);
 
    iris_fine_fence_reference(batch->screen, &batch->last_fence, NULL);
@@ -460,7 +470,7 @@ iris_batch_free(struct iris_batch *batch)
 
    _mesa_hash_table_destroy(batch->cache.render, NULL);
 
-   if (INTEL_DEBUG)
+   if (INTEL_DEBUG(DEBUG_ANY))
       intel_batch_decode_ctx_finish(&batch->decoder);
 }
 
@@ -626,6 +636,122 @@ iris_batch_check_for_reset(struct iris_batch *batch)
    return status;
 }
 
+static void
+move_syncobj_to_batch(struct iris_batch *batch,
+                      struct iris_syncobj **p_syncobj,
+                      unsigned flags)
+{
+   struct iris_bufmgr *bufmgr = batch->screen->bufmgr;
+
+   if (!*p_syncobj)
+      return;
+
+   bool found = false;
+   util_dynarray_foreach(&batch->syncobjs, struct iris_syncobj *, s) {
+      if (*p_syncobj == *s) {
+         found = true;
+         break;
+      }
+   }
+
+   if (!found)
+      iris_batch_add_syncobj(batch, *p_syncobj, flags);
+
+   iris_syncobj_reference(bufmgr, p_syncobj, NULL);
+}
+
+static void
+update_bo_syncobjs(struct iris_batch *batch, struct iris_bo *bo, bool write)
+{
+   struct iris_screen *screen = batch->screen;
+   struct iris_bufmgr *bufmgr = screen->bufmgr;
+
+   /* Make sure bo->deps is big enough */
+   if (screen->id >= bo->deps_size) {
+      int new_size = screen->id + 1;
+      bo->deps= realloc(bo->deps, new_size * sizeof(bo->deps[0]));
+      memset(&bo->deps[bo->deps_size], 0,
+             sizeof(bo->deps[0]) * (new_size - bo->deps_size));
+
+      bo->deps_size = new_size;
+   }
+
+   /* When it comes to execbuf submission of non-shared buffers, we only need
+    * to care about the reads and writes done by the other batches of our own
+    * screen, and we also don't care about the reads and writes done by our
+    * own batch, although we need to track them. Just note that other places of
+    * our code may need to care about all the operations done by every batch
+    * on every screen.
+    */
+   struct iris_bo_screen_deps *deps = &bo->deps[screen->id];
+   int batch_idx = batch->name;
+
+#if IRIS_BATCH_COUNT == 2
+   /* Due to the above, we exploit the fact that IRIS_NUM_BATCHES is actually
+    * 2, which means there's only one other batch we need to care about.
+    */
+   int other_batch_idx = 1 - batch_idx;
+#else
+   /* For IRIS_BATCH_COUNT == 3 we can do:
+    *   int other_batch_idxs[IRIS_BATCH_COUNT - 1] = {
+    *      (batch_idx ^ 1) & 1,
+    *      (batch_idx ^ 2) & 2,
+    *   };
+    * For IRIS_BATCH_COUNT == 4 we can do:
+    *   int other_batch_idxs[IRIS_BATCH_COUNT - 1] = {
+    *      (batch_idx + 1) & 3,
+    *      (batch_idx + 2) & 3,
+    *      (batch_idx + 3) & 3,
+    *   };
+    */
+#error "Implement me."
+#endif
+
+   /* If it is being written to by others, wait on it. */
+   if (deps->write_syncobjs[other_batch_idx])
+      move_syncobj_to_batch(batch, &deps->write_syncobjs[other_batch_idx],
+                            I915_EXEC_FENCE_WAIT);
+
+   struct iris_syncobj *batch_syncobj = iris_batch_get_signal_syncobj(batch);
+
+   if (write) {
+      /* If we're writing to it, set our batch's syncobj as write_syncobj so
+       * others can wait on us. Also wait every reader we care about before
+       * writing.
+       */
+      iris_syncobj_reference(bufmgr, &deps->write_syncobjs[batch_idx],
+                              batch_syncobj);
+
+      move_syncobj_to_batch(batch, &deps->read_syncobjs[other_batch_idx],
+                           I915_EXEC_FENCE_WAIT);
+
+   } else {
+      /* If we're reading, replace the other read from our batch index. */
+      iris_syncobj_reference(bufmgr, &deps->read_syncobjs[batch_idx],
+                             batch_syncobj);
+   }
+}
+
+static void
+update_batch_syncobjs(struct iris_batch *batch)
+{
+   struct iris_bufmgr *bufmgr = batch->screen->bufmgr;
+   simple_mtx_t *bo_deps_lock = iris_bufmgr_get_bo_deps_lock(bufmgr);
+
+   simple_mtx_lock(bo_deps_lock);
+
+   for (int i = 0; i < batch->exec_count; i++) {
+      struct iris_bo *bo = batch->exec_bos[i];
+      bool write = BITSET_TEST(batch->bos_written, i);
+
+      if (bo == batch->screen->workaround_bo)
+         continue;
+
+      update_bo_syncobjs(batch, bo, write);
+   }
+   simple_mtx_unlock(bo_deps_lock);
+}
+
 /**
  * Submit the batch to the GPU via execbuffer2.
  */
@@ -634,6 +760,46 @@ submit_batch(struct iris_batch *batch)
 {
    iris_bo_unmap(batch->bo);
 
+   struct drm_i915_gem_exec_object2 *validation_list =
+      malloc(batch->exec_count * sizeof(*validation_list));
+
+   unsigned *index_for_handle =
+      calloc(batch->max_gem_handle + 1, sizeof(unsigned));
+
+   unsigned validation_count = 0;
+   for (int i = 0; i < batch->exec_count; i++) {
+      struct iris_bo *bo = iris_get_backing_bo(batch->exec_bos[i]);
+      assert(bo->gem_handle != 0);
+
+      bool written = BITSET_TEST(batch->bos_written, i);
+      unsigned prev_index = index_for_handle[bo->gem_handle];
+      if (prev_index > 0) {
+         if (written)
+            validation_list[prev_index].flags |= EXEC_OBJECT_WRITE;
+      } else {
+         index_for_handle[bo->gem_handle] = validation_count;
+         validation_list[validation_count] =
+            (struct drm_i915_gem_exec_object2) {
+               .handle = bo->gem_handle,
+               .offset = bo->address,
+               .flags  = bo->real.kflags | (written ? EXEC_OBJECT_WRITE : 0) |
+                         (iris_bo_is_external(bo) ? 0 : EXEC_OBJECT_ASYNC),
+            };
+         ++validation_count;
+      }
+   }
+
+   free(index_for_handle);
+
+   if (INTEL_DEBUG(DEBUG_BATCH | DEBUG_SUBMIT)) {
+      dump_fence_list(batch);
+      dump_bo_list(batch);
+   }
+
+   if (INTEL_DEBUG(DEBUG_BATCH)) {
+      decode_batch(batch);
+   }
+
    /* The requirement for using I915_EXEC_NO_RELOC are:
     *
     *   The addresses written in the objects must match the corresponding
@@ -647,8 +813,8 @@ submit_batch(struct iris_batch *batch)
     *   address of that object within the active context.
     */
    struct drm_i915_gem_execbuffer2 execbuf = {
-      .buffers_ptr = (uintptr_t) batch->validation_list,
-      .buffer_count = batch->exec_count,
+      .buffers_ptr = (uintptr_t) validation_list,
+      .buffer_count = validation_count,
       .batch_start_offset = 0,
       /* This must be QWord aligned. */
       .batch_len = ALIGN(batch->primary_batch_size, 8),
@@ -667,7 +833,7 @@ submit_batch(struct iris_batch *batch)
    }
 
    int ret = 0;
-   if (!batch->screen->no_hw &&
+   if (!batch->screen->devinfo.no_hw &&
        intel_ioctl(batch->screen->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf))
       ret = -errno;
 
@@ -677,9 +843,13 @@ submit_batch(struct iris_batch *batch)
       bo->idle = false;
       bo->index = -1;
 
+      iris_get_backing_bo(bo)->idle = false;
+
       iris_bo_unreference(bo);
    }
 
+   free(validation_list);
+
    return ret;
 }
 
@@ -710,7 +880,9 @@ _iris_batch_flush(struct iris_batch *batch, const char *file, int line)
 
    iris_finish_batch(batch);
 
-   if (INTEL_DEBUG & (DEBUG_BATCH | DEBUG_SUBMIT | DEBUG_PIPE_CONTROL)) {
+   update_batch_syncobjs(batch);
+
+   if (INTEL_DEBUG(DEBUG_BATCH | DEBUG_SUBMIT | DEBUG_PIPE_CONTROL)) {
       const char *basefile = strstr(file, "iris/");
       if (basefile)
          file = basefile + 5;
@@ -723,28 +895,36 @@ _iris_batch_flush(struct iris_batch *batch, const char *file, int line)
               batch->exec_count,
               (float) batch->aperture_space / (1024 * 1024));
 
-      if (INTEL_DEBUG & (DEBUG_BATCH | DEBUG_SUBMIT)) {
-         dump_fence_list(batch);
-         dump_validation_list(batch);
-      }
-
-      if (INTEL_DEBUG & DEBUG_BATCH) {
-         decode_batch(batch);
-      }
    }
 
    int ret = submit_batch(batch);
 
+   /* When batch submission fails, our end-of-batch syncobj remains
+    * unsignalled, and in fact is not even considered submitted.
+    *
+    * In the hang recovery case (-EIO) or -ENOMEM, we recreate our context and
+    * attempt to carry on.  In that case, we need to signal our syncobj,
+    * dubiously claiming that this batch completed, because future batches may
+    * depend on it.  If we don't, then execbuf would fail with -EINVAL for
+    * those batches, because they depend on a syncobj that's considered to be
+    * "never submitted".  This would lead to an abort().  So here, we signal
+    * the failing batch's syncobj to try and allow further progress to be
+    * made, knowing we may have broken our dependency tracking.
+    */
+   if (ret < 0)
+      iris_syncobj_signal(screen->bufmgr, iris_batch_get_signal_syncobj(batch));
+
    batch->exec_count = 0;
+   batch->max_gem_handle = 0;
    batch->aperture_space = 0;
 
    util_dynarray_foreach(&batch->syncobjs, struct iris_syncobj *, s)
-      iris_syncobj_reference(screen, s, NULL);
+      iris_syncobj_reference(screen->bufmgr, s, NULL);
    util_dynarray_clear(&batch->syncobjs);
 
    util_dynarray_clear(&batch->exec_fences);
 
-   if (INTEL_DEBUG & DEBUG_SYNC) {
+   if (INTEL_DEBUG(DEBUG_SYNC)) {
       dbg_printf("waiting for idle\n");
       iris_bo_wait_rendering(batch->bo); /* if execbuf failed; this is a nop */
    }
@@ -769,7 +949,7 @@ _iris_batch_flush(struct iris_batch *batch, const char *file, int line)
 
    if (ret < 0) {
 #ifdef DEBUG
-      const bool color = INTEL_DEBUG & DEBUG_COLOR;
+      const bool color = INTEL_DEBUG(DEBUG_COLOR);
       fprintf(stderr, "%siris: Failed to submit batchbuffer: %-80s%s\n",
               color ? "\e[1;41m" : "", strerror(-ret), color ? "\e[0m" : "");
 #endif
@@ -785,7 +965,7 @@ _iris_batch_flush(struct iris_batch *batch, const char *file, int line)
 bool
 iris_batch_references(struct iris_batch *batch, struct iris_bo *bo)
 {
-   return find_validation_entry(batch, bo) != NULL;
+   return find_exec_index(batch, bo) != -1;
 }
 
 /**
diff --git a/mesa 3D driver/src/gallium/drivers/iris/iris_batch.h b/mesa 3D driver/src/gallium/drivers/iris/iris_batch.h
index a9f5cc66c1..0d3d597839 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/iris_batch.h	
+++ b/mesa 3D driver/src/gallium/drivers/iris/iris_batch.h	
@@ -56,8 +56,6 @@ enum iris_batch_name {
    IRIS_BATCH_COMPUTE,
 };
 
-#define IRIS_BATCH_COUNT 2
-
 struct iris_batch {
    struct iris_context *ice;
    struct iris_screen *screen;
@@ -83,11 +81,13 @@ struct iris_batch {
 
    uint32_t hw_ctx_id;
 
-   /** The validation list */
-   struct drm_i915_gem_exec_object2 *validation_list;
+   /** A list of all BOs referenced by this batch */
    struct iris_bo **exec_bos;
    int exec_count;
    int exec_array_size;
+   /** Bitset of whether this batch writes to BO `i'. */
+   BITSET_WORD *bos_written;
+   uint32_t max_gem_handle;
 
    /** Whether INTEL_BLACKHOLE_RENDER is enabled in the batch (aka first
     * instruction is a MI_BATCH_BUFFER_END).
@@ -269,7 +269,7 @@ iris_batch_reference_signal_syncobj(struct iris_batch *batch,
                                    struct iris_syncobj **out_syncobj)
 {
    struct iris_syncobj *syncobj = iris_batch_get_signal_syncobj(batch);
-   iris_syncobj_reference(batch->screen, out_syncobj, syncobj);
+   iris_syncobj_reference(batch->screen->bufmgr, out_syncobj, syncobj);
 }
 
 /**
diff --git a/mesa 3D driver/src/gallium/drivers/iris/iris_blit.c b/mesa 3D driver/src/gallium/drivers/iris/iris_blit.c
index 0afac31dce..6f789ad377 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/iris_blit.c	
+++ b/mesa 3D driver/src/gallium/drivers/iris/iris_blit.c	
@@ -238,8 +238,6 @@ iris_blorp_surf_for_resource(struct isl_device *isl_dev,
 {
    struct iris_resource *res = (void *) p_res;
 
-   assert(!iris_resource_unfinished_aux_import(res));
-
    *surf = (struct blorp_surf) {
       .surf = &res->surf,
       .addr = (struct blorp_address) {
@@ -261,8 +259,7 @@ iris_blorp_surf_for_resource(struct isl_device *isl_dev,
          .reloc_flags = is_render_target ? EXEC_OBJECT_WRITE : 0,
          .mocs = iris_mocs(res->bo, isl_dev, 0),
       };
-      surf->clear_color =
-         iris_resource_get_clear_color(res, NULL, NULL);
+      surf->clear_color = res->aux.clear_color;
       surf->clear_color_addr = (struct blorp_address) {
          .buffer = res->aux.clear_color_bo,
          .offset = res->aux.clear_color_offset,
@@ -463,11 +460,6 @@ iris_blit(struct pipe_context *ctx, const struct pipe_blit_info *info)
       enum pipe_format dst_pfmt =
          pipe_format_for_aspect(info->dst.format, aspect);
 
-      if (iris_resource_unfinished_aux_import(src_res))
-         iris_resource_finish_aux_import(ctx->screen, src_res);
-      if (iris_resource_unfinished_aux_import(dst_res))
-         iris_resource_finish_aux_import(ctx->screen, dst_res);
-
       struct iris_format_info src_fmt =
          iris_format_for_usage(devinfo, src_pfmt, ISL_SURF_USAGE_TEXTURE_BIT);
       enum isl_aux_usage src_aux_usage =
@@ -710,20 +702,6 @@ iris_copy_region(struct blorp_context *blorp,
    tex_cache_flush_hack(batch, ISL_FORMAT_UNSUPPORTED, src_res->surf.format);
 }
 
-static struct iris_batch *
-get_preferred_batch(struct iris_context *ice, struct iris_bo *bo)
-{
-   /* If the compute batch is already using this buffer, we'd prefer to
-    * continue queueing in the compute batch.
-    */
-   if (iris_batch_references(&ice->batches[IRIS_BATCH_COMPUTE], bo))
-      return &ice->batches[IRIS_BATCH_COMPUTE];
-
-   /* Otherwise default to the render batch. */
-   return &ice->batches[IRIS_BATCH_RENDER];
-}
-
-
 /**
  * The pipe->resource_copy_region() driver hook.
  *
@@ -740,30 +718,7 @@ iris_resource_copy_region(struct pipe_context *ctx,
                           const struct pipe_box *src_box)
 {
    struct iris_context *ice = (void *) ctx;
-   struct iris_screen *screen = (void *) ctx->screen;
    struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
-   struct iris_resource *src = (void *) p_src;
-   struct iris_resource *dst = (void *) p_dst;
-
-   if (iris_resource_unfinished_aux_import(src))
-      iris_resource_finish_aux_import(ctx->screen, src);
-   if (iris_resource_unfinished_aux_import(dst))
-      iris_resource_finish_aux_import(ctx->screen, dst);
-
-   /* Use MI_COPY_MEM_MEM for tiny (<= 16 byte, % 4) buffer copies. */
-   if (p_src->target == PIPE_BUFFER && p_dst->target == PIPE_BUFFER &&
-       dstx % 4 == 0 && src_box->x % 4 == 0 &&
-       src_box->width % 4 == 0 && src_box->width <= 16) {
-      struct iris_bo *dst_bo = iris_resource_bo(p_dst);
-      batch = get_preferred_batch(ice, dst_bo);
-      iris_batch_maybe_flush(batch, 24 + 5 * (src_box->width / 4));
-      iris_emit_pipe_control_flush(batch,
-                                   "stall for MI_COPY_MEM_MEM copy_region",
-                                   PIPE_CONTROL_CS_STALL);
-      screen->vtbl.copy_mem_mem(batch, dst_bo, dstx, iris_resource_bo(p_src),
-                                src_box->x, src_box->width);
-      return;
-   }
 
    iris_copy_region(&ice->blorp, batch, p_dst, dst_level, dstx, dsty, dstz,
                     p_src, src_level, src_box);
@@ -778,8 +733,8 @@ iris_resource_copy_region(struct pipe_context *ctx,
                        dsty, dstz, &s_src_res->base.b, src_level, src_box);
    }
 
-   iris_flush_and_dirty_for_history(ice, batch, dst,
-                                    PIPE_CONTROL_RENDER_TARGET_FLUSH | PIPE_CONTROL_TILE_CACHE_FLUSH,
+   iris_flush_and_dirty_for_history(ice, batch, (struct iris_resource *)p_dst,
+                                    PIPE_CONTROL_RENDER_TARGET_FLUSH,
                                     "cache history: post copy_region");
 }
 
diff --git a/mesa 3D driver/src/gallium/drivers/iris/iris_blorp.c b/mesa 3D driver/src/gallium/drivers/iris/iris_blorp.c
index ec8fa81a54..1a9ff91364 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/iris_blorp.c	
+++ b/mesa 3D driver/src/gallium/drivers/iris/iris_blorp.c	
@@ -139,6 +139,16 @@ blorp_alloc_dynamic_state(struct blorp_batch *blorp_batch,
                        size, alignment, offset, NULL);
 }
 
+UNUSED static void *
+blorp_alloc_general_state(struct blorp_batch *blorp_batch,
+                          uint32_t size,
+                          uint32_t alignment,
+                          uint32_t *offset)
+{
+   /* Use dynamic state range for general state on iris. */
+   return blorp_alloc_dynamic_state(blorp_batch, size, alignment, offset);
+}
+
 static void
 blorp_alloc_binding_table(struct blorp_batch *blorp_batch,
                           unsigned num_entries,
@@ -274,21 +284,9 @@ iris_blorp_exec(struct blorp_batch *blorp_batch,
                                 PIPE_CONTROL_STALL_AT_SCOREBOARD);
 #endif
 
-#if GFX_VERx10 == 120
-   if (!(blorp_batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL)) {
-      /* Wa_14010455700
-       *
-       * ISL will change some CHICKEN registers depending on the depth surface
-       * format, along with emitting the depth and stencil packets. In that
-       * case, we want to do a depth flush and stall, so the pipeline is not
-       * using these settings while we change the registers.
-       */
-      iris_emit_end_of_pipe_sync(batch,
-                                 "Workaround: Stop pipeline for 14010455700",
-                                 PIPE_CONTROL_DEPTH_STALL |
-                                 PIPE_CONTROL_DEPTH_CACHE_FLUSH);
-   }
-#endif
+   if (params->depth.enabled &&
+       !(blorp_batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL))
+      genX(emit_depth_state_workarounds)(ice, batch, &params->depth.surf);
 
    /* Flush the render cache in cases where the same surface is used with
     * different aux modes, which can lead to GPU hangs.  Invalidation of
diff --git a/mesa 3D driver/src/gallium/drivers/iris/iris_bufmgr.c b/mesa 3D driver/src/gallium/drivers/iris/iris_bufmgr.c
index 9534f77a6b..7a9d1a4820 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/iris_bufmgr.c	
+++ b/mesa 3D driver/src/gallium/drivers/iris/iris_bufmgr.c	
@@ -170,6 +170,26 @@ struct iris_memregion {
    uint64_t size;
 };
 
+#define NUM_SLAB_ALLOCATORS 3
+
+enum iris_heap {
+   IRIS_HEAP_SYSTEM_MEMORY,
+   IRIS_HEAP_DEVICE_LOCAL,
+   IRIS_HEAP_MAX,
+};
+
+struct iris_slab {
+   struct pb_slab base;
+
+   unsigned entry_size;
+
+   /** The BO representing the entire slab */
+   struct iris_bo *bo;
+
+   /** Array of iris_bo structs representing BOs allocated out of this slab */
+   struct iris_bo *entries;
+};
+
 struct iris_bufmgr {
    /**
     * List into the list of bufmgr.
@@ -181,6 +201,7 @@ struct iris_bufmgr {
    int fd;
 
    simple_mtx_t lock;
+   simple_mtx_t bo_deps_lock;
 
    /** Array of lists of cached gem objects of power-of-two sizes */
    struct bo_cache_bucket cache_bucket[14 * 4];
@@ -206,6 +227,8 @@ struct iris_bufmgr {
    uint64_t vma_min_align;
    struct iris_memregion vram, sys;
 
+   int next_screen_id;
+
    bool has_llc:1;
    bool has_local_mem:1;
    bool has_mmap_offset:1;
@@ -214,6 +237,8 @@ struct iris_bufmgr {
    bool bo_reuse:1;
 
    struct intel_aux_map_context *aux_map_ctx;
+
+   struct pb_slabs bo_slabs[NUM_SLAB_ALLOCATORS];
 };
 
 static simple_mtx_t global_bufmgr_list_mutex = _SIMPLE_MTX_INITIALIZER_NP;
@@ -232,7 +257,8 @@ find_and_ref_external_bo(struct hash_table *ht, unsigned int key)
 
    if (bo) {
       assert(iris_bo_is_external(bo));
-      assert(!bo->reusable);
+      assert(iris_bo_is_real(bo));
+      assert(!bo->real.reusable);
 
       /* Being non-reusable, the BO cannot be in the cache lists, but it
        * may be in the zombie list if it had reached zero references, but
@@ -379,23 +405,108 @@ vma_free(struct iris_bufmgr *bufmgr,
    util_vma_heap_free(&bufmgr->vma_allocator[memzone], address, size);
 }
 
-int
-iris_bo_busy(struct iris_bo *bo)
+static bool
+iris_bo_busy_gem(struct iris_bo *bo)
 {
+   assert(iris_bo_is_real(bo));
+
    struct iris_bufmgr *bufmgr = bo->bufmgr;
    struct drm_i915_gem_busy busy = { .handle = bo->gem_handle };
 
    int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_BUSY, &busy);
    if (ret == 0) {
-      bo->idle = !busy.busy;
       return busy.busy;
    }
    return false;
 }
 
+/* A timeout of 0 just checks for busyness. */
+static int
+iris_bo_wait_syncobj(struct iris_bo *bo, int64_t timeout_ns)
+{
+   int ret = 0;
+   struct iris_bufmgr *bufmgr = bo->bufmgr;
+
+   /* If we know it's idle, don't bother with the kernel round trip */
+   if (bo->idle)
+      return 0;
+
+   simple_mtx_lock(&bufmgr->bo_deps_lock);
+
+   uint32_t handles[bo->deps_size * IRIS_BATCH_COUNT * 2];
+   int handle_count = 0;
+
+   for (int d = 0; d < bo->deps_size; d++) {
+      for (int b = 0; b < IRIS_BATCH_COUNT; b++) {
+         struct iris_syncobj *r = bo->deps[d].read_syncobjs[b];
+         struct iris_syncobj *w = bo->deps[d].write_syncobjs[b];
+         if (r)
+            handles[handle_count++] = r->handle;
+         if (w)
+            handles[handle_count++] = w->handle;
+      }
+   }
+
+   if (handle_count == 0)
+      goto out;
+
+   /* Unlike the gem wait, negative values are not infinite here. */
+   int64_t timeout_abs = os_time_get_absolute_timeout(timeout_ns);
+   if (timeout_abs < 0)
+      timeout_abs = INT64_MAX;
+
+   struct drm_syncobj_wait args = {
+      .handles = (uintptr_t) handles,
+      .timeout_nsec = timeout_abs,
+      .count_handles = handle_count,
+      .flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
+   };
+
+   ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_SYNCOBJ_WAIT, &args);
+   if (ret != 0) {
+      ret = -errno;
+      goto out;
+   }
+
+   /* We just waited everything, so clean all the deps. */
+   for (int d = 0; d < bo->deps_size; d++) {
+      for (int b = 0; b < IRIS_BATCH_COUNT; b++) {
+         iris_syncobj_reference(bufmgr, &bo->deps[d].write_syncobjs[b], NULL);
+         iris_syncobj_reference(bufmgr, &bo->deps[d].read_syncobjs[b], NULL);
+      }
+   }
+
+out:
+   simple_mtx_unlock(&bufmgr->bo_deps_lock);
+   return ret;
+}
+
+static bool
+iris_bo_busy_syncobj(struct iris_bo *bo)
+{
+   return iris_bo_wait_syncobj(bo, 0) == -ETIME;
+}
+
+bool
+iris_bo_busy(struct iris_bo *bo)
+{
+   bool busy;
+   if (iris_bo_is_external(bo))
+      busy = iris_bo_busy_gem(bo);
+   else
+      busy = iris_bo_busy_syncobj(bo);
+
+   bo->idle = !busy;
+
+   return busy;
+}
+
 int
 iris_bo_madvise(struct iris_bo *bo, int state)
 {
+   /* We can't madvise suballocated BOs. */
+   assert(iris_bo_is_real(bo));
+
    struct drm_i915_gem_madvise madv = {
       .handle = bo->gem_handle,
       .madv = state,
@@ -414,7 +525,7 @@ bo_calloc(void)
    if (!bo)
       return NULL;
 
-   list_inithead(&bo->exports);
+   list_inithead(&bo->real.exports);
 
    bo->hash = _mesa_hash_pointer(bo);
 
@@ -424,9 +535,282 @@ bo_calloc(void)
 static void
 bo_unmap(struct iris_bo *bo)
 {
-   VG_NOACCESS(bo->map, bo->size);
-   os_munmap(bo->map, bo->size);
-   bo->map = NULL;
+   assert(iris_bo_is_real(bo));
+
+   VG_NOACCESS(bo->real.map, bo->size);
+   os_munmap(bo->real.map, bo->size);
+   bo->real.map = NULL;
+}
+
+static struct pb_slabs *
+get_slabs(struct iris_bufmgr *bufmgr, uint64_t size)
+{
+   for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
+      struct pb_slabs *slabs = &bufmgr->bo_slabs[i];
+
+      if (size <= 1ull << (slabs->min_order + slabs->num_orders - 1))
+         return slabs;
+   }
+
+   unreachable("should have found a valid slab for this size");
+}
+
+/* Return the power of two size of a slab entry matching the input size. */
+static unsigned
+get_slab_pot_entry_size(struct iris_bufmgr *bufmgr, unsigned size)
+{
+   unsigned entry_size = util_next_power_of_two(size);
+   unsigned min_entry_size = 1 << bufmgr->bo_slabs[0].min_order;
+
+   return MAX2(entry_size, min_entry_size);
+}
+
+/* Return the slab entry alignment. */
+static unsigned
+get_slab_entry_alignment(struct iris_bufmgr *bufmgr, unsigned size)
+{
+   unsigned entry_size = get_slab_pot_entry_size(bufmgr, size);
+
+   if (size <= entry_size * 3 / 4)
+      return entry_size / 4;
+
+   return entry_size;
+}
+
+static bool
+iris_can_reclaim_slab(void *priv, struct pb_slab_entry *entry)
+{
+   struct iris_bo *bo = container_of(entry, struct iris_bo, slab.entry);
+
+   return !iris_bo_busy(bo);
+}
+
+static void
+iris_slab_free(void *priv, struct pb_slab *pslab)
+{
+   struct iris_bufmgr *bufmgr = priv;
+   struct iris_slab *slab = (void *) pslab;
+   struct intel_aux_map_context *aux_map_ctx = bufmgr->aux_map_ctx;
+
+   assert(!slab->bo->aux_map_address);
+
+   if (aux_map_ctx) {
+      /* Since we're freeing the whole slab, all buffers allocated out of it
+       * must be reclaimable.  We require buffers to be idle to be reclaimed
+       * (see iris_can_reclaim_slab()), so we know all entries must be idle.
+       * Therefore, we can safely unmap their aux table entries.
+       */
+      for (unsigned i = 0; i < pslab->num_entries; i++) {
+         struct iris_bo *bo = &slab->entries[i];
+         if (bo->aux_map_address) {
+            intel_aux_map_unmap_range(aux_map_ctx, bo->address, bo->size);
+            bo->aux_map_address = 0;
+         }
+      }
+   }
+
+   iris_bo_unreference(slab->bo);
+
+   free(slab->entries);
+   free(slab);
+}
+
+static struct pb_slab *
+iris_slab_alloc(void *priv,
+                unsigned heap,
+                unsigned entry_size,
+                unsigned group_index)
+{
+   struct iris_bufmgr *bufmgr = priv;
+   struct iris_slab *slab = calloc(1, sizeof(struct iris_slab));
+   unsigned flags = heap == IRIS_HEAP_SYSTEM_MEMORY ? BO_ALLOC_SMEM : 0;
+   unsigned slab_size = 0;
+   /* We only support slab allocation for IRIS_MEMZONE_OTHER */
+   enum iris_memory_zone memzone = IRIS_MEMZONE_OTHER;
+
+   if (!slab)
+      return NULL;
+
+   struct pb_slabs *slabs = bufmgr->bo_slabs;
+
+   /* Determine the slab buffer size. */
+   for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
+      unsigned max_entry_size =
+         1 << (slabs[i].min_order + slabs[i].num_orders - 1);
+
+      if (entry_size <= max_entry_size) {
+         /* The slab size is twice the size of the largest possible entry. */
+         slab_size = max_entry_size * 2;
+
+         if (!util_is_power_of_two_nonzero(entry_size)) {
+            assert(util_is_power_of_two_nonzero(entry_size * 4 / 3));
+
+            /* If the entry size is 3/4 of a power of two, we would waste
+             * space and not gain anything if we allocated only twice the
+             * power of two for the backing buffer:
+             *
+             *    2 * 3/4 = 1.5 usable with buffer size 2
+             *
+             * Allocating 5 times the entry size leads us to the next power
+             * of two and results in a much better memory utilization:
+             *
+             *    5 * 3/4 = 3.75 usable with buffer size 4
+             */
+            if (entry_size * 5 > slab_size)
+               slab_size = util_next_power_of_two(entry_size * 5);
+         }
+
+         /* The largest slab should have the same size as the PTE fragment
+          * size to get faster address translation.
+          *
+          * TODO: move this to intel_device_info?
+          */
+         const unsigned pte_size = 2 * 1024 * 1024;
+
+         if (i == NUM_SLAB_ALLOCATORS - 1 && slab_size < pte_size)
+            slab_size = pte_size;
+
+         break;
+      }
+   }
+   assert(slab_size != 0);
+
+   slab->bo =
+      iris_bo_alloc(bufmgr, "slab", slab_size, slab_size, memzone, flags);
+   if (!slab->bo)
+      goto fail;
+
+   slab_size = slab->bo->size;
+
+   slab->base.num_entries = slab_size / entry_size;
+   slab->base.num_free = slab->base.num_entries;
+   slab->entry_size = entry_size;
+   slab->entries = calloc(slab->base.num_entries, sizeof(*slab->entries));
+   if (!slab->entries)
+      goto fail_bo;
+
+   list_inithead(&slab->base.free);
+
+   for (unsigned i = 0; i < slab->base.num_entries; i++) {
+      struct iris_bo *bo = &slab->entries[i];
+
+      bo->size = entry_size;
+      bo->bufmgr = bufmgr;
+      bo->hash = _mesa_hash_pointer(bo);
+      bo->gem_handle = 0;
+      bo->address = slab->bo->address + i * entry_size;
+      bo->aux_map_address = 0;
+      bo->index = -1;
+      bo->refcount = 0;
+      bo->idle = true;
+
+      bo->slab.entry.slab = &slab->base;
+      bo->slab.entry.group_index = group_index;
+      bo->slab.entry.entry_size = entry_size;
+
+      bo->slab.real = iris_get_backing_bo(slab->bo);
+
+      list_addtail(&bo->slab.entry.head, &slab->base.free);
+   }
+
+   return &slab->base;
+
+fail_bo:
+   iris_bo_unreference(slab->bo);
+fail:
+   free(slab);
+   return NULL;
+}
+
+static struct iris_bo *
+alloc_bo_from_slabs(struct iris_bufmgr *bufmgr,
+                    const char *name,
+                    uint64_t size,
+                    uint32_t alignment,
+                    unsigned flags,
+                    bool local)
+{
+   if (flags & BO_ALLOC_NO_SUBALLOC)
+      return NULL;
+
+   struct pb_slabs *last_slab = &bufmgr->bo_slabs[NUM_SLAB_ALLOCATORS - 1];
+   unsigned max_slab_entry_size =
+      1 << (last_slab->min_order + last_slab->num_orders - 1);
+
+   if (size > max_slab_entry_size)
+      return NULL;
+
+   struct pb_slab_entry *entry;
+
+   enum iris_heap heap =
+      local ? IRIS_HEAP_DEVICE_LOCAL : IRIS_HEAP_SYSTEM_MEMORY;
+
+   unsigned alloc_size = size;
+
+   /* Always use slabs for sizes less than 4 KB because the kernel aligns
+    * everything to 4 KB.
+    */
+   if (size < alignment && alignment <= 4 * 1024)
+      alloc_size = alignment;
+
+   if (alignment > get_slab_entry_alignment(bufmgr, alloc_size)) {
+      /* 3/4 allocations can return too small alignment.
+       * Try again with a power of two allocation size.
+       */
+      unsigned pot_size = get_slab_pot_entry_size(bufmgr, alloc_size);
+
+      if (alignment <= pot_size) {
+         /* This size works but wastes some memory to fulfill the alignment. */
+         alloc_size = pot_size;
+      } else {
+         /* can't fulfill alignment requirements */
+         return NULL;
+      }
+   }
+
+   struct pb_slabs *slabs = get_slabs(bufmgr, alloc_size);
+   entry = pb_slab_alloc(slabs, alloc_size, heap);
+   if (!entry) {
+      /* Clean up and try again... */
+      pb_slabs_reclaim(slabs);
+
+      entry = pb_slab_alloc(slabs, alloc_size, heap);
+   }
+   if (!entry)
+      return NULL;
+
+   struct iris_bo *bo = container_of(entry, struct iris_bo, slab.entry);
+
+   if (bo->aux_map_address && bo->bufmgr->aux_map_ctx) {
+      /* This buffer was associated with an aux-buffer range.  We only allow
+       * slab allocated buffers to be reclaimed when idle (not in use by an
+       * executing batch).  (See iris_can_reclaim_slab().)  So we know that
+       * our previous aux mapping is no longer in use, and we can safely
+       * remove it.
+       */
+      intel_aux_map_unmap_range(bo->bufmgr->aux_map_ctx, bo->address,
+                                bo->size);
+      bo->aux_map_address = 0;
+   }
+
+   p_atomic_set(&bo->refcount, 1);
+   bo->name = name;
+   bo->size = size;
+
+   /* Zero the contents if necessary.  If this fails, fall back to
+    * allocating a fresh BO, which will always be zeroed by the kernel.
+    */
+   if (flags & BO_ALLOC_ZEROED) {
+      void *map = iris_bo_map(NULL, bo, MAP_WRITE | MAP_RAW);
+      if (map) {
+         memset(map, 0, bo->size);
+      } else {
+         pb_slab_free(slabs, &bo->slab.entry);
+         return NULL;
+      }
+   }
+
+   return bo;
 }
 
 static struct iris_bo *
@@ -444,10 +828,12 @@ alloc_bo_from_cache(struct iris_bufmgr *bufmgr,
    struct iris_bo *bo = NULL;
 
    list_for_each_entry_safe(struct iris_bo, cur, &bucket->head, head) {
+      assert(iris_bo_is_real(cur));
+
       /* Find one that's got the right mapping type.  We used to swap maps
        * around but the kernel doesn't allow this on discrete GPUs.
        */
-      if (mmap_mode != cur->mmap_mode)
+      if (mmap_mode != cur->real.mmap_mode)
          continue;
 
       /* Try a little harder to find one that's already in the right memzone */
@@ -575,7 +961,7 @@ alloc_fresh_bo(struct iris_bufmgr *bufmgr, uint64_t bo_size, bool local)
    bo->bufmgr = bufmgr;
    bo->size = bo_size;
    bo->idle = true;
-   bo->local = local;
+   bo->real.local = local;
 
    if (bufmgr->vram.size == 0) {
       /* Calling set_domain() will allocate pages for the BO outside of the
@@ -608,6 +994,14 @@ iris_bo_alloc(struct iris_bufmgr *bufmgr,
       !(flags & BO_ALLOC_COHERENT || flags & BO_ALLOC_SMEM);
    struct bo_cache_bucket *bucket = bucket_for_size(bufmgr, size, local);
 
+   if (memzone != IRIS_MEMZONE_OTHER || (flags & BO_ALLOC_COHERENT))
+      flags |= BO_ALLOC_NO_SUBALLOC;
+
+   bo = alloc_bo_from_slabs(bufmgr, name, size, alignment, flags, local);
+
+   if (bo)
+      return bo;
+
    /* Round the size up to the bucket size, or if we don't have caching
     * at this size, a multiple of the page size.
     */
@@ -617,8 +1011,9 @@ iris_bo_alloc(struct iris_bufmgr *bufmgr,
    bool is_coherent = bufmgr->has_llc ||
                       (bufmgr->vram.size > 0 && !local) ||
                       (flags & BO_ALLOC_COHERENT);
+   bool is_scanout = (flags & BO_ALLOC_SCANOUT) != 0;
    enum iris_mmap_mode mmap_mode =
-      !local && is_coherent ? IRIS_MMAP_WB : IRIS_MMAP_WC;
+      !local && is_coherent && !is_scanout ? IRIS_MMAP_WB : IRIS_MMAP_WC;
 
    simple_mtx_lock(&bufmgr->lock);
 
@@ -653,18 +1048,18 @@ iris_bo_alloc(struct iris_bufmgr *bufmgr,
 
    bo->name = name;
    p_atomic_set(&bo->refcount, 1);
-   bo->reusable = bucket && bufmgr->bo_reuse;
+   bo->real.reusable = bucket && bufmgr->bo_reuse;
    bo->index = -1;
-   bo->kflags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS | EXEC_OBJECT_PINNED;
+   bo->real.kflags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS | EXEC_OBJECT_PINNED;
 
    /* By default, capture all driver-internal buffers like shader kernels,
     * surface states, dynamic states, border colors, and so on.
     */
    if (memzone < IRIS_MEMZONE_OTHER)
-      bo->kflags |= EXEC_OBJECT_CAPTURE;
+      bo->real.kflags |= EXEC_OBJECT_CAPTURE;
 
-   assert(bo->map == NULL || bo->mmap_mode == mmap_mode);
-   bo->mmap_mode = mmap_mode;
+   assert(bo->real.map == NULL || bo->real.mmap_mode == mmap_mode);
+   bo->real.mmap_mode = mmap_mode;
 
    /* On integrated GPUs, enable snooping to ensure coherency if needed.
     * For discrete, we instead use SMEM and avoid WB maps for coherency.
@@ -678,11 +1073,11 @@ iris_bo_alloc(struct iris_bufmgr *bufmgr,
       if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_SET_CACHING, &arg) != 0)
          goto err_free;
 
-      bo->reusable = false;
+      bo->real.reusable = false;
    }
 
    DBG("bo_create: buf %d (%s) (%s memzone) (%s) %llub\n", bo->gem_handle,
-       bo->name, memzone_name(memzone), bo->local ? "local" : "system",
+       bo->name, memzone_name(memzone), bo->real.local ? "local" : "system",
        (unsigned long long) size);
 
    return bo;
@@ -725,10 +1120,10 @@ iris_bo_create_userptr(struct iris_bufmgr *bufmgr, const char *name,
 
    bo->name = name;
    bo->size = size;
-   bo->map = ptr;
+   bo->real.map = ptr;
 
    bo->bufmgr = bufmgr;
-   bo->kflags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS | EXEC_OBJECT_PINNED;
+   bo->real.kflags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS | EXEC_OBJECT_PINNED;
 
    simple_mtx_lock(&bufmgr->lock);
    bo->address = vma_alloc(bufmgr, memzone, size, 1);
@@ -738,10 +1133,10 @@ iris_bo_create_userptr(struct iris_bufmgr *bufmgr, const char *name,
       goto err_close;
 
    p_atomic_set(&bo->refcount, 1);
-   bo->userptr = true;
+   bo->real.userptr = true;
    bo->index = -1;
    bo->idle = true;
-   bo->mmap_mode = IRIS_MMAP_WB;
+   bo->real.mmap_mode = IRIS_MMAP_WB;
 
    return bo;
 
@@ -802,15 +1197,15 @@ iris_bo_gem_create_from_name(struct iris_bufmgr *bufmgr,
    bo->bufmgr = bufmgr;
    bo->gem_handle = open_arg.handle;
    bo->name = name;
-   bo->global_name = handle;
-   bo->reusable = false;
-   bo->imported = true;
-   bo->mmap_mode = IRIS_MMAP_NONE;
-   bo->kflags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS | EXEC_OBJECT_PINNED;
+   bo->real.global_name = handle;
+   bo->real.reusable = false;
+   bo->real.imported = true;
+   bo->real.mmap_mode = IRIS_MMAP_NONE;
+   bo->real.kflags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS | EXEC_OBJECT_PINNED;
    bo->address = vma_alloc(bufmgr, IRIS_MEMZONE_OTHER, bo->size, 1);
 
    _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo);
-   _mesa_hash_table_insert(bufmgr->name_table, &bo->global_name, bo);
+   _mesa_hash_table_insert(bufmgr->name_table, &bo->real.global_name, bo);
 
    DBG("bo_create_from_handle: %d (%s)\n", handle, bo->name);
 
@@ -824,18 +1219,21 @@ bo_close(struct iris_bo *bo)
 {
    struct iris_bufmgr *bufmgr = bo->bufmgr;
 
+   assert(iris_bo_is_real(bo));
+
    if (iris_bo_is_external(bo)) {
       struct hash_entry *entry;
 
-      if (bo->global_name) {
-         entry = _mesa_hash_table_search(bufmgr->name_table, &bo->global_name);
+      if (bo->real.global_name) {
+         entry = _mesa_hash_table_search(bufmgr->name_table,
+                                         &bo->real.global_name);
          _mesa_hash_table_remove(bufmgr->name_table, entry);
       }
 
       entry = _mesa_hash_table_search(bufmgr->handle_table, &bo->gem_handle);
       _mesa_hash_table_remove(bufmgr->handle_table, entry);
 
-      list_for_each_entry_safe(struct bo_export, export, &bo->exports, link) {
+      list_for_each_entry_safe(struct bo_export, export, &bo->real.exports, link) {
          struct drm_gem_close close = { .handle = export->gem_handle };
          intel_ioctl(export->drm_fd, DRM_IOCTL_GEM_CLOSE, &close);
 
@@ -843,7 +1241,7 @@ bo_close(struct iris_bo *bo)
          free(export);
       }
    } else {
-      assert(list_is_empty(&bo->exports));
+      assert(list_is_empty(&bo->real.exports));
    }
 
    /* Close this object */
@@ -862,6 +1260,14 @@ bo_close(struct iris_bo *bo)
    /* Return the VMA for reuse */
    vma_free(bo->bufmgr, bo->address, bo->size);
 
+   for (int d = 0; d < bo->deps_size; d++) {
+      for (int b = 0; b < IRIS_BATCH_COUNT; b++) {
+         iris_syncobj_reference(bufmgr, &bo->deps[d].write_syncobjs[b], NULL);
+         iris_syncobj_reference(bufmgr, &bo->deps[d].read_syncobjs[b], NULL);
+      }
+   }
+   free(bo->deps);
+
    free(bo);
 }
 
@@ -870,7 +1276,9 @@ bo_free(struct iris_bo *bo)
 {
    struct iris_bufmgr *bufmgr = bo->bufmgr;
 
-   if (!bo->userptr && bo->map)
+   assert(iris_bo_is_real(bo));
+
+   if (!bo->real.userptr && bo->real.map)
       bo_unmap(bo);
 
    if (bo->idle) {
@@ -896,7 +1304,7 @@ cleanup_bo_cache(struct iris_bufmgr *bufmgr, time_t time)
       struct bo_cache_bucket *bucket = &bufmgr->cache_bucket[i];
 
       list_for_each_entry_safe(struct iris_bo, bo, &bucket->head, head) {
-         if (time - bo->free_time <= 1)
+         if (time - bo->real.free_time <= 1)
             break;
 
          list_del(&bo->head);
@@ -909,7 +1317,7 @@ cleanup_bo_cache(struct iris_bufmgr *bufmgr, time_t time)
       struct bo_cache_bucket *bucket = &bufmgr->local_cache_bucket[i];
 
       list_for_each_entry_safe(struct iris_bo, bo, &bucket->head, head) {
-         if (time - bo->free_time <= 1)
+         if (time - bo->real.free_time <= 1)
             break;
 
          list_del(&bo->head);
@@ -940,12 +1348,14 @@ bo_unreference_final(struct iris_bo *bo, time_t time)
 
    DBG("bo_unreference final: %d (%s)\n", bo->gem_handle, bo->name);
 
+   assert(iris_bo_is_real(bo));
+
    bucket = NULL;
-   if (bo->reusable)
-      bucket = bucket_for_size(bufmgr, bo->size, bo->local);
+   if (bo->real.reusable)
+      bucket = bucket_for_size(bufmgr, bo->size, bo->real.local);
    /* Put the buffer into our internal cache for reuse if we can. */
    if (bucket && iris_bo_madvise(bo, I915_MADV_DONTNEED)) {
-      bo->free_time = time;
+      bo->real.free_time = time;
       bo->name = NULL;
 
       list_addtail(&bo->head, &bucket->head);
@@ -968,14 +1378,18 @@ iris_bo_unreference(struct iris_bo *bo)
 
       clock_gettime(CLOCK_MONOTONIC, &time);
 
-      simple_mtx_lock(&bufmgr->lock);
+      if (bo->gem_handle == 0) {
+         pb_slab_free(get_slabs(bufmgr, bo->size), &bo->slab.entry);
+      } else {
+         simple_mtx_lock(&bufmgr->lock);
 
-      if (p_atomic_dec_zero(&bo->refcount)) {
-         bo_unreference_final(bo, time.tv_sec);
-         cleanup_bo_cache(bufmgr, time.tv_sec);
+         if (p_atomic_dec_zero(&bo->refcount)) {
+            bo_unreference_final(bo, time.tv_sec);
+            cleanup_bo_cache(bufmgr, time.tv_sec);
+         }
+
+         simple_mtx_unlock(&bufmgr->lock);
       }
-
-      simple_mtx_unlock(&bufmgr->lock);
    }
 }
 
@@ -1022,12 +1436,14 @@ iris_bo_gem_mmap_legacy(struct pipe_debug_callback *dbg, struct iris_bo *bo)
    struct iris_bufmgr *bufmgr = bo->bufmgr;
 
    assert(bufmgr->vram.size == 0);
-   assert(bo->mmap_mode == IRIS_MMAP_WB || bo->mmap_mode == IRIS_MMAP_WC);
+   assert(iris_bo_is_real(bo));
+   assert(bo->real.mmap_mode == IRIS_MMAP_WB ||
+          bo->real.mmap_mode == IRIS_MMAP_WC);
 
    struct drm_i915_gem_mmap mmap_arg = {
       .handle = bo->gem_handle,
       .size = bo->size,
-      .flags = bo->mmap_mode == IRIS_MMAP_WC ? I915_MMAP_WC : 0,
+      .flags = bo->real.mmap_mode == IRIS_MMAP_WC ? I915_MMAP_WC : 0,
    };
 
    int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg);
@@ -1046,6 +1462,8 @@ iris_bo_gem_mmap_offset(struct pipe_debug_callback *dbg, struct iris_bo *bo)
 {
    struct iris_bufmgr *bufmgr = bo->bufmgr;
 
+   assert(iris_bo_is_real(bo));
+
    struct drm_i915_gem_mmap_offset mmap_arg = {
       .handle = bo->gem_handle,
    };
@@ -1061,10 +1479,10 @@ iris_bo_gem_mmap_offset(struct pipe_debug_callback *dbg, struct iris_bo *bo)
        * across PCIe, it's always snooped.  The only caching mode allowed by
        * DG1 hardware for LMEM is WC.
        */
-      if (bo->local)
-         assert(bo->mmap_mode == IRIS_MMAP_WC);
+      if (bo->real.local)
+         assert(bo->real.mmap_mode == IRIS_MMAP_WC);
       else
-         assert(bo->mmap_mode == IRIS_MMAP_WB);
+         assert(bo->real.mmap_mode == IRIS_MMAP_WB);
 
       mmap_arg.flags = I915_MMAP_OFFSET_FIXED;
    } else {
@@ -1074,9 +1492,9 @@ iris_bo_gem_mmap_offset(struct pipe_debug_callback *dbg, struct iris_bo *bo)
          [IRIS_MMAP_WC]    = I915_MMAP_OFFSET_WC,
          [IRIS_MMAP_WB]    = I915_MMAP_OFFSET_WB,
       };
-      assert(bo->mmap_mode != IRIS_MMAP_NONE);
-      assert(bo->mmap_mode < ARRAY_SIZE(mmap_offset_for_mode));
-      mmap_arg.flags = mmap_offset_for_mode[bo->mmap_mode];
+      assert(bo->real.mmap_mode != IRIS_MMAP_NONE);
+      assert(bo->real.mmap_mode < ARRAY_SIZE(mmap_offset_for_mode));
+      mmap_arg.flags = mmap_offset_for_mode[bo->real.mmap_mode];
    }
 
    /* Get the fake offset back */
@@ -1104,36 +1522,45 @@ iris_bo_map(struct pipe_debug_callback *dbg,
             struct iris_bo *bo, unsigned flags)
 {
    struct iris_bufmgr *bufmgr = bo->bufmgr;
+   void *map = NULL;
 
-   assert(bo->mmap_mode != IRIS_MMAP_NONE);
-   if (bo->mmap_mode == IRIS_MMAP_NONE)
-      return NULL;
-
-   if (!bo->map) {
-      DBG("iris_bo_map: %d (%s)\n", bo->gem_handle, bo->name);
-      void *map = bufmgr->has_mmap_offset ? iris_bo_gem_mmap_offset(dbg, bo)
-                                          : iris_bo_gem_mmap_legacy(dbg, bo);
-      if (!map) {
+   if (bo->gem_handle == 0) {
+      struct iris_bo *real = iris_get_backing_bo(bo);
+      uint64_t offset = bo->address - real->address;
+      map = iris_bo_map(dbg, real, flags | MAP_ASYNC) + offset;
+   } else {
+      assert(bo->real.mmap_mode != IRIS_MMAP_NONE);
+      if (bo->real.mmap_mode == IRIS_MMAP_NONE)
          return NULL;
-      }
 
-      VG_DEFINED(map, bo->size);
+      if (!bo->real.map) {
+         DBG("iris_bo_map: %d (%s)\n", bo->gem_handle, bo->name);
+         map = bufmgr->has_mmap_offset ? iris_bo_gem_mmap_offset(dbg, bo)
+                                       : iris_bo_gem_mmap_legacy(dbg, bo);
+         if (!map) {
+            return NULL;
+         }
 
-      if (p_atomic_cmpxchg(&bo->map, NULL, map)) {
-         VG_NOACCESS(map, bo->size);
-         os_munmap(map, bo->size);
+         VG_DEFINED(map, bo->size);
+
+         if (p_atomic_cmpxchg(&bo->real.map, NULL, map)) {
+            VG_NOACCESS(map, bo->size);
+            os_munmap(map, bo->size);
+         }
       }
+      assert(bo->real.map);
+      map = bo->real.map;
    }
-   assert(bo->map);
 
-   DBG("iris_bo_map: %d (%s) -> %p\n", bo->gem_handle, bo->name, bo->map);
+   DBG("iris_bo_map: %d (%s) -> %p\n",
+       bo->gem_handle, bo->name, bo->real.map);
    print_flags(flags);
 
    if (!(flags & MAP_ASYNC)) {
       bo_wait_with_stall_warning(dbg, bo, "memory mapping");
    }
 
-   return bo->map;
+   return map;
 }
 
 /** Waits for all GPU rendering with the object to have completed. */
@@ -1146,6 +1573,24 @@ iris_bo_wait_rendering(struct iris_bo *bo)
    iris_bo_wait(bo, -1);
 }
 
+static int
+iris_bo_wait_gem(struct iris_bo *bo, int64_t timeout_ns)
+{
+   assert(iris_bo_is_real(bo));
+
+   struct iris_bufmgr *bufmgr = bo->bufmgr;
+   struct drm_i915_gem_wait wait = {
+      .bo_handle = bo->gem_handle,
+      .timeout_ns = timeout_ns,
+   };
+
+   int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_WAIT, &wait);
+   if (ret != 0)
+      return -errno;
+
+   return 0;
+}
+
 /**
  * Waits on a BO for the given amount of time.
  *
@@ -1176,17 +1621,13 @@ iris_bo_wait_rendering(struct iris_bo *bo)
 int
 iris_bo_wait(struct iris_bo *bo, int64_t timeout_ns)
 {
-   struct iris_bufmgr *bufmgr = bo->bufmgr;
+   int ret;
 
-   /* If we know it's idle, don't bother with the kernel round trip */
-   if (bo->idle && !iris_bo_is_external(bo))
-      return 0;
+   if (iris_bo_is_external(bo))
+      ret = iris_bo_wait_gem(bo, timeout_ns);
+   else
+      ret = iris_bo_wait_syncobj(bo, timeout_ns);
 
-   struct drm_i915_gem_wait wait = {
-      .bo_handle = bo->gem_handle,
-      .timeout_ns = timeout_ns,
-   };
-   int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_WAIT, &wait);
    if (ret != 0)
       return -errno;
 
@@ -1204,7 +1645,13 @@ iris_bufmgr_destroy(struct iris_bufmgr *bufmgr)
    /* bufmgr will no longer try to free VMA entries in the aux-map */
    bufmgr->aux_map_ctx = NULL;
 
+   for (int i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
+      if (bufmgr->bo_slabs[i].groups)
+         pb_slabs_deinit(&bufmgr->bo_slabs[i]);
+   }
+
    simple_mtx_destroy(&bufmgr->lock);
+   simple_mtx_destroy(&bufmgr->bo_deps_lock);
 
    /* Free any cached buffer objects we were going to reuse */
    for (int i = 0; i < bufmgr->num_buckets; i++) {
@@ -1333,10 +1780,10 @@ iris_bo_import_dmabuf(struct iris_bufmgr *bufmgr, int prime_fd)
 
    bo->bufmgr = bufmgr;
    bo->name = "prime";
-   bo->reusable = false;
-   bo->imported = true;
-   bo->mmap_mode = IRIS_MMAP_NONE;
-   bo->kflags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS | EXEC_OBJECT_PINNED;
+   bo->real.reusable = false;
+   bo->real.imported = true;
+   bo->real.mmap_mode = IRIS_MMAP_NONE;
+   bo->real.kflags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS | EXEC_OBJECT_PINNED;
 
    /* From the Bspec, Memory Compression - Gfx12:
     *
@@ -1362,16 +1809,19 @@ iris_bo_import_dmabuf(struct iris_bufmgr *bufmgr, int prime_fd)
 static void
 iris_bo_mark_exported_locked(struct iris_bo *bo)
 {
+   /* We cannot export suballocated BOs. */
+   assert(iris_bo_is_real(bo));
+
    if (!iris_bo_is_external(bo))
       _mesa_hash_table_insert(bo->bufmgr->handle_table, &bo->gem_handle, bo);
 
-   if (!bo->exported) {
+   if (!bo->real.exported) {
       /* If a BO is going to be used externally, it could be sent to the
        * display HW. So make sure our CPU mappings don't assume cache
        * coherency since display is outside that cache.
        */
-      bo->exported = true;
-      bo->reusable = false;
+      bo->real.exported = true;
+      bo->real.reusable = false;
    }
 }
 
@@ -1380,8 +1830,11 @@ iris_bo_mark_exported(struct iris_bo *bo)
 {
    struct iris_bufmgr *bufmgr = bo->bufmgr;
 
-   if (bo->exported) {
-      assert(!bo->reusable);
+   /* We cannot export suballocated BOs. */
+   assert(iris_bo_is_real(bo));
+
+   if (bo->real.exported) {
+      assert(!bo->real.reusable);
       return;
    }
 
@@ -1395,6 +1848,9 @@ iris_bo_export_dmabuf(struct iris_bo *bo, int *prime_fd)
 {
    struct iris_bufmgr *bufmgr = bo->bufmgr;
 
+   /* We cannot export suballocated BOs. */
+   assert(iris_bo_is_real(bo));
+
    iris_bo_mark_exported(bo);
 
    if (drmPrimeHandleToFD(bufmgr->fd, bo->gem_handle,
@@ -1407,6 +1863,9 @@ iris_bo_export_dmabuf(struct iris_bo *bo, int *prime_fd)
 uint32_t
 iris_bo_export_gem_handle(struct iris_bo *bo)
 {
+   /* We cannot export suballocated BOs. */
+   assert(iris_bo_is_real(bo));
+
    iris_bo_mark_exported(bo);
 
    return bo->gem_handle;
@@ -1417,22 +1876,25 @@ iris_bo_flink(struct iris_bo *bo, uint32_t *name)
 {
    struct iris_bufmgr *bufmgr = bo->bufmgr;
 
-   if (!bo->global_name) {
+   /* We cannot export suballocated BOs. */
+   assert(iris_bo_is_real(bo));
+
+   if (!bo->real.global_name) {
       struct drm_gem_flink flink = { .handle = bo->gem_handle };
 
       if (intel_ioctl(bufmgr->fd, DRM_IOCTL_GEM_FLINK, &flink))
          return -errno;
 
       simple_mtx_lock(&bufmgr->lock);
-      if (!bo->global_name) {
+      if (!bo->real.global_name) {
          iris_bo_mark_exported_locked(bo);
-         bo->global_name = flink.name;
-         _mesa_hash_table_insert(bufmgr->name_table, &bo->global_name, bo);
+         bo->real.global_name = flink.name;
+         _mesa_hash_table_insert(bufmgr->name_table, &bo->real.global_name, bo);
       }
       simple_mtx_unlock(&bufmgr->lock);
    }
 
-   *name = bo->global_name;
+   *name = bo->real.global_name;
    return 0;
 }
 
@@ -1440,6 +1902,9 @@ int
 iris_bo_export_gem_handle_for_device(struct iris_bo *bo, int drm_fd,
                                      uint32_t *out_handle)
 {
+   /* We cannot export suballocated BOs. */
+   assert(iris_bo_is_real(bo));
+
    /* Only add the new GEM handle to the list of export if it belongs to a
     * different GEM device. Otherwise we might close the same buffer multiple
     * times.
@@ -1477,7 +1942,7 @@ iris_bo_export_gem_handle_for_device(struct iris_bo *bo, int drm_fd,
    }
 
    bool found = false;
-   list_for_each_entry(struct bo_export, iter, &bo->exports, link) {
+   list_for_each_entry(struct bo_export, iter, &bo->real.exports, link) {
       if (iter->drm_fd != drm_fd)
          continue;
       /* Here we assume that for a given DRM fd, we'll always get back the
@@ -1490,7 +1955,7 @@ iris_bo_export_gem_handle_for_device(struct iris_bo *bo, int drm_fd,
       break;
    }
    if (!found)
-      list_addtail(&export->link, &bo->exports);
+      list_addtail(&export->link, &bo->real.exports);
 
    simple_mtx_unlock(&bufmgr->lock);
 
@@ -1674,9 +2139,23 @@ intel_aux_map_buffer_alloc(void *driver_ctx, uint32_t size)
 
    struct iris_bufmgr *bufmgr = (struct iris_bufmgr *)driver_ctx;
 
-   struct iris_bo *bo =
-      iris_bo_alloc(bufmgr, "aux-map", size, 64 * 1024,
-                    IRIS_MEMZONE_OTHER, 0);
+   bool local = bufmgr->vram.size > 0;
+   unsigned int page_size = getpagesize();
+   size = MAX2(ALIGN(size, page_size), page_size);
+
+   struct iris_bo *bo = alloc_fresh_bo(bufmgr, size, local);
+
+   simple_mtx_lock(&bufmgr->lock);
+   bo->address = vma_alloc(bufmgr, IRIS_MEMZONE_OTHER, bo->size, 64 * 1024);
+   assert(bo->address != 0ull);
+   simple_mtx_unlock(&bufmgr->lock);
+
+   bo->name = "aux-map";
+   p_atomic_set(&bo->refcount, 1);
+   bo->index = -1;
+   bo->real.kflags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS | EXEC_OBJECT_PINNED |
+                     EXEC_OBJECT_CAPTURE;
+   bo->real.mmap_mode = local ? IRIS_MMAP_WC : IRIS_MMAP_WB;
 
    buf->driver_bo = bo;
    buf->gpu = bo->address;
@@ -1769,6 +2248,7 @@ iris_bufmgr_create(struct intel_device_info *devinfo, int fd, bool bo_reuse)
    p_atomic_set(&bufmgr->refcount, 1);
 
    simple_mtx_init(&bufmgr->lock, mtx_plain);
+   simple_mtx_init(&bufmgr->bo_deps_lock, mtx_plain);
 
    list_inithead(&bufmgr->zombie_list);
 
@@ -1817,6 +2297,28 @@ iris_bufmgr_create(struct intel_device_info *devinfo, int fd, bool bo_reuse)
    init_cache_buckets(bufmgr, false);
    init_cache_buckets(bufmgr, true);
 
+   unsigned min_slab_order = 8;  /* 256 bytes */
+   unsigned max_slab_order = 20; /* 1 MB (slab size = 2 MB) */
+   unsigned num_slab_orders_per_allocator =
+      (max_slab_order - min_slab_order) / NUM_SLAB_ALLOCATORS;
+
+   /* Divide the size order range among slab managers. */
+   for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
+      unsigned min_order = min_slab_order;
+      unsigned max_order =
+         MIN2(min_order + num_slab_orders_per_allocator, max_slab_order);
+
+      if (!pb_slabs_init(&bufmgr->bo_slabs[i], min_order, max_order,
+                         IRIS_HEAP_MAX, true, bufmgr,
+                         iris_can_reclaim_slab,
+                         iris_slab_alloc,
+                         (void *) iris_slab_free)) {
+         free(bufmgr);
+         return NULL;
+      }
+      min_slab_order = max_order + 1;
+   }
+
    bufmgr->name_table =
       _mesa_hash_table_create(NULL, _mesa_hash_uint, _mesa_key_uint_equal);
    bufmgr->handle_table =
@@ -1851,6 +2353,13 @@ iris_bufmgr_unref(struct iris_bufmgr *bufmgr)
    simple_mtx_unlock(&global_bufmgr_list_mutex);
 }
 
+/** Returns a new unique id, to be used by screens. */
+int
+iris_bufmgr_create_screen_id(struct iris_bufmgr *bufmgr)
+{
+   return p_atomic_inc_return(&bufmgr->next_screen_id) - 1;
+}
+
 /**
  * Gets an already existing GEM buffer manager or create a new one.
  *
@@ -1900,3 +2409,9 @@ iris_bufmgr_get_aux_map_context(struct iris_bufmgr *bufmgr)
 {
    return bufmgr->aux_map_ctx;
 }
+
+simple_mtx_t *
+iris_bufmgr_get_bo_deps_lock(struct iris_bufmgr *bufmgr)
+{
+   return &bufmgr->bo_deps_lock;
+}
diff --git a/mesa 3D driver/src/gallium/drivers/iris/iris_bufmgr.h b/mesa 3D driver/src/gallium/drivers/iris/iris_bufmgr.h
index 1f43456bae..010ce42431 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/iris_bufmgr.h	
+++ b/mesa 3D driver/src/gallium/drivers/iris/iris_bufmgr.h	
@@ -31,13 +31,16 @@
 #include "c11/threads.h"
 #include "util/macros.h"
 #include "util/u_atomic.h"
+#include "util/u_dynarray.h"
 #include "util/list.h"
+#include "util/simple_mtx.h"
 #include "pipe/p_defines.h"
+#include "pipebuffer/pb_slab.h"
 
-struct iris_batch;
 struct intel_device_info;
 struct pipe_debug_callback;
 struct isl_surf;
+struct iris_syncobj;
 
 /**
  * Memory zones.  When allocating a buffer, you can request that it is
@@ -103,8 +106,12 @@ enum iris_domain {
    IRIS_DOMAIN_RENDER_WRITE = 0,
    /** (Hi)Z/stencil cache. */
    IRIS_DOMAIN_DEPTH_WRITE,
+   /** Data port (HDC) cache. */
+   IRIS_DOMAIN_DATA_WRITE,
    /** Any other read-write cache. */
    IRIS_DOMAIN_OTHER_WRITE,
+   /** Vertex cache. */
+   IRIS_DOMAIN_VF_READ,
    /** Any other read-only cache. */
    IRIS_DOMAIN_OTHER_READ,
    /** Number of caching domains. */
@@ -119,7 +126,8 @@ enum iris_domain {
 static inline bool
 iris_domain_is_read_only(enum iris_domain access)
 {
-   return access == IRIS_DOMAIN_OTHER_READ;
+   return access == IRIS_DOMAIN_OTHER_READ ||
+          access == IRIS_DOMAIN_VF_READ;
 }
 
 enum iris_mmap_mode {
@@ -129,6 +137,13 @@ enum iris_mmap_mode {
    IRIS_MMAP_WB, /**< Write-back mapping with CPU caches enabled */
 };
 
+#define IRIS_BATCH_COUNT 2
+
+struct iris_bo_screen_deps {
+   struct iris_syncobj *write_syncobjs[IRIS_BATCH_COUNT];
+   struct iris_syncobj *read_syncobjs[IRIS_BATCH_COUNT];
+};
+
 struct iris_bo {
    /**
     * Size in bytes of the buffer object.
@@ -162,12 +177,13 @@ struct iris_bo {
    uint64_t aux_map_address;
 
    /**
-    * The validation list index for this buffer, or -1 when not in a batch.
-    * Note that a single buffer may be in multiple batches (contexts), and
-    * this is a global field, which refers to the last batch using the BO.
-    * It should not be considered authoritative, but can be used to avoid a
-    * linear walk of the validation list in the common case by guessing that
-    * exec_bos[bo->index] == bo and confirming whether that's the case.
+    * If this BO is referenced by a batch, this _may_ be the index into the
+    * batch->exec_bos[] list.
+    *
+    * Note that a single buffer may be used by multiple batches/contexts,
+    * and thus appear in multiple lists, but we only track one index here.
+    * In the common case one can guess that batch->exec_bos[bo->index] == bo
+    * and double check if that's true to avoid a linear list walk.
     *
     * XXX: this is not ideal now that we have more than one batch per context,
     * XXX: as the index will flop back and forth between the render index and
@@ -178,29 +194,9 @@ struct iris_bo {
    int refcount;
    const char *name;
 
-   uint64_t kflags;
-
-   /**
-    * Kernel-assigned global name for this object
-    *
-    * List contains both flink named and prime fd'd objects
-    */
-   unsigned global_name;
-
-   /** The mmap coherency mode selected at BO allocation time */
-   enum iris_mmap_mode mmap_mode;
-
-   time_t free_time;
-
-   /** Mapped address for the buffer, saved across map/unmap cycles */
-   void *map;
-
    /** BO cache list */
    struct list_head head;
 
-   /** List of GEM handle exports of this buffer (bo_export) */
-   struct list_head exports;
-
    /**
     * Synchronization sequence number of most recent access of this BO from
     * each caching domain.
@@ -213,6 +209,10 @@ struct iris_bo {
     */
    uint64_t last_seqnos[NUM_IRIS_DOMAINS] __attribute__ ((aligned (8)));
 
+   /** Up to one per screen, may need realloc. */
+   struct iris_bo_screen_deps *deps;
+   int deps_size;
+
    /**
     * Boolean of whether the GPU is definitely not accessing the buffer.
     *
@@ -222,31 +222,55 @@ struct iris_bo {
     */
    bool idle;
 
-   /**
-    * Boolean of whether this buffer can be re-used
-    */
-   bool reusable;
+   union {
+      struct {
+         uint64_t kflags;
 
-   /** Was this buffer imported from an external client? */
-   bool imported;
+         time_t free_time;
 
-   /** Has this buffer been exported to external clients? */
-   bool exported;
+         /** Mapped address for the buffer, saved across map/unmap cycles */
+         void *map;
 
-   /**
-    * Boolean of whether this buffer points into user memory
-    */
-   bool userptr;
+         /** List of GEM handle exports of this buffer (bo_export) */
+         struct list_head exports;
 
-   /**
-    * Boolean of whether this was allocated from local memory
-    */
-   bool local;
+         /**
+          * Kernel-assigned global name for this object
+          *
+          * List contains both flink named and prime fd'd objects
+          */
+         unsigned global_name;
+
+         /** The mmap coherency mode selected at BO allocation time */
+         enum iris_mmap_mode mmap_mode;
+
+         /** Was this buffer imported from an external client? */
+         bool imported;
+
+         /** Has this buffer been exported to external clients? */
+         bool exported;
+
+         /** Boolean of whether this buffer can be re-used */
+         bool reusable;
+
+         /** Boolean of whether this buffer points into user memory */
+         bool userptr;
+
+         /** Boolean of whether this was allocated from local memory */
+         bool local;
+      } real;
+      struct {
+         struct pb_slab_entry entry;
+         struct iris_bo *real;
+      } slab;
+   };
 };
 
-#define BO_ALLOC_ZEROED     (1<<0)
-#define BO_ALLOC_COHERENT   (1<<1)
-#define BO_ALLOC_SMEM       (1<<2)
+#define BO_ALLOC_ZEROED      (1<<0)
+#define BO_ALLOC_COHERENT    (1<<1)
+#define BO_ALLOC_SMEM        (1<<2)
+#define BO_ALLOC_SCANOUT     (1<<3)
+#define BO_ALLOC_NO_SUBALLOC (1<<4)
 
 /**
  * Allocate a buffer object.
@@ -330,13 +354,61 @@ void iris_bufmgr_unref(struct iris_bufmgr *bufmgr);
  */
 int iris_bo_flink(struct iris_bo *bo, uint32_t *name);
 
+/**
+ * Returns true if the BO is backed by a real GEM object, false if it's
+ * a wrapper that's suballocated from a larger BO.
+ */
+static inline bool
+iris_bo_is_real(struct iris_bo *bo)
+{
+   return bo->gem_handle != 0;
+}
+
+/**
+ * Unwrap any slab-allocated wrapper BOs to get the BO for the underlying
+ * backing storage, which is a real BO associated with a GEM object.
+ */
+static inline struct iris_bo *
+iris_get_backing_bo(struct iris_bo *bo)
+{
+   if (!iris_bo_is_real(bo))
+      bo = bo->slab.real;
+
+   /* We only allow one level of wrapping. */
+   assert(iris_bo_is_real(bo));
+
+   return bo;
+}
+
 /**
  * Is this buffer shared with external clients (imported or exported)?
  */
 static inline bool
 iris_bo_is_external(const struct iris_bo *bo)
 {
-   return bo->exported || bo->imported;
+   bo = iris_get_backing_bo((struct iris_bo *) bo);
+   return bo->real.exported || bo->real.imported;
+}
+
+static inline bool
+iris_bo_is_imported(const struct iris_bo *bo)
+{
+   bo = iris_get_backing_bo((struct iris_bo *) bo);
+   return bo->real.imported;
+}
+
+static inline bool
+iris_bo_is_exported(const struct iris_bo *bo)
+{
+   bo = iris_get_backing_bo((struct iris_bo *) bo);
+   return bo->real.exported;
+}
+
+static inline enum iris_mmap_mode
+iris_bo_mmap_mode(const struct iris_bo *bo)
+{
+   bo = iris_get_backing_bo((struct iris_bo *) bo);
+   return bo->real.mmap_mode;
 }
 
 /**
@@ -345,10 +417,10 @@ iris_bo_is_external(const struct iris_bo *bo)
 void iris_bo_mark_exported(struct iris_bo *bo);
 
 /**
- * Returns 1 if mapping the buffer for write could cause the process
+ * Returns true  if mapping the buffer for write could cause the process
  * to block, due to the object being active in the GPU.
  */
-int iris_bo_busy(struct iris_bo *bo);
+bool iris_bo_busy(struct iris_bo *bo);
 
 /**
  * Specify the volatility of the buffer.
@@ -364,7 +436,6 @@ int iris_bo_busy(struct iris_bo *bo);
  */
 int iris_bo_madvise(struct iris_bo *bo, int madv);
 
-/* drm_bacon_bufmgr_gem.c */
 struct iris_bufmgr *iris_bufmgr_get_for_fd(struct intel_device_info *devinfo,
                                            int fd, bool bo_reuse);
 int iris_bufmgr_get_fd(struct iris_bufmgr *bufmgr);
@@ -448,4 +519,8 @@ iris_bo_bump_seqno(struct iris_bo *bo, uint64_t seqno,
 
 enum iris_memory_zone iris_memzone_for_address(uint64_t address);
 
+int iris_bufmgr_create_screen_id(struct iris_bufmgr *bufmgr);
+
+simple_mtx_t *iris_bufmgr_get_bo_deps_lock(struct iris_bufmgr *bufmgr);
+
 #endif /* IRIS_BUFMGR_H */
diff --git a/mesa 3D driver/src/gallium/drivers/iris/iris_clear.c b/mesa 3D driver/src/gallium/drivers/iris/iris_clear.c
index dd0e7db53e..854b1b8c88 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/iris_clear.c	
+++ b/mesa 3D driver/src/gallium/drivers/iris/iris_clear.c	
@@ -74,7 +74,7 @@ can_fast_clear_color(struct iris_context *ice,
 {
    struct iris_resource *res = (void *) p_res;
 
-   if (INTEL_DEBUG & DEBUG_NO_FAST_CLEAR)
+   if (INTEL_DEBUG(DEBUG_NO_FAST_CLEAR))
       return false;
 
    if (!isl_aux_usage_has_fast_clears(res->aux.usage))
@@ -375,7 +375,6 @@ clear_color(struct iris_context *ice,
       return;
    }
 
-   bool color_write_disable[4] = { false, false, false, false };
    enum isl_aux_usage aux_usage =
       iris_resource_render_aux_usage(ice, res, level, format, false);
 
@@ -399,7 +398,7 @@ clear_color(struct iris_context *ice,
    blorp_clear(&blorp_batch, &surf, format, swizzle,
                level, box->z, box->depth, box->x, box->y,
                box->x + box->width, box->y + box->height,
-               color, color_write_disable);
+               color, 0 /* color_write_disable */);
 
    blorp_batch_finish(&blorp_batch);
    iris_batch_sync_region_end(batch);
@@ -425,7 +424,7 @@ can_fast_clear_depth(struct iris_context *ice,
    struct iris_screen *screen = (void *) ctx->screen;
    const struct intel_device_info *devinfo = &screen->devinfo;
 
-   if (INTEL_DEBUG & DEBUG_NO_FAST_CLEAR)
+   if (INTEL_DEBUG(DEBUG_NO_FAST_CLEAR))
       return false;
 
    /* Check for partial clears */
@@ -502,8 +501,6 @@ fast_clear_depth(struct iris_context *ice,
                           ISL_AUX_OP_FULL_RESOLVE, false);
             iris_resource_set_aux_state(ice, res, res_level, layer, 1,
                                         ISL_AUX_STATE_RESOLVED);
-            iris_emit_pipe_control_flush(batch, "hiz op: post depth resolve",
-                                         PIPE_CONTROL_TILE_CACHE_FLUSH);
          }
       }
       const union isl_color_value clear_value = { .f32 = {depth, } };
@@ -511,6 +508,23 @@ fast_clear_depth(struct iris_context *ice,
       update_clear_depth = true;
    }
 
+   if (res->aux.usage == ISL_AUX_USAGE_HIZ_CCS_WT) {
+      /* From Bspec 47010 (Depth Buffer Clear):
+       *
+       *    Since the fast clear cycles to CCS are not cached in TileCache,
+       *    any previous depth buffer writes to overlapping pixels must be
+       *    flushed out of TileCache before a succeeding Depth Buffer Clear.
+       *    This restriction only applies to Depth Buffer with write-thru
+       *    enabled, since fast clears to CCS only occur for write-thru mode.
+       *
+       * There may have been a write to this depth buffer. Flush it from the
+       * tile cache just in case.
+       */
+      iris_emit_pipe_control_flush(batch, "hiz_ccs_wt: before fast clear",
+                                   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                   PIPE_CONTROL_TILE_CACHE_FLUSH);
+   }
+
    for (unsigned l = 0; l < box->depth; l++) {
       enum isl_aux_state aux_state =
          iris_resource_get_aux_state(res, level, box->z + l);
@@ -618,8 +632,7 @@ clear_depth_stencil(struct iris_context *ice,
    blorp_batch_finish(&blorp_batch);
    iris_batch_sync_region_end(batch);
 
-   iris_flush_and_dirty_for_history(ice, batch, res,
-                                    PIPE_CONTROL_TILE_CACHE_FLUSH,
+   iris_flush_and_dirty_for_history(ice, batch, res, 0,
                                     "cache history: post slow ZS clear");
 
    if (clear_depth && z_res) {
@@ -704,12 +717,8 @@ iris_clear_texture(struct pipe_context *ctx,
 {
    struct iris_context *ice = (void *) ctx;
    struct iris_screen *screen = (void *) ctx->screen;
-   struct iris_resource *res = (void *) p_res;
    const struct intel_device_info *devinfo = &screen->devinfo;
 
-   if (iris_resource_unfinished_aux_import(res))
-      iris_resource_finish_aux_import(ctx->screen, res);
-
    if (util_format_is_depth_or_stencil(p_res->format)) {
       const struct util_format_unpack_description *unpack =
          util_format_unpack_description(p_res->format);
diff --git a/mesa 3D driver/src/gallium/drivers/iris/iris_context.c b/mesa 3D driver/src/gallium/drivers/iris/iris_context.c
index 5ce69c310c..2b3e312c9b 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/iris_context.c	
+++ b/mesa 3D driver/src/gallium/drivers/iris/iris_context.c	
@@ -212,7 +212,7 @@ iris_flush_dirty_dmabufs(struct iris_context *ice)
 /**
  * Destroy a context, freeing any associated memory.
  */
-static void
+void
 iris_destroy_context(struct pipe_context *ctx)
 {
    struct iris_context *ice = (struct iris_context *)ctx;
@@ -359,7 +359,7 @@ iris_create_context(struct pipe_screen *pscreen, void *priv, unsigned flags)
    if (flags & PIPE_CONTEXT_LOW_PRIORITY)
       priority = INTEL_CONTEXT_LOW_PRIORITY;
 
-   if (INTEL_DEBUG & DEBUG_BATCH)
+   if (INTEL_DEBUG(DEBUG_BATCH))
       ice->state.sizes = _mesa_hash_table_u64_create(ice);
 
    for (int i = 0; i < IRIS_BATCH_COUNT; i++) {
@@ -379,7 +379,5 @@ iris_create_context(struct pipe_screen *pscreen, void *priv, unsigned flags)
    return threaded_context_create(ctx, &screen->transfer_pool,
                                   iris_replace_buffer_storage,
                                   NULL, /* TODO: asynchronous flushes? */
-                                  NULL,
-                                  false,
                                   &ice->thrctx);
 }
diff --git a/mesa 3D driver/src/gallium/drivers/iris/iris_context.h b/mesa 3D driver/src/gallium/drivers/iris/iris_context.h
index 60d2cd2fcf..ef0e466849 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/iris_context.h	
+++ b/mesa 3D driver/src/gallium/drivers/iris/iris_context.h	
@@ -112,8 +112,13 @@ enum {
 #define IRIS_DIRTY_DEPTH_BOUNDS                   (1ull << 29)
 #define IRIS_DIRTY_RENDER_BUFFER                  (1ull << 30)
 #define IRIS_DIRTY_STENCIL_REF                    (1ull << 31)
+#define IRIS_DIRTY_VERTEX_BUFFER_FLUSHES          (1ull << 32)
+#define IRIS_DIRTY_RENDER_MISC_BUFFER_FLUSHES     (1ull << 33)
+#define IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES    (1ull << 34)
+#define IRIS_DIRTY_VFG                            (1ull << 35)
 
-#define IRIS_ALL_DIRTY_FOR_COMPUTE (IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES)
+#define IRIS_ALL_DIRTY_FOR_COMPUTE (IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES | \
+                                    IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES)
 
 #define IRIS_ALL_DIRTY_FOR_RENDER (~IRIS_ALL_DIRTY_FOR_COMPUTE)
 
@@ -150,6 +155,7 @@ enum {
 #define IRIS_STAGE_DIRTY_CONSTANTS_GS             (1ull << 21)
 #define IRIS_STAGE_DIRTY_CONSTANTS_FS             (1ull << 22)
 #define IRIS_STAGE_DIRTY_CONSTANTS_CS             (1ull << 23)
+#define IRIS_SHIFT_FOR_STAGE_DIRTY_BINDINGS       24
 #define IRIS_STAGE_DIRTY_BINDINGS_VS              (1ull << 24)
 #define IRIS_STAGE_DIRTY_BINDINGS_TCS             (1ull << 25)
 #define IRIS_STAGE_DIRTY_BINDINGS_TES             (1ull << 26)
@@ -398,11 +404,6 @@ struct iris_uncompiled_shader {
    /** Have any shader variants been compiled yet? */
    bool compiled_once;
 
-   /** Should we use ALT mode for math?  Useful for ARB programs. */
-   bool use_alt_mode;
-
-   bool needs_edge_flag;
-
    /* Whether shader uses atomic operations. */
    bool uses_atomic_load_store;
 
@@ -537,6 +538,7 @@ struct iris_shader_state {
 
    /** Bitfield of which constant buffers are bound (non-null). */
    uint32_t bound_cbufs;
+   uint32_t dirty_cbufs;
 
    /** Bitfield of which image views are bound (non-null). */
    uint32_t bound_image_views;
@@ -721,6 +723,7 @@ struct iris_context {
       /** Bitfield of which vertex buffers are bound (non-null). */
       uint64_t bound_vertex_buffers;
 
+      uint8_t patch_vertices;
       bool primitive_restart;
       unsigned cut_index;
       enum pipe_prim_type prim_mode:8;
@@ -838,7 +841,7 @@ struct iris_context {
 };
 
 #define perf_debug(dbg, ...) do {                      \
-   if (INTEL_DEBUG & DEBUG_PERF)                       \
+   if (INTEL_DEBUG(DEBUG_PERF))                        \
       dbg_printf(__VA_ARGS__);                         \
    if (unlikely(dbg))                                  \
       pipe_debug_message(dbg, PERF_INFO, __VA_ARGS__); \
@@ -846,6 +849,7 @@ struct iris_context {
 
 struct pipe_context *
 iris_create_context(struct pipe_screen *screen, void *priv, unsigned flags);
+void iris_destroy_context(struct pipe_context *ctx);
 
 void iris_lost_context_state(struct iris_batch *batch);
 
@@ -856,6 +860,7 @@ void iris_flush_dirty_dmabufs(struct iris_context *ice);
 void iris_init_blit_functions(struct pipe_context *ctx);
 void iris_init_clear_functions(struct pipe_context *ctx);
 void iris_init_program_functions(struct pipe_context *ctx);
+void iris_init_screen_program_functions(struct pipe_screen *pscreen);
 void iris_init_resource_functions(struct pipe_context *ctx);
 void iris_init_perfquery_functions(struct pipe_context *ctx);
 void iris_update_compiled_shaders(struct iris_context *ice);
@@ -1043,6 +1048,9 @@ void iris_predraw_resolve_inputs(struct iris_context *ice,
 void iris_predraw_resolve_framebuffer(struct iris_context *ice,
                                       struct iris_batch *batch,
                                       bool *draw_aux_buffer_disabled);
+void iris_predraw_flush_buffers(struct iris_context *ice,
+                                struct iris_batch *batch,
+                                gl_shader_stage stage);
 void iris_postdraw_update_resolve_tracking(struct iris_context *ice,
                                            struct iris_batch *batch);
 void iris_cache_flush_for_render(struct iris_batch *batch,
diff --git a/mesa 3D driver/src/gallium/drivers/iris/iris_disk_cache.c b/mesa 3D driver/src/gallium/drivers/iris/iris_disk_cache.c
index a62cf5db61..2ad12002a6 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/iris_disk_cache.c	
+++ b/mesa 3D driver/src/gallium/drivers/iris/iris_disk_cache.c	
@@ -265,7 +265,7 @@ void
 iris_disk_cache_init(struct iris_screen *screen)
 {
 #ifdef ENABLE_SHADER_CACHE
-   if (INTEL_DEBUG & DEBUG_DISK_CACHE_DISABLE_MASK)
+   if (INTEL_DEBUG(DEBUG_DISK_CACHE_DISABLE_MASK))
       return;
 
    /* array length = print length + nul char + 1 extra to verify it's unused */
diff --git a/mesa 3D driver/src/gallium/drivers/iris/iris_draw.c b/mesa 3D driver/src/gallium/drivers/iris/iris_draw.c
index b12f710923..a788112232 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/iris_draw.c	
+++ b/mesa 3D driver/src/gallium/drivers/iris/iris_draw.c	
@@ -65,6 +65,7 @@ iris_update_draw_info(struct iris_context *ice,
                       const struct pipe_draw_info *info)
 {
    struct iris_screen *screen = (struct iris_screen *)ice->ctx.screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
    const struct brw_compiler *compiler = screen->compiler;
 
    if (ice->state.prim_mode != info->mode) {
@@ -81,8 +82,8 @@ iris_update_draw_info(struct iris_context *ice,
    }
 
    if (info->mode == PIPE_PRIM_PATCHES &&
-       ice->state.vertices_per_patch != info->vertices_per_patch) {
-      ice->state.vertices_per_patch = info->vertices_per_patch;
+       ice->state.vertices_per_patch != ice->state.patch_vertices) {
+      ice->state.vertices_per_patch = ice->state.patch_vertices;
       ice->state.dirty |= IRIS_DIRTY_VF_TOPOLOGY;
 
       /* 8_PATCH TCS needs this for key->input_vertices */
@@ -105,8 +106,11 @@ iris_update_draw_info(struct iris_context *ice,
    if (ice->state.primitive_restart != info->primitive_restart ||
        ice->state.cut_index != cut_index) {
       ice->state.dirty |= IRIS_DIRTY_VF;
-      ice->state.primitive_restart = info->primitive_restart;
       ice->state.cut_index = cut_index;
+      ice->state.dirty |=
+         ((ice->state.primitive_restart != info->primitive_restart) &&
+          devinfo->verx10 >= 125) ? IRIS_DIRTY_VFG : 0;
+      ice->state.primitive_restart = info->primitive_restart;
    }
 }
 
@@ -187,10 +191,19 @@ iris_indirect_draw_vbo(struct iris_context *ice,
    struct pipe_draw_info info = *dinfo;
    struct pipe_draw_indirect_info indirect = *dindirect;
 
-   if (indirect.indirect_draw_count &&
-       ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT) {
-      /* Upload MI_PREDICATE_RESULT to GPR15.*/
-      batch->screen->vtbl.load_register_reg64(batch, CS_GPR(15), MI_PREDICATE_RESULT);
+   iris_emit_buffer_barrier_for(batch, iris_resource_bo(indirect.buffer),
+                                IRIS_DOMAIN_VF_READ);
+
+   if (indirect.indirect_draw_count) {
+      struct iris_bo *draw_count_bo =
+         iris_resource_bo(indirect.indirect_draw_count);
+      iris_emit_buffer_barrier_for(batch, draw_count_bo,
+                                   IRIS_DOMAIN_OTHER_READ);
+
+      if (ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT) {
+         /* Upload MI_PREDICATE_RESULT to GPR15.*/
+         batch->screen->vtbl.load_register_reg64(batch, CS_GPR(15), MI_PREDICATE_RESULT);
+      }
    }
 
    const uint64_t orig_dirty = ice->state.dirty;
@@ -262,7 +275,7 @@ iris_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info,
    if (ice->state.predicate == IRIS_PREDICATE_STATE_DONT_RENDER)
       return;
 
-   if (INTEL_DEBUG & DEBUG_REEMIT) {
+   if (INTEL_DEBUG(DEBUG_REEMIT)) {
       ice->state.dirty |= IRIS_ALL_DIRTY_FOR_RENDER;
       ice->state.stage_dirty |= IRIS_ALL_STAGE_DIRTY_FOR_RENDER;
    }
@@ -284,6 +297,11 @@ iris_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info,
       iris_predraw_resolve_framebuffer(ice, batch, draw_aux_buffer_disabled);
    }
 
+   if (ice->state.dirty & IRIS_DIRTY_RENDER_MISC_BUFFER_FLUSHES) {
+      for (gl_shader_stage stage = 0; stage < MESA_SHADER_COMPUTE; stage++)
+         iris_predraw_flush_buffers(ice, batch, stage);
+   }
+
    iris_binder_reserve_3d(ice);
 
    batch->screen->vtbl.update_surface_base_address(batch, &ice->state.binder);
@@ -368,7 +386,7 @@ iris_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info *grid)
    if (ice->state.predicate == IRIS_PREDICATE_STATE_DONT_RENDER)
       return;
 
-   if (INTEL_DEBUG & DEBUG_REEMIT) {
+   if (INTEL_DEBUG(DEBUG_REEMIT)) {
       ice->state.dirty |= IRIS_ALL_DIRTY_FOR_COMPUTE;
       ice->state.stage_dirty |= IRIS_ALL_STAGE_DIRTY_FOR_COMPUTE;
    }
@@ -376,6 +394,9 @@ iris_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info *grid)
    if (ice->state.dirty & IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES)
       iris_predraw_resolve_inputs(ice, batch, NULL, MESA_SHADER_COMPUTE, false);
 
+   if (ice->state.dirty & IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES)
+      iris_predraw_flush_buffers(ice, batch, MESA_SHADER_COMPUTE);
+
    iris_batch_maybe_flush(batch, 1500);
 
    iris_update_compiled_compute_shader(ice);
diff --git a/mesa 3D driver/src/gallium/drivers/iris/iris_fence.c b/mesa 3D driver/src/gallium/drivers/iris/iris_fence.c
index f6b10d5c6f..54cd5d8624 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/iris_fence.c	
+++ b/mesa 3D driver/src/gallium/drivers/iris/iris_fence.c	
@@ -63,14 +63,15 @@ gem_syncobj_destroy(int fd, uint32_t handle)
  * Make a new sync-point.
  */
 struct iris_syncobj *
-iris_create_syncobj(struct iris_screen *screen)
+iris_create_syncobj(struct iris_bufmgr *bufmgr)
 {
+   int fd = iris_bufmgr_get_fd(bufmgr);
    struct iris_syncobj *syncobj = malloc(sizeof(*syncobj));
 
    if (!syncobj)
       return NULL;
 
-   syncobj->handle = gem_syncobj_create(screen->fd, 0);
+   syncobj->handle = gem_syncobj_create(fd, 0);
    assert(syncobj->handle);
 
    pipe_reference_init(&syncobj->ref, 1);
@@ -79,12 +80,28 @@ iris_create_syncobj(struct iris_screen *screen)
 }
 
 void
-iris_syncobj_destroy(struct iris_screen *screen, struct iris_syncobj *syncobj)
+iris_syncobj_destroy(struct iris_bufmgr *bufmgr, struct iris_syncobj *syncobj)
 {
-   gem_syncobj_destroy(screen->fd, syncobj->handle);
+   int fd = iris_bufmgr_get_fd(bufmgr);
+   gem_syncobj_destroy(fd, syncobj->handle);
    free(syncobj);
 }
 
+void
+iris_syncobj_signal(struct iris_bufmgr *bufmgr, struct iris_syncobj *syncobj)
+{
+   int fd = iris_bufmgr_get_fd(bufmgr);
+   struct drm_syncobj_array args = {
+      .handles = (uintptr_t)&syncobj->handle,
+      .count_handles = 1,
+   };
+
+   if (intel_ioctl(fd, DRM_IOCTL_SYNCOBJ_SIGNAL, &args)) {
+      fprintf(stderr, "failed to signal syncobj %"PRIu32"\n",
+              syncobj->handle);
+   }
+}
+
 /**
  * Add a sync-point to the batch, with the given flags.
  *
@@ -107,7 +124,7 @@ iris_batch_add_syncobj(struct iris_batch *batch,
       util_dynarray_grow(&batch->syncobjs, struct iris_syncobj *, 1);
 
    *store = NULL;
-   iris_syncobj_reference(batch->screen, store, syncobj);
+   iris_syncobj_reference(batch->screen->bufmgr, store, syncobj);
 }
 
 /**
@@ -122,6 +139,7 @@ static void
 clear_stale_syncobjs(struct iris_batch *batch)
 {
    struct iris_screen *screen = batch->screen;
+   struct iris_bufmgr *bufmgr = screen->bufmgr;
 
    int n = util_dynarray_num_elements(&batch->syncobjs, struct iris_syncobj *);
 
@@ -137,13 +155,13 @@ clear_stale_syncobjs(struct iris_batch *batch)
                                struct drm_i915_gem_exec_fence, i);
       assert(fence->flags & I915_EXEC_FENCE_WAIT);
 
-      if (iris_wait_syncobj(&screen->base, *syncobj, 0))
+      if (iris_wait_syncobj(bufmgr, *syncobj, 0))
          continue;
 
       /* This sync object has already passed, there's no need to continue
        * marking it as a dependency; we can stop holding on to the reference.
        */
-      iris_syncobj_reference(screen, syncobj, NULL);
+      iris_syncobj_reference(bufmgr, syncobj, NULL);
 
       /* Remove it from the lists; move the last element here. */
       struct iris_syncobj **nth_syncobj =
@@ -194,20 +212,21 @@ iris_fence_reference(struct pipe_screen *p_screen,
 }
 
 bool
-iris_wait_syncobj(struct pipe_screen *p_screen,
+iris_wait_syncobj(struct iris_bufmgr *bufmgr,
                   struct iris_syncobj *syncobj,
                   int64_t timeout_nsec)
 {
    if (!syncobj)
       return false;
 
-   struct iris_screen *screen = (struct iris_screen *)p_screen;
+   int fd = iris_bufmgr_get_fd(bufmgr);
+
    struct drm_syncobj_wait args = {
       .handles = (uintptr_t)&syncobj->handle,
       .count_handles = 1,
       .timeout_nsec = timeout_nsec,
    };
-   return intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_WAIT, &args);
+   return intel_ioctl(fd, DRM_IOCTL_SYNCOBJ_WAIT, &args);
 }
 
 #define CSI "\e["
@@ -233,11 +252,11 @@ iris_fence_flush(struct pipe_context *ctx,
    if (flags & PIPE_FLUSH_END_OF_FRAME) {
       ice->frame++;
 
-      if (INTEL_DEBUG & DEBUG_SUBMIT) {
+      if (INTEL_DEBUG(DEBUG_SUBMIT)) {
          fprintf(stderr, "%s ::: FRAME %-10u (ctx %p)%-35c%s\n",
-                 (INTEL_DEBUG & DEBUG_COLOR) ? BLUE_HEADER : "",
+                 INTEL_DEBUG(DEBUG_COLOR) ? BLUE_HEADER : "",
                  ice->frame, ctx, ' ',
-                 (INTEL_DEBUG & DEBUG_COLOR) ? NORMAL : "");
+                 INTEL_DEBUG(DEBUG_COLOR) ? NORMAL : "");
       }
    }
 
diff --git a/mesa 3D driver/src/gallium/drivers/iris/iris_fence.h b/mesa 3D driver/src/gallium/drivers/iris/iris_fence.h
index 0696627796..d28c59ce5e 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/iris_fence.h	
+++ b/mesa 3D driver/src/gallium/drivers/iris/iris_fence.h	
@@ -27,8 +27,8 @@
 #include "util/u_inlines.h"
 
 struct pipe_screen;
-struct iris_screen;
 struct iris_batch;
+struct iris_bufmgr;
 
 /**
  * A refcounted DRM Sync Object (drm_syncobj).
@@ -38,24 +38,25 @@ struct iris_syncobj {
    uint32_t handle;
 };
 
-struct iris_syncobj *iris_create_syncobj(struct iris_screen *screen);
-void iris_syncobj_destroy(struct iris_screen *, struct iris_syncobj *);
+struct iris_syncobj *iris_create_syncobj(struct iris_bufmgr *bufmgr);
+void iris_syncobj_destroy(struct iris_bufmgr *, struct iris_syncobj *);
+void iris_syncobj_signal(struct iris_bufmgr *, struct iris_syncobj *);
 
 void iris_batch_add_syncobj(struct iris_batch *batch,
                             struct iris_syncobj *syncobj,
                             unsigned flags);
-bool iris_wait_syncobj(struct pipe_screen *screen,
+bool iris_wait_syncobj(struct iris_bufmgr *bufmgr,
                        struct iris_syncobj *syncobj,
                        int64_t timeout_nsec);
 
 static inline void
-iris_syncobj_reference(struct iris_screen *screen,
+iris_syncobj_reference(struct iris_bufmgr *bufmgr,
                        struct iris_syncobj **dst,
                        struct iris_syncobj *src)
 {
    if (pipe_reference(*dst ? &(*dst)->ref : NULL,
                       src ? &src->ref : NULL))
-      iris_syncobj_destroy(screen, *dst);
+      iris_syncobj_destroy(bufmgr, *dst);
 
    *dst = src;
 }
diff --git a/mesa 3D driver/src/gallium/drivers/iris/iris_fine_fence.c b/mesa 3D driver/src/gallium/drivers/iris/iris_fine_fence.c
index 0470389593..f057a64882 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/iris_fine_fence.c	
+++ b/mesa 3D driver/src/gallium/drivers/iris/iris_fine_fence.c	
@@ -36,7 +36,7 @@ void
 iris_fine_fence_destroy(struct iris_screen *screen,
                         struct iris_fine_fence *fine)
 {
-   iris_syncobj_reference(screen, &fine->syncobj, NULL);
+   iris_syncobj_reference(screen->bufmgr, &fine->syncobj, NULL);
    pipe_resource_reference(&fine->ref.res, NULL);
    free(fine);
 }
@@ -52,7 +52,7 @@ iris_fine_fence_new(struct iris_batch *batch, unsigned flags)
 
    fine->seqno = iris_fine_fence_next(batch);
 
-   iris_syncobj_reference(batch->screen, &fine->syncobj,
+   iris_syncobj_reference(batch->screen->bufmgr, &fine->syncobj,
                           iris_batch_get_signal_syncobj(batch));
 
    pipe_resource_reference(&fine->ref.res, batch->fine_fences.ref.res);
diff --git a/mesa 3D driver/src/gallium/drivers/iris/iris_genx_protos.h b/mesa 3D driver/src/gallium/drivers/iris/iris_genx_protos.h
index d6dddb1711..9dce089b0c 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/iris_genx_protos.h	
+++ b/mesa 3D driver/src/gallium/drivers/iris/iris_genx_protos.h	
@@ -34,6 +34,9 @@ void genX(emit_hashing_mode)(struct iris_context *ice,
                              struct iris_batch *batch,
                              unsigned width, unsigned height,
                              unsigned scale);
+void genX(emit_depth_state_workarounds)(struct iris_context *ice,
+                                        struct iris_batch *batch,
+                                        const struct isl_surf *surf);
 void genX(update_pma_fix)(struct iris_context *ice,
                           struct iris_batch *batch,
                           bool enable);
diff --git a/mesa 3D driver/src/gallium/drivers/iris/iris_pipe_control.c b/mesa 3D driver/src/gallium/drivers/iris/iris_pipe_control.c
index 97689513b7..df6814fd32 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/iris_pipe_control.c	
+++ b/mesa 3D driver/src/gallium/drivers/iris/iris_pipe_control.c	
@@ -190,13 +190,17 @@ iris_emit_buffer_barrier_for(struct iris_batch *batch,
    const uint32_t flush_bits[NUM_IRIS_DOMAINS] = {
       [IRIS_DOMAIN_RENDER_WRITE] = PIPE_CONTROL_RENDER_TARGET_FLUSH,
       [IRIS_DOMAIN_DEPTH_WRITE] = PIPE_CONTROL_DEPTH_CACHE_FLUSH,
+      [IRIS_DOMAIN_DATA_WRITE] = PIPE_CONTROL_DATA_CACHE_FLUSH,
       [IRIS_DOMAIN_OTHER_WRITE] = PIPE_CONTROL_FLUSH_ENABLE,
+      [IRIS_DOMAIN_VF_READ] = PIPE_CONTROL_STALL_AT_SCOREBOARD,
       [IRIS_DOMAIN_OTHER_READ] = PIPE_CONTROL_STALL_AT_SCOREBOARD,
    };
    const uint32_t invalidate_bits[NUM_IRIS_DOMAINS] = {
       [IRIS_DOMAIN_RENDER_WRITE] = PIPE_CONTROL_RENDER_TARGET_FLUSH,
       [IRIS_DOMAIN_DEPTH_WRITE] = PIPE_CONTROL_DEPTH_CACHE_FLUSH,
+      [IRIS_DOMAIN_DATA_WRITE] = PIPE_CONTROL_DATA_CACHE_FLUSH,
       [IRIS_DOMAIN_OTHER_WRITE] = PIPE_CONTROL_FLUSH_ENABLE,
+      [IRIS_DOMAIN_VF_READ] = PIPE_CONTROL_VF_CACHE_INVALIDATE,
       [IRIS_DOMAIN_OTHER_READ] = (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
                                   PIPE_CONTROL_CONST_CACHE_INVALIDATE),
    };
@@ -231,7 +235,7 @@ iris_emit_buffer_barrier_for(struct iris_batch *batch,
     * in order to handle any WaR dependencies.
     */
    if (!iris_domain_is_read_only(access)) {
-      for (unsigned i = IRIS_DOMAIN_OTHER_READ; i < NUM_IRIS_DOMAINS; i++) {
+      for (unsigned i = IRIS_DOMAIN_VF_READ; i < NUM_IRIS_DOMAINS; i++) {
          assert(iris_domain_is_read_only(i));
          const uint64_t seqno = READ_ONCE(bo->last_seqnos[i]);
 
@@ -349,7 +353,8 @@ iris_memory_barrier(struct pipe_context *ctx, unsigned flags)
 
    if (flags & (PIPE_BARRIER_TEXTURE | PIPE_BARRIER_FRAMEBUFFER)) {
       bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
-              PIPE_CONTROL_RENDER_TARGET_FLUSH;
+              PIPE_CONTROL_RENDER_TARGET_FLUSH |
+              PIPE_CONTROL_TILE_CACHE_FLUSH;
    }
 
    for (int i = 0; i < IRIS_BATCH_COUNT; i++) {
diff --git a/mesa 3D driver/src/gallium/drivers/iris/iris_program.c b/mesa 3D driver/src/gallium/drivers/iris/iris_program.c
index 85b65675e5..9e735ffe8b 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/iris_program.c	
+++ b/mesa 3D driver/src/gallium/drivers/iris/iris_program.c	
@@ -300,6 +300,55 @@ iris_lower_storage_image_derefs(nir_shader *nir)
    }
 }
 
+static bool
+iris_uses_image_atomic(const nir_shader *shader)
+{
+   nir_foreach_function(function, shader) {
+      if (function->impl == NULL)
+         continue;
+
+      nir_foreach_block(block, function->impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            switch (intrin->intrinsic) {
+            case nir_intrinsic_image_deref_atomic_add:
+            case nir_intrinsic_image_deref_atomic_imin:
+            case nir_intrinsic_image_deref_atomic_umin:
+            case nir_intrinsic_image_deref_atomic_imax:
+            case nir_intrinsic_image_deref_atomic_umax:
+            case nir_intrinsic_image_deref_atomic_and:
+            case nir_intrinsic_image_deref_atomic_or:
+            case nir_intrinsic_image_deref_atomic_xor:
+            case nir_intrinsic_image_deref_atomic_exchange:
+            case nir_intrinsic_image_deref_atomic_comp_swap:
+               unreachable("Should have been lowered in "
+                           "iris_lower_storage_image_derefs");
+
+            case nir_intrinsic_image_atomic_add:
+            case nir_intrinsic_image_atomic_imin:
+            case nir_intrinsic_image_atomic_umin:
+            case nir_intrinsic_image_atomic_imax:
+            case nir_intrinsic_image_atomic_umax:
+            case nir_intrinsic_image_atomic_and:
+            case nir_intrinsic_image_atomic_or:
+            case nir_intrinsic_image_atomic_xor:
+            case nir_intrinsic_image_atomic_exchange:
+            case nir_intrinsic_image_atomic_comp_swap:
+               return true;
+
+            default:
+               break;
+            }
+         }
+      }
+   }
+
+   return false;
+}
+
 /**
  * Undo nir_lower_passthrough_edgeflags but keep the inputs_read flag.
  */
@@ -967,7 +1016,7 @@ iris_setup_binding_table(const struct intel_device_info *devinfo,
    }
    bt->size_bytes = next * 4;
 
-   if (INTEL_DEBUG & DEBUG_BT) {
+   if (INTEL_DEBUG(DEBUG_BT)) {
       iris_print_binding_table(stderr, gl_shader_stage_name(info->stage), bt);
    }
 
@@ -1279,7 +1328,7 @@ iris_compile_vs(struct iris_screen *screen,
       nir_shader_gather_info(nir, impl);
    }
 
-   prog_data->use_alt_mode = ish->use_alt_mode;
+   prog_data->use_alt_mode = nir->info.is_arb_asm;
 
    iris_setup_uniforms(compiler, mem_ctx, nir, prog_data, 0, &system_values,
                        &num_system_values, &num_cbufs);
@@ -1898,7 +1947,7 @@ iris_compile_fs(struct iris_screen *screen,
    nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir);
    const struct iris_fs_prog_key *const key = &shader->key.fs;
 
-   prog_data->use_alt_mode = ish->use_alt_mode;
+   prog_data->use_alt_mode = nir->info.is_arb_asm;
 
    iris_setup_uniforms(compiler, mem_ctx, nir, prog_data, 0, &system_values,
                        &num_system_values, &num_cbufs);
@@ -2320,62 +2369,9 @@ iris_get_scratch_space(struct iris_context *ice,
 
    struct iris_bo **bop = &ice->shaders.scratch_bos[encoded_size][stage];
 
-   /* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
-    *
-    *    "Scratch Space per slice is computed based on 4 sub-slices.  SW
-    *     must allocate scratch space enough so that each slice has 4
-    *     slices allowed."
-    *
-    * According to the other driver team, this applies to compute shaders
-    * as well.  This is not currently documented at all.
-    *
-    * This hack is no longer necessary on Gfx11+.
-    *
-    * For, Gfx11+, scratch space allocation is based on the number of threads
-    * in the base configuration.
-    */
-   unsigned subslice_total = screen->subslice_total;
-   if (devinfo->verx10 == 125)
-      subslice_total = 32;
-   else if (devinfo->ver == 12)
-      subslice_total = (devinfo->is_dg1 || devinfo->gt == 2 ? 6 : 2);
-   else if (devinfo->ver == 11)
-      subslice_total = 8;
-   else if (devinfo->ver < 11)
-      subslice_total = 4 * devinfo->num_slices;
-   assert(subslice_total >= screen->subslice_total);
-
    if (!*bop) {
-      unsigned scratch_ids_per_subslice = devinfo->max_cs_threads;
-
-      if (devinfo->ver >= 12) {
-         /* Same as ICL below, but with 16 EUs. */
-         scratch_ids_per_subslice = 16 * 8;
-      } else if (devinfo->ver == 11) {
-         /* The MEDIA_VFE_STATE docs say:
-          *
-          *    "Starting with this configuration, the Maximum Number of
-          *     Threads must be set to (#EU * 8) for GPGPU dispatches.
-          *
-          *     Although there are only 7 threads per EU in the configuration,
-          *     the FFTID is calculated as if there are 8 threads per EU,
-          *     which in turn requires a larger amount of Scratch Space to be
-          *     allocated by the driver."
-          */
-         scratch_ids_per_subslice = 8 * 8;
-      }
-
-      uint32_t max_threads[] = {
-         [MESA_SHADER_VERTEX]    = devinfo->max_vs_threads,
-         [MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads,
-         [MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads,
-         [MESA_SHADER_GEOMETRY]  = devinfo->max_gs_threads,
-         [MESA_SHADER_FRAGMENT]  = devinfo->max_wm_threads,
-         [MESA_SHADER_COMPUTE]   = scratch_ids_per_subslice * subslice_total,
-      };
-
-      uint32_t size = per_thread_scratch * max_threads[stage];
-
+      assert(stage < ARRAY_SIZE(devinfo->max_scratch_ids));
+      uint32_t size = per_thread_scratch * devinfo->max_scratch_ids[stage];
       *bop = iris_bo_alloc(bufmgr, "scratch", size, 1, IRIS_MEMZONE_SHADER, 0);
    }
 
@@ -2433,8 +2429,6 @@ iris_create_uncompiled_shader(struct iris_screen *screen,
                               nir_shader *nir,
                               const struct pipe_stream_output_info *so_info)
 {
-   const struct intel_device_info *devinfo = &screen->devinfo;
-
    struct iris_uncompiled_shader *ish =
       calloc(1, sizeof(struct iris_uncompiled_shader));
    if (!ish)
@@ -2444,15 +2438,7 @@ iris_create_uncompiled_shader(struct iris_screen *screen,
    list_inithead(&ish->variants);
    simple_mtx_init(&ish->lock, mtx_plain);
 
-   NIR_PASS(ish->needs_edge_flag, nir, iris_fix_edge_flags);
-
-   brw_preprocess_nir(screen->compiler, nir, NULL);
-
-   NIR_PASS_V(nir, brw_nir_lower_storage_image, devinfo,
-              &ish->uses_atomic_load_store);
-   NIR_PASS_V(nir, iris_lower_storage_image_derefs);
-
-   nir_sweep(nir);
+   ish->uses_atomic_load_store = iris_uses_image_atomic(nir);
 
    ish->program_id = get_new_program_id(screen);
    ish->nir = nir;
@@ -2461,10 +2447,6 @@ iris_create_uncompiled_shader(struct iris_screen *screen,
       update_so_info(&ish->stream_output, nir->info.outputs_written);
    }
 
-   /* Save this now before potentially dropping nir->info.name */
-   if (nir->info.name && strncmp(nir->info.name, "ARB", 3) == 0)
-      ish->use_alt_mode = true;
-
    if (screen->disk_cache) {
       /* Serialize the NIR to a binary blob that we can hash for the disk
        * cache.  Drop unnecessary information (like variable names)
@@ -2531,6 +2513,9 @@ iris_create_compute_state(struct pipe_context *ctx,
          iris_create_shader_variant(screen, NULL, IRIS_CACHE_CS,
                                     sizeof(key), &key);
 
+      /* Append our new variant to the shader's variant list. */
+      list_addtail(&shader->link, &ish->variants);
+
       if (!iris_disk_cache_retrieve(screen, uploader, ish, shader,
                                     &key, sizeof(key))) {
          iris_compile_cs(screen, uploader, &ice->dbg, ish, shader);
@@ -2835,7 +2820,7 @@ iris_bind_vs_state(struct pipe_context *ctx, void *state)
 
       if (ice->state.vs_uses_draw_params != uses_draw_params ||
           ice->state.vs_uses_derived_draw_params != uses_derived_draw_params ||
-          ice->state.vs_needs_edge_flag != ish->needs_edge_flag) {
+          ice->state.vs_needs_edge_flag != info->vs.needs_edge_flag) {
          ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS |
                              IRIS_DIRTY_VERTEX_ELEMENTS;
       }
@@ -2843,7 +2828,7 @@ iris_bind_vs_state(struct pipe_context *ctx, void *state)
       ice->state.vs_uses_draw_params = uses_draw_params;
       ice->state.vs_uses_derived_draw_params = uses_derived_draw_params;
       ice->state.vs_needs_sgvs_element = needs_sgvs_element;
-      ice->state.vs_needs_edge_flag = ish->needs_edge_flag;
+      ice->state.vs_needs_edge_flag = info->vs.needs_edge_flag;
    }
 
    bind_shader_state((void *) ctx, state, MESA_SHADER_VERTEX);
@@ -2859,10 +2844,13 @@ static void
 iris_bind_tes_state(struct pipe_context *ctx, void *state)
 {
    struct iris_context *ice = (struct iris_context *)ctx;
+   struct iris_screen *screen = (struct iris_screen *) ctx->screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
 
    /* Enabling/disabling optional stages requires a URB reconfiguration. */
    if (!!state != !!ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL])
-      ice->state.dirty |= IRIS_DIRTY_URB;
+      ice->state.dirty |= IRIS_DIRTY_URB | (devinfo->verx10 >= 125 ?
+                                            IRIS_DIRTY_VFG : 0);
 
    bind_shader_state((void *) ctx, state, MESA_SHADER_TESS_EVAL);
 }
@@ -2911,6 +2899,67 @@ iris_bind_cs_state(struct pipe_context *ctx, void *state)
    bind_shader_state((void *) ctx, state, MESA_SHADER_COMPUTE);
 }
 
+static char *
+iris_finalize_nir(struct pipe_screen *_screen, void *nirptr)
+{
+   struct iris_screen *screen = (struct iris_screen *)_screen;
+   struct nir_shader *nir = (struct nir_shader *) nirptr;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+
+   NIR_PASS_V(nir, iris_fix_edge_flags);
+
+   brw_preprocess_nir(screen->compiler, nir, NULL);
+
+   NIR_PASS_V(nir, brw_nir_lower_storage_image, devinfo);
+   NIR_PASS_V(nir, iris_lower_storage_image_derefs);
+
+   nir_sweep(nir);
+
+   return NULL;
+}
+
+static void
+iris_set_max_shader_compiler_threads(struct pipe_screen *pscreen,
+                                     unsigned max_threads)
+{
+   struct iris_screen *screen = (struct iris_screen *) pscreen;
+   util_queue_adjust_num_threads(&screen->shader_compiler_queue, max_threads);
+}
+
+static bool
+iris_is_parallel_shader_compilation_finished(struct pipe_screen *pscreen,
+                                             void *v_shader,
+                                             enum pipe_shader_type p_stage)
+{
+   struct iris_screen *screen = (struct iris_screen *) pscreen;
+
+   /* Threaded compilation is only used for the precompile.  If precompile is
+    * disabled, threaded compilation is "done."
+    */
+   if (!screen->precompile)
+      return true;
+
+   struct iris_uncompiled_shader *ish = v_shader;
+
+   /* When precompile is enabled, the first entry is the precompile variant.
+    * Check the ready fence of the precompile variant.
+    */
+   struct iris_compiled_shader *first =
+      list_first_entry(&ish->variants, struct iris_compiled_shader, link);
+
+   return util_queue_fence_is_signalled(&first->ready);
+}
+
+void
+iris_init_screen_program_functions(struct pipe_screen *pscreen)
+{
+   pscreen->is_parallel_shader_compilation_finished =
+      iris_is_parallel_shader_compilation_finished;
+   pscreen->set_max_shader_compiler_threads =
+      iris_set_max_shader_compiler_threads;
+   pscreen->finalize_nir = iris_finalize_nir;
+}
+
 void
 iris_init_program_functions(struct pipe_context *ctx)
 {
diff --git a/mesa 3D driver/src/gallium/drivers/iris/iris_query.c b/mesa 3D driver/src/gallium/drivers/iris/iris_query.c
index ee80b2ff2c..e79d23c702 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/iris_query.c	
+++ b/mesa 3D driver/src/gallium/drivers/iris/iris_query.c	
@@ -483,7 +483,7 @@ iris_destroy_query(struct pipe_context *ctx, struct pipe_query *p_query)
       iris_destroy_monitor_object(ctx, query->monitor);
       query->monitor = NULL;
    } else {
-      iris_syncobj_reference(screen, &query->syncobj, NULL);
+      iris_syncobj_reference(screen->bufmgr, &query->syncobj, NULL);
       screen->base.fence_reference(ctx->screen, &query->fence, NULL);
    }
    pipe_resource_reference(&query->query_state_ref.res, NULL);
@@ -612,7 +612,7 @@ iris_get_query_result(struct pipe_context *ctx,
    struct iris_screen *screen = (void *) ctx->screen;
    const struct intel_device_info *devinfo = &screen->devinfo;
 
-   if (unlikely(screen->no_hw)) {
+   if (unlikely(screen->devinfo.no_hw)) {
       result->u64 = 0;
       return true;
    }
@@ -632,7 +632,7 @@ iris_get_query_result(struct pipe_context *ctx,
 
       while (!READ_ONCE(q->map->snapshots_landed)) {
          if (wait)
-            iris_wait_syncobj(ctx->screen, q->syncobj, INT64_MAX);
+            iris_wait_syncobj(screen->bufmgr, q->syncobj, INT64_MAX);
          else
             return false;
       }
diff --git a/mesa 3D driver/src/gallium/drivers/iris/iris_resolve.c b/mesa 3D driver/src/gallium/drivers/iris/iris_resolve.c
index cbdfec64ca..50fc9a94df 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/iris_resolve.c	
+++ b/mesa 3D driver/src/gallium/drivers/iris/iris_resolve.c	
@@ -147,11 +147,10 @@ resolve_image_views(struct iris_context *ice,
                                       aux_usage, false);
       }
 
-      iris_emit_buffer_barrier_for(batch, res->bo, IRIS_DOMAIN_OTHER_READ);
+      iris_emit_buffer_barrier_for(batch, res->bo, IRIS_DOMAIN_DATA_WRITE);
    }
 }
 
-
 /**
  * \brief Resolve buffers before drawing.
  *
@@ -372,6 +371,50 @@ iris_cache_flush_for_render(struct iris_batch *batch,
    }
 }
 
+static void
+flush_ubos(struct iris_batch *batch,
+            struct iris_shader_state *shs)
+{
+   uint32_t cbufs = shs->dirty_cbufs & shs->bound_cbufs;
+
+   while (cbufs) {
+      const int i = u_bit_scan(&cbufs);
+      struct pipe_shader_buffer *cbuf = &shs->constbuf[i];
+      struct iris_resource *res = (void *)cbuf->buffer;
+      iris_emit_buffer_barrier_for(batch, res->bo, IRIS_DOMAIN_OTHER_READ);
+   }
+
+   shs->dirty_cbufs = 0;
+}
+
+static void
+flush_ssbos(struct iris_batch *batch,
+            struct iris_shader_state *shs)
+{
+   uint32_t ssbos = shs->bound_ssbos;
+
+   while (ssbos) {
+      const int i = u_bit_scan(&ssbos);
+      struct pipe_shader_buffer *ssbo = &shs->ssbo[i];
+      struct iris_resource *res = (void *)ssbo->buffer;
+      iris_emit_buffer_barrier_for(batch, res->bo, IRIS_DOMAIN_DATA_WRITE);
+   }
+}
+
+void
+iris_predraw_flush_buffers(struct iris_context *ice,
+                           struct iris_batch *batch,
+                           gl_shader_stage stage)
+{
+   struct iris_shader_state *shs = &ice->state.shaders[stage];
+
+   if (ice->state.stage_dirty & (IRIS_STAGE_DIRTY_CONSTANTS_VS << stage))
+      flush_ubos(batch, shs);
+
+   if (ice->state.stage_dirty & (IRIS_STAGE_DIRTY_BINDINGS_VS << stage))
+      flush_ssbos(batch, shs);
+}
+
 static void
 iris_resolve_color(struct iris_context *ice,
                    struct iris_batch *batch,
@@ -646,7 +689,7 @@ iris_has_invalid_primary(const struct iris_resource *res,
                          unsigned start_level, unsigned num_levels,
                          unsigned start_layer, unsigned num_layers)
 {
-   if (!res->aux.bo)
+   if (res->aux.usage == ISL_AUX_USAGE_NONE)
       return false;
 
    /* Clamp the level range to fit the resource */
@@ -675,7 +718,7 @@ iris_resource_prepare_access(struct iris_context *ice,
                              enum isl_aux_usage aux_usage,
                              bool fast_clear_supported)
 {
-   if (!res->aux.bo)
+   if (res->aux.usage == ISL_AUX_USAGE_NONE)
       return;
 
    /* We can't do resolves on the compute engine, so awkwardly, we have to
@@ -729,7 +772,7 @@ iris_resource_finish_write(struct iris_context *ice,
                            uint32_t start_layer, uint32_t num_layers,
                            enum isl_aux_usage aux_usage)
 {
-   if (!res->aux.bo)
+   if (res->aux.usage == ISL_AUX_USAGE_NONE)
       return;
 
    const uint32_t level_layers =
@@ -870,6 +913,8 @@ iris_image_view_aux_usage(struct iris_context *ice,
    if (!info)
       return ISL_AUX_USAGE_NONE;
 
+   const struct iris_screen *screen = (void *) ice->ctx.screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
    struct iris_resource *res = (void *) pview->resource;
 
    enum isl_format view_format = iris_image_view_get_format(ice, pview);
@@ -879,7 +924,11 @@ iris_image_view_aux_usage(struct iris_context *ice,
    bool uses_atomic_load_store =
       ice->shaders.uncompiled[info->stage]->uses_atomic_load_store;
 
-   if (aux_usage == ISL_AUX_USAGE_GFX12_CCS_E && !uses_atomic_load_store)
+   /* On GFX12, compressed surfaces supports non-atomic operations. GFX12HP and
+    * further, add support for all the operations.
+    */
+   if (aux_usage == ISL_AUX_USAGE_GFX12_CCS_E &&
+       (devinfo->verx10 >= 125 || !uses_atomic_load_store))
       return ISL_AUX_USAGE_GFX12_CCS_E;
 
    return ISL_AUX_USAGE_NONE;
diff --git a/mesa 3D driver/src/gallium/drivers/iris/iris_resource.c b/mesa 3D driver/src/gallium/drivers/iris/iris_resource.c
index b6cd0bcc03..8610a5531e 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/iris_resource.c	
+++ b/mesa 3D driver/src/gallium/drivers/iris/iris_resource.c	
@@ -87,6 +87,8 @@ modifier_is_supported(const struct intel_device_info *devinfo,
    case I915_FORMAT_MOD_Y_TILED:
       if (devinfo->ver <= 8 && (bind & PIPE_BIND_SCANOUT))
          return false;
+      if (devinfo->verx10 >= 125)
+         return false;
       break;
    case I915_FORMAT_MOD_Y_TILED_CCS:
       if (devinfo->ver <= 8 || devinfo->ver >= 12)
@@ -95,7 +97,9 @@ modifier_is_supported(const struct intel_device_info *devinfo,
    case I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS:
    case I915_FORMAT_MOD_Y_TILED_GEN12_MC_CCS:
    case I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC:
-      if (devinfo->ver != 12)
+      if (devinfo->verx10 != 120)
+         return false;
+      if (devinfo->display_ver != 12)
          return false;
       break;
    case DRM_FORMAT_MOD_INVALID:
@@ -106,6 +110,9 @@ modifier_is_supported(const struct intel_device_info *devinfo,
    /* Check remaining requirements. */
    switch (modifier) {
    case I915_FORMAT_MOD_Y_TILED_GEN12_MC_CCS:
+      if (INTEL_DEBUG(DEBUG_NO_RBC))
+         return false;
+
       if (pfmt != PIPE_FORMAT_BGRA8888_UNORM &&
           pfmt != PIPE_FORMAT_RGBA8888_UNORM &&
           pfmt != PIPE_FORMAT_BGRX8888_UNORM &&
@@ -122,7 +129,7 @@ modifier_is_supported(const struct intel_device_info *devinfo,
    case I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC:
    case I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS:
    case I915_FORMAT_MOD_Y_TILED_CCS: {
-      if (INTEL_DEBUG & DEBUG_NO_RBC)
+      if (INTEL_DEBUG(DEBUG_NO_RBC))
          return false;
 
       enum isl_format rt_format =
@@ -428,10 +435,20 @@ iris_resource_alloc_flags(const struct iris_screen *screen,
       break;
    }
 
+   /* Scanout and shared buffers need to be WC (shared because they might be
+    * used for scanout)
+    */
+   if (templ->bind & (PIPE_BIND_SCANOUT | PIPE_BIND_SHARED))
+      flags |= BO_ALLOC_SCANOUT;
+
    if (templ->flags & (PIPE_RESOURCE_FLAG_MAP_COHERENT |
                        PIPE_RESOURCE_FLAG_MAP_PERSISTENT))
       flags |= BO_ALLOC_SMEM;
 
+   if ((templ->bind & PIPE_BIND_SHARED) ||
+       util_format_get_num_planes(templ->format) > 1)
+      flags |= BO_ALLOC_NO_SUBALLOC;
+
    return flags;
 }
 
@@ -465,7 +482,7 @@ iris_alloc_resource(struct pipe_screen *pscreen,
    res->base.b.screen = pscreen;
    res->orig_screen = iris_pscreen_ref(pscreen);
    pipe_reference_init(&res->base.b.reference, 1);
-   threaded_resource_init(&res->base.b);
+   threaded_resource_init(&res->base.b, false, 0);
 
    res->aux.possible_usages = 1 << ISL_AUX_USAGE_NONE;
    res->aux.sampler_usages = 1 << ISL_AUX_USAGE_NONE;
@@ -618,6 +635,9 @@ iris_resource_configure_main(const struct iris_screen *screen,
 
    isl_surf_usage_flags_t usage = 0;
 
+   if (res->mod_info && res->mod_info->aux_usage == ISL_AUX_USAGE_NONE)
+      usage |= ISL_SURF_USAGE_DISABLE_AUX_BIT;
+
    if (templ->usage == PIPE_USAGE_STAGING)
       usage |= ISL_SURF_USAGE_STAGING_BIT;
 
@@ -713,37 +733,27 @@ iris_resource_configure_aux(struct iris_screen *screen,
 {
    const struct intel_device_info *devinfo = &screen->devinfo;
 
-   /* Try to create the auxiliary surfaces allowed by the modifier or by
-    * the user if no modifier is specified.
-    */
-   assert(!res->mod_info ||
-          res->mod_info->aux_usage == ISL_AUX_USAGE_NONE ||
-          res->mod_info->aux_usage == ISL_AUX_USAGE_CCS_E ||
-          res->mod_info->aux_usage == ISL_AUX_USAGE_GFX12_CCS_E ||
-          res->mod_info->aux_usage == ISL_AUX_USAGE_MC);
-
-   const bool has_mcs = !res->mod_info &&
+   const bool has_mcs =
       isl_surf_get_mcs_surf(&screen->isl_dev, &res->surf, &res->aux.surf);
 
-   const bool has_hiz = !res->mod_info && !(INTEL_DEBUG & DEBUG_NO_HIZ) &&
+   const bool has_hiz = !INTEL_DEBUG(DEBUG_NO_HIZ) &&
       isl_surf_get_hiz_surf(&screen->isl_dev, &res->surf, &res->aux.surf);
 
-   const bool has_ccs =
-      ((!res->mod_info && !(INTEL_DEBUG & DEBUG_NO_RBC)) ||
-       (res->mod_info && res->mod_info->aux_usage != ISL_AUX_USAGE_NONE)) &&
+   const bool has_ccs = !INTEL_DEBUG(DEBUG_NO_RBC) &&
       iris_get_ccs_surf(&screen->isl_dev, &res->surf, &res->aux.surf,
                         &res->aux.extra_aux.surf, 0);
 
-   /* Having both HIZ and MCS is impossible. */
-   assert(!has_mcs || !has_hiz);
-
-   if (res->mod_info && has_ccs) {
-      /* Only allow a CCS modifier if the aux was created successfully. */
-      res->aux.possible_usages |= 1 << res->mod_info->aux_usage;
-   } else if (has_mcs) {
-      res->aux.possible_usages |=
-         1 << (has_ccs ? ISL_AUX_USAGE_MCS_CCS : ISL_AUX_USAGE_MCS);
+   if (has_mcs) {
+      assert(!res->mod_info);
+      assert(!has_hiz);
+      if (has_ccs) {
+         res->aux.possible_usages |= 1 << ISL_AUX_USAGE_MCS_CCS;
+      } else {
+         res->aux.possible_usages |= 1 << ISL_AUX_USAGE_MCS;
+      }
    } else if (has_hiz) {
+      assert(!res->mod_info);
+      assert(!has_mcs);
       if (!has_ccs) {
          res->aux.possible_usages |= 1 << ISL_AUX_USAGE_HIZ;
       } else if (res->surf.samples == 1 &&
@@ -756,10 +766,12 @@ iris_resource_configure_aux(struct iris_screen *screen,
       } else {
          res->aux.possible_usages |= 1 << ISL_AUX_USAGE_HIZ_CCS;
       }
-   } else if (has_ccs && isl_surf_usage_is_stencil(res->surf.usage)) {
-      res->aux.possible_usages |= 1 << ISL_AUX_USAGE_STC_CCS;
    } else if (has_ccs) {
-      if (want_ccs_e_for_format(devinfo, res->surf.format)) {
+      if (res->mod_info) {
+         res->aux.possible_usages |= 1 << res->mod_info->aux_usage;
+      } else if (isl_surf_usage_is_stencil(res->surf.usage)) {
+         res->aux.possible_usages |= 1 << ISL_AUX_USAGE_STC_CCS;
+      } else if (want_ccs_e_for_format(devinfo, res->surf.format)) {
          res->aux.possible_usages |= devinfo->ver < 12 ?
             1 << ISL_AUX_USAGE_CCS_E : 1 << ISL_AUX_USAGE_GFX12_CCS_E;
       } else if (isl_format_supports_ccs_d(devinfo, res->surf.format)) {
@@ -890,12 +902,11 @@ import_aux_info(struct iris_resource *res,
    res->aux.offset = aux_res->aux.offset;
 }
 
-void
+static void
 iris_resource_finish_aux_import(struct pipe_screen *pscreen,
                                 struct iris_resource *res)
 {
    struct iris_screen *screen = (struct iris_screen *)pscreen;
-   assert(iris_resource_unfinished_aux_import(res));
 
    /* Create an array of resources. Combining main and aux planes is easier
     * with indexing as opposed to scanning the linked list.
@@ -921,18 +932,28 @@ iris_resource_finish_aux_import(struct pipe_screen *pscreen,
    }
 
    /* Combine main and aux plane information. */
-   if (num_main_planes == 1 && num_planes == 2) {
+   switch (res->mod_info->modifier) {
+   case I915_FORMAT_MOD_Y_TILED_CCS:
+   case I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS:
+      assert(num_main_planes == 1 && num_planes == 2);
       import_aux_info(r[0], r[1]);
       map_aux_addresses(screen, r[0], format, 0);
 
-      /* Add on a clear color BO. */
+      /* Add on a clear color BO.
+       *
+       * Also add some padding to make sure the fast clear color state buffer
+       * starts at a 4K alignment to avoid some unknown issues.  See the
+       * matching comment in iris_resource_create_with_modifiers().
+       */
       if (iris_get_aux_clear_color_state_size(screen) > 0) {
          res->aux.clear_color_bo =
             iris_bo_alloc(screen->bufmgr, "clear color_buffer",
-                          iris_get_aux_clear_color_state_size(screen), 1,
+                          iris_get_aux_clear_color_state_size(screen), 4096,
                           IRIS_MEMZONE_OTHER, BO_ALLOC_ZEROED);
       }
-   } else if (num_main_planes == 1 && num_planes == 3) {
+      break;
+   case I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC:
+      assert(num_main_planes == 1 && num_planes == 3);
       import_aux_info(r[0], r[1]);
       map_aux_addresses(screen, r[0], format, 0);
 
@@ -941,18 +962,29 @@ iris_resource_finish_aux_import(struct pipe_screen *pscreen,
       r[0]->aux.clear_color_bo = r[2]->aux.clear_color_bo;
       r[0]->aux.clear_color_offset = r[2]->aux.clear_color_offset;
       r[0]->aux.clear_color_unknown = true;
-   } else if (num_main_planes == 2 && num_planes == 4) {
-      import_aux_info(r[0], r[2]);
-      import_aux_info(r[1], r[3]);
-      map_aux_addresses(screen, r[0], format, 0);
-      map_aux_addresses(screen, r[1], format, 1);
-   } else {
-      /* Gallium has lowered a single main plane into two. */
-      assert(num_main_planes == 2 && num_planes == 3);
-      assert(isl_format_is_yuv(format) && !isl_format_is_planar(format));
-      import_aux_info(r[0], r[2]);
-      import_aux_info(r[1], r[2]);
-      map_aux_addresses(screen, r[0], format, 0);
+      break;
+   case I915_FORMAT_MOD_Y_TILED_GEN12_MC_CCS:
+      if (num_main_planes == 1 && num_planes == 2) {
+         import_aux_info(r[0], r[1]);
+         map_aux_addresses(screen, r[0], format, 0);
+      } else if (num_main_planes == 2 && num_planes == 4) {
+         import_aux_info(r[0], r[2]);
+         import_aux_info(r[1], r[3]);
+         map_aux_addresses(screen, r[0], format, 0);
+         map_aux_addresses(screen, r[1], format, 1);
+      } else {
+         /* Gallium has lowered a single main plane into two. */
+         assert(num_main_planes == 2 && num_planes == 3);
+         assert(isl_format_is_yuv(format) && !isl_format_is_planar(format));
+         import_aux_info(r[0], r[2]);
+         import_aux_info(r[1], r[2]);
+         map_aux_addresses(screen, r[0], format, 0);
+      }
+      assert(!isl_aux_usage_has_fast_clears(res->mod_info->aux_usage));
+      break;
+   default:
+      assert(res->mod_info->aux_usage == ISL_AUX_USAGE_NONE);
+      break;
    }
 }
 
@@ -1174,6 +1206,16 @@ mod_plane_is_clear_color(uint64_t modifier, uint32_t plane)
    }
 }
 
+static unsigned
+get_num_planes(const struct pipe_resource *resource)
+{
+   unsigned count = 0;
+   for (const struct pipe_resource *cur = resource; cur; cur = cur->next)
+      count++;
+
+   return count;
+}
+
 static struct pipe_resource *
 iris_resource_from_handle(struct pipe_screen *pscreen,
                           const struct pipe_resource *templ,
@@ -1242,6 +1284,12 @@ iris_resource_from_handle(struct pipe_screen *pscreen,
       res->bo = NULL;
    }
 
+   if (get_num_planes(&res->base.b) ==
+       iris_get_dmabuf_modifier_planes(pscreen, whandle->modifier,
+                                       whandle->format)) {
+      iris_resource_finish_aux_import(pscreen, res);
+   }
+
    return &res->base.b;
 
 fail:
@@ -1349,6 +1397,139 @@ iris_flush_resource(struct pipe_context *ctx, struct pipe_resource *resource)
    }
 }
 
+/**
+ * Reallocate a (non-external) resource into new storage, copying the data
+ * and modifying the original resource to point at the new storage.
+ *
+ * This is useful for e.g. moving a suballocated internal resource to a
+ * dedicated allocation that can be exported by itself.
+ */
+static void
+iris_reallocate_resource_inplace(struct iris_context *ice,
+                                 struct iris_resource *old_res,
+                                 unsigned new_bind_flag)
+{
+   struct pipe_screen *pscreen = ice->ctx.screen;
+
+   if (iris_bo_is_external(old_res->bo))
+      return;
+
+   assert(old_res->mod_info == NULL);
+   assert(old_res->bo == old_res->aux.bo || old_res->aux.bo == NULL);
+   assert(old_res->bo == old_res->aux.clear_color_bo ||
+          old_res->aux.clear_color_bo == NULL);
+   assert(old_res->external_format == PIPE_FORMAT_NONE);
+
+   struct pipe_resource templ = old_res->base.b;
+   templ.bind |= new_bind_flag;
+
+   struct iris_resource *new_res =
+      (void *) pscreen->resource_create(pscreen, &templ);
+
+   assert(iris_bo_is_real(new_res->bo));
+
+   struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER];
+
+   if (old_res->base.b.target == PIPE_BUFFER) {
+      struct pipe_box box = (struct pipe_box) {
+         .width = old_res->base.b.width0,
+         .height = 1,
+      };
+
+      iris_copy_region(&ice->blorp, batch, &new_res->base.b, 0, 0, 0, 0,
+                       &old_res->base.b, 0, &box);
+   } else {
+      for (unsigned l = 0; l <= templ.last_level; l++) {
+         struct pipe_box box = (struct pipe_box) {
+            .width = u_minify(templ.width0, l),
+            .height = u_minify(templ.height0, l),
+            .depth = util_num_layers(&templ, l),
+         };
+
+         iris_copy_region(&ice->blorp, batch, &new_res->base.b, 0, 0, 0, l,
+                          &old_res->base.b, l, &box);
+      }
+   }
+
+   iris_flush_resource(&ice->ctx, &new_res->base.b);
+
+   struct iris_bo *old_bo = old_res->bo;
+   struct iris_bo *old_aux_bo = old_res->aux.bo;
+   struct iris_bo *old_clear_color_bo = old_res->aux.clear_color_bo;
+
+   /* Replace the structure fields with the new ones */
+   old_res->base.b.bind = templ.bind;
+   old_res->bo = new_res->bo;
+   old_res->aux.surf = new_res->aux.surf;
+   old_res->aux.bo = new_res->aux.bo;
+   old_res->aux.offset = new_res->aux.offset;
+   old_res->aux.extra_aux.surf = new_res->aux.extra_aux.surf;
+   old_res->aux.extra_aux.offset = new_res->aux.extra_aux.offset;
+   old_res->aux.clear_color_bo = new_res->aux.clear_color_bo;
+   old_res->aux.clear_color_offset = new_res->aux.clear_color_offset;
+   old_res->aux.usage = new_res->aux.usage;
+   old_res->aux.possible_usages = new_res->aux.possible_usages;
+   old_res->aux.sampler_usages = new_res->aux.sampler_usages;
+
+   if (new_res->aux.state) {
+      assert(old_res->aux.state);
+      for (unsigned l = 0; l <= templ.last_level; l++) {
+         unsigned layers = util_num_layers(&templ, l);
+         for (unsigned z = 0; z < layers; z++) {
+            enum isl_aux_state aux =
+               iris_resource_get_aux_state(new_res, l, z);
+            iris_resource_set_aux_state(ice, old_res, l, z, 1, aux);
+         }
+      }
+   }
+
+   /* old_res now points at the new BOs, make new_res point at the old ones
+    * so they'll be freed when we unreference the resource below.
+    */
+   new_res->bo = old_bo;
+   new_res->aux.bo = old_aux_bo;
+   new_res->aux.clear_color_bo = old_clear_color_bo;
+
+   pipe_resource_reference((struct pipe_resource **)&new_res, NULL);
+}
+
+static void
+iris_resource_disable_suballoc_on_first_query(struct pipe_screen *pscreen,
+                                              struct pipe_context *ctx,
+                                              struct iris_resource *res)
+{
+   if (iris_bo_is_real(res->bo))
+      return;
+
+   assert(!(res->base.b.bind & PIPE_BIND_SHARED));
+
+   bool destroy_context;
+   if (ctx) {
+      ctx = threaded_context_unwrap_sync(ctx);
+      destroy_context = false;
+   } else {
+      /* We need to execute a blit on some GPU context, but the DRI layer
+       * often doesn't give us one.  So we have to invent a temporary one.
+       *
+       * We can't store a permanent context in the screen, as it would cause
+       * circular refcounting where screens reference contexts that reference
+       * resources, while resources reference screens...causing nothing to be
+       * freed.  So we just create and destroy a temporary one here.
+       */
+      ctx = iris_create_context(pscreen, NULL, 0);
+      destroy_context = true;
+   }
+
+   struct iris_context *ice = (struct iris_context *)ctx;
+
+   iris_reallocate_resource_inplace(ice, res, PIPE_BIND_SHARED);
+   assert(res->base.b.bind & PIPE_BIND_SHARED);
+
+   if (destroy_context)
+      iris_destroy_context(ctx);
+}
+
+
 static void
 iris_resource_disable_aux_on_first_query(struct pipe_resource *resource,
                                          unsigned usage)
@@ -1370,7 +1551,7 @@ iris_resource_disable_aux_on_first_query(struct pipe_resource *resource,
 
 static bool
 iris_resource_get_param(struct pipe_screen *pscreen,
-                        struct pipe_context *context,
+                        struct pipe_context *ctx,
                         struct pipe_resource *resource,
                         unsigned plane,
                         unsigned layer,
@@ -1387,12 +1568,12 @@ iris_resource_get_param(struct pipe_screen *pscreen,
    bool result;
    unsigned handle;
 
-   if (iris_resource_unfinished_aux_import(res))
-      iris_resource_finish_aux_import(pscreen, res);
+   iris_resource_disable_aux_on_first_query(resource, handle_usage);
+   iris_resource_disable_suballoc_on_first_query(pscreen, ctx, res);
 
    struct iris_bo *bo = wants_aux ? res->aux.bo : res->bo;
 
-   iris_resource_disable_aux_on_first_query(resource, handle_usage);
+   assert(iris_bo_is_real(bo));
 
    switch (param) {
    case PIPE_RESOURCE_PARAM_NPLANES:
@@ -1401,10 +1582,7 @@ iris_resource_get_param(struct pipe_screen *pscreen,
                                                   res->mod_info->modifier,
                                                   res->external_format);
       } else {
-         unsigned count = 0;
-         for (struct pipe_resource *cur = resource; cur; cur = cur->next)
-            count++;
-         *value = count;
+         *value = get_num_planes(&res->base.b);
       }
       return true;
    case PIPE_RESOURCE_PARAM_STRIDE:
@@ -1458,7 +1636,7 @@ iris_resource_get_param(struct pipe_screen *pscreen,
 
 static bool
 iris_resource_get_handle(struct pipe_screen *pscreen,
-                         struct pipe_context *unused_ctx,
+                         struct pipe_context *ctx,
                          struct pipe_resource *resource,
                          struct winsys_handle *whandle,
                          unsigned usage)
@@ -1468,9 +1646,10 @@ iris_resource_get_handle(struct pipe_screen *pscreen,
    bool mod_with_aux =
       res->mod_info && res->mod_info->aux_usage != ISL_AUX_USAGE_NONE;
 
-   /* if ctx is ever used, do ctx = threaded_context_unwrap_sync(ctx) */
-
    iris_resource_disable_aux_on_first_query(resource, usage);
+   iris_resource_disable_suballoc_on_first_query(pscreen, ctx, res);
+
+   assert(iris_bo_is_real(res->bo));
 
    struct iris_bo *bo;
    if (res->mod_info &&
@@ -1597,7 +1776,7 @@ iris_invalidate_resource(struct pipe_context *ctx,
    /* Otherwise, try and replace the backing storage with a new BO. */
 
    /* We can't reallocate memory we didn't allocate in the first place. */
-   if (res->bo->userptr)
+   if (res->bo->gem_handle && res->bo->real.userptr)
       return;
 
    struct iris_bo *old_bo = res->bo;
@@ -2012,9 +2191,6 @@ iris_transfer_map(struct pipe_context *ctx,
    struct iris_resource *res = (struct iris_resource *)resource;
    struct isl_surf *surf = &res->surf;
 
-   if (iris_resource_unfinished_aux_import(res))
-      iris_resource_finish_aux_import(ctx->screen, res);
-
    if (usage & PIPE_MAP_DISCARD_WHOLE_RESOURCE) {
       /* Replace the backing storage with a fresh buffer for non-async maps */
       if (!(usage & (PIPE_MAP_UNSYNCHRONIZED |
@@ -2044,7 +2220,7 @@ iris_transfer_map(struct pipe_context *ctx,
     * other devices that I915_GEM_MMAP cannot work with.
     */
    if ((usage & PIPE_MAP_DIRECTLY) &&
-       (surf->tiling != ISL_TILING_LINEAR || res->bo->imported))
+       (surf->tiling != ISL_TILING_LINEAR || iris_bo_is_imported(res->bo)))
       return NULL;
 
    bool map_would_stall = false;
@@ -2087,7 +2263,7 @@ iris_transfer_map(struct pipe_context *ctx,
    if (usage & PIPE_MAP_WRITE)
       util_range_add(&res->base.b, &res->valid_buffer_range, box->x, box->x + box->width);
 
-   if (res->bo->mmap_mode != IRIS_MMAP_NONE) {
+   if (iris_bo_mmap_mode(res->bo) != IRIS_MMAP_NONE) {
       /* GPU copies are not useful for buffer reads.  Instead of stalling to
        * read from the original buffer, we'd simply copy it to a temporary...
        * then stall (a bit longer) to read from that buffer.
@@ -2111,11 +2287,16 @@ iris_transfer_map(struct pipe_context *ctx,
        */
       if (!map_would_stall &&
           !isl_aux_usage_has_compression(res->aux.usage) &&
-          !((usage & PIPE_MAP_READ) && res->bo->mmap_mode != IRIS_MMAP_WB)) {
+          !((usage & PIPE_MAP_READ) &&
+            iris_bo_mmap_mode(res->bo) != IRIS_MMAP_WB)) {
          usage |= PIPE_MAP_DIRECTLY;
       }
    }
 
+   /* TODO: Teach iris_map_tiled_memcpy about Tile4... */
+   if (res->surf.tiling == ISL_TILING_4)
+      usage &= ~PIPE_MAP_DIRECTLY;
+
    if (!(usage & PIPE_MAP_DIRECTLY)) {
       /* If we need a synchronous mapping and the resource is busy, or needs
        * resolving, we copy to/from a linear temporary buffer using the GPU.
@@ -2245,20 +2426,20 @@ iris_texture_subdata(struct pipe_context *ctx,
 
    assert(resource->target != PIPE_BUFFER);
 
-   if (iris_resource_unfinished_aux_import(res))
-      iris_resource_finish_aux_import(ctx->screen, res);
-
    /* Just use the transfer-based path for linear buffers - it will already
     * do a direct mapping, or a simple linear staging buffer.
     *
     * Linear staging buffers appear to be better than tiled ones, too, so
     * take that path if we need the GPU to perform color compression, or
     * stall-avoidance blits.
+    *
+    * TODO: Teach isl_memcpy_linear_to_tiled about Tile4...
     */
    if (surf->tiling == ISL_TILING_LINEAR ||
+       surf->tiling == ISL_TILING_4 ||
        isl_aux_usage_has_compression(res->aux.usage) ||
        resource_is_busy(ice, res) ||
-       res->bo->mmap_mode == IRIS_MMAP_NONE) {
+       iris_bo_mmap_mode(res->bo) == IRIS_MMAP_NONE) {
       return u_default_texture_subdata(ctx, resource, level, usage, box,
                                        data, stride, layer_stride);
    }
@@ -2309,13 +2490,39 @@ void
 iris_dirty_for_history(struct iris_context *ice,
                        struct iris_resource *res)
 {
+   const uint64_t stages = res->bind_stages;
+   uint64_t dirty = 0ull;
    uint64_t stage_dirty = 0ull;
 
    if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
-      stage_dirty |= ((uint64_t)res->bind_stages)
-                        << IRIS_SHIFT_FOR_STAGE_DIRTY_CONSTANTS;
+      for (unsigned stage = 0; stage < MESA_SHADER_STAGES; stage++) {
+         if (stages & (1u << stage)) {
+            struct iris_shader_state *shs = &ice->state.shaders[stage];
+            shs->dirty_cbufs |= ~0u;
+         }
+      }
+      dirty |= IRIS_DIRTY_RENDER_MISC_BUFFER_FLUSHES |
+               IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES;
+      stage_dirty |= (stages << IRIS_SHIFT_FOR_STAGE_DIRTY_CONSTANTS);
    }
 
+   if (res->bind_history & (PIPE_BIND_SAMPLER_VIEW |
+                            PIPE_BIND_SHADER_IMAGE)) {
+      dirty |= IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES |
+               IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES;
+      stage_dirty |= (stages << IRIS_SHIFT_FOR_STAGE_DIRTY_BINDINGS);
+   }
+
+   if (res->bind_history & PIPE_BIND_SHADER_BUFFER) {
+      dirty |= IRIS_DIRTY_RENDER_MISC_BUFFER_FLUSHES |
+               IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES;
+      stage_dirty |= (stages << IRIS_SHIFT_FOR_STAGE_DIRTY_BINDINGS);
+   }
+
+   if (res->bind_history & PIPE_BIND_VERTEX_BUFFER)
+      dirty |= IRIS_DIRTY_VERTEX_BUFFER_FLUSHES;
+
+   ice->state.dirty |= dirty;
    ice->state.stage_dirty |= stage_dirty;
 }
 
@@ -2382,20 +2589,6 @@ iris_resource_set_clear_color(struct iris_context *ice,
    return false;
 }
 
-union isl_color_value
-iris_resource_get_clear_color(const struct iris_resource *res,
-                              struct iris_bo **clear_color_bo,
-                              uint64_t *clear_color_offset)
-{
-   assert(res->aux.bo);
-
-   if (clear_color_bo)
-      *clear_color_bo = res->aux.clear_color_bo;
-   if (clear_color_offset)
-      *clear_color_offset = res->aux.clear_color_offset;
-   return res->aux.clear_color;
-}
-
 static enum pipe_format
 iris_resource_get_internal_format(struct pipe_resource *p_res)
 {
diff --git a/mesa 3D driver/src/gallium/drivers/iris/iris_resource.h b/mesa 3D driver/src/gallium/drivers/iris/iris_resource.h
index 75888a86bd..1acef60bd5 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/iris_resource.h	
+++ b/mesa 3D driver/src/gallium/drivers/iris/iris_resource.h	
@@ -332,10 +332,6 @@ void iris_get_depth_stencil_resources(struct pipe_resource *res,
 bool iris_resource_set_clear_color(struct iris_context *ice,
                                    struct iris_resource *res,
                                    union isl_color_value color);
-union isl_color_value
-iris_resource_get_clear_color(const struct iris_resource *res,
-                              struct iris_bo **clear_color_bo,
-                              uint64_t *clear_color_offset);
 
 void iris_replace_buffer_storage(struct pipe_context *ctx,
                                  struct pipe_resource *dst,
@@ -492,16 +488,6 @@ enum isl_aux_usage iris_image_view_aux_usage(struct iris_context *ice,
 enum isl_format iris_image_view_get_format(struct iris_context *ice,
                                            const struct pipe_image_view *img);
 
-static inline bool
-iris_resource_unfinished_aux_import(struct iris_resource *res)
-{
-   return res->aux.bo == NULL && res->mod_info &&
-      res->mod_info->aux_usage != ISL_AUX_USAGE_NONE;
-}
-
-void iris_resource_finish_aux_import(struct pipe_screen *pscreen,
-                                     struct iris_resource *res);
-
 bool iris_has_invalid_primary(const struct iris_resource *res,
                               unsigned start_level, unsigned num_levels,
                               unsigned start_layer, unsigned num_layers);
diff --git a/mesa 3D driver/src/gallium/drivers/iris/iris_screen.c b/mesa 3D driver/src/gallium/drivers/iris/iris_screen.c
index 8ea97a2e2b..afc83fa614 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/iris_screen.c	
+++ b/mesa 3D driver/src/gallium/drivers/iris/iris_screen.c	
@@ -559,7 +559,7 @@ iris_get_compute_param(struct pipe_screen *pscreen,
       RET((uint64_t []) { 64 * 1024 });
 
    case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED:
-      RET((uint32_t []) { 0 });
+      RET((uint32_t []) { 1 });
 
    case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
       RET((uint32_t []) { BRW_SUBGROUP_SIZE });
@@ -651,38 +651,6 @@ iris_get_disk_shader_cache(struct pipe_screen *pscreen)
    return screen->disk_cache;
 }
 
-static void
-iris_set_max_shader_compiler_threads(struct pipe_screen *pscreen,
-                                     unsigned max_threads)
-{
-   struct iris_screen *screen = (struct iris_screen *) pscreen;
-   util_queue_adjust_num_threads(&screen->shader_compiler_queue, max_threads);
-}
-
-static bool
-iris_is_parallel_shader_compilation_finished(struct pipe_screen *pscreen,
-                                             void *v_shader,
-                                             enum pipe_shader_type p_stage)
-{
-   struct iris_screen *screen = (struct iris_screen *) pscreen;
-
-   /* Threaded compilation is only used for the precompile.  If precompile is
-    * disabled, threaded compilation is "done."
-    */
-   if (!screen->precompile)
-      return true;
-
-   struct iris_uncompiled_shader *ish = v_shader;
-
-   /* When precompile is enabled, the first entry is the precompile variant.
-    * Check the ready fence of the precompile variant.
-    */
-   struct iris_compiled_shader *first =
-      list_first_entry(&ish->variants, struct iris_compiled_shader, link);
-
-   return util_queue_fence_is_signalled(&first->ready);
-}
-
 static int
 iris_getparam(int fd, int param, int *value)
 {
@@ -737,7 +705,7 @@ iris_shader_perf_log(void *data, unsigned *id, const char *fmt, ...)
    va_list args;
    va_start(args, fmt);
 
-   if (INTEL_DEBUG & DEBUG_PERF) {
+   if (INTEL_DEBUG(DEBUG_PERF)) {
       va_list args_copy;
       va_copy(args_copy, args);
       vfprintf(stderr, fmt, args_copy);
@@ -768,7 +736,10 @@ iris_init_identifier_bo(struct iris_screen *screen)
    if (!bo_map)
       return false;
 
-   screen->workaround_bo->kflags |= EXEC_OBJECT_CAPTURE | EXEC_OBJECT_ASYNC;
+   assert(iris_bo_is_real(screen->workaround_bo));
+
+   screen->workaround_bo->real.kflags |=
+      EXEC_OBJECT_CAPTURE | EXEC_OBJECT_ASYNC;
    screen->workaround_address = (struct iris_address) {
       .bo = screen->workaround_bo,
       .offset = ALIGN(
@@ -804,7 +775,6 @@ iris_screen_create(int fd, const struct pipe_screen_config *config)
    if (!intel_get_device_info_from_fd(fd, &screen->devinfo))
       return NULL;
    screen->pci_id = screen->devinfo.chipset_id;
-   screen->no_hw = screen->devinfo.no_hw;
 
    p_atomic_set(&screen->refcount, 1);
 
@@ -831,12 +801,11 @@ iris_screen_create(int fd, const struct pipe_screen_config *config)
    screen->fd = iris_bufmgr_get_fd(screen->bufmgr);
    screen->winsys_fd = fd;
 
-   if (getenv("INTEL_NO_HW") != NULL)
-      screen->no_hw = true;
+   screen->id = iris_bufmgr_create_screen_id(screen->bufmgr);
 
    screen->workaround_bo =
       iris_bo_alloc(screen->bufmgr, "workaround", 4096, 1,
-                    IRIS_MEMZONE_OTHER, 0);
+                    IRIS_MEMZONE_OTHER, BO_ALLOC_NO_SUBALLOC);
    if (!screen->workaround_bo)
       return NULL;
 
@@ -874,9 +843,6 @@ iris_screen_create(int fd, const struct pipe_screen_config *config)
    slab_create_parent(&screen->transfer_pool,
                       sizeof(struct iris_transfer), 64);
 
-   screen->subslice_total = intel_device_info_subslice_total(&screen->devinfo);
-   assert(screen->subslice_total >= 1);
-
    iris_detect_kernel_features(screen);
 
    struct pipe_screen *pscreen = &screen->base;
@@ -904,8 +870,7 @@ iris_screen_create(int fd, const struct pipe_screen_config *config)
    pscreen->query_memory_info = iris_query_memory_info;
    pscreen->get_driver_query_group_info = iris_get_monitor_group_info;
    pscreen->get_driver_query_info = iris_get_monitor_info;
-   pscreen->is_parallel_shader_compilation_finished = iris_is_parallel_shader_compilation_finished;
-   pscreen->set_max_shader_compiler_threads = iris_set_max_shader_compiler_threads;
+   iris_init_screen_program_functions(pscreen);
 
    genX_call(&screen->devinfo, init_screen_state, screen);
 
diff --git a/mesa 3D driver/src/gallium/drivers/iris/iris_screen.h b/mesa 3D driver/src/gallium/drivers/iris/iris_screen.h
index a1c0588ecd..b789f6b1b3 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/iris_screen.h	
+++ b/mesa 3D driver/src/gallium/drivers/iris/iris_screen.h	
@@ -164,8 +164,6 @@ struct iris_screen {
    /** PCI ID for our GPU device */
    int pci_id;
 
-   bool no_hw;
-
    struct iris_vtable vtbl;
 
    /** Global program_string_id counter (see get_program_string_id()) */
@@ -187,8 +185,6 @@ struct iris_screen {
    unsigned kernel_features;
 #define KERNEL_HAS_WAIT_FOR_SUBMIT (1<<0)
 
-   unsigned subslice_total;
-
    uint64_t aperture_bytes;
 
    /**
@@ -225,6 +221,9 @@ struct iris_screen {
    struct disk_cache *disk_cache;
 
    struct intel_measure_device measure;
+
+   /** Every screen on a bufmgr has an unique ID assigned by the bufmgr. */
+   int id;
 };
 
 struct pipe_screen *
diff --git a/mesa 3D driver/src/gallium/drivers/iris/iris_state.c b/mesa 3D driver/src/gallium/drivers/iris/iris_state.c
index 9dfd6775d9..0cd8ce8ee9 100644
--- a/mesa 3D driver/src/gallium/drivers/iris/iris_state.c	
+++ b/mesa 3D driver/src/gallium/drivers/iris/iris_state.c	
@@ -382,8 +382,6 @@ emit_state(struct iris_batch *batch,
 static void
 flush_before_state_base_change(struct iris_batch *batch)
 {
-   const struct intel_device_info *devinfo = &batch->screen->devinfo;
-
    /* Flush before emitting STATE_BASE_ADDRESS.
     *
     * This isn't documented anywhere in the PRM.  However, it seems to be
@@ -409,18 +407,7 @@ flush_before_state_base_change(struct iris_batch *batch)
                               "change STATE_BASE_ADDRESS (flushes)",
                               PIPE_CONTROL_RENDER_TARGET_FLUSH |
                               PIPE_CONTROL_DEPTH_CACHE_FLUSH |
-                              PIPE_CONTROL_DATA_CACHE_FLUSH |
-                              /* Wa_1606662791:
-                               *
-                               *   Software must program PIPE_CONTROL command
-                               *   with "HDC Pipeline Flush" prior to
-                               *   programming of the below two non-pipeline
-                               *   state :
-                               *      * STATE_BASE_ADDRESS
-                               *      * 3DSTATE_BINDING_TABLE_POOL_ALLOC
-                               */
-                              ((GFX_VER == 12 && devinfo->revision == 0 /* A0 */ ?
-                                PIPE_CONTROL_FLUSH_HDC : 0)));
+                              PIPE_CONTROL_DATA_CACHE_FLUSH);
 }
 
 static void
@@ -925,6 +912,8 @@ gfx12_upload_pixel_hashing_tables(struct iris_batch *batch)
 static void
 iris_alloc_push_constants(struct iris_batch *batch)
 {
+   const struct intel_device_info *devinfo = &batch->screen->devinfo;
+
    /* For now, we set a static partitioning of the push constant area,
     * assuming that all stages could be in use.
     *
@@ -934,11 +923,17 @@ iris_alloc_push_constants(struct iris_batch *batch)
     *       enabling/disabling it like i965 does.  This would be more
     *       stalls and may not actually help; we don't know yet.
     */
+
+   /* Divide as equally as possible with any remainder given to FRAGMENT. */
+   const unsigned push_constant_kb = devinfo->max_constant_urb_size_kb;
+   const unsigned stage_size = push_constant_kb / 5;
+   const unsigned frag_size = push_constant_kb - 4 * stage_size;
+
    for (int i = 0; i <= MESA_SHADER_FRAGMENT; i++) {
       iris_emit_cmd(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
          alloc._3DCommandSubOpcode = 18 + i;
-         alloc.ConstantBufferOffset = 6 * i;
-         alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? 8 : 6;
+         alloc.ConstantBufferOffset = stage_size * i;
+         alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? frag_size : stage_size;
       }
    }
 }
@@ -1140,10 +1135,17 @@ struct iris_depth_buffer_state {
    uint32_t packets[GENX(3DSTATE_DEPTH_BUFFER_length) +
                     GENX(3DSTATE_STENCIL_BUFFER_length) +
                     GENX(3DSTATE_HIER_DEPTH_BUFFER_length) +
-                    GENX(3DSTATE_CLEAR_PARAMS_length) +
-                    GENX(MI_LOAD_REGISTER_IMM_length) * 2];
+                    GENX(3DSTATE_CLEAR_PARAMS_length)];
 };
 
+#if GFX_VERx10 == 120
+   enum iris_depth_reg_mode {
+      IRIS_DEPTH_REG_MODE_HW_DEFAULT = 0,
+      IRIS_DEPTH_REG_MODE_D16,
+      IRIS_DEPTH_REG_MODE_UNKNOWN,
+   };
+#endif
+
 /**
  * Generation-specific context state (ice->state.genx->...).
  *
@@ -1167,6 +1169,10 @@ struct iris_genx_state {
    bool object_preemption;
 #endif
 
+#if GFX_VERx10 == 120
+   enum iris_depth_reg_mode depth_reg_mode;
+#endif
+
    struct {
 #if GFX_VER == 8
       struct brw_image_param image_param[PIPE_MAX_SHADER_IMAGES];
@@ -2062,8 +2068,9 @@ iris_bind_sampler_states(struct pipe_context *ctx,
    bool dirty = false;
 
    for (int i = 0; i < count; i++) {
-      if (shs->samplers[start + i] != states[i]) {
-         shs->samplers[start + i] = states[i];
+      struct iris_sampler_state *state = states ? states[i] : NULL;
+      if (shs->samplers[start + i] != state) {
+         shs->samplers[start + i] = state;
          dirty = true;
       }
    }
@@ -2329,19 +2336,17 @@ fill_surface_state(struct isl_device *isl_dev,
       .y_offset_sa = tile_y_sa,
    };
 
-   assert(!iris_resource_unfinished_aux_import(res));
-
    if (aux_usage != ISL_AUX_USAGE_NONE) {
       f.aux_surf = &res->aux.surf;
       f.aux_usage = aux_usage;
-      f.aux_address = res->aux.bo->address + res->aux.offset;
+      f.clear_color = res->aux.clear_color;
 
-      struct iris_bo *clear_bo = NULL;
-      uint64_t clear_offset = 0;
-      f.clear_color =
-         iris_resource_get_clear_color(res, &clear_bo, &clear_offset);
-      if (clear_bo) {
-         f.clear_address = clear_bo->address + clear_offset;
+      if (res->aux.bo)
+         f.aux_address = res->aux.bo->address + res->aux.offset;
+
+      if (res->aux.clear_color_bo) {
+         f.clear_address = res->aux.clear_color_bo->address +
+                           res->aux.clear_color_offset;
          f.use_clear_address = isl_dev->info->ver > 9;
       }
    }
@@ -2415,13 +2420,15 @@ iris_create_sampler_view(struct pipe_context *ctx,
    if (tmpl->target != PIPE_BUFFER) {
       isv->view.base_level = tmpl->u.tex.first_level;
       isv->view.levels = tmpl->u.tex.last_level - tmpl->u.tex.first_level + 1;
-      // XXX: do I need to port f9fd0cf4790cb2a530e75d1a2206dbb9d8af7cb2?
-      isv->view.base_array_layer = tmpl->u.tex.first_layer;
-      isv->view.array_len =
-         tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
 
-      if (iris_resource_unfinished_aux_import(isv->res))
-         iris_resource_finish_aux_import(&screen->base, isv->res);
+      if (tmpl->target == PIPE_TEXTURE_3D) {
+         isv->view.base_array_layer = 0;
+         isv->view.array_len = 1;
+      } else {
+         isv->view.base_array_layer = tmpl->u.tex.first_layer;
+         isv->view.array_len =
+            tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
+      }
 
       unsigned aux_modes = isv->res->aux.sampler_usages;
       while (aux_modes) {
@@ -2581,9 +2588,6 @@ iris_create_surface(struct pipe_context *ctx,
 #endif
 
    if (!isl_format_is_compressed(res->surf.format)) {
-      if (iris_resource_unfinished_aux_import(res))
-         iris_resource_finish_aux_import(&screen->base, res);
-
       void *map = surf->surface_state.cpu;
       UNUSED void *map_read = surf->surface_state_read.cpu;
 
@@ -2806,6 +2810,7 @@ iris_set_sampler_views(struct pipe_context *ctx,
                        enum pipe_shader_type p_stage,
                        unsigned start, unsigned count,
                        unsigned unbind_num_trailing_slots,
+                       bool take_ownership,
                        struct pipe_sampler_view **views)
 {
    struct iris_context *ice = (struct iris_context *) ctx;
@@ -2818,8 +2823,15 @@ iris_set_sampler_views(struct pipe_context *ctx,
 
    for (i = 0; i < count; i++) {
       struct pipe_sampler_view *pview = views ? views[i] : NULL;
-      pipe_sampler_view_reference((struct pipe_sampler_view **)
-                                  &shs->textures[start + i], pview);
+
+      if (take_ownership) {
+         pipe_sampler_view_reference((struct pipe_sampler_view **)
+                                     &shs->textures[start + i], NULL);
+         shs->textures[start + i] = (struct iris_sampler_view *)pview;
+      } else {
+         pipe_sampler_view_reference((struct pipe_sampler_view **)
+                                     &shs->textures[start + i], pview);
+      }
       struct iris_sampler_view *view = (void *) pview;
       if (view) {
          view->res->bind_history |= PIPE_BIND_SAMPLER_VIEW;
@@ -2893,6 +2905,14 @@ iris_set_tess_state(struct pipe_context *ctx,
    shs->sysvals_need_upload = true;
 }
 
+static void
+iris_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices)
+{
+   struct iris_context *ice = (struct iris_context *) ctx;
+
+   ice->state.patch_vertices = patch_vertices;
+}
+
 static void
 iris_surface_destroy(struct pipe_context *ctx, struct pipe_surface *p_surf)
 {
@@ -3199,6 +3219,12 @@ iris_set_constant_buffer(struct pipe_context *ctx,
          assert(map);
          memcpy(map, input->user_buffer, input->buffer_size);
       } else if (input->buffer) {
+         if (cbuf->buffer != input->buffer) {
+            ice->state.dirty |= (IRIS_DIRTY_RENDER_MISC_BUFFER_FLUSHES |
+                                 IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES);
+            shs->dirty_cbufs |= 1u << index;
+         }
+
          if (take_ownership) {
             pipe_resource_reference(&cbuf->buffer, NULL);
             cbuf->buffer = input->buffer;
@@ -3368,6 +3394,8 @@ iris_set_shader_buffers(struct pipe_context *ctx,
       }
    }
 
+   ice->state.dirty |= (IRIS_DIRTY_RENDER_MISC_BUFFER_FLUSHES |
+                        IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES);
    ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << stage;
 }
 
@@ -3409,6 +3437,10 @@ iris_set_vertex_buffers(struct pipe_context *ctx,
       /* We may see user buffers that are NULL bindings. */
       assert(!(buffer->is_user_buffer && buffer->buffer.user != NULL));
 
+      if (buffer->buffer.resource &&
+          state->resource != buffer->buffer.resource)
+         ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFER_FLUSHES;
+
       if (take_ownership) {
          pipe_resource_reference(&state->resource, NULL);
          state->resource = buffer->buffer.resource;
@@ -4380,17 +4412,8 @@ iris_store_tes_state(const struct intel_device_info *devinfo,
    struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
    struct brw_tes_prog_data *tes_prog_data = (void *) prog_data;
 
-   uint32_t *te_state = (void *) shader->derived_data;
-   uint32_t *ds_state = te_state + GENX(3DSTATE_TE_length);
-
-   iris_pack_command(GENX(3DSTATE_TE), te_state, te) {
-      te.Partitioning = tes_prog_data->partitioning;
-      te.OutputTopology = tes_prog_data->output_topology;
-      te.TEDomain = tes_prog_data->domain;
-      te.TEEnable = true;
-      te.MaximumTessellationFactorOdd = 63.0;
-      te.MaximumTessellationFactorNotOdd = 64.0;
-   }
+   uint32_t *ds_state = (void *) shader->derived_data;
+   uint32_t *te_state = ds_state + GENX(3DSTATE_DS_length);
 
    iris_pack_command(GENX(3DSTATE_DS), ds_state, ds) {
       INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL);
@@ -4404,6 +4427,24 @@ iris_store_tes_state(const struct intel_device_info *devinfo,
          vue_prog_data->cull_distance_mask;
    }
 
+   iris_pack_command(GENX(3DSTATE_TE), te_state, te) {
+      te.Partitioning = tes_prog_data->partitioning;
+      te.OutputTopology = tes_prog_data->output_topology;
+      te.TEDomain = tes_prog_data->domain;
+      te.TEEnable = true;
+      te.MaximumTessellationFactorOdd = 63.0;
+      te.MaximumTessellationFactorNotOdd = 64.0;
+#if GFX_VERx10 >= 125
+      te.TessellationDistributionMode = TEDMODE_RR_FREE;
+      te.TessellationDistributionLevel = TEDLEVEL_PATCH;
+      /* 64_TRIANGLES */
+      te.SmallPatchThreshold = 3;
+      /* 1K_TRIANGLES */
+      te.TargetBlockSize = 8;
+      /* 1K_TRIANGLES */
+      te.LocalBOPAccumulatorThreshold = 1;
+#endif
+   }
 }
 
 /**
@@ -4761,25 +4802,25 @@ use_surface(struct iris_context *ice,
                             &surf->surface_state);
    }
 
-   if (res->aux.bo) {
-      iris_use_pinned_bo(batch, res->aux.bo, writeable, access);
-      if (res->aux.clear_color_bo)
-         iris_use_pinned_bo(batch, res->aux.clear_color_bo, false, access);
-
-      if (memcmp(&res->aux.clear_color, &surf->clear_color,
-                 sizeof(surf->clear_color)) != 0) {
-         update_clear_value(ice, batch, res, &surf->surface_state,
-                            res->aux.possible_usages, &surf->view);
-         if (GFX_VER == 8) {
-            update_clear_value(ice, batch, res, &surf->surface_state_read,
-                               res->aux.possible_usages, &surf->read_view);
-         }
-         surf->clear_color = res->aux.clear_color;
+   if (memcmp(&res->aux.clear_color, &surf->clear_color,
+              sizeof(surf->clear_color)) != 0) {
+      update_clear_value(ice, batch, res, &surf->surface_state,
+                         res->aux.possible_usages, &surf->view);
+      if (GFX_VER == 8) {
+         update_clear_value(ice, batch, res, &surf->surface_state_read,
+                            res->aux.possible_usages, &surf->read_view);
       }
+      surf->clear_color = res->aux.clear_color;
    }
 
-   iris_use_pinned_bo(batch, iris_resource_bo(p_surf->texture),
-                      writeable, access);
+   if (res->aux.clear_color_bo)
+      iris_use_pinned_bo(batch, res->aux.clear_color_bo, false, access);
+
+   if (res->aux.bo)
+      iris_use_pinned_bo(batch, res->aux.bo, writeable, access);
+
+   iris_use_pinned_bo(batch, res->bo, writeable, access);
+
    if (GFX_VER == 8 && is_read_surface) {
       iris_use_pinned_bo(batch, iris_resource_bo(surf->surface_state_read.ref.res), false,
                          IRIS_DOMAIN_NONE);
@@ -4807,18 +4848,21 @@ use_sampler_view(struct iris_context *ice,
    if (!isv->surface_state.ref.res)
       upload_surface_states(ice->state.surface_uploader, &isv->surface_state);
 
+   if (memcmp(&isv->res->aux.clear_color, &isv->clear_color,
+              sizeof(isv->clear_color)) != 0) {
+      update_clear_value(ice, batch, isv->res, &isv->surface_state,
+                         isv->res->aux.sampler_usages, &isv->view);
+      isv->clear_color = isv->res->aux.clear_color;
+   }
+
+   if (isv->res->aux.clear_color_bo) {
+      iris_use_pinned_bo(batch, isv->res->aux.clear_color_bo,
+                         false, IRIS_DOMAIN_OTHER_READ);
+   }
+
    if (isv->res->aux.bo) {
       iris_use_pinned_bo(batch, isv->res->aux.bo,
                          false, IRIS_DOMAIN_OTHER_READ);
-      if (isv->res->aux.clear_color_bo)
-         iris_use_pinned_bo(batch, isv->res->aux.clear_color_bo,
-                            false, IRIS_DOMAIN_OTHER_READ);
-      if (memcmp(&isv->res->aux.clear_color, &isv->clear_color,
-                 sizeof(isv->clear_color)) != 0) {
-         update_clear_value(ice, batch, isv->res, &isv->surface_state,
-                            isv->res->aux.sampler_usages, &isv->view);
-         isv->clear_color = isv->res->aux.clear_color;
-      }
    }
 
    iris_use_pinned_bo(batch, isv->res->bo, false, IRIS_DOMAIN_OTHER_READ);
@@ -5203,7 +5247,7 @@ iris_restore_render_saved_bos(struct iris_context *ice,
    }
 
    iris_use_optional_res(batch, ice->state.last_res.index_buffer, false,
-                         IRIS_DOMAIN_OTHER_READ);
+                         IRIS_DOMAIN_VF_READ);
 
    if (clean & IRIS_DIRTY_VERTEX_BUFFERS) {
       uint64_t bound = ice->state.bound_vertex_buffers;
@@ -5211,7 +5255,7 @@ iris_restore_render_saved_bos(struct iris_context *ice,
          const int i = u_bit_scan64(&bound);
          struct pipe_resource *res = genx->vertex_buffers[i].resource;
          iris_use_pinned_bo(batch, iris_resource_bo(res), false,
-                            IRIS_DOMAIN_OTHER_READ);
+                            IRIS_DOMAIN_VF_READ);
       }
    }
 }
@@ -5522,6 +5566,60 @@ emit_push_constant_packet_all(struct iris_context *ice,
 }
 #endif
 
+void
+genX(emit_depth_state_workarounds)(struct iris_context *ice,
+                                   struct iris_batch *batch,
+                                   const struct isl_surf *surf)
+{
+#if GFX_VERx10 == 120
+   const bool fmt_is_d16 = surf->format == ISL_FORMAT_R16_UNORM;
+
+   switch (ice->state.genx->depth_reg_mode) {
+   case IRIS_DEPTH_REG_MODE_HW_DEFAULT:
+      if (!fmt_is_d16)
+         return;
+      break;
+   case IRIS_DEPTH_REG_MODE_D16:
+      if (fmt_is_d16)
+         return;
+      break;
+   case IRIS_DEPTH_REG_MODE_UNKNOWN:
+      break;
+   }
+
+   /* We'll change some CHICKEN registers depending on the depth surface
+    * format. Do a depth flush and stall so the pipeline is not using these
+    * settings while we change the registers.
+    */
+   iris_emit_end_of_pipe_sync(batch,
+                              "Workaround: Stop pipeline for 14010455700",
+                              PIPE_CONTROL_DEPTH_STALL |
+                              PIPE_CONTROL_DEPTH_CACHE_FLUSH);
+
+   /* Wa_14010455700
+    *
+    * To avoid sporadic corruptions “Set 0x7010[9] when Depth Buffer
+    * Surface Format is D16_UNORM , surface type is not NULL & 1X_MSAA”.
+    */
+   iris_emit_reg(batch, GENX(COMMON_SLICE_CHICKEN1), reg) {
+      reg.HIZPlaneOptimizationdisablebit = fmt_is_d16 && surf->samples == 1;
+      reg.HIZPlaneOptimizationdisablebitMask = true;
+   }
+
+   /* Wa_1806527549
+    *
+    * Set HIZ_CHICKEN (7018h) bit 13 = 1 when depth buffer is D16_UNORM.
+    */
+   iris_emit_reg(batch, GENX(HIZ_CHICKEN), reg) {
+      reg.HZDepthTestLEGEOptimizationDisable = fmt_is_d16;
+      reg.HZDepthTestLEGEOptimizationDisableMask = true;
+   }
+
+   ice->state.genx->depth_reg_mode =
+      fmt_is_d16 ? IRIS_DEPTH_REG_MODE_D16 : IRIS_DEPTH_REG_MODE_HW_DEFAULT;
+#endif
+}
+
 static void
 iris_upload_dirty_render_state(struct iris_context *ice,
                                struct iris_batch *batch,
@@ -6184,27 +6282,30 @@ iris_upload_dirty_render_state(struct iris_context *ice,
    if (dirty & IRIS_DIRTY_DEPTH_BUFFER) {
       struct iris_depth_buffer_state *cso_z = &ice->state.genx->depth_buffer;
 
-      /* Do not emit the clear params yets. We need to update the clear value
-       * first.
-       */
-      uint32_t clear_length = GENX(3DSTATE_CLEAR_PARAMS_length) * 4;
-      uint32_t cso_z_size = batch->screen->isl_dev.ds.size - clear_length;;
+      /* Do not emit the cso yet. We may need to update clear params first. */
+      struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+      struct iris_resource *zres = NULL, *sres = NULL;
+      if (cso_fb->zsbuf) {
+         iris_get_depth_stencil_resources(cso_fb->zsbuf->texture,
+                                          &zres, &sres);
+      }
 
-#if GFX_VERx10 == 120
-      /* Wa_14010455700
-       *
-       * ISL will change some CHICKEN registers depending on the depth surface
-       * format, along with emitting the depth and stencil packets. In that
-       * case, we want to do a depth flush and stall, so the pipeline is not
-       * using these settings while we change the registers.
-       */
-      iris_emit_end_of_pipe_sync(batch,
-                                 "Workaround: Stop pipeline for 14010455700",
-                                 PIPE_CONTROL_DEPTH_STALL |
-                                 PIPE_CONTROL_DEPTH_CACHE_FLUSH);
-#endif
+      if (zres && ice->state.hiz_usage != ISL_AUX_USAGE_NONE) {
+         uint32_t *clear_params =
+            cso_z->packets + ARRAY_SIZE(cso_z->packets) -
+            GENX(3DSTATE_CLEAR_PARAMS_length);
+
+         iris_pack_command(GENX(3DSTATE_CLEAR_PARAMS), clear_params, clear) {
+            clear.DepthClearValueValid = true;
+            clear.DepthClearValue = zres->aux.clear_color.f32[0];
+         }
+      }
+
+      iris_batch_emit(batch, cso_z->packets, sizeof(cso_z->packets));
+
+      if (zres)
+         genX(emit_depth_state_workarounds)(ice, batch, &zres->surf);
 
-      iris_batch_emit(batch, cso_z->packets, cso_z_size);
       if (GFX_VER >= 12) {
          /* Wa_1408224581
           *
@@ -6218,24 +6319,6 @@ iris_upload_dirty_render_state(struct iris_context *ice,
                                       batch->screen->workaround_address.bo,
                                       batch->screen->workaround_address.offset, 0);
       }
-
-      union isl_color_value clear_value = { .f32 = { 0, } };
-
-      struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
-      if (cso_fb->zsbuf) {
-         struct iris_resource *zres, *sres;
-         iris_get_depth_stencil_resources(cso_fb->zsbuf->texture,
-                                          &zres, &sres);
-         if (zres && zres->aux.bo)
-            clear_value = iris_resource_get_clear_color(zres, NULL, NULL);
-      }
-
-      uint32_t clear_params[GENX(3DSTATE_CLEAR_PARAMS_length)];
-      iris_pack_command(GENX(3DSTATE_CLEAR_PARAMS), clear_params, clear) {
-         clear.DepthClearValueValid = true;
-         clear.DepthClearValue = clear_value.f32[0];
-      }
-      iris_batch_emit(batch, clear_params, clear_length);
    }
 
    if (dirty & (IRIS_DIRTY_DEPTH_BUFFER | IRIS_DIRTY_WM_DEPTH_STENCIL)) {
@@ -6260,7 +6343,7 @@ iris_upload_dirty_render_state(struct iris_context *ice,
    if (dirty & IRIS_DIRTY_VF_TOPOLOGY) {
       iris_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
          topo.PrimitiveTopologyType =
-            translate_prim_type(draw->mode, draw->vertices_per_patch);
+            translate_prim_type(draw->mode, ice->state.vertices_per_patch);
       }
    }
 
@@ -6327,7 +6410,7 @@ iris_upload_dirty_render_state(struct iris_context *ice,
          while (bound) {
             const int i = u_bit_scan64(&bound);
             iris_use_optional_res(batch, genx->vertex_buffers[i].resource,
-                                  false, IRIS_DOMAIN_OTHER_READ);
+                                  false, IRIS_DOMAIN_VF_READ);
          }
 #else
          /* The VF cache designers cut corners, and made the cache key's
@@ -6349,7 +6432,7 @@ iris_upload_dirty_render_state(struct iris_context *ice,
             struct iris_resource *res =
                (void *) genx->vertex_buffers[i].resource;
             if (res) {
-               iris_use_pinned_bo(batch, res->bo, false, IRIS_DOMAIN_OTHER_READ);
+               iris_use_pinned_bo(batch, res->bo, false, IRIS_DOMAIN_VF_READ);
 
                high_bits = res->bo->address >> 32ull;
                if (high_bits != ice->state.last_vbo_high_bits[i]) {
@@ -6499,6 +6582,9 @@ iris_upload_dirty_render_state(struct iris_context *ice,
 
    if (dirty & IRIS_DIRTY_VF) {
       iris_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
+#if GFX_VERx10 >= 125
+         vf.GeometryDistributionEnable = true;
+#endif
          if (draw->primitive_restart) {
             vf.IndexedDrawCutIndexEnable = true;
             vf.CutIndex = draw->restart_index;
@@ -6506,6 +6592,33 @@ iris_upload_dirty_render_state(struct iris_context *ice,
       }
    }
 
+#if GFX_VERx10 >= 125
+   if (dirty & IRIS_DIRTY_VFG) {
+      iris_emit_cmd(batch, GENX(3DSTATE_VFG), vfg) {
+         /* If 3DSTATE_TE: TE Enable == 1 then RR_STRICT else RR_FREE*/
+         vfg.DistributionMode =
+            ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL ? RR_STRICT :
+                                                               RR_FREE;
+         vfg.DistributionGranularity = BatchLevelGranularity;
+         vfg.ListCutIndexEnable = draw->primitive_restart;
+         /* 192 vertices for TRILIST_ADJ */
+         vfg.ListNBatchSizeScale = 0;
+         /* Batch size of 384 vertices */
+         vfg.List3BatchSizeScale = 2;
+         /* Batch size of 128 vertices */
+         vfg.List2BatchSizeScale = 1;
+         /* Batch size of 128 vertices */
+         vfg.List1BatchSizeScale = 2;
+         /* Batch size of 256 vertices for STRIP topologies */
+         vfg.StripBatchSizeScale = 3;
+         /* 192 control points for PATCHLIST_3 */
+         vfg.PatchBatchSizeScale = 1;
+         /* 192 control points for PATCHLIST_3 */
+         vfg.PatchBatchSizeMultiplier = 31;
+      }
+   }
+#endif
+
    if (dirty & IRIS_DIRTY_VF_STATISTICS) {
       iris_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
          vf.StatisticsEnable = true;
@@ -6527,6 +6640,18 @@ iris_upload_dirty_render_state(struct iris_context *ice,
 #endif
 }
 
+static void
+flush_vbos(struct iris_context *ice, struct iris_batch *batch)
+{
+   struct iris_genx_state *genx = ice->state.genx;
+   uint64_t bound = ice->state.bound_vertex_buffers;
+   while (bound) {
+      const int i = u_bit_scan64(&bound);
+      struct iris_bo *bo = iris_resource_bo(genx->vertex_buffers[i].resource);
+      iris_emit_buffer_barrier_for(batch, bo, IRIS_DOMAIN_VF_READ);
+   }
+}
+
 static void
 iris_upload_render_state(struct iris_context *ice,
                          struct iris_batch *batch,
@@ -6537,6 +6662,9 @@ iris_upload_render_state(struct iris_context *ice,
 {
    bool use_predicate = ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT;
 
+   if (ice->state.dirty & IRIS_DIRTY_VERTEX_BUFFER_FLUSHES)
+      flush_vbos(ice, batch);
+
    iris_batch_sync_region_start(batch);
 
    /* Always pin the binder.  If we're emitting new binding table pointers,
@@ -6588,6 +6716,8 @@ iris_upload_render_state(struct iris_context *ice,
          pipe_resource_reference(&ice->state.last_res.index_buffer,
                                  draw->index.resource);
          offset = 0;
+
+         iris_emit_buffer_barrier_for(batch, res->bo, IRIS_DOMAIN_VF_READ);
       }
 
       struct iris_genx_state *genx = ice->state.genx;
@@ -6608,7 +6738,7 @@ iris_upload_render_state(struct iris_context *ice,
       if (memcmp(genx->last_index_buffer, ib_packet, sizeof(ib_packet)) != 0) {
          memcpy(genx->last_index_buffer, ib_packet, sizeof(ib_packet));
          iris_batch_emit(batch, ib_packet, sizeof(ib_packet));
-         iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_OTHER_READ);
+         iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_VF_READ);
       }
 
 #if GFX_VER < 11
@@ -6640,10 +6770,6 @@ iris_upload_render_state(struct iris_context *ice,
          unsigned draw_count_offset =
             indirect->indirect_draw_count_offset;
 
-         iris_emit_pipe_control_flush(batch,
-                                      "ensure indirect draw buffer is flushed",
-                                      PIPE_CONTROL_FLUSH_ENABLE);
-
          if (ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT) {
             struct mi_builder b;
             mi_builder_init(&b, &batch->screen->devinfo, batch);
@@ -6821,7 +6947,7 @@ iris_upload_compute_walker(struct iris_context *ice,
    if (stage_dirty & IRIS_STAGE_DIRTY_CS) {
       iris_emit_cmd(batch, GENX(CFE_STATE), cfe) {
          cfe.MaximumNumberofThreads =
-            devinfo->max_cs_threads * screen->subslice_total - 1;
+            devinfo->max_cs_threads * devinfo->subslice_total - 1;
          if (prog_data->total_scratch > 0) {
             cfe.ScratchSpaceBuffer =
                iris_get_scratch_surf(ice, prog_data->total_scratch)->offset >> 4;
@@ -6848,7 +6974,7 @@ iris_upload_compute_walker(struct iris_context *ice,
          .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
          .SharedLocalMemorySize =
             encode_slm_size(GFX_VER, prog_data->total_shared),
-         .BarrierEnable = cs_prog_data->uses_barrier,
+         .NumberOfBarriers = cs_prog_data->uses_barrier,
          .SamplerStatePointer = shs->sampler_table.offset,
          .BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE],
       };
@@ -6904,7 +7030,7 @@ iris_upload_gpgpu_walker(struct iris_context *ice,
          }
 
          vfe.MaximumNumberofThreads =
-            devinfo->max_cs_threads * screen->subslice_total - 1;
+            devinfo->max_cs_threads * devinfo->subslice_total - 1;
 #if GFX_VER < 11
          vfe.ResetGatewayTimer =
             Resettingrelativetimerandlatchingtheglobaltimestamp;
@@ -7165,7 +7291,8 @@ iris_rebind_buffer(struct iris_context *ice,
 
          if (*addr != bo->address + state->offset) {
             *addr = bo->address + state->offset;
-            ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS;
+            ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS |
+                                IRIS_DIRTY_VERTEX_BUFFER_FLUSHES;
          }
       }
    }
@@ -7216,6 +7343,9 @@ iris_rebind_buffer(struct iris_context *ice,
 
             if (res->bo == iris_resource_bo(cbuf->buffer)) {
                pipe_resource_reference(&surf_state->res, NULL);
+               shs->dirty_cbufs |= 1u << i;
+               ice->state.dirty |= (IRIS_DIRTY_RENDER_MISC_BUFFER_FLUSHES |
+                                    IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES);
                ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS << s;
             }
          }
@@ -7288,12 +7418,17 @@ batch_mark_sync_for_pipe_control(struct iris_batch *batch, uint32_t flags)
       if ((flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH))
          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_DEPTH_WRITE);
 
+      if ((flags & PIPE_CONTROL_DATA_CACHE_FLUSH))
+         iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_DATA_WRITE);
+
       if ((flags & PIPE_CONTROL_FLUSH_ENABLE))
          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_OTHER_WRITE);
 
       if ((flags & (PIPE_CONTROL_CACHE_FLUSH_BITS |
-                    PIPE_CONTROL_STALL_AT_SCOREBOARD)))
+                    PIPE_CONTROL_STALL_AT_SCOREBOARD))) {
+         iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_VF_READ);
          iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_OTHER_READ);
+      }
    }
 
    if ((flags & PIPE_CONTROL_RENDER_TARGET_FLUSH))
@@ -7302,9 +7437,15 @@ batch_mark_sync_for_pipe_control(struct iris_batch *batch, uint32_t flags)
    if ((flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH))
       iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_DEPTH_WRITE);
 
+   if ((flags & PIPE_CONTROL_DATA_CACHE_FLUSH))
+      iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_DATA_WRITE);
+
    if ((flags & PIPE_CONTROL_FLUSH_ENABLE))
       iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_OTHER_WRITE);
 
+   if ((flags & PIPE_CONTROL_VF_CACHE_INVALIDATE))
+      iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_VF_READ);
+
    if ((flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) &&
        (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE))
       iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_OTHER_READ);
@@ -7406,8 +7547,7 @@ iris_emit_raw_pipe_control(struct iris_batch *batch,
                                  imm);
    }
 
-   if ((GFX_VER == 9 || (GFX_VER == 12 && devinfo->revision == 0 /* A0*/)) &&
-        IS_COMPUTE_PIPELINE(batch) && post_sync_flags) {
+   if (GFX_VER == 9 && IS_COMPUTE_PIPELINE(batch) && post_sync_flags) {
       /* Project: SKL / Argument: LRI Post Sync Operation [23]
        *
        * "PIPECONTROL command with “Command Streamer Stall Enable” must be
@@ -7416,8 +7556,6 @@ iris_emit_raw_pipe_control(struct iris_batch *batch,
        *  PIPELINE_SELECT command is set to GPGPU mode of operation)."
        *
        * The same text exists a few rows below for Post Sync Op.
-       *
-       * On Gfx12 this is Wa_1607156449.
        */
       iris_emit_raw_pipe_control(batch,
                                  "workaround: CS stall before gpgpu post-sync",
@@ -7692,7 +7830,7 @@ iris_emit_raw_pipe_control(struct iris_batch *batch,
 
    /* Emit --------------------------------------------------------------- */
 
-   if (INTEL_DEBUG & DEBUG_PIPE_CONTROL) {
+   if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) {
       fprintf(stderr,
               "  PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n",
               (flags & PIPE_CONTROL_FLUSH_ENABLE) ? "PipeCon " : "",
@@ -7839,6 +7977,10 @@ iris_lost_genx_state(struct iris_context *ice, struct iris_batch *batch)
 {
    struct iris_genx_state *genx = ice->state.genx;
 
+#if GFX_VERx10 == 120
+   genx->depth_reg_mode = IRIS_DEPTH_REG_MODE_UNKNOWN;
+#endif
+
    memset(genx->last_index_buffer, 0, sizeof(genx->last_index_buffer));
 }
 
@@ -8030,6 +8172,7 @@ genX(init_state)(struct iris_context *ice)
    ctx->set_compute_resources = iris_set_compute_resources;
    ctx->set_global_binding = iris_set_global_binding;
    ctx->set_tess_state = iris_set_tess_state;
+   ctx->set_patch_vertices = iris_set_patch_vertices;
    ctx->set_framebuffer_state = iris_set_framebuffer_state;
    ctx->set_polygon_stipple = iris_set_polygon_stipple;
    ctx->set_sample_mask = iris_set_sample_mask;
diff --git a/mesa 3D driver/src/gallium/drivers/lima/ci/deqp-lima-fails.txt b/mesa 3D driver/src/gallium/drivers/lima/ci/deqp-lima-fails.txt
index 7bc535e833..4232bffe76 100644
--- a/mesa 3D driver/src/gallium/drivers/lima/ci/deqp-lima-fails.txt	
+++ b/mesa 3D driver/src/gallium/drivers/lima/ci/deqp-lima-fails.txt	
@@ -30,21 +30,10 @@ dEQP-GLES2.functional.fragment_ops.depth_stencil.random.7,Fail
 dEQP-GLES2.functional.fragment_ops.depth_stencil.random.8,Fail
 dEQP-GLES2.functional.fragment_ops.depth_stencil.random.9,Fail
 dEQP-GLES2.functional.fragment_ops.depth_stencil.write_mask.stencil,Fail
-dEQP-GLES2.functional.negative_api.shader.uniform_matrixfv_invalid_transpose,Fail
-dEQP-GLES2.functional.negative_api.texture.generatemipmap_zero_level_array_compressed,Fail
-dEQP-GLES2.functional.shaders.builtin_variable.frontfacing,Fail
 dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat4_dynamic_loop_write_dynamic_loop_read_vertex,Fail
 dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat4_dynamic_loop_write_dynamic_read_vertex,Fail
 dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat4_dynamic_loop_write_static_loop_read_vertex,Fail
 dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat4_dynamic_loop_write_static_read_vertex,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.float_const_write_dynamic_loop_read_vertex,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.float_const_write_dynamic_read_vertex,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec2_const_write_dynamic_loop_read_vertex,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec2_const_write_dynamic_read_vertex,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_loop_read_vertex,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_read_vertex,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_loop_read_vertex,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_read_vertex,Fail
 dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_dynamic_loop_write_dynamic_loop_read_vertex,Fail
 dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_dynamic_loop_write_dynamic_read_vertex,Fail
 dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_dynamic_loop_write_static_loop_read_vertex,Fail
diff --git a/mesa 3D driver/src/gallium/drivers/lima/ci/gitlab-ci.yml b/mesa 3D driver/src/gallium/drivers/lima/ci/gitlab-ci.yml
index ee42f3c639..93fae88e3e 100644
--- a/mesa 3D driver/src/gallium/drivers/lima/ci/gitlab-ci.yml	
+++ b/mesa 3D driver/src/gallium/drivers/lima/ci/gitlab-ci.yml	
@@ -7,7 +7,7 @@ lima-mali450-test:arm64:
     DTB: ${DEVICE_TYPE}
     FDO_HTTP_CACHE_URI: ''
     GPU_VERSION: lima
-    DEQP_PARALLEL: 4
+    FDO_CI_CONCURRENT: 4
     DEQP_EXPECTED_RENDERER: Mali450
     VISIBILITY_GROUP: "mesa-ci"
 
diff --git a/mesa 3D driver/src/gallium/drivers/lima/drm-shim/lima_noop.c b/mesa 3D driver/src/gallium/drivers/lima/drm-shim/lima_noop.c
new file mode 100644
index 0000000000..2732029075
--- /dev/null
+++ b/mesa 3D driver/src/gallium/drivers/lima/drm-shim/lima_noop.c	
@@ -0,0 +1,120 @@
+/*
+ * Copyright (C) 2021 Icecream95
+ * Copyright (C) 2019 Google LLC
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "drm-shim/drm_shim.h"
+#include "drm-uapi/lima_drm.h"
+
+#include "util/u_math.h"
+
+bool drm_shim_driver_prefers_first_render_node = true;
+
+static int
+lima_ioctl_noop(int fd, unsigned long request, void *arg)
+{
+   return 0;
+}
+
+static int
+lima_ioctl_get_param(int fd, unsigned long request, void *arg)
+{
+   struct drm_lima_get_param *gp = arg;
+
+   switch (gp->param) {
+   case DRM_LIMA_PARAM_GPU_ID:
+      gp->value = DRM_LIMA_PARAM_GPU_ID_MALI450;
+      return 0;
+   case DRM_LIMA_PARAM_NUM_PP:
+      gp->value = 6;
+      return 0;
+   default:
+      fprintf(stderr, "Unknown DRM_IOCTL_LIMA_GET_PARAM %d\n", gp->param);
+      return -1;
+   }
+}
+
+static int
+lima_ioctl_gem_create(int fd, unsigned long request, void *arg)
+{
+   struct drm_lima_gem_create *create = arg;
+
+   struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
+   struct shim_bo *bo = calloc(1, sizeof(*bo));
+   size_t size = ALIGN(create->size, 4096);
+
+   drm_shim_bo_init(bo, size);
+
+   create->handle = drm_shim_bo_get_handle(shim_fd, bo);
+
+   drm_shim_bo_put(bo);
+
+   return 0;
+}
+
+static int
+lima_ioctl_gem_info(int fd, unsigned long request, void *arg)
+{
+   struct drm_lima_gem_info *gem_info = arg;
+
+   struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
+   struct shim_bo *bo = drm_shim_bo_lookup(shim_fd, gem_info->handle);
+
+   gem_info->va = bo->mem_addr;
+   gem_info->offset = drm_shim_bo_get_mmap_offset(shim_fd, bo);
+
+   return 0;
+}
+
+static ioctl_fn_t driver_ioctls[] = {
+   [DRM_LIMA_GET_PARAM] = lima_ioctl_get_param,
+   [DRM_LIMA_GEM_CREATE] = lima_ioctl_gem_create,
+   [DRM_LIMA_GEM_INFO] = lima_ioctl_gem_info,
+   [DRM_LIMA_GEM_SUBMIT] = lima_ioctl_noop,
+   [DRM_LIMA_GEM_WAIT] = lima_ioctl_noop,
+   [DRM_LIMA_CTX_CREATE] = lima_ioctl_noop,
+   [DRM_LIMA_CTX_FREE] = lima_ioctl_noop,
+};
+
+void
+drm_shim_driver_init(void)
+{
+   shim_device.bus_type = DRM_BUS_PLATFORM;
+   shim_device.driver_name = "lima";
+   shim_device.driver_ioctls = driver_ioctls;
+   shim_device.driver_ioctl_count = ARRAY_SIZE(driver_ioctls);
+
+   /* lima uses the DRM version to expose features, instead of getparam. */
+   shim_device.version_major = 1;
+   shim_device.version_minor = 1;
+   shim_device.version_patchlevel = 0;
+
+   drm_shim_override_file("DRIVER=lima\n"
+                          "OF_FULLNAME=/soc/mali\n"
+                          "OF_COMPATIBLE_0=arm,mali-450\n"
+                          "OF_COMPATIBLE_N=1\n",
+                          "/sys/dev/char/%d:%d/device/uevent", DRM_MAJOR,
+                          render_node_minor);
+}
diff --git a/mesa 3D driver/src/gallium/drivers/lima/drm-shim/meson.build b/mesa 3D driver/src/gallium/drivers/lima/drm-shim/meson.build
new file mode 100644
index 0000000000..a978d3505b
--- /dev/null
+++ b/mesa 3D driver/src/gallium/drivers/lima/drm-shim/meson.build	
@@ -0,0 +1,29 @@
+# Copyright (C) 2021 Icecream95
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+liblima_noop_drm_shim = shared_library(
+  ['lima_noop_drm_shim'],
+  'lima_noop.c',
+  include_directories: [inc_include, inc_src],
+  dependencies: dep_drm_shim,
+  gnu_symbol_visibility : 'hidden',
+  install : true,
+)
diff --git a/mesa 3D driver/src/gallium/drivers/lima/ir/gp/codegen.c b/mesa 3D driver/src/gallium/drivers/lima/ir/gp/codegen.c
index d9a46f86a9..aa0a0496b0 100644
--- a/mesa 3D driver/src/gallium/drivers/lima/ir/gp/codegen.c	
+++ b/mesa 3D driver/src/gallium/drivers/lima/ir/gp/codegen.c	
@@ -608,7 +608,7 @@ bool gpir_codegen_prog(gpir_compiler *comp)
 
    if (lima_debug & LIMA_DEBUG_GP) {
       gpir_codegen_print_prog(comp);
-      gpir_disassemble_program(code, num_instr);
+      gpir_disassemble_program(code, num_instr, stdout);
    }
 
    return true;
diff --git a/mesa 3D driver/src/gallium/drivers/lima/ir/gp/codegen.h b/mesa 3D driver/src/gallium/drivers/lima/ir/gp/codegen.h
index d24b31b41f..f6bf4eb192 100644
--- a/mesa 3D driver/src/gallium/drivers/lima/ir/gp/codegen.h	
+++ b/mesa 3D driver/src/gallium/drivers/lima/ir/gp/codegen.h	
@@ -161,6 +161,6 @@ typedef struct __attribute__((__packed__)) {
    unsigned                branch_target       : 8;
 } gpir_codegen_instr;
 
-void gpir_disassemble_program(gpir_codegen_instr *code, unsigned num_instr);
+void gpir_disassemble_program(gpir_codegen_instr *code, unsigned num_instr, FILE *fp);
 
 #endif
diff --git a/mesa 3D driver/src/gallium/drivers/lima/ir/gp/disasm.c b/mesa 3D driver/src/gallium/drivers/lima/ir/gp/disasm.c
index bc0ce3bec4..eb15fdb5e1 100644
--- a/mesa 3D driver/src/gallium/drivers/lima/ir/gp/disasm.c	
+++ b/mesa 3D driver/src/gallium/drivers/lima/ir/gp/disasm.c	
@@ -47,9 +47,9 @@ static const gpir_codegen_store_src gp_unit_to_store_src[num_units] = {
 };
 
 static void
-print_dest(gpir_codegen_instr *instr, gp_unit unit, unsigned cur_dest_index)
+print_dest(gpir_codegen_instr *instr, gp_unit unit, unsigned cur_dest_index, FILE *fp)
 {
-   printf("^%u", cur_dest_index + unit);
+   fprintf(fp, "^%u", cur_dest_index + unit);
 
    gpir_codegen_store_src src = gp_unit_to_store_src[unit];
 
@@ -59,54 +59,54 @@ print_dest(gpir_codegen_instr *instr, gp_unit unit, unsigned cur_dest_index)
          /* Temporary stores ignore the address, and always use whatever's
           * stored in address register 0.
           */
-         printf("/t[addr0]");
+         fprintf(fp, "/t[addr0]");
       } else {
          if (instr->store0_varying)
-            printf("/v");
+            fprintf(fp, "/v");
          else
-            printf("/$");
-         printf("%u", instr->store0_addr);
+            fprintf(fp, "/$");
+         fprintf(fp, "%u", instr->store0_addr);
       }
 
-      printf(".");
+      fprintf(fp, ".");
       if (instr->store0_src_x == src)
-         printf("x");
+         fprintf(fp, "x");
       if (instr->store0_src_y == src)
-         printf("y");
+         fprintf(fp, "y");
    }
 
    if (instr->store1_src_z == src ||
        instr->store1_src_w == src) {
       if (instr->store1_temporary) {
-         printf("/t[addr0]");
+         fprintf(fp, "/t[addr0]");
       } else {
          if (instr->store1_varying)
-            printf("/v");
+            fprintf(fp, "/v");
          else
-            printf("/$");
-         printf("%u", instr->store1_addr);
+            fprintf(fp, "/$");
+         fprintf(fp, "%u", instr->store1_addr);
       }
 
-      printf(".");
+      fprintf(fp, ".");
       if (instr->store1_src_z == src)
-         printf("z");
+         fprintf(fp, "z");
       if (instr->store1_src_w == src)
-         printf("w");
+         fprintf(fp, "w");
    }
 
    if (unit == unit_complex) {
       switch (instr->complex_op) {
       case gpir_codegen_complex_op_temp_store_addr:
-         printf("/addr0");
+         fprintf(fp, "/addr0");
          break;
       case gpir_codegen_complex_op_temp_load_addr_0:
-         printf("/addr1");
+         fprintf(fp, "/addr1");
          break;
       case gpir_codegen_complex_op_temp_load_addr_1:
-         printf("/addr2");
+         fprintf(fp, "/addr2");
          break;
       case gpir_codegen_complex_op_temp_load_addr_2:
-         printf("/addr3");
+         fprintf(fp, "/addr3");
          break;
       default:
          break;
@@ -117,14 +117,14 @@ print_dest(gpir_codegen_instr *instr, gp_unit unit, unsigned cur_dest_index)
 static void
 print_src(gpir_codegen_src src, gp_unit unit, unsigned unit_src_num,
           gpir_codegen_instr *instr, gpir_codegen_instr *prev_instr,
-          unsigned cur_dest_index)
+          unsigned cur_dest_index, FILE *fp)
 {
    switch (src) {
    case gpir_codegen_src_attrib_x:
    case gpir_codegen_src_attrib_y:
    case gpir_codegen_src_attrib_z:
    case gpir_codegen_src_attrib_w:
-      printf("%c%d.%c", instr->register0_attribute ? 'a' : '$',
+      fprintf(fp, "%c%d.%c", instr->register0_attribute ? 'a' : '$',
              instr->register0_addr, "xyzw"[src - gpir_codegen_src_attrib_x]);
       break;
 
@@ -132,7 +132,7 @@ print_src(gpir_codegen_src src, gp_unit unit, unsigned unit_src_num,
    case gpir_codegen_src_register_y:
    case gpir_codegen_src_register_z:
    case gpir_codegen_src_register_w:
-      printf("$%d.%c", instr->register1_addr,
+      fprintf(fp, "$%d.%c", instr->register1_addr,
              "xyzw"[src - gpir_codegen_src_register_x]);
       break;
 
@@ -140,54 +140,54 @@ print_src(gpir_codegen_src src, gp_unit unit, unsigned unit_src_num,
    case gpir_codegen_src_unknown_1:
    case gpir_codegen_src_unknown_2:
    case gpir_codegen_src_unknown_3:
-      printf("unknown%d", src - gpir_codegen_src_unknown_0);
+      fprintf(fp, "unknown%d", src - gpir_codegen_src_unknown_0);
       break;
 
    case gpir_codegen_src_load_x:
    case gpir_codegen_src_load_y:
    case gpir_codegen_src_load_z:
    case gpir_codegen_src_load_w:
-      printf("t[%d", instr->load_addr);
+      fprintf(fp, "t[%d", instr->load_addr);
       switch (instr->load_offset) {
       case gpir_codegen_load_off_ld_addr_0:
-         printf("+addr1");
+         fprintf(fp, "+addr1");
          break;
       case gpir_codegen_load_off_ld_addr_1:
-         printf("+addr2");
+         fprintf(fp, "+addr2");
          break;
       case gpir_codegen_load_off_ld_addr_2:
-         printf("+addr3");
+         fprintf(fp, "+addr3");
          break;
       case gpir_codegen_load_off_none:
          break;
       default:
-         printf("+unk%d", instr->load_offset);
+         fprintf(fp, "+unk%d", instr->load_offset);
       }
-      printf("].%c", "xyzw"[src - gpir_codegen_src_load_x]);
+      fprintf(fp, "].%c", "xyzw"[src - gpir_codegen_src_load_x]);
       break;
 
    case gpir_codegen_src_p1_acc_0:
-      printf("^%d", cur_dest_index - 1 * num_units + unit_acc_0);
+      fprintf(fp, "^%d", cur_dest_index - 1 * num_units + unit_acc_0);
       break;
 
    case gpir_codegen_src_p1_acc_1:
-      printf("^%d", cur_dest_index - 1 * num_units + unit_acc_1);
+      fprintf(fp, "^%d", cur_dest_index - 1 * num_units + unit_acc_1);
       break;
 
    case gpir_codegen_src_p1_mul_0:
-      printf("^%d", cur_dest_index - 1 * num_units + unit_mul_0);
+      fprintf(fp, "^%d", cur_dest_index - 1 * num_units + unit_mul_0);
       break;
 
    case gpir_codegen_src_p1_mul_1:
-      printf("^%d", cur_dest_index - 1 * num_units + unit_mul_1);
+      fprintf(fp, "^%d", cur_dest_index - 1 * num_units + unit_mul_1);
       break;
 
    case gpir_codegen_src_p1_pass:
-      printf("^%d", cur_dest_index - 1 * num_units + unit_pass);
+      fprintf(fp, "^%d", cur_dest_index - 1 * num_units + unit_pass);
       break;
 
    case gpir_codegen_src_unused:
-      printf("unused");
+      fprintf(fp, "unused");
       break;
 
    case gpir_codegen_src_p1_complex: /* Also ident */
@@ -195,48 +195,48 @@ print_src(gpir_codegen_src src, gp_unit unit, unsigned unit_src_num,
       case unit_acc_0:
       case unit_acc_1:
          if (unit_src_num == 1) {
-            printf("0");
+            fprintf(fp, "0");
             return;
          }
          break;
       case unit_mul_0:
       case unit_mul_1:
          if (unit_src_num == 1) {
-            printf("1");
+            fprintf(fp, "1");
             return;
          }
          break;
       default:
          break;
       }
-      printf("^%d", cur_dest_index - 1 * num_units + unit_complex);
+      fprintf(fp, "^%d", cur_dest_index - 1 * num_units + unit_complex);
       break;
 
    case gpir_codegen_src_p2_pass:
-      printf("^%d", cur_dest_index - 2 * num_units + unit_pass);
+      fprintf(fp, "^%d", cur_dest_index - 2 * num_units + unit_pass);
       break;
 
    case gpir_codegen_src_p2_acc_0:
-      printf("^%d", cur_dest_index - 2 * num_units + unit_acc_0);
+      fprintf(fp, "^%d", cur_dest_index - 2 * num_units + unit_acc_0);
       break;
 
    case gpir_codegen_src_p2_acc_1:
-      printf("^%d", cur_dest_index - 2 * num_units + unit_acc_1);
+      fprintf(fp, "^%d", cur_dest_index - 2 * num_units + unit_acc_1);
       break;
 
    case gpir_codegen_src_p2_mul_0:
-      printf("^%d", cur_dest_index - 2 * num_units + unit_mul_0);
+      fprintf(fp, "^%d", cur_dest_index - 2 * num_units + unit_mul_0);
       break;
 
    case gpir_codegen_src_p2_mul_1:
-      printf("^%d", cur_dest_index - 2 * num_units + unit_mul_1);
+      fprintf(fp, "^%d", cur_dest_index - 2 * num_units + unit_mul_1);
       break;
 
    case gpir_codegen_src_p1_attrib_x:
    case gpir_codegen_src_p1_attrib_y:
    case gpir_codegen_src_p1_attrib_z:
    case gpir_codegen_src_p1_attrib_w:
-      printf("%c%d.%c", prev_instr->register0_attribute ? 'a' : '$',
+      fprintf(fp, "%c%d.%c", prev_instr->register0_attribute ? 'a' : '$',
              prev_instr->register0_addr,
              "xyzw"[src - gpir_codegen_src_p1_attrib_x]);
       break;
@@ -245,7 +245,7 @@ print_src(gpir_codegen_src src, gp_unit unit, unsigned unit_src_num,
 
 static bool
 print_mul(gpir_codegen_instr *instr, gpir_codegen_instr *prev_instr,
-          unsigned cur_dest_index)
+          unsigned cur_dest_index, FILE *fp)
 {
    bool printed = false;
 
@@ -255,113 +255,113 @@ print_mul(gpir_codegen_instr *instr, gpir_codegen_instr *prev_instr,
       if (instr->mul0_src0 != gpir_codegen_src_unused &&
           instr->mul0_src1 != gpir_codegen_src_unused) {
          printed = true;
-         printf("\t");
+         fprintf(fp, "\t");
          if (instr->mul0_src1 == gpir_codegen_src_ident &&
              !instr->mul0_neg) {
-            printf("mov.m0 ");
-            print_dest(instr, unit_mul_0, cur_dest_index);
-            printf(" ");
+            fprintf(fp, "mov.m0 ");
+            print_dest(instr, unit_mul_0, cur_dest_index, fp);
+            fprintf(fp, " ");
             print_src(instr->mul0_src0, unit_mul_0, 0, instr, prev_instr,
-                      cur_dest_index);
+                      cur_dest_index, fp);
          } else {
             if (instr->mul_op == gpir_codegen_mul_op_complex2)
-               printf("complex2.m0 ");
+               fprintf(fp, "complex2.m0 ");
             else
-               printf("mul.m0 ");
+               fprintf(fp, "mul.m0 ");
 
-            print_dest(instr, unit_mul_0, cur_dest_index);
-            printf(" ");
+            print_dest(instr, unit_mul_0, cur_dest_index, fp);
+            fprintf(fp, " ");
             print_src(instr->mul0_src0, unit_mul_0, 0, instr, prev_instr,
-                      cur_dest_index);
-            printf(" ");
+                      cur_dest_index, fp);
+            fprintf(fp, " ");
             if (instr->mul0_neg)
-               printf("-");
+               fprintf(fp, "-");
             print_src(instr->mul0_src1, unit_mul_0, 1, instr, prev_instr,
-                      cur_dest_index);
+                      cur_dest_index, fp);
          }
 
-         printf("\n");
+         fprintf(fp, "\n");
       }
 
       if (instr->mul1_src0 != gpir_codegen_src_unused &&
           instr->mul1_src1 != gpir_codegen_src_unused) {
          printed = true;
-         printf("\t");
+         fprintf(fp, "\t");
          if (instr->mul1_src1 == gpir_codegen_src_ident &&
              !instr->mul1_neg) {
-            printf("mov.m1 ");
-            print_dest(instr, unit_mul_1, cur_dest_index);
-            printf(" ");
+            fprintf(fp, "mov.m1 ");
+            print_dest(instr, unit_mul_1, cur_dest_index, fp);
+            fprintf(fp, " ");
             print_src(instr->mul1_src0, unit_mul_1, 0, instr, prev_instr,
-                      cur_dest_index);
+                      cur_dest_index, fp);
          } else {
-            printf("mul.m1 ");
-            print_dest(instr, unit_mul_1, cur_dest_index);
-            printf(" ");
+            fprintf(fp, "mul.m1 ");
+            print_dest(instr, unit_mul_1, cur_dest_index, fp);
+            fprintf(fp, " ");
             print_src(instr->mul1_src0, unit_mul_1, 0, instr, prev_instr,
-                      cur_dest_index);
-            printf(" ");
+                      cur_dest_index, fp);
+            fprintf(fp, " ");
             if (instr->mul1_neg)
-               printf("-");
+               fprintf(fp, "-");
             print_src(instr->mul1_src1, unit_mul_0, 1, instr, prev_instr,
-                      cur_dest_index);
+                      cur_dest_index, fp);
          }
-         printf("\n");
+         fprintf(fp, "\n");
       }
 
       break;
    case gpir_codegen_mul_op_complex1:
       printed = true;
-      printf("\tcomplex1.m01 ");
-      print_dest(instr, unit_mul_0, cur_dest_index);
-      printf(" ");
+      fprintf(fp, "\tcomplex1.m01 ");
+      print_dest(instr, unit_mul_0, cur_dest_index, fp);
+      fprintf(fp, " ");
       print_src(instr->mul0_src0, unit_mul_0, 0, instr, prev_instr,
-                cur_dest_index);
-      printf(" ");
+                cur_dest_index, fp);
+      fprintf(fp, " ");
       print_src(instr->mul0_src1, unit_mul_0, 1, instr, prev_instr,
-                cur_dest_index);
-      printf(" ");
+                cur_dest_index, fp);
+      fprintf(fp, " ");
       print_src(instr->mul1_src0, unit_mul_1, 0, instr, prev_instr,
-                cur_dest_index);
-      printf(" ");
+                cur_dest_index, fp);
+      fprintf(fp, " ");
       print_src(instr->mul1_src1, unit_mul_1, 1, instr, prev_instr,
-                cur_dest_index);
-      printf("\n");
+                cur_dest_index, fp);
+      fprintf(fp, "\n");
       break;
 
    case gpir_codegen_mul_op_select:
       printed = true;
-      printf("\tsel.m01 ");
-      print_dest(instr, unit_mul_0, cur_dest_index);
-      printf(" ");
+      fprintf(fp, "\tsel.m01 ");
+      print_dest(instr, unit_mul_0, cur_dest_index, fp);
+      fprintf(fp, " ");
       print_src(instr->mul0_src1, unit_mul_0, 1, instr, prev_instr,
-                cur_dest_index);
-      printf(" ");
+                cur_dest_index, fp);
+      fprintf(fp, " ");
       print_src(instr->mul0_src0, unit_mul_0, 0, instr, prev_instr,
-                cur_dest_index);
-      printf(" ");
+                cur_dest_index, fp);
+      fprintf(fp, " ");
       print_src(instr->mul1_src0, unit_mul_1, 0, instr, prev_instr,
-                cur_dest_index);
-      printf("\n");
+                cur_dest_index, fp);
+      fprintf(fp, "\n");
       break;
 
    default:
       printed = true;
-      printf("\tunknown%u.m01 ", instr->mul_op);
-      print_dest(instr, unit_mul_0, cur_dest_index);
-      printf(" ");
+      fprintf(fp, "\tunknown%u.m01 ", instr->mul_op);
+      print_dest(instr, unit_mul_0, cur_dest_index, fp);
+      fprintf(fp, " ");
       print_src(instr->mul0_src0, unit_mul_0, 0, instr, prev_instr,
-                cur_dest_index);
-      printf(" ");
+                cur_dest_index, fp);
+      fprintf(fp, " ");
       print_src(instr->mul0_src1, unit_mul_0, 1, instr, prev_instr,
-                cur_dest_index);
-      printf(" ");
+                cur_dest_index, fp);
+      fprintf(fp, " ");
       print_src(instr->mul1_src0, unit_mul_1, 0, instr, prev_instr,
-                cur_dest_index);
-      printf(" ");
+                cur_dest_index, fp);
+      fprintf(fp, " ");
       print_src(instr->mul1_src1, unit_mul_1, 1, instr, prev_instr,
-                cur_dest_index);
-      printf("\n");
+                cur_dest_index, fp);
+      fprintf(fp, "\n");
       break;
    }
 
@@ -393,14 +393,14 @@ static const acc_op_info acc_op_infos[8] = {
 
 static bool
 print_acc(gpir_codegen_instr *instr, gpir_codegen_instr *prev_instr,
-          unsigned cur_dest_index)
+          unsigned cur_dest_index, FILE *fp)
 {
    bool printed = false;
    const acc_op_info op = acc_op_infos[instr->acc_op];
 
    if (instr->acc0_src0 != gpir_codegen_src_unused) {
       printed = true;
-      printf("\t");
+      fprintf(fp, "\t");
       acc_op_info acc0_op = op;
       if (instr->acc0_src1 == gpir_codegen_src_ident &&
           instr->acc0_src1_neg) {
@@ -410,30 +410,30 @@ print_acc(gpir_codegen_instr *instr, gpir_codegen_instr *prev_instr,
       }
 
       if (acc0_op.name)
-         printf("%s.a0 ", acc0_op.name);
+         fprintf(fp, "%s.a0 ", acc0_op.name);
       else
-         printf("op%u.a0 ", instr->acc_op);
+         fprintf(fp, "op%u.a0 ", instr->acc_op);
 
-      print_dest(instr, unit_acc_0, cur_dest_index);
-      printf(" ");
+      print_dest(instr, unit_acc_0, cur_dest_index, fp);
+      fprintf(fp, " ");
       if (instr->acc0_src0_neg)
-         printf("-");
+         fprintf(fp, "-");
       print_src(instr->acc0_src0, unit_acc_0, 0, instr, prev_instr,
-                cur_dest_index);
+                cur_dest_index, fp);
       if (acc0_op.srcs > 1) {
-         printf(" ");
+         fprintf(fp, " ");
          if (instr->acc0_src1_neg)
-            printf("-");
+            fprintf(fp, "-");
          print_src(instr->acc0_src1, unit_acc_0, 1, instr, prev_instr,
-                   cur_dest_index);
+                   cur_dest_index, fp);
       }
 
-      printf("\n");
+      fprintf(fp, "\n");
    }
 
    if (instr->acc1_src0 != gpir_codegen_src_unused) {
       printed = true;
-      printf("\t");
+      fprintf(fp, "\t");
       acc_op_info acc1_op = op;
       if (instr->acc1_src1 == gpir_codegen_src_ident &&
           instr->acc1_src1_neg) {
@@ -443,25 +443,25 @@ print_acc(gpir_codegen_instr *instr, gpir_codegen_instr *prev_instr,
       }
 
       if (acc1_op.name)
-         printf("%s.a1 ", acc1_op.name);
+         fprintf(fp, "%s.a1 ", acc1_op.name);
       else
-         printf("op%u.a1 ", instr->acc_op);
+         fprintf(fp, "op%u.a1 ", instr->acc_op);
 
-      print_dest(instr, unit_acc_1, cur_dest_index);
-      printf(" ");
+      print_dest(instr, unit_acc_1, cur_dest_index, fp);
+      fprintf(fp, " ");
       if (instr->acc1_src0_neg)
-         printf("-");
+         fprintf(fp, "-");
       print_src(instr->acc1_src0, unit_acc_1, 0, instr, prev_instr,
-                cur_dest_index);
+                cur_dest_index, fp);
       if (acc1_op.srcs > 1) {
-         printf(" ");
+         fprintf(fp, " ");
          if (instr->acc1_src1_neg)
-            printf("-");
+            fprintf(fp, "-");
          print_src(instr->acc1_src1, unit_acc_1, 1, instr, prev_instr,
-                   cur_dest_index);
+                   cur_dest_index, fp);
       }
 
-      printf("\n");
+      fprintf(fp, "\n");
    }
 
    return printed;
@@ -469,131 +469,129 @@ print_acc(gpir_codegen_instr *instr, gpir_codegen_instr *prev_instr,
 
 static bool
 print_pass(gpir_codegen_instr *instr, gpir_codegen_instr *prev_instr,
-           unsigned cur_dest_index)
+           unsigned cur_dest_index, FILE *fp)
 {
    if (instr->pass_src == gpir_codegen_src_unused)
       return false;
 
-   printf("\t");
+   fprintf(fp, "\t");
 
    switch (instr->pass_op) {
    case gpir_codegen_pass_op_pass:
-      printf("mov.p ");
+      fprintf(fp, "mov.p ");
       break;
    case gpir_codegen_pass_op_preexp2:
-      printf("preexp2.p ");
+      fprintf(fp, "preexp2.p ");
       break;
    case gpir_codegen_pass_op_postlog2:
-      printf("postlog2.p ");
+      fprintf(fp, "postlog2.p ");
       break;
    case gpir_codegen_pass_op_clamp:
-      printf("clamp.p ");
+      fprintf(fp, "clamp.p ");
       break;
    default:
-      printf("unk%u.p ", instr->pass_op);
+      fprintf(fp, "unk%u.p ", instr->pass_op);
    }
 
-   print_dest(instr, unit_pass, cur_dest_index);
-   printf(" ");
+   print_dest(instr, unit_pass, cur_dest_index, fp);
+   fprintf(fp, " ");
    print_src(instr->pass_src, unit_pass, 0, instr, prev_instr,
-             cur_dest_index);
+             cur_dest_index, fp);
 
    if (instr->pass_op == gpir_codegen_pass_op_clamp) {
-      printf(" ");
+      fprintf(fp, " ");
       print_src(gpir_codegen_src_load_x, unit_pass, 1, instr, prev_instr,
-                cur_dest_index);
-      printf(" ");
+                cur_dest_index, fp);
+      fprintf(fp, " ");
       print_src(gpir_codegen_src_load_y, unit_pass, 2, instr, prev_instr,
-                cur_dest_index);
+                cur_dest_index, fp);
    }
 
-   printf("\n");
+   fprintf(fp, "\n");
 
    return true;
 }
 
 static bool
 print_complex(gpir_codegen_instr *instr, gpir_codegen_instr *prev_instr,
-              unsigned cur_dest_index)
+              unsigned cur_dest_index, FILE *fp)
 {
    if (instr->complex_src == gpir_codegen_src_unused)
       return false;
 
-   printf("\t");
+   fprintf(fp, "\t");
 
    switch (instr->complex_op) {
    case gpir_codegen_complex_op_nop:
       return false;
 
    case gpir_codegen_complex_op_exp2:
-      printf("exp2.c ");
+      fprintf(fp, "exp2.c ");
       break;
    case gpir_codegen_complex_op_log2:
-      printf("log2.c ");
+      fprintf(fp, "log2.c ");
       break;
    case gpir_codegen_complex_op_rsqrt:
-      printf("rsqrt.c ");
+      fprintf(fp, "rsqrt.c ");
       break;
    case gpir_codegen_complex_op_rcp:
-      printf("rcp.c ");
+      fprintf(fp, "rcp.c ");
       break;
    case gpir_codegen_complex_op_pass:
    case gpir_codegen_complex_op_temp_store_addr:
    case gpir_codegen_complex_op_temp_load_addr_0:
    case gpir_codegen_complex_op_temp_load_addr_1:
    case gpir_codegen_complex_op_temp_load_addr_2:
-      printf("mov.c ");
+      fprintf(fp, "mov.c ");
       break;
    default:
-      printf("unk%u.c ", instr->complex_op);
+      fprintf(fp, "unk%u.c ", instr->complex_op);
    }
 
-   print_dest(instr, unit_complex, cur_dest_index);
-   printf(" ");
+   print_dest(instr, unit_complex, cur_dest_index, fp);
+   fprintf(fp, " ");
    print_src(instr->complex_src, unit_complex, 0, instr, prev_instr,
-             cur_dest_index);
-   printf("\n");
+             cur_dest_index, fp);
+   fprintf(fp, "\n");
 
    return true;
 }
 
 static void
 print_instr(gpir_codegen_instr *instr, gpir_codegen_instr *prev_instr,
-            unsigned instr_number, unsigned cur_dest_index)
+            unsigned instr_number, unsigned cur_dest_index, FILE *fp)
 {
    bool printed = false;
-   printf("%03d:", instr_number);
-   printed |= print_acc(instr, prev_instr, cur_dest_index);
-   printed |= print_mul(instr, prev_instr, cur_dest_index);
-   printed |= print_complex(instr, prev_instr, cur_dest_index);
-   printed |= print_pass(instr, prev_instr, cur_dest_index);
+   fprintf(fp, "%03d:", instr_number);
+   printed |= print_acc(instr, prev_instr, cur_dest_index, fp);
+   printed |= print_mul(instr, prev_instr, cur_dest_index, fp);
+   printed |= print_complex(instr, prev_instr, cur_dest_index, fp);
+   printed |= print_pass(instr, prev_instr, cur_dest_index, fp);
 
    if (instr->branch) {
       printed = true;
       /* The branch condition is taken from the current pass unit result */
-      printf("\tbranch ^%d %03d\n", cur_dest_index + unit_pass,
+      fprintf(fp, "\tbranch ^%d %03d\n", cur_dest_index + unit_pass,
              instr->branch_target + (instr->branch_target_lo ? 0 : 0x100));
    }
 
    if (instr->unknown_1 != 0) {
       printed = true;
-      printf("\tunknown_1 %u\n", instr->unknown_1);
+      fprintf(fp, "\tunknown_1 %u\n", instr->unknown_1);
    }
 
    if (!printed)
-      printf("\tnop\n");
+      fprintf(fp, "\tnop\n");
 }
 
 void
-gpir_disassemble_program(gpir_codegen_instr *code, unsigned num_instr)
+gpir_disassemble_program(gpir_codegen_instr *code, unsigned num_instr, FILE *fp)
 {
-   printf("=======disassembly:=======\n");
-
    unsigned cur_dest_index = 0;
    unsigned cur_instr = 0;
    for (gpir_codegen_instr *instr = code; cur_instr < num_instr;
         instr++, cur_instr++, cur_dest_index += num_units) {
-      print_instr(instr, instr - 1, cur_instr, cur_dest_index);
+      print_instr(instr, instr - 1, cur_instr, cur_dest_index, fp);
    }
 }
 
diff --git a/mesa 3D driver/src/gallium/drivers/lima/ir/gp/nir.c b/mesa 3D driver/src/gallium/drivers/lima/ir/gp/nir.c
index 6334f6f4f4..4b1479a68f 100644
--- a/mesa 3D driver/src/gallium/drivers/lima/ir/gp/nir.c	
+++ b/mesa 3D driver/src/gallium/drivers/lima/ir/gp/nir.c	
@@ -315,13 +315,13 @@ static bool gpir_emit_load_const(gpir_block *block, nir_instr *ni)
 
 static bool gpir_emit_ssa_undef(gpir_block *block, nir_instr *ni)
 {
-   gpir_error("nir_ssa_undef_instr not support\n");
+   gpir_error("nir_ssa_undef_instr is not supported\n");
    return false;
 }
 
 static bool gpir_emit_tex(gpir_block *block, nir_instr *ni)
 {
-   gpir_error("nir_jump_instr not support\n");
+   gpir_error("texture operations are not supported\n");
    return false;
 }
 
diff --git a/mesa 3D driver/src/gallium/drivers/lima/ir/gp/regalloc.c b/mesa 3D driver/src/gallium/drivers/lima/ir/gp/regalloc.c
index de519b210b..8526d1e9e7 100644
--- a/mesa 3D driver/src/gallium/drivers/lima/ir/gp/regalloc.c	
+++ b/mesa 3D driver/src/gallium/drivers/lima/ir/gp/regalloc.c	
@@ -31,14 +31,7 @@ struct reg_info {
    BITSET_WORD *conflicts;
    struct util_dynarray conflict_list;
 
-   /* Number of conflicts that must be allocated to physical registers.
-    */
-   unsigned phys_conflicts;
-
-   unsigned node_conflicts;
-
-   /* Number of conflicts that can be allocated to either. */
-   unsigned total_conflicts;
+   unsigned num_conflicts;
 
    int assigned_color;
 
@@ -46,7 +39,7 @@ struct reg_info {
 };
 
 struct regalloc_ctx {
-   unsigned bitset_words, num_nodes_and_regs;
+   unsigned bitset_words;
    struct reg_info *registers;
 
    /* Reusable scratch liveness array */
@@ -64,8 +57,8 @@ struct regalloc_ctx {
 
 /* Liveness analysis */
 
-static void propagate_liveness_instr(gpir_node *node, BITSET_WORD *live,
-                                     gpir_compiler *comp)
+static void propagate_liveness_node(gpir_node *node, BITSET_WORD *live,
+                                    gpir_compiler *comp)
 {
    /* KILL */
    if (node->type == gpir_node_type_store) {
@@ -96,7 +89,7 @@ static bool propagate_liveness_block(gpir_block *block, struct regalloc_ctx *ctx
    memcpy(ctx->live, block->live_out, ctx->bitset_words * sizeof(BITSET_WORD));
 
    list_for_each_entry_rev(gpir_node, node, &block->node_list, list) {
-      propagate_liveness_instr(node, ctx->live, block->comp);
+      propagate_liveness_node(node, ctx->live, block->comp);
    }
 
    bool changed = false;
@@ -166,18 +159,8 @@ static void add_interference(struct regalloc_ctx *ctx, unsigned i, unsigned j)
    BITSET_SET(a->conflicts, j);
    BITSET_SET(b->conflicts, i);
 
-   a->total_conflicts++;
-   b->total_conflicts++;
-   if (j < ctx->comp->cur_reg)
-      a->phys_conflicts++;
-   else
-      a->node_conflicts++;
-
-   if (i < ctx->comp->cur_reg)
-      b->phys_conflicts++;
-   else
-      b->node_conflicts++;
-
+   a->num_conflicts++;
+   b->num_conflicts++;
    util_dynarray_append(&a->conflict_list, unsigned, j);
    util_dynarray_append(&b->conflict_list, unsigned, i);
 }
@@ -187,24 +170,17 @@ static void add_interference(struct regalloc_ctx *ctx, unsigned i, unsigned j)
  */
 static void add_all_interferences(struct regalloc_ctx *ctx,
                                   unsigned i,
-                                  BITSET_WORD *live_nodes,
                                   BITSET_WORD *live_regs)
 {
-   int live_node;
-   BITSET_FOREACH_SET(live_node, live_nodes, ctx->comp->cur_index) {
-      add_interference(ctx, i,
-                       live_node + ctx->comp->cur_reg);
-   }
-
    int live_reg;
-   BITSET_FOREACH_SET(live_reg, ctx->live, ctx->comp->cur_index) {
+   BITSET_FOREACH_SET(live_reg, ctx->live, ctx->comp->cur_reg) {
       add_interference(ctx, i, live_reg);
    }
 
 }
 
 static void print_liveness(struct regalloc_ctx *ctx,
-                           BITSET_WORD *live_reg, BITSET_WORD *live_val)
+                           BITSET_WORD *live_reg)
 {
    if (!(lima_debug & LIMA_DEBUG_GP))
       return;
@@ -213,17 +189,11 @@ static void print_liveness(struct regalloc_ctx *ctx,
    BITSET_FOREACH_SET(live_idx, live_reg, ctx->comp->cur_reg) {
       printf("reg%d ", live_idx);
    }
-   BITSET_FOREACH_SET(live_idx, live_val, ctx->comp->cur_index) {
-      printf("%d ", live_idx);
-   }
    printf("\n");
 }
 
 static void calc_interference(struct regalloc_ctx *ctx)
 {
-   BITSET_WORD *live_nodes =
-      rzalloc_array(ctx->mem_ctx, BITSET_WORD, ctx->comp->cur_index);
-
    list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) {
       /* Initialize liveness at the end of the block, but exclude values that
        * definitely aren't defined by the end. This helps out with
@@ -247,35 +217,15 @@ static void calc_interference(struct regalloc_ctx *ctx)
 
       list_for_each_entry_rev(gpir_node, node, &block->node_list, list) {
          gpir_debug("processing node %d\n", node->index);
-         print_liveness(ctx, ctx->live, live_nodes);
-         if (node->type != gpir_node_type_store &&
-             node->type != gpir_node_type_branch) {
-            add_all_interferences(ctx, node->index + ctx->comp->cur_reg,
-                                  live_nodes, ctx->live);
-
-            /* KILL */
-            BITSET_CLEAR(live_nodes, node->index);
-         } else if (node->op == gpir_op_store_reg) {
+         print_liveness(ctx, ctx->live);
+         if (node->op == gpir_op_store_reg) {
             gpir_store_node *store = gpir_node_to_store(node);
-            add_all_interferences(ctx, store->reg->index,
-                                  live_nodes, ctx->live);
+            add_all_interferences(ctx, store->reg->index, ctx->live);
 
             /* KILL */
             BITSET_CLEAR(ctx->live, store->reg->index);
-         }
-
-         /* GEN */
-         if (node->type == gpir_node_type_store) {
-            gpir_store_node *store = gpir_node_to_store(node);
-            BITSET_SET(live_nodes, store->child->index);
-         } else if (node->type == gpir_node_type_alu) {
-            gpir_alu_node *alu = gpir_node_to_alu(node);
-            for (int i = 0; i < alu->num_child; i++)
-               BITSET_SET(live_nodes, alu->children[i]->index);
-         } else if (node->type == gpir_node_type_branch) {
-            gpir_branch_node *branch = gpir_node_to_branch(node);
-            BITSET_SET(live_nodes, branch->cond->index);
          } else if (node->op == gpir_op_load_reg) {
+            /* GEN */
             gpir_load_node *load = gpir_node_to_load(node);
             BITSET_SET(ctx->live, load->reg->index);
          }
@@ -288,39 +238,21 @@ static void calc_interference(struct regalloc_ctx *ctx)
 static bool can_simplify(struct regalloc_ctx *ctx, unsigned i)
 {
    struct reg_info *info = &ctx->registers[i];
-   if (i < ctx->comp->cur_reg) {
-      /* Physical regs. */
-      return info->phys_conflicts + info->node_conflicts < GPIR_PHYSICAL_REG_NUM;
-   } else {
-      /* Nodes: if we manage to allocate all of its conflicting physical
-       * registers, they will take up at most GPIR_PHYSICAL_REG_NUM colors, so
-       * we can ignore any more than that.
-       */
-      return MIN2(info->phys_conflicts, GPIR_PHYSICAL_REG_NUM) + 
-         info->node_conflicts < GPIR_PHYSICAL_REG_NUM + GPIR_VALUE_REG_NUM;
-   }
+   return info->num_conflicts < GPIR_PHYSICAL_REG_NUM;
 }
 
 static void push_stack(struct regalloc_ctx *ctx, unsigned i)
 {
    ctx->stack[ctx->stack_size++] = i;
-   if (i < ctx->comp->cur_reg)
-      gpir_debug("pushing reg%u\n", i);
-   else
-      gpir_debug("pushing %d\n", i - ctx->comp->cur_reg);
+   gpir_debug("pushing reg%u\n", i);
 
    struct reg_info *info = &ctx->registers[i];
    assert(info->visited);
 
    util_dynarray_foreach(&info->conflict_list, unsigned, conflict) {
       struct reg_info *conflict_info = &ctx->registers[*conflict];
-      if (i < ctx->comp->cur_reg) {
-         assert(conflict_info->phys_conflicts > 0);
-         conflict_info->phys_conflicts--;
-      } else {
-         assert(conflict_info->node_conflicts > 0);
-         conflict_info->node_conflicts--;
-      }
+      assert(conflict_info->num_conflicts > 0);
+      conflict_info->num_conflicts--;
       if (!ctx->registers[*conflict].visited && can_simplify(ctx, *conflict)) {
          ctx->worklist[ctx->worklist_end++] = *conflict;
          ctx->registers[*conflict].visited = true;
@@ -335,7 +267,7 @@ static bool do_regalloc(struct regalloc_ctx *ctx)
    ctx->stack_size = 0;
 
    /* Step 1: find the initially simplifiable registers */
-   for (int i = 0; i < ctx->comp->cur_reg + ctx->comp->cur_index; i++) {
+   for (int i = 0; i < ctx->comp->cur_reg; i++) {
       if (can_simplify(ctx, i)) {
          ctx->worklist[ctx->worklist_end++] = i;
          ctx->registers[i].visited = true;
@@ -348,7 +280,7 @@ static bool do_regalloc(struct regalloc_ctx *ctx)
          push_stack(ctx, ctx->worklist[ctx->worklist_start++]);
       }
 
-      if (ctx->stack_size < ctx->num_nodes_and_regs) {
+      if (ctx->stack_size < ctx->comp->cur_reg) {
          /* If there are still unsimplifiable nodes left, we need to
           * optimistically push a node onto the stack.  Choose the one with
           * the smallest number of current neighbors, since that's the most
@@ -356,13 +288,13 @@ static bool do_regalloc(struct regalloc_ctx *ctx)
           */
          unsigned min_conflicts = UINT_MAX;
          unsigned best_reg = 0;
-         for (unsigned reg = 0; reg < ctx->num_nodes_and_regs; reg++) {
+         for (int reg = 0; reg < ctx->comp->cur_reg; reg++) {
             struct reg_info *info = &ctx->registers[reg];
             if (info->visited)
                continue;
-            if (info->phys_conflicts + info->node_conflicts < min_conflicts) {
+            if (info->num_conflicts < min_conflicts) {
                best_reg = reg;
-               min_conflicts = info->phys_conflicts + info->node_conflicts;
+               min_conflicts = info->num_conflicts;
             }
          }
          gpir_debug("optimistic triggered\n");
@@ -374,21 +306,14 @@ static bool do_regalloc(struct regalloc_ctx *ctx)
    }
 
    /* Step 4: pop off the stack and assign colors */
-   for (int i = ctx->num_nodes_and_regs - 1; i >= 0; i--) {
+   for (int i = ctx->comp->cur_reg - 1; i >= 0; i--) {
       unsigned idx = ctx->stack[i];
       struct reg_info *reg = &ctx->registers[idx];
 
-      unsigned num_available_regs;
-      if (idx < ctx->comp->cur_reg) {
-         num_available_regs = GPIR_PHYSICAL_REG_NUM;
-      } else {
-         num_available_regs = GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM;
-      }
-
       bool found = false;
-      unsigned start = i % num_available_regs;
-      for (unsigned j = 0; j < num_available_regs; j++) {
-         unsigned candidate = (j + start) % num_available_regs;
+      unsigned start = i % GPIR_PHYSICAL_REG_NUM;
+      for (unsigned j = 0; j < GPIR_PHYSICAL_REG_NUM; j++) {
+         unsigned candidate = (j + start) % GPIR_PHYSICAL_REG_NUM;
          bool available = true;
          util_dynarray_foreach(&reg->conflict_list, unsigned, conflict_idx) {
             struct reg_info *conflict = &ctx->registers[*conflict_idx];
@@ -420,11 +345,6 @@ static void assign_regs(struct regalloc_ctx *ctx)
 {
    list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) {
       list_for_each_entry(gpir_node, node, &block->node_list, list) {
-         if (node->index >= 0) {
-            node->value_reg =
-               ctx->registers[ctx->comp->cur_reg + node->index].assigned_color;
-         }
-
          if (node->op == gpir_op_load_reg) {
             gpir_load_node *load = gpir_node_to_load(node);
             unsigned color = ctx->registers[load->reg->index].assigned_color;
@@ -452,6 +372,195 @@ static void assign_regs(struct regalloc_ctx *ctx)
    }
 }
 
+/* Value register allocation */
+
+/* Define a special token for when the register is occupied by a preallocated
+ * physical register (i.e. load_reg/store_reg). Normally entries in the "live"
+ * array points to the definition of the value, but there may be multiple
+ * definitions in this case, and they will certainly come from other basic
+ * blocks, so it doesn't make sense to do that here.
+ */
+static gpir_node __physreg_live;
+#define PHYSREG_LIVE (&__physreg_live)
+
+struct value_regalloc_ctx {
+   gpir_node *last_written[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM];
+   gpir_node *complex1_last_written[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM];
+   gpir_node *live[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM];
+   gpir_node *last_complex1;
+   unsigned alloc_start;
+};
+
+static unsigned find_free_value_reg(struct value_regalloc_ctx *ctx)
+{
+   /* Implement round-robin allocation */
+   unsigned reg_offset = ctx->alloc_start++;
+   if (ctx->alloc_start == GPIR_PHYSICAL_REG_NUM + GPIR_VALUE_REG_NUM)
+      ctx->alloc_start = 0;
+
+   unsigned reg = UINT_MAX;
+   for (unsigned reg_base = 0;
+        reg_base < GPIR_PHYSICAL_REG_NUM + GPIR_VALUE_REG_NUM;
+        reg_base++) {
+      unsigned cur_reg = (reg_base + reg_offset) % (GPIR_PHYSICAL_REG_NUM + GPIR_VALUE_REG_NUM);
+      if (!ctx->live[cur_reg]) {
+         reg = cur_reg;
+         break;
+      }
+   }
+
+   return reg;
+}
+
+static void add_fake_dep(gpir_node *node, gpir_node *src,
+                         struct value_regalloc_ctx *ctx)
+{
+   assert(src->value_reg >= 0);
+   if (ctx->last_written[src->value_reg] &&
+       ctx->last_written[src->value_reg] != node) {
+      gpir_node_add_dep(ctx->last_written[src->value_reg], node,
+                        GPIR_DEP_WRITE_AFTER_READ);
+   }
+
+   /* For a sequence of schedule_first nodes right before a complex1
+    * node, add any extra fake dependencies necessary so that the
+    * schedule_first nodes can be scheduled right after the complex1 is
+    * scheduled. We have to save the last_written before complex1 happens to
+    * avoid adding dependencies to children of the complex1 node which would
+    * create a cycle.
+    */
+
+   if (gpir_op_infos[node->op].schedule_first &&
+       ctx->last_complex1 &&
+       ctx->complex1_last_written[src->value_reg]) {
+      gpir_node_add_dep(ctx->complex1_last_written[src->value_reg],
+                        ctx->last_complex1,
+                        GPIR_DEP_WRITE_AFTER_READ);
+   }
+}
+
+static bool handle_value_read(gpir_node *node, gpir_node *src,
+                              struct value_regalloc_ctx *ctx)
+{
+   /* If already allocated, don't allocate it */
+   if (src->value_reg < 0) {
+      unsigned reg = find_free_value_reg(ctx);
+      if (reg == UINT_MAX)
+         return false;
+
+      src->value_reg = reg;
+      ctx->live[reg] = src;
+   }
+
+   /* Add any fake dependencies. Note: this is the actual result of value
+    * register allocation. We throw away node->value_reg afterwards, since
+    * it's really the fake dependencies which constrain the post-RA scheduler
+    * enough to make sure it never needs to spill to temporaries.
+    */
+   add_fake_dep(node, src, ctx);
+
+   return true;
+}
+
+static bool handle_reg_read(gpir_load_node *load,
+                            struct value_regalloc_ctx *ctx)
+{
+   unsigned idx = load->index * 4 + load->component;
+   if (!ctx->live[idx]) {
+      ctx->live[idx] = PHYSREG_LIVE;
+   } else if (ctx->live[idx] != PHYSREG_LIVE) {
+      /* This slot is occupied by some other value register, so we need to
+       * evict it. This effectively splits the live range of the value
+       * register. NB: since we create fake dependencies on the fly, and the
+       * fake dependencies are the only output of this pass, we don't actually
+       * have to record where the split happened or that this value was
+       * assigned to two different registers. Any actual live range splitting
+       * happens in the post-RA scheduler, which moves the value to and from
+       * the register file. This will just cause some reads of the value
+       * register to have different fake dependencies.
+       */
+      unsigned new_reg = find_free_value_reg(ctx);
+      if (new_reg == UINT_MAX)
+         return false;
+      ctx->live[new_reg] = ctx->live[idx];
+      ctx->live[new_reg]->value_reg = new_reg;
+      ctx->live[idx] = PHYSREG_LIVE;
+   }
+
+   if (ctx->last_written[idx]) {
+      gpir_node_add_dep(ctx->last_written[idx], &load->node,
+                        GPIR_DEP_WRITE_AFTER_READ);
+   }
+
+   return true;
+}
+
+static void handle_reg_write(gpir_store_node *store,
+                             struct value_regalloc_ctx *ctx)
+{
+   unsigned idx = store->index * 4 + store->component;
+   store->node.value_reg = idx;
+   ctx->last_written[idx] = &store->node;
+   ctx->live[idx] = NULL;
+}
+
+static void handle_value_write(gpir_node *node,
+                               struct value_regalloc_ctx *ctx)
+{
+   ctx->last_written[node->value_reg] = node;
+   ctx->live[node->value_reg] = NULL;
+}
+
+static bool regalloc_value_regs(gpir_block *block)
+{
+   struct value_regalloc_ctx ctx = { { 0 } };
+
+   list_for_each_entry(gpir_node, node, &block->node_list, list) {
+      node->value_reg = -1;
+   }
+
+   list_for_each_entry_rev(gpir_node, node, &block->node_list, list) {
+      if (node->op == gpir_op_complex1) {
+         ctx.last_complex1 = node;
+         memcpy(ctx.complex1_last_written, ctx.last_written,
+                sizeof(ctx.complex1_last_written));
+      }
+
+      if (node->type != gpir_node_type_store &&
+          node->type != gpir_node_type_branch) {
+         handle_value_write(node, &ctx);
+      } else if (node->op == gpir_op_store_reg) {
+         handle_reg_write(gpir_node_to_store(node), &ctx);
+      }
+
+      if (node->type == gpir_node_type_store) {
+         gpir_store_node *store = gpir_node_to_store(node);
+         if (!handle_value_read(&store->node, store->child, &ctx))
+            return false;
+      } else if (node->type == gpir_node_type_alu) {
+         gpir_alu_node *alu = gpir_node_to_alu(node);
+         for (int i = 0; i < alu->num_child; i++) {
+            if (!handle_value_read(&alu->node, alu->children[i], &ctx))
+               return false;
+         }
+      } else if (node->type == gpir_node_type_branch) {
+         /* At the end of a block the top 11 values are always free, so
+          * branches should always succeed.
+          */
+         gpir_branch_node *branch = gpir_node_to_branch(node);
+         ASSERTED bool result = handle_value_read(&branch->node,
+                                                  branch->cond, &ctx);
+         assert(result);
+      } else if (node->op == gpir_op_load_reg) {
+         gpir_load_node *load = gpir_node_to_load(node);
+         if (!handle_reg_read(load, &ctx))
+             return false;
+      }
+   }
+
+   return true;
+}
+
 static void regalloc_print_result(gpir_compiler *comp)
 {
    if (!(lima_debug & LIMA_DEBUG_GP))
@@ -486,15 +595,14 @@ bool gpir_regalloc_prog(gpir_compiler *comp)
    struct regalloc_ctx ctx;
 
    ctx.mem_ctx = ralloc_context(NULL);
-   ctx.num_nodes_and_regs = comp->cur_reg + comp->cur_index;
-   ctx.bitset_words = BITSET_WORDS(ctx.num_nodes_and_regs);
+   ctx.bitset_words = BITSET_WORDS(comp->cur_reg);
    ctx.live = ralloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words);
-   ctx.worklist = ralloc_array(ctx.mem_ctx, unsigned, ctx.num_nodes_and_regs);
-   ctx.stack = ralloc_array(ctx.mem_ctx, unsigned, ctx.num_nodes_and_regs);
+   ctx.worklist = ralloc_array(ctx.mem_ctx, unsigned, comp->cur_reg);
+   ctx.stack = ralloc_array(ctx.mem_ctx, unsigned, comp->cur_reg);
    ctx.comp = comp;
 
-   ctx.registers = rzalloc_array(ctx.mem_ctx, struct reg_info, ctx.num_nodes_and_regs);
-   for (unsigned i = 0; i < ctx.num_nodes_and_regs; i++) {
+   ctx.registers = rzalloc_array(ctx.mem_ctx, struct reg_info, comp->cur_reg);
+   for (int i = 0; i < comp->cur_reg; i++) {
       ctx.registers[i].conflicts = rzalloc_array(ctx.mem_ctx, BITSET_WORD,
                                                  ctx.bitset_words);
       util_dynarray_init(&ctx.registers[i].conflict_list, ctx.mem_ctx);
@@ -514,6 +622,11 @@ bool gpir_regalloc_prog(gpir_compiler *comp)
    }
    assign_regs(&ctx);
 
+   list_for_each_entry(gpir_block, block, &comp->block_list, list) {
+      if (!regalloc_value_regs(block))
+         return false;
+   }
+
    regalloc_print_result(comp);
    ralloc_free(ctx.mem_ctx);
    return true;
diff --git a/mesa 3D driver/src/gallium/drivers/lima/ir/gp/scheduler.c b/mesa 3D driver/src/gallium/drivers/lima/ir/gp/scheduler.c
index 90a830f056..78128bd891 100644
--- a/mesa 3D driver/src/gallium/drivers/lima/ir/gp/scheduler.c	
+++ b/mesa 3D driver/src/gallium/drivers/lima/ir/gp/scheduler.c	
@@ -1612,33 +1612,8 @@ static bool schedule_block(gpir_block *block)
    return true;
 }
 
-static void add_fake_dep(gpir_node *node, gpir_node *dep_node,
-                         gpir_node *last_written[])
-{
-      gpir_node_foreach_pred(node, dep) {
-         if (dep->type == GPIR_DEP_INPUT) {
-            int index = dep->pred->value_reg;
-            if (index >= 0 && last_written[index]) {
-               gpir_node_add_dep(last_written[index], dep_node,
-                                 GPIR_DEP_WRITE_AFTER_READ);
-            }
-            if (gpir_op_infos[dep->pred->op].schedule_first) {
-               /* Insert fake dependencies for any schedule_first children on
-                * this node as well. This guarantees that as soon as
-                * "dep_node" is ready to schedule, all of its schedule_first
-                * children, grandchildren, etc. are ready so that they can be
-                * scheduled as soon as possible.
-                */
-               add_fake_dep(dep->pred, dep_node, last_written);
-            }
-         }
-      }
-}
-
 static void schedule_build_dependency(gpir_block *block)
 {
-   gpir_node *last_written[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM] = {0};
-
    /* merge dummy_f/m to the node created from */
    list_for_each_entry_safe(gpir_node, node, &block->node_list, list) {
       if (node->op == gpir_op_dummy_m) {
@@ -1658,30 +1633,6 @@ static void schedule_build_dependency(gpir_block *block)
          gpir_node_delete(node);
       }
    }
-
-   memset(last_written, 0, sizeof(last_written));
-
-   /* False dependencies. For value registers, these exist only to make sure
-    * that the maximum pressure isn't exceeded and are hence "fake".
-    */
-   list_for_each_entry_rev(gpir_node, node, &block->node_list, list) {
-      if (node->op == gpir_op_load_reg) {
-         gpir_load_node *load = gpir_node_to_load(node);
-         unsigned index = 4 * load->index + load->component;
-         if (last_written[index]) {
-            gpir_node_add_dep(last_written[index], node, GPIR_DEP_WRITE_AFTER_READ);
-         }
-      } else if (node->op == gpir_op_store_reg) {
-         gpir_store_node *store = gpir_node_to_store(node);
-         unsigned index = 4 * store->index + store->component;
-         last_written[index] = node;
-      } else {
-         add_fake_dep(node, node, last_written);
-      }
-
-      if (node->value_reg >= 0)
-         last_written[node->value_reg] = node;
-   }
 }
 
 static void print_statistic(gpir_compiler *comp, int save_index)
diff --git a/mesa 3D driver/src/gallium/drivers/lima/ir/lima_ir.h b/mesa 3D driver/src/gallium/drivers/lima/ir/lima_ir.h
index 60d574a640..41d363a555 100644
--- a/mesa 3D driver/src/gallium/drivers/lima/ir/lima_ir.h	
+++ b/mesa 3D driver/src/gallium/drivers/lima/ir/lima_ir.h	
@@ -67,6 +67,7 @@ void lima_nir_lower_uniform_to_scalar(nir_shader *shader);
 bool lima_nir_scale_trig(nir_shader *shader);
 bool lima_nir_lower_ftrunc(nir_shader *shader);
 bool lima_nir_split_load_input(nir_shader *shader);
+bool lima_nir_split_loads(nir_shader *shader);
 
 void lima_nir_duplicate_load_consts(nir_shader *shader);
 void lima_nir_duplicate_load_inputs(nir_shader *shader);
diff --git a/mesa 3D driver/src/gallium/drivers/lima/ir/lima_nir_split_load_input.c b/mesa 3D driver/src/gallium/drivers/lima/ir/lima_nir_split_load_input.c
index b1ec9aef39..4cd37bdd24 100644
--- a/mesa 3D driver/src/gallium/drivers/lima/ir/lima_nir_split_load_input.c	
+++ b/mesa 3D driver/src/gallium/drivers/lima/ir/lima_nir_split_load_input.c	
@@ -27,86 +27,71 @@
 #include "lima_ir.h"
 
 static bool
-lima_nir_split_load_input_block(nir_block *block, nir_builder *b)
+lima_nir_split_load_input_instr(nir_builder *b,
+                                nir_instr *instr,
+                                UNUSED void *cb_data)
 {
-   bool progress = false;
+   if (instr->type != nir_instr_type_alu)
+      return false;
 
-   nir_foreach_instr_safe(instr, block) {
-      if (instr->type != nir_instr_type_alu)
-         continue;
+   nir_alu_instr *alu = nir_instr_as_alu(instr);
+   if (alu->op != nir_op_mov)
+      return false;
 
-      nir_alu_instr *alu = nir_instr_as_alu(instr);
-      if (alu->op != nir_op_mov)
-         continue;
+   if (!alu->dest.dest.is_ssa)
+      return false;
 
-      if (!alu->dest.dest.is_ssa)
-         continue;
+   if (!alu->src[0].src.is_ssa)
+      return false;
 
-      if (!alu->src[0].src.is_ssa)
-         continue;
+   nir_ssa_def *ssa = alu->src[0].src.ssa;
+   if (ssa->parent_instr->type != nir_instr_type_intrinsic)
+      return false;
 
-      nir_ssa_def *ssa = alu->src[0].src.ssa;
-      if (ssa->parent_instr->type != nir_instr_type_intrinsic)
-         continue;
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(ssa->parent_instr);
+   if (intrin->intrinsic != nir_intrinsic_load_input)
+      return false;
 
-      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(ssa->parent_instr);
-      if (intrin->intrinsic != nir_intrinsic_load_input)
-         continue;
+   uint8_t swizzle = alu->src[0].swizzle[0];
+   int i;
 
-      uint8_t swizzle = alu->src[0].swizzle[0];
-      int i;
+   for (i = 1; i < nir_dest_num_components(alu->dest.dest); i++)
+      if (alu->src[0].swizzle[i] != (swizzle + i))
+         break;
 
-      for (i = 1; i < nir_dest_num_components(alu->dest.dest); i++)
-         if (alu->src[0].swizzle[i] != (swizzle + i))
-            break;
+   if (i != nir_dest_num_components(alu->dest.dest))
+      return false;
 
-      if (i != nir_dest_num_components(alu->dest.dest))
-         continue;
+   /* mali4xx can't access unaligned vec3, don't split load input */
+   if (nir_dest_num_components(alu->dest.dest) == 3 && swizzle > 0)
+      return false;
 
-      /* mali4xx can't access unaligned vec3, don't split load input */
-      if (nir_dest_num_components(alu->dest.dest) == 3 && swizzle > 0)
-         continue;
+   /* mali4xx can't access unaligned vec2, don't split load input */
+   if (nir_dest_num_components(alu->dest.dest) == 2 &&
+       swizzle != 0 && swizzle != 2)
+      return false;
 
-      b->cursor = nir_before_instr(&intrin->instr);
-      nir_intrinsic_instr *new_intrin = nir_intrinsic_instr_create(
-                                             b->shader,
-                                             intrin->intrinsic);
-      nir_ssa_dest_init(&new_intrin->instr, &new_intrin->dest,
-                        nir_dest_num_components(alu->dest.dest),
-                        ssa->bit_size,
-                        NULL);
-      new_intrin->num_components = nir_dest_num_components(alu->dest.dest);
-      nir_intrinsic_set_base(new_intrin, nir_intrinsic_base(intrin));
-      nir_intrinsic_set_component(new_intrin, nir_intrinsic_component(intrin) + swizzle);
-      nir_intrinsic_set_dest_type(new_intrin, nir_intrinsic_dest_type(intrin));
+   b->cursor = nir_before_instr(&intrin->instr);
+   nir_intrinsic_instr *new_intrin = nir_intrinsic_instr_create(
+                                          b->shader,
+                                          intrin->intrinsic);
+   nir_ssa_dest_init(&new_intrin->instr, &new_intrin->dest,
+                     nir_dest_num_components(alu->dest.dest),
+                     ssa->bit_size,
+                     NULL);
+   new_intrin->num_components = nir_dest_num_components(alu->dest.dest);
+   nir_intrinsic_set_base(new_intrin, nir_intrinsic_base(intrin));
+   nir_intrinsic_set_component(new_intrin, nir_intrinsic_component(intrin) + swizzle);
+   nir_intrinsic_set_dest_type(new_intrin, nir_intrinsic_dest_type(intrin));
 
-      /* offset */
-      nir_src_copy(&new_intrin->src[0], &intrin->src[0], new_intrin);
+   /* offset */
+   nir_src_copy(&new_intrin->src[0], &intrin->src[0]);
 
-      nir_builder_instr_insert(b, &new_intrin->instr);
-      nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa,
-                               &new_intrin->dest.ssa);
-      nir_instr_remove(&alu->instr);
-      progress = true;
-   }
-
-   return progress;
-}
-
-static bool
-lima_nir_split_load_input_impl(nir_function_impl *impl)
-{
-   bool progress = false;
-   nir_builder builder;
-   nir_builder_init(&builder, impl);
-
-   nir_foreach_block(block, impl) {
-      progress |= lima_nir_split_load_input_block(block, &builder);
-   }
-
-   nir_metadata_preserve(impl, nir_metadata_block_index |
-                               nir_metadata_dominance);
-   return progress;
+   nir_builder_instr_insert(b, &new_intrin->instr);
+   nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa,
+                            &new_intrin->dest.ssa);
+   nir_instr_remove(&alu->instr);
+   return true;
 }
 
 /* Replaces a single load of several packed varyings and number of movs with
@@ -115,13 +100,8 @@ lima_nir_split_load_input_impl(nir_function_impl *impl)
 bool
 lima_nir_split_load_input(nir_shader *shader)
 {
-   bool progress = false;
-
-   nir_foreach_function(function, shader) {
-      if (function->impl)
-         progress |= lima_nir_split_load_input_impl(function->impl);
-   }
-
-   return progress;
+   return nir_shader_instructions_pass(shader, lima_nir_split_load_input_instr,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance,
+                                       NULL);
 }
-
diff --git a/mesa 3D driver/src/gallium/drivers/lima/ir/lima_nir_split_loads.c b/mesa 3D driver/src/gallium/drivers/lima/ir/lima_nir_split_loads.c
new file mode 100644
index 0000000000..75707280c7
--- /dev/null
+++ b/mesa 3D driver/src/gallium/drivers/lima/ir/lima_nir_split_loads.c	
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2019 Connor Abbott <cwabbott0@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sub license,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+#include "lima_ir.h"
+
+/* This pass clones certain input intrinsics, creating a copy for each user.
+ * Inputs are relatively cheap, since in both PP and GP one input can be
+ * loaded "for free" in each instruction bundle. In GP especially, if there is
+ * a load instruction with multiple uses in different basic blocks, we need to
+ * split it in NIR so that we don't generate a register write and reads for
+ * it, which is almost certainly more expensive than splitting. Hence this
+ * pass is more aggressive than nir_opt_move, which just moves the intrinsic
+ * down but won't split it.
+ */
+
+static nir_ssa_def *
+clone_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin)
+{
+   nir_intrinsic_instr *new_intrin =
+      nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intrin->instr));
+
+   assert(new_intrin->dest.is_ssa);
+
+   unsigned num_srcs = nir_intrinsic_infos[new_intrin->intrinsic].num_srcs;
+   for (unsigned i = 0; i < num_srcs; i++) {
+      assert(new_intrin->src[i].is_ssa);
+   }
+
+   nir_builder_instr_insert(b, &new_intrin->instr);
+
+   return &new_intrin->dest.ssa;
+}
+
+static bool
+replace_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin)
+{
+   if (!intrin->dest.is_ssa)
+      return false;
+
+   if (intrin->intrinsic != nir_intrinsic_load_input &&
+       intrin->intrinsic != nir_intrinsic_load_uniform)
+      return false;
+
+   if (!intrin->src[0].is_ssa)
+      return false;
+
+   if (intrin->src[0].ssa->parent_instr->type == nir_instr_type_load_const)
+      return false;
+
+   struct hash_table *visited_instrs = _mesa_pointer_hash_table_create(NULL);
+
+   nir_foreach_use_safe(src, &intrin->dest.ssa) {
+      struct hash_entry *entry =
+         _mesa_hash_table_search(visited_instrs, src->parent_instr);
+      if (entry && (src->parent_instr->type != nir_instr_type_phi)) {
+         nir_ssa_def *def = entry->data;
+         nir_instr_rewrite_src(src->parent_instr, src, nir_src_for_ssa(def));
+         continue;
+      }
+      b->cursor = nir_before_src(src, false);
+      nir_ssa_def *new = clone_intrinsic(b, intrin);
+      nir_instr_rewrite_src(src->parent_instr, src, nir_src_for_ssa(new));
+      _mesa_hash_table_insert(visited_instrs, src->parent_instr, new);
+   }
+   nir_foreach_if_use_safe(src, &intrin->dest.ssa) {
+      b->cursor = nir_before_src(src, true);
+      nir_if_rewrite_condition(src->parent_if,
+                               nir_src_for_ssa(clone_intrinsic(b, intrin)));
+   }
+
+   nir_instr_remove(&intrin->instr);
+   _mesa_hash_table_destroy(visited_instrs, NULL);
+   return true;
+}
+
+static void
+replace_load_const(nir_builder *b, nir_load_const_instr *load_const)
+{
+   struct hash_table *visited_instrs = _mesa_pointer_hash_table_create(NULL);
+
+   nir_foreach_use_safe(src, &load_const->def) {
+      struct hash_entry *entry =
+         _mesa_hash_table_search(visited_instrs, src->parent_instr);
+      if (entry && (src->parent_instr->type != nir_instr_type_phi)) {
+         nir_ssa_def *def = entry->data;
+         nir_instr_rewrite_src(src->parent_instr, src, nir_src_for_ssa(def));
+         continue;
+      }
+      b->cursor = nir_before_src(src, false);
+      nir_ssa_def *new = nir_build_imm(b, load_const->def.num_components,
+                                       load_const->def.bit_size,
+                                       load_const->value);
+      nir_instr_rewrite_src(src->parent_instr, src, nir_src_for_ssa(new));
+      _mesa_hash_table_insert(visited_instrs, src->parent_instr, new);
+   }
+
+   nir_instr_remove(&load_const->instr);
+   _mesa_hash_table_destroy(visited_instrs, NULL);
+}
+
+bool
+lima_nir_split_loads(nir_shader *shader)
+{
+   bool progress = false;
+
+   nir_foreach_function(function, shader) {
+      if (function->impl) {
+         nir_builder b;
+         nir_builder_init(&b, function->impl);
+
+         nir_foreach_block_reverse(block, function->impl) {
+            nir_foreach_instr_reverse_safe(instr, block) {
+               if (instr->type == nir_instr_type_load_const) {
+                  replace_load_const(&b, nir_instr_as_load_const(instr));
+                  progress = true;
+               } else if (instr->type == nir_instr_type_intrinsic) {
+                  progress |= replace_intrinsic(&b, nir_instr_as_intrinsic(instr));
+               }
+            }
+         }
+      }
+   }
+
+   return progress;
+}
+
diff --git a/mesa 3D driver/src/gallium/drivers/lima/ir/pp/codegen.c b/mesa 3D driver/src/gallium/drivers/lima/ir/pp/codegen.c
index 47ceb18367..63ac9c0935 100644
--- a/mesa 3D driver/src/gallium/drivers/lima/ir/pp/codegen.c	
+++ b/mesa 3D driver/src/gallium/drivers/lima/ir/pp/codegen.c	
@@ -778,7 +778,7 @@ static void ppir_codegen_print_prog(ppir_compiler *comp)
             printf("%08x ", prog[i]);
          }
          printf("\n");
-         ppir_disassemble_instr(prog, offset);
+         ppir_disassemble_instr(prog, offset, stdout);
          prog += n;
          offset += n;
       }
diff --git a/mesa 3D driver/src/gallium/drivers/lima/ir/pp/codegen.h b/mesa 3D driver/src/gallium/drivers/lima/ir/pp/codegen.h
index bf2541f1a8..198517205c 100644
--- a/mesa 3D driver/src/gallium/drivers/lima/ir/pp/codegen.h	
+++ b/mesa 3D driver/src/gallium/drivers/lima/ir/pp/codegen.h	
@@ -355,6 +355,6 @@ typedef union __attribute__((__packed__)) {
    } discard;
 } ppir_codegen_field_branch;
 
-void ppir_disassemble_instr(uint32_t *instr, unsigned offset);
+void ppir_disassemble_instr(uint32_t *instr, unsigned offset, FILE *fp);
 
 #endif
diff --git a/mesa 3D driver/src/gallium/drivers/lima/ir/pp/disasm.c b/mesa 3D driver/src/gallium/drivers/lima/ir/pp/disasm.c
index 50aa4cbb85..8f34d7c3c3 100644
--- a/mesa 3D driver/src/gallium/drivers/lima/ir/pp/disasm.c	
+++ b/mesa 3D driver/src/gallium/drivers/lima/ir/pp/disasm.c	
@@ -35,51 +35,51 @@ typedef struct {
 } asm_op;
 
 static void
-print_swizzle(uint8_t swizzle)
+print_swizzle(uint8_t swizzle, FILE *fp)
 {
    if (swizzle == 0xE4)
       return;
 
-   printf(".");
+   fprintf(fp, ".");
    for (unsigned i = 0; i < 4; i++, swizzle >>= 2)
-      printf("%c", "xyzw"[swizzle & 3]);
+      fprintf(fp, "%c", "xyzw"[swizzle & 3]);
 }
 
 static void
-print_mask(uint8_t mask)
+print_mask(uint8_t mask, FILE *fp)
 {
    if (mask == 0xF)
       return;
 
-   printf(".");
-   if (mask & 1) printf("x");
-   if (mask & 2) printf("y");
-   if (mask & 4) printf("z");
-   if (mask & 8) printf("w");
+   fprintf(fp, ".");
+   if (mask & 1) fprintf(fp, "x");
+   if (mask & 2) fprintf(fp, "y");
+   if (mask & 4) fprintf(fp, "z");
+   if (mask & 8) fprintf(fp, "w");
 }
 
 static void
-print_reg(ppir_codegen_vec4_reg reg, const char *special)
+print_reg(ppir_codegen_vec4_reg reg, const char *special, FILE *fp)
 {
    if (special) {
-      printf("%s", special);
+      fprintf(fp, "%s", special);
    } else {
       switch (reg)
       {
          case ppir_codegen_vec4_reg_constant0:
-            printf("^const0");
+            fprintf(fp, "^const0");
             break;
          case ppir_codegen_vec4_reg_constant1:
-            printf("^const1");
+            fprintf(fp, "^const1");
             break;
          case ppir_codegen_vec4_reg_texture:
-            printf("^texture");
+            fprintf(fp, "^texture");
             break;
          case ppir_codegen_vec4_reg_uniform:
-            printf("^uniform");
+            fprintf(fp, "^uniform");
             break;
          default:
-            printf("$%u", reg);
+            fprintf(fp, "$%u", reg);
             break;
       }
    }
@@ -87,75 +87,75 @@ print_reg(ppir_codegen_vec4_reg reg, const char *special)
 
 static void
 print_vector_source(ppir_codegen_vec4_reg reg, const char *special,
-                    uint8_t swizzle, bool abs, bool neg)
+                    uint8_t swizzle, bool abs, bool neg, FILE *fp)
 {
    if (neg)
-      printf("-");
+      fprintf(fp, "-");
    if (abs)
-      printf("abs(");
+      fprintf(fp, "abs(");
 
-   print_reg(reg, special);
-   print_swizzle(swizzle);
+   print_reg(reg, special, fp);
+   print_swizzle(swizzle, fp);
 
    if (abs)
-      printf(")");
+      fprintf(fp, ")");
 }
 
 static void
-print_source_scalar(unsigned reg, const char *special, bool abs, bool neg)
+print_source_scalar(unsigned reg, const char *special, bool abs, bool neg, FILE *fp)
 {
    if (neg)
-      printf("-");
+      fprintf(fp, "-");
    if (abs)
-      printf("abs(");
+      fprintf(fp, "abs(");
 
-   print_reg(reg >> 2, special);
+   print_reg(reg >> 2, special, fp);
    if (!special)
-      printf(".%c", "xyzw"[reg & 3]);
+      fprintf(fp, ".%c", "xyzw"[reg & 3]);
 
    if (abs)
-      printf(")");
+      fprintf(fp, ")");
 }
 
 static void
-print_varying_source(ppir_codegen_field_varying *varying)
+print_varying_source(ppir_codegen_field_varying *varying, FILE *fp)
 {
    switch (varying->imm.alignment) {
    case 0:
-      printf("%u.%c", varying->imm.index >> 2,
+      fprintf(fp, "%u.%c", varying->imm.index >> 2,
              "xyzw"[varying->imm.index & 3]);
       break;
    case 1: {
       const char *c[2] = {"xy", "zw"};
-      printf("%u.%s", varying->imm.index >> 1, c[varying->imm.index & 1]);
+      fprintf(fp, "%u.%s", varying->imm.index >> 1, c[varying->imm.index & 1]);
       break;
    }
    default:
-      printf("%u", varying->imm.index);
+      fprintf(fp, "%u", varying->imm.index);
       break;
    }
 
    if (varying->imm.offset_vector != 15) {
       unsigned reg = (varying->imm.offset_vector << 2) +
          varying->imm.offset_scalar;
-      printf("+");
-      print_source_scalar(reg, NULL, false, false);
+      fprintf(fp, "+");
+      print_source_scalar(reg, NULL, false, false, fp);
    }
 }
 
 static void
-print_outmod(ppir_codegen_outmod modifier)
+print_outmod(ppir_codegen_outmod modifier, FILE *fp)
 {
    switch (modifier)
    {
       case ppir_codegen_outmod_clamp_fraction:
-         printf(".sat");
+         fprintf(fp, ".sat");
          break;
       case ppir_codegen_outmod_clamp_positive:
-         printf(".pos");
+         fprintf(fp, ".pos");
          break;
       case ppir_codegen_outmod_round:
-         printf(".int");
+         fprintf(fp, ".int");
          break;
       default:
          break;
@@ -163,190 +163,190 @@ print_outmod(ppir_codegen_outmod modifier)
 }
 
 static void
-print_dest_scalar(unsigned reg)
+print_dest_scalar(unsigned reg, FILE *fp)
 {
-   printf("$%u", reg >> 2);
-   printf(".%c ", "xyzw"[reg & 3]);
+   fprintf(fp, "$%u", reg >> 2);
+   fprintf(fp, ".%c ", "xyzw"[reg & 3]);
 }
 
 static void
-print_const(unsigned const_num, uint16_t *val)
+print_const(unsigned const_num, uint16_t *val, FILE *fp)
 {
-   printf("const%u", const_num);
+   fprintf(fp, "const%u", const_num);
    for (unsigned i = 0; i < 4; i++)
-      printf(" %f", _mesa_half_to_float(val[i]));
+      fprintf(fp, " %f", _mesa_half_to_float(val[i]));
 }
 
 static void
-print_const0(void *code, unsigned offset)
+print_const0(void *code, unsigned offset, FILE *fp)
 {
    (void) offset;
 
-   print_const(0, code);
+   print_const(0, code, fp);
 }
 
 static void
-print_const1(void *code, unsigned offset)
+print_const1(void *code, unsigned offset, FILE *fp)
 {
    (void) offset;
 
-   print_const(1, code);
+   print_const(1, code, fp);
 }
 
 static void
-print_varying(void *code, unsigned offset)
+print_varying(void *code, unsigned offset, FILE *fp)
 {
    (void) offset;
    ppir_codegen_field_varying *varying = code;
 
-   printf("load");
+   fprintf(fp, "load");
 
    bool perspective = varying->imm.source_type < 2 && varying->imm.perspective;
    if (perspective)
    {
-      printf(".perspective");
+      fprintf(fp, ".perspective");
       switch (varying->imm.perspective)
       {
       case 2:
-         printf(".z");
+         fprintf(fp, ".z");
          break;
       case 3:
-         printf(".w");
+         fprintf(fp, ".w");
          break;
       default:
-         printf(".unknown");
+         fprintf(fp, ".unknown");
          break;
       }
    }
 
-   printf(".v ");
+   fprintf(fp, ".v ");
 
    switch (varying->imm.dest)
    {
    case ppir_codegen_vec4_reg_discard:
-      printf("^discard");
+      fprintf(fp, "^discard");
       break;
    default:
-      printf("$%u", varying->imm.dest);
+      fprintf(fp, "$%u", varying->imm.dest);
       break;
    }
-   print_mask(varying->imm.mask);
-   printf(" ");
+   print_mask(varying->imm.mask, fp);
+   fprintf(fp, " ");
 
    switch (varying->imm.source_type) {
    case 1:
       print_vector_source(varying->reg.source, NULL, varying->reg.swizzle,
-                          varying->reg.absolute, varying->reg.negate);
+                          varying->reg.absolute, varying->reg.negate, fp);
       break;
    case 2:
       switch (varying->imm.perspective) {
       case 0:
-         printf("cube(");
-         print_varying_source(varying);
-         printf(")");
+         fprintf(fp, "cube(");
+         print_varying_source(varying, fp);
+         fprintf(fp, ")");
          break;
       case 1:
-         printf("cube(");
+         fprintf(fp, "cube(");
          print_vector_source(varying->reg.source, NULL, varying->reg.swizzle,
-                             varying->reg.absolute, varying->reg.negate);
-         printf(")");
+                             varying->reg.absolute, varying->reg.negate, fp);
+         fprintf(fp, ")");
          break;
       case 2:
-         printf("normalize(");
+         fprintf(fp, "normalize(");
          print_vector_source(varying->reg.source, NULL, varying->reg.swizzle,
-                             varying->reg.absolute, varying->reg.negate);
-         printf(")");
+                             varying->reg.absolute, varying->reg.negate, fp);
+         fprintf(fp, ")");
          break;
       default:
-         printf("gl_FragCoord");
+         fprintf(fp, "gl_FragCoord");
          break;
       }
       break;
    case 3:
       if (varying->imm.perspective)
-         printf("gl_FrontFacing");
+         fprintf(fp, "gl_FrontFacing");
       else
-         printf("gl_PointCoord");
+         fprintf(fp, "gl_PointCoord");
       break;
    default:
-      print_varying_source(varying);
+      print_varying_source(varying, fp);
       break;
    }
 }
 
 static void
-print_sampler(void *code, unsigned offset)
+print_sampler(void *code, unsigned offset, FILE *fp)
 {
    (void) offset;
    ppir_codegen_field_sampler *sampler = code;
 
-   printf("texld");
+   fprintf(fp, "texld");
    if (sampler->lod_bias_en)
-      printf(".b");
+      fprintf(fp, ".b");
 
    switch (sampler->type) {
    case ppir_codegen_sampler_type_2d:
-      printf(".2d");
+      fprintf(fp, ".2d");
       break;
    case ppir_codegen_sampler_type_cube:
-      printf(".cube");
+      fprintf(fp, ".cube");
       break;
    default:
-      printf("_t%u", sampler->type);
+      fprintf(fp, "_t%u", sampler->type);
       break;
    }
 
-   printf(" %u", sampler->index);
+   fprintf(fp, " %u", sampler->index);
 
    if (sampler->offset_en)
    {
-      printf("+");
-      print_source_scalar(sampler->index_offset, NULL, false, false);
+      fprintf(fp, "+");
+      print_source_scalar(sampler->index_offset, NULL, false, false, fp);
    }
 
    if (sampler->lod_bias_en)
    {
-      printf(" ");
-      print_source_scalar(sampler->lod_bias, NULL, false, false);
+      fprintf(fp, " ");
+      print_source_scalar(sampler->lod_bias, NULL, false, false, fp);
    }
 }
 
 static void
-print_uniform(void *code, unsigned offset)
+print_uniform(void *code, unsigned offset, FILE *fp)
 {
    (void) offset;
    ppir_codegen_field_uniform *uniform = code;
 
-   printf("load.");
+   fprintf(fp, "load.");
 
    switch (uniform->source) {
    case ppir_codegen_uniform_src_uniform:
-      printf("u");
+      fprintf(fp, "u");
       break;
    case ppir_codegen_uniform_src_temporary:
-      printf("t");
+      fprintf(fp, "t");
       break;
    default:
-      printf(".u%u", uniform->source);
+      fprintf(fp, ".u%u", uniform->source);
       break;
    }
 
    int16_t index = uniform->index;
    switch (uniform->alignment) {
    case 2:
-      printf(" %d", index);
+      fprintf(fp, " %d", index);
       break;
    case 1:
-      printf(" %d.%s", index / 2, (index & 1) ? "zw" : "xy");
+      fprintf(fp, " %d.%s", index / 2, (index & 1) ? "zw" : "xy");
       break;
    default:
-      printf(" %d.%c", index / 4, "xyzw"[index & 3]);
+      fprintf(fp, " %d.%c", index / 4, "xyzw"[index & 3]);
       break;
    }
 
    if (uniform->offset_en) {
-      printf("+");
-      print_source_scalar(uniform->offset_reg, NULL, false, false);
+      fprintf(fp, "+");
+      print_source_scalar(uniform->offset_reg, NULL, false, false, fp);
    }
 }
 
@@ -377,7 +377,7 @@ static const asm_op vec4_mul_ops[] = {
 #undef CASE
 
 static void
-print_vec4_mul(void *code, unsigned offset)
+print_vec4_mul(void *code, unsigned offset, FILE *fp)
 {
    (void) offset;
    ppir_codegen_field_vec4_mul *vec4_mul = code;
@@ -385,34 +385,34 @@ print_vec4_mul(void *code, unsigned offset)
    asm_op op = vec4_mul_ops[vec4_mul->op];
 
    if (op.name)
-      printf("%s", op.name);
+      fprintf(fp, "%s", op.name);
    else
-      printf("op%u", vec4_mul->op);
-   print_outmod(vec4_mul->dest_modifier);
-   printf(".v0 ");
+      fprintf(fp, "op%u", vec4_mul->op);
+   print_outmod(vec4_mul->dest_modifier, fp);
+   fprintf(fp, ".v0 ");
 
    if (vec4_mul->mask) {
-      printf("$%u", vec4_mul->dest);
-      print_mask(vec4_mul->mask);
-      printf(" ");
+      fprintf(fp, "$%u", vec4_mul->dest);
+      print_mask(vec4_mul->mask, fp);
+      fprintf(fp, " ");
    }
 
    print_vector_source(vec4_mul->arg0_source, NULL,
                        vec4_mul->arg0_swizzle,
                        vec4_mul->arg0_absolute,
-                       vec4_mul->arg0_negate);
+                       vec4_mul->arg0_negate, fp);
 
    if (vec4_mul->op < 8 && vec4_mul->op != 0) {
-      printf("<<%u", vec4_mul->op);
+      fprintf(fp, "<<%u", vec4_mul->op);
    }
 
-   printf(" ");
+   fprintf(fp, " ");
 
    if (op.srcs > 1) {
       print_vector_source(vec4_mul->arg1_source, NULL,
                           vec4_mul->arg1_swizzle,
                           vec4_mul->arg1_absolute,
-                          vec4_mul->arg1_negate);
+                          vec4_mul->arg1_negate, fp);
    }
 }
 
@@ -444,7 +444,7 @@ static const asm_op vec4_acc_ops[] = {
 #undef CASE
 
 static void
-print_vec4_acc(void *code, unsigned offset)
+print_vec4_acc(void *code, unsigned offset, FILE *fp)
 {
    (void) offset;
    ppir_codegen_field_vec4_acc *vec4_acc = code;
@@ -452,29 +452,29 @@ print_vec4_acc(void *code, unsigned offset)
    asm_op op = vec4_acc_ops[vec4_acc->op];
 
    if (op.name)
-      printf("%s", op.name);
+      fprintf(fp, "%s", op.name);
    else
-      printf("op%u", vec4_acc->op);
-   print_outmod(vec4_acc->dest_modifier);
-   printf(".v1 ");
+      fprintf(fp, "op%u", vec4_acc->op);
+   print_outmod(vec4_acc->dest_modifier, fp);
+   fprintf(fp, ".v1 ");
 
    if (vec4_acc->mask) {
-      printf("$%u", vec4_acc->dest);
-      print_mask(vec4_acc->mask);
-      printf(" ");
+      fprintf(fp, "$%u", vec4_acc->dest);
+      print_mask(vec4_acc->mask, fp);
+      fprintf(fp, " ");
    }
 
    print_vector_source(vec4_acc->arg0_source, vec4_acc->mul_in ? "^v0" : NULL,
                        vec4_acc->arg0_swizzle,
                        vec4_acc->arg0_absolute,
-                       vec4_acc->arg0_negate);
+                       vec4_acc->arg0_negate, fp);
 
    if (op.srcs > 1) {
-      printf(" ");
+      fprintf(fp, " ");
       print_vector_source(vec4_acc->arg1_source, NULL,
                           vec4_acc->arg1_swizzle,
                           vec4_acc->arg1_absolute,
-                          vec4_acc->arg1_negate);
+                          vec4_acc->arg1_negate, fp);
    }
 }
 
@@ -505,7 +505,7 @@ static const asm_op float_mul_ops[] = {
 #undef CASE
 
 static void
-print_float_mul(void *code, unsigned offset)
+print_float_mul(void *code, unsigned offset, FILE *fp)
 {
    (void) offset;
    ppir_codegen_field_float_mul *float_mul = code;
@@ -513,29 +513,29 @@ print_float_mul(void *code, unsigned offset)
    asm_op op = float_mul_ops[float_mul->op];
 
    if (op.name)
-      printf("%s", op.name);
+      fprintf(fp, "%s", op.name);
    else
-      printf("op%u", float_mul->op);
-   print_outmod(float_mul->dest_modifier);
-   printf(".s0 ");
+      fprintf(fp, "op%u", float_mul->op);
+   print_outmod(float_mul->dest_modifier, fp);
+   fprintf(fp, ".s0 ");
 
    if (float_mul->output_en)
-      print_dest_scalar(float_mul->dest);
+      print_dest_scalar(float_mul->dest, fp);
 
    print_source_scalar(float_mul->arg0_source, NULL,
                        float_mul->arg0_absolute,
-                       float_mul->arg0_negate);
+                       float_mul->arg0_negate, fp);
 
    if (float_mul->op < 8 && float_mul->op != 0) {
-      printf("<<%u", float_mul->op);
+      fprintf(fp, "<<%u", float_mul->op);
    }
 
    if (op.srcs > 1) {
-      printf(" ");
+      fprintf(fp, " ");
 
       print_source_scalar(float_mul->arg1_source, NULL,
                           float_mul->arg1_absolute,
-                          float_mul->arg1_negate);
+                          float_mul->arg1_negate, fp);
    }
 }
 
@@ -565,7 +565,7 @@ static const asm_op float_acc_ops[] = {
 #undef CASE
 
 static void
-print_float_acc(void *code, unsigned offset)
+print_float_acc(void *code, unsigned offset, FILE *fp)
 {
    (void) offset;
    ppir_codegen_field_float_acc *float_acc = code;
@@ -573,24 +573,24 @@ print_float_acc(void *code, unsigned offset)
    asm_op op = float_acc_ops[float_acc->op];
 
    if (op.name)
-      printf("%s", op.name);
+      fprintf(fp, "%s", op.name);
    else
-      printf("op%u", float_acc->op);
-   print_outmod(float_acc->dest_modifier);
-   printf(".s1 ");
+      fprintf(fp, "op%u", float_acc->op);
+   print_outmod(float_acc->dest_modifier, fp);
+   fprintf(fp, ".s1 ");
 
    if (float_acc->output_en)
-      print_dest_scalar(float_acc->dest);
+      print_dest_scalar(float_acc->dest, fp);
 
    print_source_scalar(float_acc->arg0_source, float_acc->mul_in ? "^s0" : NULL,
                        float_acc->arg0_absolute,
-                       float_acc->arg0_negate);
+                       float_acc->arg0_negate, fp);
 
    if (op.srcs > 1) {
-      printf(" ");
+      fprintf(fp, " ");
       print_source_scalar(float_acc->arg1_source, NULL,
                           float_acc->arg1_absolute,
-                          float_acc->arg1_negate);
+                          float_acc->arg1_negate, fp);
    }
 }
 
@@ -616,7 +616,7 @@ static const asm_op combine_ops[] = {
 #undef CASE
 
 static void
-print_combine(void *code, unsigned offset)
+print_combine(void *code, unsigned offset, FILE *fp)
 {
    (void) offset;
    ppir_codegen_field_combine *combine = code;
@@ -626,105 +626,104 @@ print_combine(void *code, unsigned offset)
       /* This particular combination can only be valid for scalar * vector
        * multiplies, and the opcode field is reused for something else.
        */
-      printf("mul");
+      fprintf(fp, "mul");
    } else {
       asm_op op = combine_ops[combine->scalar.op];
 
       if (op.name)
-         printf("%s", op.name);
+         fprintf(fp, "%s", op.name);
       else
-         printf("op%u", combine->scalar.op);
+         fprintf(fp, "op%u", combine->scalar.op);
    }
 
    if (!combine->scalar.dest_vec)
-      print_outmod(combine->scalar.dest_modifier);
-   printf(".s2 ");
+      print_outmod(combine->scalar.dest_modifier, fp);
+   fprintf(fp, ".s2 ");
 
    if (combine->scalar.dest_vec) {
-      printf("$%u", combine->vector.dest);
-      print_mask(combine->vector.mask);
+      fprintf(fp, "$%u", combine->vector.dest);
+      print_mask(combine->vector.mask, fp);
    } else {
-      print_dest_scalar(combine->scalar.dest);
+      print_dest_scalar(combine->scalar.dest, fp);
    }
-   printf(" ");
+   fprintf(fp, " ");
 
    print_source_scalar(combine->scalar.arg0_src, NULL,
                        combine->scalar.arg0_absolute,
-                       combine->scalar.arg0_negate);
-   printf(" ");
+                       combine->scalar.arg0_negate, fp);
+   fprintf(fp, " ");
 
    if (combine->scalar.arg1_en) {
       if (combine->scalar.dest_vec) {
          print_vector_source(combine->vector.arg1_source, NULL,
                              combine->vector.arg1_swizzle,
-                             false, false);
+                             false, false, fp);
       } else {
          print_source_scalar(combine->scalar.arg1_src, NULL,
                              combine->scalar.arg1_absolute,
-                             combine->scalar.arg1_negate);
+                             combine->scalar.arg1_negate, fp);
       }
    }
 }
 
 static void
-print_temp_write(void *code, unsigned offset)
+print_temp_write(void *code, unsigned offset, FILE *fp)
 {
    (void) offset;
    ppir_codegen_field_temp_write *temp_write = code;
 
    if (temp_write->fb_read.unknown_0 == 0x7) {
       if (temp_write->fb_read.source)
-         printf("fb_color");
+         fprintf(fp, "fb_color");
       else
-         printf("fb_depth");
-      printf(" $%u", temp_write->fb_read.dest);
+         fprintf(fp, "fb_depth");
+      fprintf(fp, " $%u", temp_write->fb_read.dest);
 
       return;
    }
 
-   printf("store.t");
+   fprintf(fp, "store.t");
 
    int16_t index = temp_write->temp_write.index;
    switch (temp_write->temp_write.alignment) {
    case 2:
-      printf(" %d", index);
+      fprintf(fp, " %d", index);
       break;
    case 1:
-      printf(" %d.%s", index / 2, (index & 1) ? "zw" : "xy");
+      fprintf(fp, " %d.%s", index / 2, (index & 1) ? "zw" : "xy");
       break;
    default:
-      printf(" %d.%c", index / 4, "xyzw"[index & 3]);
+      fprintf(fp, " %d.%c", index / 4, "xyzw"[index & 3]);
       break;
    }
 
    if (temp_write->temp_write.offset_en) {
-      printf("+");
+      fprintf(fp, "+");
       print_source_scalar(temp_write->temp_write.offset_reg,
-                          NULL, false, false);
+                          NULL, false, false, fp);
    }
 
-   printf(" ");
+   fprintf(fp, " ");
 
    if (temp_write->temp_write.alignment) {
-      print_reg(temp_write->temp_write.source >> 2, NULL);
+      print_reg(temp_write->temp_write.source >> 2, NULL, fp);
    } else {
-      print_source_scalar(temp_write->temp_write.source, NULL, false, false);
+      print_source_scalar(temp_write->temp_write.source, NULL, false, false, fp);
    }
 }
 
 static void
-print_branch(void *code, unsigned offset)
-{ 
+print_branch(void *code, unsigned offset, FILE *fp)
+{
    ppir_codegen_field_branch *branch = code;
 
    if (branch->discard.word0 == PPIR_CODEGEN_DISCARD_WORD0 &&
        branch->discard.word1 == PPIR_CODEGEN_DISCARD_WORD1 &&
        branch->discard.word2 == PPIR_CODEGEN_DISCARD_WORD2) {
-      printf("discard");
+      fprintf(fp, "discard");
       return;
    }
 
-   
    const char* cond[] = {
       "nv", "lt", "eq", "le",
       "gt", "ne", "ge", ""  ,
@@ -734,18 +733,18 @@ print_branch(void *code, unsigned offset)
    cond_mask |= (branch->branch.cond_lt ? 1 : 0);
    cond_mask |= (branch->branch.cond_eq ? 2 : 0);
    cond_mask |= (branch->branch.cond_gt ? 4 : 0);
-   printf("branch");
+   fprintf(fp, "branch");
    if (cond_mask != 0x7) {
-      printf(".%s ", cond[cond_mask]);
-      print_source_scalar(branch->branch.arg0_source, NULL, false, false); 
-      printf(" ");
-      print_source_scalar(branch->branch.arg1_source, NULL, false, false); 
+      fprintf(fp, ".%s ", cond[cond_mask]);
+      print_source_scalar(branch->branch.arg0_source, NULL, false, false, fp);
+      fprintf(fp, " ");
+      print_source_scalar(branch->branch.arg1_source, NULL, false, false, fp);
    }
 
-   printf(" %d", branch->branch.target + offset);
+   fprintf(fp, " %d", branch->branch.target + offset);
 }
 
-typedef void (*print_field_func)(void *, unsigned);
+typedef void (*print_field_func)(void *, unsigned, FILE *);
 
 static const print_field_func print_field[ppir_codegen_field_shift_count] = {
    [ppir_codegen_field_shift_varying] = print_varying,
@@ -781,7 +780,7 @@ bitcopy(char *src, char *dst, unsigned bits, unsigned src_offset)
 }
 
 void
-ppir_disassemble_instr(uint32_t *instr, unsigned offset)
+ppir_disassemble_instr(uint32_t *instr, unsigned offset, FILE *fp)
 {
    ppir_codegen_ctrl *ctrl = (ppir_codegen_ctrl *) instr;
 
@@ -800,18 +799,18 @@ ppir_disassemble_instr(uint32_t *instr, unsigned offset)
       if (first)
          first = false;
       else
-         printf(", ");
+         fprintf(fp, ", ");
 
-      print_field[i](code, offset);
+      print_field[i](code, offset, fp);
 
       bit_offset += bits;
    }
 
    if (ctrl->sync)
-      printf(", sync");
+      fprintf(fp, ", sync");
    if (ctrl->stop)
-      printf(", stop");
+      fprintf(fp, ", stop");
 
-   printf("\n");
+   fprintf(fp, "\n");
 }
 
diff --git a/mesa 3D driver/src/gallium/drivers/lima/lima_draw.c b/mesa 3D driver/src/gallium/drivers/lima/lima_draw.c
index 161fc7288a..c60956629b 100644
--- a/mesa 3D driver/src/gallium/drivers/lima/lima_draw.c	
+++ b/mesa 3D driver/src/gallium/drivers/lima/lima_draw.c	
@@ -600,8 +600,7 @@ lima_calculate_depth_test(struct pipe_depth_stencil_alpha_state *depth,
    return (depth->depth_enabled && depth->depth_writemask) |
       ((int)func << 1) |
       (offset_scale << 16) |
-      (offset_units << 24) |
-      0x30; /* find out what is this */
+      (offset_units << 24);
 }
 
 static void
@@ -647,20 +646,16 @@ lima_pack_render_state(struct lima_context *ctx, const struct pipe_draw_info *in
    struct pipe_rasterizer_state *rst = &ctx->rasterizer->base;
    render->depth_test = lima_calculate_depth_test(&ctx->zsa->base, rst);
 
+   if (!rst->depth_clip_near || ctx->viewport.near == 0.0f)
+      render->depth_test |= 0x10; /* don't clip depth near */
+   if (!rst->depth_clip_far || ctx->viewport.far == 1.0f)
+      render->depth_test |= 0x20; /* don't clip depth far */
+
    ushort far, near;
 
    near = float_to_ushort(ctx->viewport.near);
    far = float_to_ushort(ctx->viewport.far);
 
-   /* Insert a small 'epsilon' difference between 'near' and 'far' when
-    * they are equal, to avoid application bugs. */
-   if (far == near) {
-      if (near > 0)
-         near--;
-      if (far < USHRT_MAX)
-         far++;
-   }
-
    /* overlap with plbu? any place can remove one? */
    render->depth_range = near | (far << 16);
 
@@ -726,7 +721,10 @@ lima_pack_render_state(struct lima_context *ctx, const struct pipe_draw_info *in
    render->textures_address = 0x00000000;
 
    render->aux0 = (ctx->vs->state.varying_stride >> 3);
-   render->aux1 = 0x00001000;
+   render->aux1 = 0x00000000;
+   if (ctx->rasterizer->base.front_ccw)
+      render->aux1 = 0x00001000;
+
    if (ctx->blend->base.dither)
       render->aux1 |= 0x00002000;
 
@@ -1182,10 +1180,12 @@ lima_draw_vbo(struct pipe_context *pctx,
    lima_dump_command_stream_print(
       job->dump, ctx->vs->bo->map, ctx->vs->state.shader_size, false,
       "add vs at va %x\n", ctx->vs->bo->va);
+   lima_dump_shader(job->dump, ctx->vs->bo->map, ctx->vs->state.shader_size, false);
 
    lima_dump_command_stream_print(
       job->dump, ctx->fs->bo->map, ctx->fs->state.shader_size, false,
       "add fs at va %x\n", ctx->fs->bo->va);
+   lima_dump_shader(job->dump, ctx->fs->bo->map, ctx->fs->state.shader_size, true);
 
    lima_job_add_bo(job, LIMA_PIPE_GP, ctx->vs->bo, LIMA_SUBMIT_BO_READ);
    lima_job_add_bo(job, LIMA_PIPE_PP, ctx->fs->bo, LIMA_SUBMIT_BO_READ);
diff --git a/mesa 3D driver/src/gallium/drivers/lima/lima_job.c b/mesa 3D driver/src/gallium/drivers/lima/lima_job.c
index 9503075931..ef8a6444cb 100644
--- a/mesa 3D driver/src/gallium/drivers/lima/lima_job.c	
+++ b/mesa 3D driver/src/gallium/drivers/lima/lima_job.c	
@@ -552,8 +552,8 @@ lima_generate_pp_stream(struct lima_job *job, int off_x, int off_y,
     */
    int max = MAX2(tiled_w, tiled_h);
    int index = 0;
-   uint32_t *stream[4];
-   int si[4] = {0};
+   uint32_t *stream[8];
+   int si[8] = {0};
    int dim = 0;
    int count = 0;
 
diff --git a/mesa 3D driver/src/gallium/drivers/lima/lima_parser.c b/mesa 3D driver/src/gallium/drivers/lima/lima_parser.c
index b0a6c86a9e..c3780d94f1 100644
--- a/mesa 3D driver/src/gallium/drivers/lima/lima_parser.c	
+++ b/mesa 3D driver/src/gallium/drivers/lima/lima_parser.c	
@@ -32,6 +32,9 @@
 #include "lima_parser.h"
 #include "lima_texture.h"
 
+#include "lima/ir/gp/codegen.h"
+#include "lima/ir/pp/codegen.h"
+
 typedef struct {
    char *info;
 } render_state_info;
@@ -433,6 +436,35 @@ lima_parse_plbu(FILE *fp, uint32_t *data, int size, uint32_t start)
    fprintf(fp, "\n");
 }
 
+void
+lima_parse_shader(FILE *fp, uint32_t *data, int size, bool is_frag)
+{
+   uint32_t *value = &data[0];
+
+   if (is_frag) {
+      uint32_t *bin = value;
+      uint32_t offt = 0;
+      uint32_t next_instr_length = 0;
+
+      fprintf(fp, "/* ============ FS DISASSEMBLY BEGIN ============== */\n");
+
+      do {
+         ppir_codegen_ctrl *ctrl = (ppir_codegen_ctrl *)bin;
+         fprintf(fp, "@%6d: ", offt);
+         ppir_disassemble_instr(bin, offt, fp);
+         bin += ctrl->count;
+         offt += ctrl->count;
+         next_instr_length = ctrl->next_count;
+      } while (next_instr_length);
+
+      fprintf(fp, "/* ============ FS DISASSEMBLY END ================= */\n");
+   } else {
+      fprintf(fp, "/* ============ VS DISASSEMBLY BEGIN ============== */\n");
+      gpir_disassemble_program((gpir_codegen_instr *)value, size / sizeof(gpir_codegen_instr), fp);
+      fprintf(fp, "/* ============ VS DISASSEMBLY END ================= */\n");
+   }
+}
+
 static void
 parse_rsw(FILE *fp, uint32_t *value, int i, uint32_t *helper)
 {
@@ -489,7 +521,11 @@ parse_rsw(FILE *fp, uint32_t *value, int i, uint32_t *helper)
       if (*value & 0x1000)
          fprintf(fp, ", shader writes stencil");
       fprintf(fp, " */\n\t\t\t\t\t\t/* %s(3)", render_state_infos[i].info);
-      fprintf(fp, ": unknown bits 4-9: 0x%08x", *value & 0x000003f0);
+      if ((*value & 0x00000010) == 0x00000010)
+         fprintf(fp, ": ignore depth clip near");
+      if ((*value & 0x00000020) == 0x00000020)
+         fprintf(fp, ", ignore depth clip far");
+      fprintf(fp, ", unknown bits 6-9: 0x%08x", *value & 0x000003c0);
       fprintf(fp, ", unknown bits 13-15: 0x%08x */\n", *value & 0x00000e000);
       break;
    case 4: /* DEPTH RANGE */
@@ -637,6 +673,12 @@ parse_rsw(FILE *fp, uint32_t *value, int i, uint32_t *helper)
       fprintf(fp, ": ");
       if ((*value & 0x00002000) == 0x00002000)
          fprintf(fp, "blend->base.dither true, ");
+
+      if ((*value & 0x00001000) == 0x00001000)
+         fprintf(fp, "glFrontFace(GL_CCW), ");
+      else
+         fprintf(fp, "glFrontFace(GL_CW), ");
+
       if ((*value & 0x00010000) == 0x00010000)
          fprintf(fp, "ctx->const_buffer[PIPE_SHADER_FRAGMENT].buffer true ");
       fprintf(fp, "*/\n");
diff --git a/mesa 3D driver/src/gallium/drivers/lima/lima_parser.h b/mesa 3D driver/src/gallium/drivers/lima/lima_parser.h
index 2378cfc02d..d592c9f861 100644
--- a/mesa 3D driver/src/gallium/drivers/lima/lima_parser.h	
+++ b/mesa 3D driver/src/gallium/drivers/lima/lima_parser.h	
@@ -118,6 +118,7 @@ static inline const char
       return "UNKNOWN";
 }
 
+void lima_parse_shader(FILE *fp, uint32_t *data, int size, bool is_frag);
 void lima_parse_vs(FILE *fp, uint32_t *data, int size, uint32_t start);
 void lima_parse_plbu(FILE *fp, uint32_t *data, int size, uint32_t start);
 void lima_parse_render_state(FILE *fp, uint32_t *data, int size, uint32_t start);
diff --git a/mesa 3D driver/src/gallium/drivers/lima/lima_program.c b/mesa 3D driver/src/gallium/drivers/lima/lima_program.c
index e43fa9e629..a4d9b8a0ef 100644
--- a/mesa 3D driver/src/gallium/drivers/lima/lima_program.c	
+++ b/mesa 3D driver/src/gallium/drivers/lima/lima_program.c	
@@ -143,6 +143,7 @@ lima_program_optimize_vs_nir(struct nir_shader *s)
 
    NIR_PASS_V(s, nir_copy_prop);
    NIR_PASS_V(s, nir_opt_dce);
+   NIR_PASS_V(s, lima_nir_split_loads);
    NIR_PASS_V(s, nir_lower_locals_to_regs);
    NIR_PASS_V(s, nir_convert_from_ssa, true);
    NIR_PASS_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);
diff --git a/mesa 3D driver/src/gallium/drivers/lima/lima_screen.c b/mesa 3D driver/src/gallium/drivers/lima/lima_screen.c
index 4795058509..adeb3b9c84 100644
--- a/mesa 3D driver/src/gallium/drivers/lima/lima_screen.c	
+++ b/mesa 3D driver/src/gallium/drivers/lima/lima_screen.c	
@@ -101,9 +101,11 @@ lima_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_BLEND_EQUATION_SEPARATE:
    case PIPE_CAP_ACCELERATED:
    case PIPE_CAP_UMA:
+   case PIPE_CAP_CLIP_HALFZ:
    case PIPE_CAP_NATIVE_FENCE_FD:
    case PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD:
    case PIPE_CAP_TEXTURE_SWIZZLE:
+   case PIPE_CAP_VERTEX_COLOR_UNCLAMPED:
       return 1;
 
    /* Unimplemented, but for exporting OpenGL 2.0 */
diff --git a/mesa 3D driver/src/gallium/drivers/lima/lima_state.c b/mesa 3D driver/src/gallium/drivers/lima/lima_state.c
index fe6c493016..778c793fec 100644
--- a/mesa 3D driver/src/gallium/drivers/lima/lima_state.c	
+++ b/mesa 3D driver/src/gallium/drivers/lima/lima_state.c	
@@ -29,6 +29,7 @@
 #include "util/u_helpers.h"
 #include "util/u_debug.h"
 #include "util/u_framebuffer.h"
+#include "util/u_viewport.h"
 
 #include "pipe/p_state.h"
 
@@ -218,11 +219,11 @@ lima_set_viewport_states(struct pipe_context *pctx,
 
    /* reverse calculate the parameter of glDepthRange */
    float near, far;
-   near = viewport->translate[2] - viewport->scale[2];
-   far = viewport->translate[2] + viewport->scale[2];
+   bool halfz = ctx->rasterizer && ctx->rasterizer->base.clip_halfz;
+   util_viewport_zmin_zmax(viewport, halfz, &near, &far);
 
-   ctx->viewport.near = MIN2(near, far);
-   ctx->viewport.far = MAX2(near, far);
+   ctx->viewport.near = ctx->rasterizer && ctx->rasterizer->base.depth_clip_near ? near : 0.0f;
+   ctx->viewport.far = ctx->rasterizer && ctx->rasterizer->base.depth_clip_far ? far : 1.0f;
 
    ctx->viewport.transform = *viewport;
    ctx->dirty |= LIMA_CONTEXT_DIRTY_VIEWPORT;
@@ -381,6 +382,7 @@ lima_set_sampler_views(struct pipe_context *pctx,
                       enum pipe_shader_type shader,
                       unsigned start, unsigned nr,
                        unsigned unbind_num_trailing_slots,
+                       bool take_ownership,
                       struct pipe_sampler_view **views)
 {
    struct lima_context *ctx = lima_context(pctx);
@@ -393,7 +395,13 @@ lima_set_sampler_views(struct pipe_context *pctx,
    for (i = 0; i < nr; i++) {
       if (views[i])
          new_nr = i + 1;
-      pipe_sampler_view_reference(&lima_tex->textures[i], views[i]);
+
+      if (take_ownership) {
+         pipe_sampler_view_reference(&lima_tex->textures[i], NULL);
+         lima_tex->textures[i] = views[i];
+      } else {
+         pipe_sampler_view_reference(&lima_tex->textures[i], views[i]);
+      }
    }
 
    for (; i < lima_tex->num_textures; i++) {
diff --git a/mesa 3D driver/src/gallium/drivers/lima/lima_util.c b/mesa 3D driver/src/gallium/drivers/lima/lima_util.c
index dca9307c99..832fbf68ec 100644
--- a/mesa 3D driver/src/gallium/drivers/lima/lima_util.c	
+++ b/mesa 3D driver/src/gallium/drivers/lima/lima_util.c	
@@ -78,6 +78,13 @@ lima_dump_blob(FILE *fp, void *data, int size, bool is_float)
    fprintf(fp, "}\n");
 }
 
+void
+lima_dump_shader(struct lima_dump *dump, void *data, int size, bool is_frag)
+{
+   if (dump)
+      lima_parse_shader(dump->fp, (uint32_t *)data, size, is_frag);
+}
+
 void
 lima_dump_vs_command_stream_print(struct lima_dump *dump, void *data,
                                   int size, uint32_t start)
diff --git a/mesa 3D driver/src/gallium/drivers/lima/lima_util.h b/mesa 3D driver/src/gallium/drivers/lima/lima_util.h
index 3749523f3a..ebab8a40e6 100644
--- a/mesa 3D driver/src/gallium/drivers/lima/lima_util.h	
+++ b/mesa 3D driver/src/gallium/drivers/lima/lima_util.h	
@@ -37,6 +37,7 @@ struct lima_dump *lima_dump_create(void);
 struct lima_dump *lima_dump_next(struct lima_dump *dump);
 void lima_dump_free(struct lima_dump *dump);
 
+void lima_dump_shader(struct lima_dump *dump, void *data, int size, bool is_frag);
 void lima_dump_vs_command_stream_print(struct lima_dump *dump, void *data,
                                        int size, uint32_t start);
 void lima_dump_plbu_command_stream_print(struct lima_dump *dump, void *data,
diff --git a/mesa 3D driver/src/gallium/drivers/lima/meson.build b/mesa 3D driver/src/gallium/drivers/lima/meson.build
index 08533fe563..5bc6fbbf86 100644
--- a/mesa 3D driver/src/gallium/drivers/lima/meson.build	
+++ b/mesa 3D driver/src/gallium/drivers/lima/meson.build	
@@ -50,6 +50,7 @@ files_lima = files(
   'ir/lima_nir_duplicate_intrinsic.c',
   'ir/lima_nir_lower_uniform_to_scalar.c',
   'ir/lima_nir_split_load_input.c',
+  'ir/lima_nir_split_loads.c',
 
   'ir/lima_ir.h',
 
@@ -152,3 +153,7 @@ lima_disasm = executable(
   build_by_default : with_tools.contains('lima'),
   install : with_tools.contains('lima'),
 )
+
+if with_tools.contains('drm-shim')
+  subdir('drm-shim')
+endif
diff --git a/mesa 3D driver/src/gallium/drivers/lima/standalone/lima_disasm.c b/mesa 3D driver/src/gallium/drivers/lima/standalone/lima_disasm.c
index ee4460d5fc..f54bfc3599 100644
--- a/mesa 3D driver/src/gallium/drivers/lima/standalone/lima_disasm.c	
+++ b/mesa 3D driver/src/gallium/drivers/lima/standalone/lima_disasm.c	
@@ -183,13 +183,13 @@ main(int argc, char **argv)
       do {
          ppir_codegen_ctrl *ctrl = (ppir_codegen_ctrl *)bin;
          printf("@%6d: ", offset);
-         ppir_disassemble_instr(bin, offset);
+         ppir_disassemble_instr(bin, offset, stdout);
          bin += ctrl->count;
          offset += ctrl->count;
          size -= ctrl->count;
       } while (size);
    } else {
-      gpir_disassemble_program((gpir_codegen_instr *)prog, size / (sizeof(gpir_codegen_instr)));
+      gpir_disassemble_program((gpir_codegen_instr *)prog, size / (sizeof(gpir_codegen_instr)), stdout);
    }
 
    ralloc_free(prog);
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/deqp-llvmpipe-asan.toml b/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/deqp-llvmpipe-asan.toml
new file mode 100644
index 0000000000..a8d09ed162
--- /dev/null
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/deqp-llvmpipe-asan.toml	
@@ -0,0 +1,14 @@
+
+[[deqp]]
+deqp = "/deqp/modules/gles31/deqp-gles31"
+caselists = ["/deqp/mustpass/gles31-master.txt"]
+skips = ["install/deqp-llvmpipe-skips.txt"]
+deqp_args = [
+    "--deqp-surface-width=256", "--deqp-surface-height=256",
+    "--deqp-surface-type=pbuffer", "--deqp-gl-config-name=rgba8888d24s8ms0", "--deqp-visibility=hidden"
+]
+fraction = 10
+version_check = "GL ES 3.2.*git"
+renderer_check = "llvmpipe"
+  [deqp.env]
+  LD_PRELOAD = "libasan.so.6"
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/deqp-llvmpipe-fails.txt b/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/deqp-llvmpipe-fails.txt
index 4bbd9a67ee..ce277b6bce 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/deqp-llvmpipe-fails.txt	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/deqp-llvmpipe-fails.txt	
@@ -10,17 +10,8 @@ dEQP-EGL.functional.robustness.reset_context.shaders.infinite_loop.sync_status.v
 dEQP-EGL.functional.robustness.reset_context.shaders.infinite_loop.sync_status.vertex_and_fragment,Fail
 dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_center,Fail
 dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_corner,Fail
-dEQP-GLES2.functional.clipping.triangle_vertex.clip_two.clip_neg_y_neg_z_and_neg_x_neg_y_pos_z,Fail
-dEQP-GLES2.functional.polygon_offset.default_displacement_with_units,Fail
-dEQP-GLES2.functional.rasterization.interpolation.basic.line_loop_wide,Fail
-dEQP-GLES2.functional.rasterization.interpolation.basic.line_strip_wide,Fail
-dEQP-GLES2.functional.rasterization.interpolation.basic.lines_wide,Fail
-dEQP-GLES2.functional.rasterization.interpolation.projected.line_loop_wide,Fail
-dEQP-GLES2.functional.rasterization.interpolation.projected.line_strip_wide,Fail
-dEQP-GLES2.functional.rasterization.interpolation.projected.lines_wide,Fail
 dEQP-GLES3.functional.clipping.line.wide_line_clip_viewport_center,Fail
 dEQP-GLES3.functional.clipping.line.wide_line_clip_viewport_corner,Fail
-dEQP-GLES3.functional.clipping.triangle_vertex.clip_two.clip_neg_y_neg_z_and_neg_x_neg_y_pos_z,Fail
 dEQP-GLES3.functional.fbo.blit.rect.nearest_consistency_mag,Fail
 dEQP-GLES3.functional.fbo.blit.rect.nearest_consistency_mag_reverse_dst_x,Fail
 dEQP-GLES3.functional.fbo.blit.rect.nearest_consistency_mag_reverse_src_dst_x,Fail
@@ -31,17 +22,6 @@ dEQP-GLES3.functional.fbo.blit.rect.nearest_consistency_min_reverse_dst_x,Fail
 dEQP-GLES3.functional.fbo.blit.rect.nearest_consistency_min_reverse_src_dst_x,Fail
 dEQP-GLES3.functional.fbo.blit.rect.nearest_consistency_min_reverse_src_dst_y,Fail
 dEQP-GLES3.functional.fbo.blit.rect.nearest_consistency_min_reverse_src_x,Fail
-dEQP-GLES3.functional.polygon_offset.default_displacement_with_units,Fail
-dEQP-GLES3.functional.polygon_offset.fixed24_displacement_with_units,Fail
-dEQP-GLES3.functional.polygon_offset.float32_displacement_with_units,Fail
-dEQP-GLES3.functional.rasterization.fbo.rbo_singlesample.interpolation.lines_wide,Fail
-dEQP-GLES3.functional.rasterization.fbo.texture_2d.interpolation.lines_wide,Fail
-dEQP-GLES3.functional.rasterization.interpolation.basic.line_loop_wide,Fail
-dEQP-GLES3.functional.rasterization.interpolation.basic.line_strip_wide,Fail
-dEQP-GLES3.functional.rasterization.interpolation.basic.lines_wide,Fail
-dEQP-GLES3.functional.rasterization.interpolation.projected.line_loop_wide,Fail
-dEQP-GLES3.functional.rasterization.interpolation.projected.line_strip_wide,Fail
-dEQP-GLES3.functional.rasterization.interpolation.projected.lines_wide,Fail
 dEQP-GLES31.functional.primitive_bounding_box.wide_points.global_state.vertex_tessellation_fragment.default_framebuffer_bbox_equal,Fail
 dEQP-GLES31.functional.primitive_bounding_box.wide_points.global_state.vertex_tessellation_fragment.default_framebuffer_bbox_larger,Fail
 dEQP-GLES31.functional.primitive_bounding_box.wide_points.global_state.vertex_tessellation_fragment.fbo_bbox_equal,Fail
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/deqp-llvmpipe.toml b/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/deqp-llvmpipe.toml
index 9a5286a5be..b98996b30b 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/deqp-llvmpipe.toml	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/deqp-llvmpipe.toml	
@@ -8,6 +8,8 @@ deqp_args = [
     "--deqp-gl-config-name=rgba8888d24s8ms0",
     "--deqp-visibility=hidden"
 ]
+version_check = "GL ES 3.2.*git"
+renderer_check = "llvmpipe"
 
 [[deqp]]
 deqp = "/deqp/modules/gles3/deqp-gles3"
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/gitlab-ci.yml b/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/gitlab-ci.yml
index 8dfd040470..b0eee1aeb6 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/gitlab-ci.yml	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/gitlab-ci.yml	
@@ -84,7 +84,6 @@ llvmpipe-traces:
     # busy at the deqp-runner level.
     LP_NUM_THREADS: 0
     GPU_VERSION: llvmpipe
-    DEQP_EXPECTED_RENDERER: llvmpipe
   extends:
     - .test-gl
     - .deqp-test
@@ -92,11 +91,20 @@ llvmpipe-traces:
 
 llvmpipe-deqp:
   variables:
-    DEQP_VER: gles2 # for the renderer check
     DEQP_SUITE: llvmpipe
   parallel: 2
   extends: .llvmpipe-deqp-test
 
+llvmpipe-deqp-asan:
+  variables:
+    DEQP_SUITE: llvmpipe-asan
+    GPU_VERSION: llvmpipe-asan
+    DEQP_FRACTION: 10
+  extends: .llvmpipe-deqp-test
+  needs:
+    - debian/x86_test-gl
+    - debian-testing-asan
+
 llvmpipe-egl:
   variables:
     DEQP_VER: egl
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/llvmpipe-cl.txt b/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/llvmpipe-cl.txt
index dbbcbcde0e..bd4759050c 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/llvmpipe-cl.txt	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/llvmpipe-cl.txt	
@@ -15,9 +15,9 @@ program/check predefined preprocessor macros/__opencl_c_version__ must be define
 program/check predefined preprocessor macros/cl_version_1_2 must be defined for opencl 1.2 and later: skip
 program/check predefined preprocessor macros/cl_version_2_0 must be defined for opencl 2.0 and later: skip
 program/execute/amdgcn-callee-saved-registers: skip
-program/execute/amdgcn-f16-inline-immediates: skip
+program/execute/amdgcn-f16-inline-immediates: fail
 program/execute/amdgcn-f32-inline-immediates: fail
-program/execute/amdgcn-i16-inline-immediates: skip
+program/execute/amdgcn-i16-inline-immediates: fail
 program/execute/atomic_int64_add-global: skip
 program/execute/atomic_int64_add-global-return: skip
 program/execute/atomic_int64_add-local: skip
@@ -60,8 +60,6 @@ program/execute/builtin/builtin-float-tan-1.0.generated: timeout
 program/execute/builtin/builtin-int-popcount-1.2.generated: skip
 program/execute/builtin/builtin-long-popcount-1.2.generated: skip
 program/execute/builtin/builtin-short-popcount-1.2.generated: skip
-program/execute/builtin/builtin-shuffle-half-ushort: skip
-program/execute/builtin/builtin-shuffle2-half-ushort: skip
 program/execute/builtin/builtin-uchar-popcount-1.2.generated: skip
 program/execute/builtin/builtin-uint-popcount-1.2.generated: skip
 program/execute/builtin/builtin-ulong-popcount-1.2.generated: skip
@@ -73,23 +71,10 @@ program/execute/calls-struct/small struct in regs: fail
 program/execute/fdiv-modifiers-f32: fail
 program/execute/global-offset/3d, input dependent: fail
 program/execute/image-attributes: crash
-program/execute/image-read-2d/read float from cl_float cl_rgba image.: fail
-program/execute/image-read-2d/read signed integer from cl_signed_int8 cl_rgba image.: fail
-program/execute/image-read-2d/read unsigned integer from cl_unsigned_int8 cl_rgba image.: fail
 program/execute/image-write-2d: crash
 program/execute/load-hi16-generic: skip
 program/execute/load-lo16-generic: skip
-program/execute/mad-mix: skip
+program/execute/mad-mix: fail
 program/execute/program-tester-check-local-size-test-should-skip/this test should skip: skip
-program/execute/sampler/read from image using clamp_to_edge addressing mode: fail
-program/execute/sampler/read from image using linear filtering and normalized coords: fail
-program/execute/sampler/read from image using linear filtering and unnormalized coords: fail
 program/execute/scalar-logical-float: skip
 program/execute/store-hi16-generic: skip
-program/execute/vload/vload-half-constant: skip
-program/execute/vload/vload-half-global: skip
-program/execute/vload/vload-half-local: skip
-program/execute/vload/vload-half-private: skip
-program/execute/vstore/vstore-half-global: skip
-program/execute/vstore/vstore-half-local: skip
-program/execute/vstore/vstore-half-private: skip
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/llvmpipe-quick_gl.txt b/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/llvmpipe-quick_gl.txt
index 838d4e88f1..d13603c0cc 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/llvmpipe-quick_gl.txt	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/llvmpipe-quick_gl.txt	
@@ -269,7 +269,6 @@ spec/arb_fragment_program/fp-indirections: skip
 spec/arb_fragment_shader_interlock/arb_fragment_shader_interlock-image-load-store: skip
 spec/arb_framebuffer_no_attachments/arb_framebuffer_no_attachments-params/dsa: skip
 spec/arb_framebuffer_no_attachments/arb_framebuffer_no_attachments-query/ms2: skip
-spec/arb_framebuffer_object/fbo-blit-scaled-linear: fail
 spec/arb_geometry_shader4/arb_geometry_shader4-ignore-adjacent-vertices  gl_line_strip_adjacency: skip
 spec/arb_geometry_shader4/arb_geometry_shader4-ignore-adjacent-vertices  gl_lines_adjacency: skip
 spec/arb_geometry_shader4/arb_geometry_shader4-ignore-adjacent-vertices  gl_triangle_strip_adjacency: skip
@@ -621,7 +620,6 @@ spec/ext_direct_state_access/named-buffers 15/flushmappednamedbufferrangeext: sk
 spec/ext_direct_state_access/named-buffers 15/mapnamedbufferext: skip
 spec/ext_direct_state_access/named-buffers 15/mapnamedbufferrangeext: skip
 spec/ext_direct_state_access/named-buffers 15/namedcopybuffersubdataext: skip
-spec/ext_external_objects/memory-object-api-errors: skip
 spec/ext_external_objects/semaphore-api-errors: skip
 spec/ext_external_objects/vk-buf-exchange: skip
 spec/ext_external_objects/vk-depth-display: skip
@@ -635,7 +633,6 @@ spec/ext_external_objects/vk-semaphores-2: skip
 spec/ext_external_objects/vk-stencil-display: skip
 spec/ext_external_objects/vk-vert-buf-reuse: skip
 spec/ext_external_objects/vk-vert-buf-update-errors: skip
-spec/ext_external_objects_fd/memory-object-api-errors: skip
 spec/ext_external_objects_fd/semaphore-api-errors: skip
 spec/ext_framebuffer_blit/fbo-blit-check-limits: fail
 spec/ext_framebuffer_multisample/accuracy 16 color: skip
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/llvmpipe-quick_shader.txt b/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/llvmpipe-quick_shader.txt
index 4dace0da09..9254949774 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/llvmpipe-quick_shader.txt	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/llvmpipe-quick_shader.txt	
@@ -188,6 +188,7 @@ spec/glsl-4.00/execution/conversion/vert-conversion-explicit-dvec2-vec2: fail
 spec/glsl-4.00/execution/conversion/vert-conversion-explicit-dvec3-vec3: fail
 spec/glsl-4.00/execution/conversion/vert-conversion-explicit-dvec4-vec4: fail
 spec/glsl-4.50/execution/ssbo-atomiccompswap-int: fail
+spec/glsl-es-1.00/linker/glsl-mismatched-uniform-precision-unused: fail
 spec/intel_shader_atomic_float_minmax/execution/shared-atomiccompswap-float: skip
 spec/intel_shader_atomic_float_minmax/execution/shared-atomicexchange-float: skip
 spec/intel_shader_atomic_float_minmax/execution/shared-atomicmax-float: skip
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/traces-llvmpipe.yml b/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/traces-llvmpipe.yml
index c2ccf84dc0..99e0385c51 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/traces-llvmpipe.yml	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/ci/traces-llvmpipe.yml	
@@ -5,11 +5,11 @@ traces:
   - path: glmark2/desktop:windows=4:effect=blur:blur-radius=5:passes=1:separable=true.trace
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 26080879ac8eb63c2c5da3de5fc732f4
+        checksum: fa54af7770699dfe78c3e33061312739
   - path: glmark2/jellyfish.trace
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 91dbe94735a132aeb192ae5c618ddc06
+        checksum: 0bba174c99746be068c4960cb6a9dabb
   - path: glxgears/glxgears-2.trace
     expectations:
       - device: gl-vmware-llvmpipe
@@ -17,161 +17,161 @@ traces:
   - path: 0ad/0ad.trace
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 7f45f21eefb76b4a35506cd6a47040ad
+        checksum: b29c740db174350d9be0beaaccd40453
   - path: pathfinder/demo.trace
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: e624d76c70cc3c532f4f54439e13659a
+        checksum: a053c56658bc830249bc94317a3b3ea8
   - path: pathfinder/canvas_moire.trace
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: c4ee0eef519f1a32ba24b7b392b31b28
+        checksum: 2cb5be6a6f62e417f1a89c89180e5728
   - path: pathfinder/canvas_text_v2.trace
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 7ec3d817091a273223158ab0f4f726b5
+        checksum: a1446d0c42a78771240fca6f3b1e10d8
   - path: gputest/furmark.trace
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 8c9d1267987f838070a74a734751624a
+        checksum: e2fea90560ce0f65efba5d38610dc7ef
   - path: gputest/pixmark-piano.trace
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: f73a68e9814405910c4628a2824f1eae
+        checksum: 4262587e893cf98c61a8467a15677181
   - path: gputest/triangle.trace
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 848436d1a2448bdc76fd6220382d8606
+        checksum: 7812de00011a3a059892e36cea19c696
   - path: humus/Portals.trace
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: c4a0f928e0cedf91726728e7fd5d9759
+        checksum: a55dd3d87a86b3b47121ff67861028c3
   - path: bgfx/01-cubes.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: de0b5f10a091b976021cf1dac2ca8e5d
+        checksum: a453a832e0e07132bb2c92c3fed7df18
   - path: bgfx/02-metaballs.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 21425e0f19bd5e65ff8fd3a5f0b5c62d
+        checksum: 905b005c6dce1cb54819085bf0c8dbfd
   - path: bgfx/03-raymarch.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: ae1e1558eee7108ba0254d785ac9687b
+        checksum: 71c0a0fc8a3e2760014efda8c07c623e
   - path: bgfx/04-mesh.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 9ee7518f354ec4372fff84849416e864
+        checksum: 274682ad4bf2ca4fa9cc92b55a7fd20b
   - path: bgfx/05-instancing.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 79d48af0dee9cb41fba0ef35cd655324
+        checksum: 948ec4c353485559163c575e80a01550
   - path: bgfx/06-bump.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 75a8ac1b7bbcba03a93159243ade2c83
+        checksum: 49b428a9c1a8e72f1ef5f3e91bc278db
   - path: bgfx/07-callback.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 488d471efb84824f2d4be1c75e455881
+        checksum: 702793a6317d16de9f8045128401b31a
   - path: bgfx/09-hdr.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 4f98347342ec1d349410774292e4f5c5
+        checksum: f0e52dff670caa2aad0080a8aa59ad06
   - path: bgfx/10-font.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 32e95e7c6fb4deaef63d2f6f9c51f864
+        checksum: 0a1673e22adc3dc126c921fe9460b2fe
   - path: bgfx/11-fontsdf.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 29a66770ae1240fa87dce21ec28323ff
+        checksum: 65d8ab58c89debcb4b7d3f39e6785d2e
   - path: bgfx/12-lod.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 5e95c12eecb4a422c8b529e2bac592fd
+        checksum: a79ccce53c09c2a43a51be2467cb15bc
   - path: bgfx/13-stencil.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 81266a254b1558ac90e89a6b02077ae1
+        checksum: 244919318cc38eed2ca68a31a067f507
   - path: bgfx/14-shadowvolumes.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 66f789a7c813945e43e59c187fba1d19
+        checksum: a94f05e82d4adc3e31bfcffd37f7b04b
   - path: bgfx/15-shadowmaps-simple.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: d3497adc3a91f2fa89950da384391fe7
+        checksum: 607edbe247f0977a011ea673965c376d
   - path: bgfx/16-shadowmaps.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 4b1942b6c218c83de66687782f2cfd3b
+        checksum: 87562fb15f341a214765e47adc910cc0
   - path: bgfx/18-ibl.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 8b7be0c59b4be560dba4ca7583ed1130
+        checksum: 47490275249793f778fc5d14899bf836
   - path: bgfx/19-oit.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 8f67d3b8327c0099d75753aed9d792d0
+        checksum: 5e5751621add149c9aab1e28e70ccfc7
   - path: bgfx/20-nanosvg.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: da60dfd9dc21a3e4716f1e19e346d7a3
+        checksum: 6b32c5f18a421412c7bbae3c65b5e0f6
   - path: bgfx/23-vectordisplay.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 3fe59e4876b86f4929f19e4a2254cf7d
+        checksum: 7af42ee2a19009fd65e2a0c6aa2c2c8a
   - path: bgfx/26-occlusion.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 6a29b2cd71b3f831ae0f41d3b7714c89
+        checksum: 601965313d5db009067fce901be2be2c
   - path: bgfx/28-wireframe.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 3aebd4eb89f6e68b84852f7556d5c1d9
+        checksum: de5452f4cbc0100d8ecb51459e47cd99
   - path: bgfx/29-debugdraw.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: c3ea41e60bb61c0dfb941f2203ce5555
+        checksum: 164e5226af26b6552506542a45bc6bf5
   - path: bgfx/31-rsm.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 559e9fabc82a7979599ab1153700f018
+        checksum: ef42f05c98862167a9eac6c733021e91
   - path: bgfx/32-particles.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 2093155a8b898e1d800a877a6c2ed446
+        checksum: 018418bdd7f60a186cce532613b0c7ab
   - path: bgfx/33-pom.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: f68403ec95113c4720e1a66e4cf956c8
+        checksum: c2d54a830ada2ff97c7e532b22d858b4
   - path: bgfx/34-mvs.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: b51eae3259861e0d8d791877ccbcdc1b
+        checksum: 6ad9c7d97debb7bf495b0bfca921ba9c
   - path: bgfx/35-dynamic.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: a3feb6e8c1b7cc4de02e0a324fbdc710
+        checksum: 62b390c4a31d7aa073fa4190fcfd0618
   - path: bgfx/36-sky.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 9971a794fe1b12fc21b8655ccb3b6bac
+        checksum: 06b1a3eb0e4793930502d808939b2386
   - path: bgfx/37-gpudrivenrendering.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: a778dbf1d7c8bff9d68ae53a74cec467
+        checksum: c435c6eedc86530da24856ab3f704681
   - path: bgfx/38-bloom.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 123d7a210f839e463886b1325c91f6f1
+        checksum: 960ceb01ab2716de810d410c49cf71cf
   - path: bgfx/39-assao.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: 74b3570d73241fa0798afb0869e5b121
+        checksum: bc6f44e63010db07e7ba588b216e38b1
   - path: bgfx/40-svt.rdc
     expectations:
       - device: gl-vmware-llvmpipe
-        checksum: a29f2ca810cba5d2e2d5f62b0385d275
+        checksum: 83bf05971404700b874c4e9d9edd1379
 
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_bld_interp.c
index 96bf8d4ea5..a93879c938 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_bld_interp.c	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_bld_interp.c	
@@ -413,21 +413,25 @@ attribs_update_simple(struct lp_build_interp_soa_context *bld,
                break;
             }
 
-            if ((attrib == 0) && (chan == 2) && !bld->depth_clamp){
-               /* FIXME: Depth values can exceed 1.0, due to the fact that
-                * setup interpolation coefficients refer to (0,0) which causes
-                * precision loss. So we must clamp to 1.0 here to avoid artifacts.
-                * Note though values outside [0,1] are perfectly valid with
-                * depth clip disabled.
-                * XXX: If depth clip is disabled but we force depth clamp
-                * we may get values larger than 1.0 in the fs (but not in
-                * depth test). Not sure if that's an issue...
-                * Also, on a similar note, it is not obvious if the depth values
-                * appearing in fs (with depth clip disabled) should be clamped
-                * to [0,1], clamped to near/far or not be clamped at all...
-                */
-               a = lp_build_min(coeff_bld, a, coeff_bld->one);
+            if ((attrib == 0) && (chan == 2)) {
+               /* add polygon-offset value, stored in the X component of a0 */
+               LLVMValueRef offset =
+                  lp_build_extract_broadcast(gallivm, setup_bld->type,
+                                             coeff_bld->type, bld->a0aos[0],
+                                             lp_build_const_int32(gallivm, 0));
+               a = LLVMBuildFAdd(builder, a, offset, "");
+
+               if (!bld->depth_clamp){
+                  /* OpenGL requires clamping z to 0..1 range after polgon offset
+                  * is applied if depth-clamping isn't enabled.
+                  *
+                  * This also fixes the problem that depth values can exceed 1.0,
+                  * due to imprecision in the calculations.
+                  */
+                  a = lp_build_clamp(coeff_bld, a, coeff_bld->zero, coeff_bld->one);
+               }
             }
+
             bld->attribs[attrib][chan] = a;
          }
       }
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_bld_interp.h b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_bld_interp.h
index f1b07873f3..f77d219225 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_bld_interp.h	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_bld_interp.h	
@@ -70,9 +70,8 @@ struct lp_shader_input {
    uint interp:4;       /* enum lp_interp */
    uint usage_mask:4;   /* bitmask of TGSI_WRITEMASK_x flags */
    uint src_index:8;    /* where to find values in incoming vertices */
-   uint cyl_wrap:4;     /* TGSI_CYLINDRICAL_WRAP_x flags */
    uint location:2;     /* TGSI_INTERPOLOATE_LOC_* */
-   uint padding:10;
+   uint padding:14;
 };
 
 
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_context.h b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_context.h
index 22f177eee2..c42aeca80e 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_context.h	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_context.h	
@@ -117,6 +117,8 @@ struct llvmpipe_context {
    
    /** Vertex format */
    struct vertex_info vertex_info;
+
+   uint8_t patch_vertices;
    
    /** Which vertex shader output slot contains color */
    int8_t color_slot[2];
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_cs_tpool.c b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_cs_tpool.c
index ea28446851..4bf76a16bc 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_cs_tpool.c	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_cs_tpool.c	
@@ -43,6 +43,7 @@ lp_cs_tpool_worker(void *data)
 
    while (!pool->shutdown) {
       struct lp_cs_tpool_task *task;
+      unsigned iter_per_thread;
 
       while (list_is_empty(&pool->workqueue) && !pool->shutdown)
          cnd_wait(&pool->new_work, &pool->m);
@@ -52,15 +53,28 @@ lp_cs_tpool_worker(void *data)
 
       task = list_first_entry(&pool->workqueue, struct lp_cs_tpool_task,
                               list);
-      unsigned this_iter = task->iter_start++;
+
+      unsigned this_iter = task->iter_start;
+
+      iter_per_thread = task->iter_per_thread;
+
+      if (task->iter_remainder &&
+          task->iter_start + task->iter_remainder == task->iter_total) {
+         task->iter_remainder--;
+         iter_per_thread = 1;
+      }
+
+      task->iter_start += iter_per_thread;
 
       if (task->iter_start == task->iter_total)
          list_del(&task->list);
 
       mtx_unlock(&pool->m);
-      task->work(task->data, this_iter, &lmem);
+      for (unsigned i = 0; i < iter_per_thread; i++)
+         task->work(task->data, this_iter + i, &lmem);
+
       mtx_lock(&pool->m);
-      task->iter_finished++;
+      task->iter_finished += iter_per_thread;
       if (task->iter_finished == task->iter_total)
          cnd_broadcast(&task->finish);
    }
@@ -121,6 +135,7 @@ lp_cs_tpool_queue_task(struct lp_cs_tpool *pool,
       for (unsigned t = 0; t < num_iters; t++) {
          work(data, t, &lmem);
       }
+      FREE(lmem.local_mem_ptr);
       return NULL;
    }
    task = CALLOC_STRUCT(lp_cs_tpool_task);
@@ -131,6 +146,10 @@ lp_cs_tpool_queue_task(struct lp_cs_tpool *pool,
    task->work = work;
    task->data = data;
    task->iter_total = num_iters;
+
+   task->iter_per_thread = num_iters / pool->num_threads;
+   task->iter_remainder = num_iters % pool->num_threads;
+
    cnd_init(&task->finish);
 
    mtx_lock(&pool->m);
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_cs_tpool.h b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_cs_tpool.h
index d32a5e0909..22c0d10e7d 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_cs_tpool.h	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_cs_tpool.h	
@@ -66,6 +66,8 @@ struct lp_cs_tpool_task {
    unsigned iter_total;
    unsigned iter_start;
    unsigned iter_finished;
+   unsigned iter_per_thread;
+   unsigned iter_remainder;
 };
 
 struct lp_cs_tpool *lp_cs_tpool_create(unsigned num_threads);
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_draw_arrays.c b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
index 33ab84f950..48d112ac08 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_draw_arrays.c	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_draw_arrays.c	
@@ -146,7 +146,8 @@ llvmpipe_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info,
                                      !lp->queries_disabled);
 
    /* draw! */
-   draw_vbo(draw, info, drawid_offset, indirect, draws, num_draws);
+   draw_vbo(draw, info, drawid_offset, indirect, draws, num_draws,
+            lp->patch_vertices);
 
    /*
     * unmap vertex/index buffers
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_linear.c b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_linear.c
index 62939a3975..c069f04486 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_linear.c	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_linear.c	
@@ -175,7 +175,7 @@ lp_fs_linear_run(const struct lp_rast_state *state,
 
       if (!lp_linear_init_sampler(&samp[i],
                                   tex_info,
-                                  &variant->key.samplers[unit],
+                                  lp_fs_variant_key_sampler_idx(&variant->key, unit),
                                   &state->jit_context.textures[unit],
                                   x, y, width, height,
                                   a0, dadx, dady)) {
@@ -315,7 +315,8 @@ lp_linear_check_variant(struct lp_fragment_shader_variant *variant)
          goto fail;
       }
 
-      if (!lp_linear_check_sampler(&key->samplers[unit], tex_info)) {
+      struct lp_sampler_static_state *samp = lp_fs_variant_key_sampler_idx(key, unit);
+      if (!lp_linear_check_sampler(samp, tex_info)) {
          if (LP_DEBUG & DEBUG_LINEAR)
             debug_printf(" -- samp[%d]: check_sampler failed\n", i);
          goto fail;
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_linear_fastpath.c b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_linear_fastpath.c
index b97ea1b88e..84863dc9e1 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_linear_fastpath.c	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_linear_fastpath.c	
@@ -199,11 +199,15 @@ lp_linear_purple(const struct lp_rast_state *state,
 boolean
 lp_linear_check_fastpath(struct lp_fragment_shader_variant *variant)
 {
-   enum pipe_format tex_format = variant->key.samplers[0].texture_state.format;
+   struct lp_sampler_static_state *samp0 = lp_fs_variant_key_sampler_idx(&variant->key, 0);
 
+   if (!samp0)
+      return false;
+
+   enum pipe_format tex_format = samp0->texture_state.format;
    if (variant->shader->kind == LP_FS_KIND_BLIT_RGBA &&
        tex_format == PIPE_FORMAT_B8G8R8A8_UNORM &&
-       is_nearest_clamp_sampler(&variant->key.samplers[0]) &&
+       is_nearest_clamp_sampler(samp0) &&
        variant->opaque) {
       variant->jit_linear_blit             = lp_linear_blit_rgba_blit;
    }
@@ -212,7 +216,7 @@ lp_linear_check_fastpath(struct lp_fragment_shader_variant *variant)
        variant->opaque &&
        (tex_format == PIPE_FORMAT_B8G8R8A8_UNORM ||
         tex_format == PIPE_FORMAT_B8G8R8X8_UNORM) &&
-       is_nearest_clamp_sampler(&variant->key.samplers[0])) {
+       is_nearest_clamp_sampler(samp0)) {
       variant->jit_linear_blit             = lp_linear_blit_rgb1_blit;
    }
 
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_rast.c b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_rast.c
index f054f4cc95..f67fbda6b0 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_rast.c	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_rast.c	
@@ -1366,7 +1366,12 @@ void lp_rast_destroy( struct lp_rasterizer *rast )
     * per https://bugs.freedesktop.org/show_bug.cgi?id=76252 */
    for (i = 0; i < rast->num_threads; i++) {
 #ifdef _WIN32
-      pipe_semaphore_wait(&rast->tasks[i].work_done);
+      /* Threads might already be dead - Windows apparently terminates other threads when
+       * returning from main.
+       */
+      DWORD exit_code = STILL_ACTIVE;
+      if (GetExitCodeThread(rast->threads[i], &exit_code) && exit_code == STILL_ACTIVE)
+         pipe_semaphore_wait(&rast->tasks[i].work_done);
 #else
       thrd_join(rast->threads[i], NULL);
 #endif
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_screen.c b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_screen.c
index 9a406d19bd..dc9ffce97e 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_screen.c	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_screen.c	
@@ -170,6 +170,8 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
       return 1;
    case PIPE_CAP_DEPTH_CLIP_DISABLE:
       return 1;
+   case PIPE_CAP_DEPTH_CLAMP_ENABLE:
+      return 1;
    case PIPE_CAP_SHADER_STENCIL_EXPORT:
       return 1;
    case PIPE_CAP_TGSI_INSTANCEID:
@@ -347,6 +349,10 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
       return 1;
+#ifdef PIPE_MEMORY_FD
+   case PIPE_CAP_MEMOBJ:
+      return 1;
+#endif
    case PIPE_CAP_SAMPLER_REDUCTION_MINMAX:
    case PIPE_CAP_TGSI_TXQS:
    case PIPE_CAP_TGSI_VOTE:
@@ -566,9 +572,11 @@ static const struct nir_shader_compiler_options gallivm_nir_options = {
    .lower_ffma16 = true,
    .lower_ffma32 = true,
    .lower_ffma64 = true,
+   .lower_flrp16 = true,
    .lower_fmod = true,
    .lower_hadd = true,
-   .lower_add_sat = true,
+   .lower_uadd_sat = true,
+   .lower_iadd_sat = true,
    .lower_ldexp = true,
    .lower_pack_snorm_2x16 = true,
    .lower_pack_snorm_4x8 = true,
@@ -600,12 +608,13 @@ static const struct nir_shader_compiler_options gallivm_nir_options = {
    .lower_fisnormal = true,
 };
 
-static void
+static char *
 llvmpipe_finalize_nir(struct pipe_screen *screen,
                       void *nirptr)
 {
    struct nir_shader *nir = (struct nir_shader *)nirptr;
    lp_build_opt_nir(nir);
+   return NULL;
 }
 
 static inline const void *
@@ -719,6 +728,7 @@ llvmpipe_is_format_supported( struct pipe_screen *_screen,
          case PIPE_FORMAT_R8G8_SNORM:
          case PIPE_FORMAT_R16_SNORM:
          case PIPE_FORMAT_R8_SNORM:
+         case PIPE_FORMAT_B8G8R8A8_UNORM:
             break;
 
          default:
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_setup.c b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_setup.c
index 9109114487..cc0651ab46 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_setup.c	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_setup.c	
@@ -1752,4 +1752,69 @@ lp_setup_flush_and_restart(struct lp_setup_context *setup)
    return TRUE;
 }
 
-
+void
+lp_setup_add_scissor_planes(const struct u_rect *scissor,
+                            struct lp_rast_plane *plane_s,
+                            boolean s_planes[4], bool multisample)
+{
+   /*
+    * When rasterizing scissored tris, use the intersection of the
+    * triangle bounding box and the scissor rect to generate the
+    * scissor planes.
+    *
+    * This permits us to cut off the triangle "tails" that are present
+    * in the intermediate recursive levels caused when two of the
+    * triangles edges don't diverge quickly enough to trivially reject
+    * exterior blocks from the triangle.
+    *
+    * It's not really clear if it's worth worrying about these tails,
+    * but since we generate the planes for each scissored tri, it's
+    * free to trim them in this case.
+    *
+    * Note that otherwise, the scissor planes only vary in 'C' value,
+    * and even then only on state-changes.  Could alternatively store
+    * these planes elsewhere.
+    * (Or only store the c value together with a bit indicating which
+    * scissor edge this is, so rasterization would treat them differently
+    * (easier to evaluate) to ordinary planes.)
+    */
+   int adj = multisample ? 127 : 0;
+   if (s_planes[0]) {
+      int x0 = scissor->x0 - 1;
+      plane_s->dcdx = ~0U << 8;
+      plane_s->dcdy = 0;
+      plane_s->c = x0 << 8;
+      plane_s->c += adj;
+      plane_s->c = -plane_s->c; /* flip sign */
+      plane_s->eo = 1 << 8;
+      plane_s++;
+   }
+   if (s_planes[1]) {
+      int x1 = scissor->x1;
+      plane_s->dcdx = 1 << 8;
+      plane_s->dcdy = 0;
+      plane_s->c = x1 << 8;
+      plane_s->c += 127 + adj;
+      plane_s->eo = 0 << 8;
+      plane_s++;
+   }
+   if (s_planes[2]) {
+      int y0 = scissor->y0 - 1;
+      plane_s->dcdx = 0;
+      plane_s->dcdy = 1 << 8;
+      plane_s->c = y0 << 8;
+      plane_s->c += adj;
+      plane_s->c = -plane_s->c; /* flip sign */
+      plane_s->eo = 1 << 8;
+      plane_s++;
+   }
+   if (s_planes[3]) {
+      int y1 = scissor->y1;
+      plane_s->dcdx = 0;
+      plane_s->dcdy = ~0U << 8;
+      plane_s->c = y1 << 8;
+      plane_s->c += 127 + adj;
+      plane_s->eo = 0;
+      plane_s++;
+   }
+}
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_setup_context.h b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_setup_context.h
index a7273a2ad9..656a64f405 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_setup_context.h	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_setup_context.h	
@@ -207,6 +207,10 @@ scissor_planes_needed(boolean scis_planes[4], const struct u_rect *bbox,
    scis_planes[3] = (bbox->y1 > scissor->y1);
 }
 
+void
+lp_setup_add_scissor_planes(const struct u_rect *scissor,
+                            struct lp_rast_plane *plane_s,
+                            boolean s_planes[4], bool multisample);
 
 void lp_setup_choose_triangle( struct lp_setup_context *setup );
 void lp_setup_choose_line( struct lp_setup_context *setup );
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_setup_line.c b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_setup_line.c
index 2d1b954162..1f812e8eac 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_setup_line.c	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_setup_line.c	
@@ -426,6 +426,10 @@ try_setup_line( struct lp_setup_context *setup,
       will_draw_start = sign(-x1diff) != sign(dx);
       will_draw_end = (sign(x2diff) == sign(-dx)) || x2diff==0;
 
+      /* interpolate using the preferred wide-lines formula */
+      info.dx *= 1 + dydx * dydx;
+      info.dy = 0;
+
       if (dx < 0) {
          /* if v2 is to the right of v1, swap pointers */
          const float (*temp)[4] = v1;
@@ -523,6 +527,10 @@ try_setup_line( struct lp_setup_context *setup,
       will_draw_start = sign(y1diff) == sign(dy);
       will_draw_end = (sign(-y2diff) == sign(dy)) || y2diff==0;
 
+      /* interpolate using the preferred wide-lines formula */
+      info.dx = 0;
+      info.dy *= 1 + dxdy * dxdy;
+
       if (dy > 0) {
          /* if v2 is on top of v1, swap pointers */
          const float (*temp)[4] = v1;
@@ -703,60 +711,8 @@ try_setup_line( struct lp_setup_context *setup,
       if (plane[i].dcdy > 0) plane[i].eo += plane[i].dcdy;
    }
 
-
-   /* 
-    * When rasterizing scissored tris, use the intersection of the
-    * triangle bounding box and the scissor rect to generate the
-    * scissor planes.
-    *
-    * This permits us to cut off the triangle "tails" that are present
-    * in the intermediate recursive levels caused when two of the
-    * triangles edges don't diverge quickly enough to trivially reject
-    * exterior blocks from the triangle.
-    *
-    * It's not really clear if it's worth worrying about these tails,
-    * but since we generate the planes for each scissored tri, it's
-    * free to trim them in this case.
-    * 
-    * Note that otherwise, the scissor planes only vary in 'C' value,
-    * and even then only on state-changes.  Could alternatively store
-    * these planes elsewhere.
-    * (Or only store the c value together with a bit indicating which
-    * scissor edge this is, so rasterization would treat them differently
-    * (easier to evaluate) to ordinary planes.)
-    */
    if (nr_planes > 4) {
-      struct lp_rast_plane *plane_s = &plane[4];
-
-      if (s_planes[0]) {
-         plane_s->dcdx = ~0U << 8;
-         plane_s->dcdy = 0;
-         plane_s->c = (1-scissor->x0) << 8;
-         plane_s->eo = 1 << 8;
-         plane_s++;
-      }
-      if (s_planes[1]) {
-         plane_s->dcdx = 1 << 8;
-         plane_s->dcdy = 0;
-         plane_s->c = (scissor->x1+1) << 8;
-         plane_s->eo = 0 << 8;
-         plane_s++;
-      }
-      if (s_planes[2]) {
-         plane_s->dcdx = 0;
-         plane_s->dcdy = 1 << 8;
-         plane_s->c = (1-scissor->y0) << 8;
-         plane_s->eo = 1 << 8;
-         plane_s++;
-      }
-      if (s_planes[3]) {
-         plane_s->dcdx = 0;
-         plane_s->dcdy = ~0U << 8;
-         plane_s->c = (scissor->y1+1) << 8;
-         plane_s->eo = 0;
-         plane_s++;
-      }
-      assert(plane_s == &plane[nr_planes]);
+      lp_setup_add_scissor_planes(scissor, &plane[4], s_planes, setup->multisample);
    }
 
    return lp_setup_bin_triangle(setup, line, &bbox, &bboxpos, nr_planes, viewport_index);
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_setup_rect.c b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_setup_rect.c
index 7c1234eafe..4b1e0cc1a9 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_setup_rect.c	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_setup_rect.c	
@@ -173,8 +173,10 @@ lp_setup_is_blit(const struct lp_setup_context *setup,
        * texture filtering.
        */
 
-      assert(variant->key.samplers[0].sampler_state.min_img_filter == PIPE_TEX_FILTER_NEAREST);
-      assert(variant->key.samplers[0].sampler_state.mag_img_filter == PIPE_TEX_FILTER_NEAREST);
+      ASSERTED struct lp_sampler_static_state *samp0 = lp_fs_variant_key_sampler_idx(&variant->key, 0);
+      assert(samp0);
+      assert(samp0->sampler_state.min_img_filter == PIPE_TEX_FILTER_NEAREST);
+      assert(samp0->sampler_state.mag_img_filter == PIPE_TEX_FILTER_NEAREST);
 
       /*
        * Check for 1:1 match of texels to dest pixels
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index a31e77e521..347f0a61c1 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_setup_tri.c	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_setup_tri.c	
@@ -734,60 +734,8 @@ do_triangle_ccw(struct lp_setup_context *setup,
                    plane[2].eo);
    }
 
-
-   /* 
-    * When rasterizing scissored tris, use the intersection of the
-    * triangle bounding box and the scissor rect to generate the
-    * scissor planes.
-    *
-    * This permits us to cut off the triangle "tails" that are present
-    * in the intermediate recursive levels caused when two of the
-    * triangles edges don't diverge quickly enough to trivially reject
-    * exterior blocks from the triangle.
-    *
-    * It's not really clear if it's worth worrying about these tails,
-    * but since we generate the planes for each scissored tri, it's
-    * free to trim them in this case.
-    * 
-    * Note that otherwise, the scissor planes only vary in 'C' value,
-    * and even then only on state-changes.  Could alternatively store
-    * these planes elsewhere.
-    * (Or only store the c value together with a bit indicating which
-    * scissor edge this is, so rasterization would treat them differently
-    * (easier to evaluate) to ordinary planes.)
-    */
    if (nr_planes > 3) {
-      struct lp_rast_plane *plane_s = &plane[3];
-
-      if (s_planes[0]) {
-         plane_s->dcdx = ~0U << 8;
-         plane_s->dcdy = 0;
-         plane_s->c = (1-scissor->x0) << 8;
-         plane_s->eo = 1 << 8;
-         plane_s++;
-      }
-      if (s_planes[1]) {
-         plane_s->dcdx = 1 << 8;
-         plane_s->dcdy = 0;
-         plane_s->c = (scissor->x1+1) << 8;
-         plane_s->eo = 0 << 8;
-         plane_s++;
-      }
-      if (s_planes[2]) {
-         plane_s->dcdx = 0;
-         plane_s->dcdy = 1 << 8;
-         plane_s->c = (1-scissor->y0) << 8;
-         plane_s->eo = 1 << 8;
-         plane_s++;
-      }
-      if (s_planes[3]) {
-         plane_s->dcdx = 0;
-         plane_s->dcdy = ~0U << 8;
-         plane_s->c = (scissor->y1+1) << 8;
-         plane_s->eo = 0;
-         plane_s++;
-      }
-      assert(plane_s == &plane[nr_planes]);
+      lp_setup_add_scissor_planes(scissor, &plane[3], s_planes, setup->multisample);
    }
 
    return lp_setup_bin_triangle(setup, tri, &bbox, &bboxpos, nr_planes, viewport_index);
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_cs.c b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_cs.c
index e11b67b01f..c655a5477a 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_cs.c	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_cs.c	
@@ -70,7 +70,7 @@ generate_compute(struct llvmpipe_context *lp,
    struct gallivm_state *gallivm = variant->gallivm;
    const struct lp_compute_shader_variant_key *key = &variant->key;
    char func_name[64], func_name_coro[64];
-   LLVMTypeRef arg_types[18];
+   LLVMTypeRef arg_types[19];
    LLVMTypeRef func_type, coro_func_type;
    LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
    LLVMValueRef context_ptr;
@@ -121,8 +121,9 @@ generate_compute(struct llvmpipe_context *lp,
    arg_types[15] = int32_type;                         /* coro block_y_size */
    arg_types[16] = int32_type;                         /* coro block_z_size */
    arg_types[17] = int32_type;                         /* coro idx */
+   arg_types[18] = LLVMPointerType(LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), 0);
    func_type = LLVMFunctionType(LLVMVoidTypeInContext(gallivm->context),
-                                arg_types, ARRAY_SIZE(arg_types) - 6, 0);
+                                arg_types, ARRAY_SIZE(arg_types) - 7, 0);
 
    coro_func_type = LLVMFunctionType(LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0),
                                      arg_types, ARRAY_SIZE(arg_types), 0);
@@ -138,7 +139,8 @@ generate_compute(struct llvmpipe_context *lp,
    for(i = 0; i < ARRAY_SIZE(arg_types); ++i) {
       if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) {
          lp_add_function_attr(coro, i + 1, LP_FUNC_ATTR_NOALIAS);
-         lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
+         if (i < ARRAY_SIZE(arg_types) - 7)
+            lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
       }
    }
 
@@ -177,7 +179,7 @@ generate_compute(struct llvmpipe_context *lp,
    builder = gallivm->builder;
    assert(builder);
    LLVMPositionBuilderAtEnd(builder, block);
-   sampler = lp_llvm_sampler_soa_create(key->samplers, key->nr_samplers);
+   sampler = lp_llvm_sampler_soa_create(lp_cs_variant_key_samplers(key), key->nr_samplers);
    image = lp_llvm_image_soa_create(lp_cs_variant_key_images(key), key->nr_images);
 
    struct lp_build_loop_state loop_state[4];
@@ -191,7 +193,11 @@ generate_compute(struct llvmpipe_context *lp,
    LLVMValueRef coro_num_hdls = LLVMBuildMul(gallivm->builder, num_x_loop, y_size_arg, "");
    coro_num_hdls = LLVMBuildMul(gallivm->builder, coro_num_hdls, z_size_arg, "");
 
+   /* build a ptr in memory to store all the frames in later. */
    LLVMTypeRef hdl_ptr_type = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
+   LLVMValueRef coro_mem = LLVMBuildAlloca(gallivm->builder, hdl_ptr_type, "coro_mem");
+   LLVMBuildStore(builder, LLVMConstNull(hdl_ptr_type), coro_mem);
+
    LLVMValueRef coro_hdls = LLVMBuildArrayAlloca(gallivm->builder, hdl_ptr_type, coro_num_hdls, "coro_hdls");
 
    unsigned end_coroutine = INT_MAX;
@@ -211,7 +217,7 @@ generate_compute(struct llvmpipe_context *lp,
    lp_build_loop_begin(&loop_state[0], gallivm,
                        lp_build_const_int32(gallivm, 0)); /* x loop */
    {
-      LLVMValueRef args[18];
+      LLVMValueRef args[19];
       args[0] = context_ptr;
       args[1] = loop_state[0].counter;
       args[2] = loop_state[1].counter;
@@ -240,6 +246,8 @@ generate_compute(struct llvmpipe_context *lp,
                                   loop_state[0].counter, "");
 
       args[17] = coro_hdl_idx;
+
+      args[18] = coro_mem;
       LLVMValueRef coro_entry = LLVMBuildGEP(gallivm->builder, coro_hdls, &coro_hdl_idx, 1, "");
 
       LLVMValueRef coro_hdl = LLVMBuildLoad(gallivm->builder, coro_entry, "coro_hdl");
@@ -249,7 +257,7 @@ generate_compute(struct llvmpipe_context *lp,
                                        lp_build_const_int32(gallivm, 0), "");
       /* first time here - call the coroutine function entry point */
       lp_build_if(&ifstate, gallivm, cmp);
-      LLVMValueRef coro_ret = LLVMBuildCall(gallivm->builder, coro, args, 18, "");
+      LLVMValueRef coro_ret = LLVMBuildCall(gallivm->builder, coro, args, 19, "");
       LLVMBuildStore(gallivm->builder, coro_ret, coro_entry);
       lp_build_else(&ifstate);
       /* subsequent calls for this invocation - check if done. */
@@ -278,6 +286,10 @@ generate_compute(struct llvmpipe_context *lp,
    lp_build_loop_end_cond(&loop_state[3],
                           lp_build_const_int32(gallivm, end_coroutine),
                           NULL, LLVMIntEQ);
+
+   LLVMValueRef coro_mem_ptr = LLVMBuildLoad(builder, coro_mem, "");
+   LLVMBuildCall(gallivm->builder, gallivm->coro_free_hook, &coro_mem_ptr, 1, "");
+
    LLVMBuildRetVoid(builder);
 
    /* This is stage (b) - generate the compute shader code inside the coroutine. */
@@ -300,6 +312,7 @@ generate_compute(struct llvmpipe_context *lp,
    block_y_size_arg = LLVMGetParam(coro, 15);
    block_z_size_arg = LLVMGetParam(coro, 16);
    LLVMValueRef coro_idx = LLVMGetParam(coro, 17);
+   coro_mem = LLVMGetParam(coro, 18);
    block = LLVMAppendBasicBlockInContext(gallivm->context, coro, "entry");
    LLVMPositionBuilderAtEnd(builder, block);
    {
@@ -319,10 +332,16 @@ generate_compute(struct llvmpipe_context *lp,
 
       shared_ptr = lp_jit_cs_thread_data_shared(gallivm, thread_data_ptr);
 
+      LLVMValueRef coro_num_hdls = LLVMBuildMul(gallivm->builder, num_x_loop, block_y_size_arg, "");
+      coro_num_hdls = LLVMBuildMul(gallivm->builder, coro_num_hdls, block_z_size_arg, "");
+
       /* these are coroutine entrypoint necessities */
       LLVMValueRef coro_id = lp_build_coro_id(gallivm);
-      LLVMValueRef coro_hdl = lp_build_coro_begin_alloc_mem(gallivm, coro_id);
+      LLVMValueRef coro_entry = lp_build_coro_alloc_mem_array(gallivm, coro_mem, coro_idx, coro_num_hdls);
 
+      LLVMValueRef alloced_ptr = LLVMBuildLoad(gallivm->builder, coro_mem, "");
+      alloced_ptr = LLVMBuildGEP(gallivm->builder, alloced_ptr, &coro_entry, 1, "");
+      LLVMValueRef coro_hdl = lp_build_coro_begin(gallivm, coro_id, alloced_ptr);
       LLVMValueRef has_partials = LLVMBuildICmp(gallivm->builder, LLVMIntNE, partials, lp_build_const_int32(gallivm, 0), "");
       LLVMValueRef tid_vals[3];
       LLVMValueRef tids_x[LP_MAX_VECTOR_LENGTH], tids_y[LP_MAX_VECTOR_LENGTH], tids_z[LP_MAX_VECTOR_LENGTH];
@@ -417,8 +436,6 @@ generate_compute(struct llvmpipe_context *lp,
       lp_build_coro_suspend_switch(gallivm, &coro_info, NULL, true);
       LLVMPositionBuilderAtEnd(builder, clean_block);
 
-      lp_build_coro_free_mem(gallivm, coro_id, coro_hdl);
-
       LLVMBuildBr(builder, sus_block);
       LLVMPositionBuilderAtEnd(builder, sus_block);
 
@@ -561,7 +578,7 @@ make_variant_key(struct llvmpipe_context *lp,
    int i;
    struct lp_compute_shader_variant_key *key;
    key = (struct lp_compute_shader_variant_key *)store;
-   memset(key, 0, offsetof(struct lp_compute_shader_variant_key, samplers[1]));
+   memset(key, 0, sizeof(*key));
 
    /* This value will be the same for all the variants of a given shader:
     */
@@ -569,7 +586,9 @@ make_variant_key(struct llvmpipe_context *lp,
 
    struct lp_sampler_static_state *cs_sampler;
 
-   cs_sampler = key->samplers;
+   cs_sampler = lp_cs_variant_key_samplers(key);
+
+   memset(cs_sampler, 0, MAX2(key->nr_samplers, key->nr_sampler_views) * sizeof *cs_sampler);
    for(i = 0; i < key->nr_samplers; ++i) {
       if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
          lp_sampler_static_sampler_state(&cs_sampler[i].sampler_state,
@@ -625,7 +644,8 @@ dump_cs_variant_key(const struct lp_compute_shader_variant_key *key)
    debug_printf("cs variant %p:\n", (void *) key);
 
    for (i = 0; i < key->nr_samplers; ++i) {
-      const struct lp_static_sampler_state *sampler = &key->samplers[i].sampler_state;
+      const struct lp_sampler_static_state *samplers = lp_cs_variant_key_samplers(key);
+      const struct lp_static_sampler_state *sampler = &samplers[i].sampler_state;
       debug_printf("sampler[%u] = \n", i);
       debug_printf("  .wrap = %s %s %s\n",
                    util_str_tex_wrap(sampler->wrap_s, TRUE),
@@ -647,7 +667,8 @@ dump_cs_variant_key(const struct lp_compute_shader_variant_key *key)
       debug_printf("  .aniso = %u\n", sampler->aniso);
    }
    for (i = 0; i < key->nr_sampler_views; ++i) {
-      const struct lp_static_texture_state *texture = &key->samplers[i].texture_state;
+      const struct lp_sampler_static_state *samplers = lp_cs_variant_key_samplers(key);
+      const struct lp_static_texture_state *texture = &samplers[i].texture_state;
       debug_printf("texture[%u] = \n", i);
       debug_printf("  .format = %s\n",
                    util_format_name(texture->format));
@@ -1361,9 +1382,9 @@ static void llvmpipe_launch_grid(struct pipe_context *pipe,
       struct lp_cs_tpool_task *task;
       mtx_lock(&screen->cs_mutex);
       task = lp_cs_tpool_queue_task(screen->cs_tpool, cs_exec_fn, &job_info, num_tasks);
+      mtx_unlock(&screen->cs_mutex);
 
       lp_cs_tpool_wait_for_task(screen->cs_tpool, &task);
-      mtx_unlock(&screen->cs_mutex);
    }
    llvmpipe->pipeline_statistics.cs_invocations += num_tasks * info->block[0] * info->block[1] * info->block[2];
 }
@@ -1443,6 +1464,9 @@ lp_csctx_destroy(struct lp_cs_context *csctx)
    for (i = 0; i < ARRAY_SIZE(csctx->ssbos); i++) {
       pipe_resource_reference(&csctx->ssbos[i].current.buffer, NULL);
    }
+   for (i = 0; i < ARRAY_SIZE(csctx->images); i++) {
+      pipe_resource_reference(&csctx->images[i].current.resource, NULL);
+   }
    FREE(csctx);
 }
 
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_cs.h b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_cs.h
index 61267aadf0..8b578424b4 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_cs.h	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_cs.h	
@@ -42,29 +42,32 @@ struct lp_compute_shader_variant_key
    unsigned nr_samplers:8;
    unsigned nr_sampler_views:8;
    unsigned nr_images:8;
-   /* followed by variable number of images */
-   struct lp_sampler_static_state samplers[1];
 };
 
 #define LP_CS_MAX_VARIANT_KEY_SIZE                                      \
    (sizeof(struct lp_compute_shader_variant_key) +                     \
-    PIPE_MAX_SHADER_SAMPLER_VIEWS * sizeof(struct lp_sampler_static_state) +\
+    PIPE_MAX_SHADER_SAMPLER_VIEWS * sizeof(struct lp_sampler_static_state) + \
     PIPE_MAX_SHADER_IMAGES * sizeof(struct lp_image_static_state))
 
 static inline size_t
 lp_cs_variant_key_size(unsigned nr_samplers, unsigned nr_images)
 {
-   unsigned samplers = nr_samplers > 1 ? (nr_samplers - 1) : 0;
    return (sizeof(struct lp_compute_shader_variant_key) +
-           samplers * sizeof(struct lp_sampler_static_state) +
+           nr_samplers * sizeof(struct lp_sampler_static_state) +
            nr_images * sizeof(struct lp_image_static_state));
 }
 
+static inline struct lp_sampler_static_state *
+lp_cs_variant_key_samplers(const struct lp_compute_shader_variant_key *key)
+{
+   return (struct lp_sampler_static_state *)&(key[1]);
+}
+
 static inline struct lp_image_static_state *
 lp_cs_variant_key_images(const struct lp_compute_shader_variant_key *key)
 {
    return (struct lp_image_static_state *)
-      &key->samplers[key->nr_samplers];
+      &(lp_cs_variant_key_samplers(key)[key->nr_samplers]);
 }
 
 struct lp_cs_variant_list_item
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_fs.c b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 12c42b0cd7..80dd95f01e 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_fs.c	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_fs.c	
@@ -67,6 +67,7 @@
 #include "util/u_string.h"
 #include "util/simple_list.h"
 #include "util/u_dual_blend.h"
+#include "util/u_upload_mgr.h"
 #include "util/os_time.h"
 #include "pipe/p_shader_tokens.h"
 #include "draw/draw_context.h"
@@ -451,10 +452,13 @@ static LLVMValueRef fs_interp(const struct lp_build_fs_iface *iface,
 }
 
 static void fs_fb_fetch(const struct lp_build_fs_iface *iface,
-                                struct lp_build_context *bld,
-                                unsigned cbuf,
-                                LLVMValueRef result[4])
+                        struct lp_build_context *bld,
+                        int location,
+                        LLVMValueRef result[4])
 {
+   assert(location >= FRAG_RESULT_DATA0 && location <= FRAG_RESULT_DATA7);
+   const int cbuf = location - FRAG_RESULT_DATA0;
+
    struct lp_build_fs_llvm_iface *fs_iface = (struct lp_build_fs_llvm_iface *)iface;
    struct gallivm_state *gallivm = bld->gallivm;
    LLVMBuilderRef builder = gallivm->builder;
@@ -589,6 +593,7 @@ generate_fs_loop(struct gallivm_state *gallivm,
    LLVMValueRef stencil_refs[2];
    LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS];
    LLVMValueRef zs_samples = lp_build_const_int32(gallivm, key->zsbuf_nr_samples);
+   LLVMValueRef z_out = NULL, s_out = NULL;
    struct lp_build_for_loop_state loop_state, sample_loop_state = {0};
    struct lp_build_mask_context mask;
    /*
@@ -700,6 +705,17 @@ generate_fs_loop(struct gallivm_state *gallivm,
                                                     color_store_size, "color1");
       }
    }
+   if (shader->info.base.writes_z) {
+      z_out = lp_build_array_alloca(gallivm,
+                                    lp_build_vec_type(gallivm, type),
+                                    color_store_size, "depth");
+   }
+
+   if (shader->info.base.writes_stencil) {
+      s_out = lp_build_array_alloca(gallivm,
+                                    lp_build_vec_type(gallivm, type),
+                                    color_store_size, "depth");
+   }
 
    lp_build_for_loop_begin(&loop_state, gallivm,
                            lp_build_const_int32(gallivm, 0),
@@ -1049,6 +1065,33 @@ generate_fs_loop(struct gallivm_state *gallivm,
       LLVMBuildStore(builder, output_smask, out_sample_mask_storage);
    }
 
+   if (shader->info.base.writes_z) {
+      int pos0 = find_output_by_semantic(&shader->info.base,
+                                         TGSI_SEMANTIC_POSITION,
+                                         0);
+      LLVMValueRef out = LLVMBuildLoad(builder, outputs[pos0][2], "");
+      LLVMValueRef idx = loop_state.counter;
+      if (key->min_samples > 1)
+         idx = LLVMBuildAdd(builder, idx,
+                            LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
+      LLVMValueRef ptr = LLVMBuildGEP(builder, z_out, &idx, 1, "");
+      LLVMBuildStore(builder, out, ptr);
+   }
+
+   if (shader->info.base.writes_stencil) {
+      int sten_out = find_output_by_semantic(&shader->info.base,
+                                             TGSI_SEMANTIC_STENCIL,
+                                             0);
+      LLVMValueRef out = LLVMBuildLoad(builder, outputs[sten_out][1], "output.s");
+      LLVMValueRef idx = loop_state.counter;
+      if (key->min_samples > 1)
+         idx = LLVMBuildAdd(builder, idx,
+                            LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
+      LLVMValueRef ptr = LLVMBuildGEP(builder, s_out, &idx, 1, "");
+      LLVMBuildStore(builder, out, ptr);
+   }
+
+
    /* Color write - per fragment sample */
    for (attrib = 0; attrib < shader->info.base.num_outputs; ++attrib)
    {
@@ -1119,14 +1162,13 @@ generate_fs_loop(struct gallivm_state *gallivm,
 
    /* Late Z test */
    if (depth_mode & LATE_DEPTH_TEST) {
-      int pos0 = find_output_by_semantic(&shader->info.base,
-                                         TGSI_SEMANTIC_POSITION,
-                                         0);
-      int s_out = find_output_by_semantic(&shader->info.base,
-                                          TGSI_SEMANTIC_STENCIL,
-                                          0);
-      if (pos0 != -1 && outputs[pos0][2]) {
-         z = LLVMBuildLoad(builder, outputs[pos0][2], "output.z");
+      if (shader->info.base.writes_z) {
+         LLVMValueRef idx = loop_state.counter;
+         if (key->min_samples > 1)
+            idx = LLVMBuildAdd(builder, idx,
+                               LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
+         LLVMValueRef ptr = LLVMBuildGEP(builder, z_out, &idx, 1, "");
+         z = LLVMBuildLoad(builder, ptr, "output.z");
       } else {
          if (key->multisample) {
             lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter, key->multisample ? sample_loop_state.counter : NULL);
@@ -1148,10 +1190,15 @@ generate_fs_loop(struct gallivm_state *gallivm,
                             lp_build_const_vec(gallivm, type, 1.0));
       }
 
-      if (s_out != -1 && outputs[s_out][1]) {
+      if (shader->info.base.writes_stencil) {
+         LLVMValueRef idx = loop_state.counter;
+         if (key->min_samples > 1)
+            idx = LLVMBuildAdd(builder, idx,
+                               LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
+         LLVMValueRef ptr = LLVMBuildGEP(builder, s_out, &idx, 1, "");
+         stencil_refs[0] = LLVMBuildLoad(builder, ptr, "output.s");
          /* there's only one value, and spec says to discard additional bits */
          LLVMValueRef s_max_mask = lp_build_const_int_vec(gallivm, int_type, 255);
-         stencil_refs[0] = LLVMBuildLoad(builder, outputs[s_out][1], "output.s");
          stencil_refs[0] = LLVMBuildBitCast(builder, stencil_refs[0], int_vec_type, "");
          stencil_refs[0] = LLVMBuildAnd(builder, stencil_refs[0], s_max_mask, "");
          stencil_refs[1] = stencil_refs[0];
@@ -1665,6 +1712,15 @@ scale_bits(struct gallivm_state *gallivm,
       int delta_bits = src_bits - dst_bits;
 
       if (delta_bits <= dst_bits) {
+
+         if (dst_bits == 4) {
+            struct lp_type flt_type = lp_type_float_vec(32, src_type.length * 32);
+
+            result = lp_build_unsigned_norm_to_float(gallivm, src_bits, flt_type, src);
+            result = lp_build_clamped_float_to_unsigned_norm(gallivm, flt_type, dst_bits, result);
+            return result;
+         }
+
          /*
           * Approximate the rescaling with a single shift.
           *
@@ -3114,7 +3170,7 @@ generate_fragment(struct llvmpipe_context *lp,
    }
 
    /* code generated texture sampling */
-   sampler = lp_llvm_sampler_soa_create(key->samplers, key->nr_samplers);
+   sampler = lp_llvm_sampler_soa_create(lp_fs_variant_key_samplers(key), key->nr_samplers);
    image = lp_llvm_image_soa_create(lp_fs_variant_key_images(key), key->nr_images);
 
    num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */
@@ -3388,7 +3444,8 @@ dump_fs_variant_key(struct lp_fragment_shader_variant_key *key)
       debug_printf("blend.alpha_to_coverage is enabled\n");
    }
    for (i = 0; i < key->nr_samplers; ++i) {
-      const struct lp_static_sampler_state *sampler = &key->samplers[i].sampler_state;
+      const struct lp_sampler_static_state *samplers = lp_fs_variant_key_samplers(key);
+      const struct lp_static_sampler_state *sampler = &samplers[i].sampler_state;
       debug_printf("sampler[%u] = \n", i);
       debug_printf("  .wrap = %s %s %s\n",
                    util_str_tex_wrap(sampler->wrap_s, TRUE),
@@ -3411,7 +3468,8 @@ dump_fs_variant_key(struct lp_fragment_shader_variant_key *key)
       debug_printf("  .aniso = %u\n", sampler->aniso);
    }
    for (i = 0; i < key->nr_sampler_views; ++i) {
-      const struct lp_static_texture_state *texture = &key->samplers[i].texture_state;
+      const struct lp_sampler_static_state *samplers = lp_fs_variant_key_samplers(key);
+      const struct lp_static_texture_state *texture = &samplers[i].texture_state;
       debug_printf("texture[%u] = \n", i);
       debug_printf("  .format = %s\n",
                    util_format_name(texture->format));
@@ -3601,15 +3659,16 @@ generate_variant(struct llvmpipe_context *lp,
         shader->kind == LP_FS_KIND_BLIT_RGB1)) {
       unsigned target, min_img_filter, mag_img_filter, min_mip_filter;
       enum pipe_format texture_format;
-
-      texture_format = key->samplers[0].texture_state.format;
-      target = key->samplers[0].texture_state.target;
-      min_img_filter = key->samplers[0].sampler_state.min_img_filter;
-      mag_img_filter = key->samplers[0].sampler_state.mag_img_filter;
-      if (key->samplers[0].texture_state.level_zero_only) {
+      struct lp_sampler_static_state *samp0 = lp_fs_variant_key_sampler_idx(key, 0);
+      assert(samp0);
+      texture_format = samp0->texture_state.format;
+      target = samp0->texture_state.target;
+      min_img_filter = samp0->sampler_state.min_img_filter;
+      mag_img_filter = samp0->sampler_state.mag_img_filter;
+      if (samp0->texture_state.level_zero_only) {
          min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
       } else {
-         min_mip_filter = key->samplers[0].sampler_state.min_mip_filter;
+         min_mip_filter = samp0->sampler_state.min_mip_filter;
       }
 
       if (target == PIPE_TEXTURE_2D &&
@@ -3777,7 +3836,6 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
 
    for (i = 0; i < shader->info.base.num_inputs; i++) {
       shader->inputs[i].usage_mask = shader->info.base.input_usage_mask[i];
-      shader->inputs[i].cyl_wrap = shader->info.base.input_cylindrical_wrap[i];
       shader->inputs[i].location = shader->info.base.input_interpolate_loc[i];
 
       switch (shader->info.base.input_interpolate[i]) {
@@ -3942,7 +4000,7 @@ llvmpipe_set_constant_buffer(struct pipe_context *pipe,
                              const struct pipe_constant_buffer *cb)
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
-   struct pipe_resource *constants = cb ? cb->buffer : NULL;
+   struct pipe_constant_buffer *constants = &llvmpipe->constants[shader][index];
 
    assert(shader < PIPE_SHADER_TYPES);
    assert(index < ARRAY_SIZE(llvmpipe->constants[shader]));
@@ -3951,10 +4009,19 @@ llvmpipe_set_constant_buffer(struct pipe_context *pipe,
    util_copy_constant_buffer(&llvmpipe->constants[shader][index], cb,
                              take_ownership);
 
-   if (constants) {
-       if (!(constants->bind & PIPE_BIND_CONSTANT_BUFFER)) {
+   /* user_buffer is only valid until the next set_constant_buffer (at most,
+    * possibly until shader deletion), so we need to upload it now to make sure
+    * it doesn't get updated/freed out from under us.
+    */
+   if (constants->user_buffer) {
+      u_upload_data(llvmpipe->pipe.const_uploader, 0, constants->buffer_size, 16,
+                    constants->user_buffer, &constants->buffer_offset,
+                    &constants->buffer);
+   }
+   if (constants->buffer) {
+       if (!(constants->buffer->bind & PIPE_BIND_CONSTANT_BUFFER)) {
          debug_printf("Illegal set constant without bind flag\n");
-         constants->bind |= PIPE_BIND_CONSTANT_BUFFER;
+         constants->buffer->bind |= PIPE_BIND_CONSTANT_BUFFER;
       }
    }
 
@@ -3964,20 +4031,10 @@ llvmpipe_set_constant_buffer(struct pipe_context *pipe,
        shader == PIPE_SHADER_TESS_EVAL) {
       /* Pass the constants to the 'draw' module */
       const unsigned size = cb ? cb->buffer_size : 0;
-      const ubyte *data;
 
-      if (constants) {
-         data = (ubyte *) llvmpipe_resource_data(constants);
-      }
-      else if (cb && cb->user_buffer) {
-         data = (ubyte *) cb->user_buffer;
-      }
-      else {
-         data = NULL;
-      }
-
-      if (data)
-         data += cb->buffer_offset;
+      const ubyte *data = NULL;
+      if (constants->buffer)
+         data = (ubyte *) llvmpipe_resource_data(constants->buffer) + constants->buffer_offset;
 
       draw_set_mapped_constant_buffer(llvmpipe->draw, shader,
                                       index, data, size);
@@ -3986,10 +4043,6 @@ llvmpipe_set_constant_buffer(struct pipe_context *pipe,
       llvmpipe->cs_dirty |= LP_CSNEW_CONSTANTS;
    else
       llvmpipe->dirty |= LP_NEW_FS_CONSTANTS;
-
-   if (cb && cb->user_buffer) {
-      pipe_resource_reference(&constants, NULL);
-   }
 }
 
 static void
@@ -4100,7 +4153,7 @@ make_variant_key(struct llvmpipe_context *lp,
 
    key = (struct lp_fragment_shader_variant_key *)store;
 
-   memset(key, 0, offsetof(struct lp_fragment_shader_variant_key, samplers[1]));
+   memset(key, 0, sizeof(*key));
 
    if (lp->framebuffer.zsbuf) {
       enum pipe_format zsbuf_format = lp->framebuffer.zsbuf->format;
@@ -4127,10 +4180,8 @@ make_variant_key(struct llvmpipe_context *lp,
 
    /*
     * Propagate the depth clamp setting from the rasterizer state.
-    * depth_clip == 0 implies depth clamping is enabled.
-    *
     */
-   key->depth_clamp = (lp->rasterizer->depth_clip_near == 0) ? 1 : 0;
+   key->depth_clamp = lp->rasterizer->depth_clamp;
 
    /* alpha test only applies if render buffer 0 is non-integer (or does not exist) */
    if (!lp->framebuffer.nr_cbufs ||
@@ -4247,7 +4298,7 @@ make_variant_key(struct llvmpipe_context *lp,
 
    struct lp_sampler_static_state *fs_sampler;
 
-   fs_sampler = key->samplers;
+   fs_sampler = lp_fs_variant_key_samplers(key);
 
    memset(fs_sampler, 0, MAX2(key->nr_samplers, key->nr_sampler_views) * sizeof *fs_sampler);
 
@@ -4298,8 +4349,10 @@ make_variant_key(struct llvmpipe_context *lp,
    }
 
    if (shader->kind == LP_FS_KIND_AERO_MINIFICATION) {
-      key->samplers[0].sampler_state.min_img_filter = PIPE_TEX_FILTER_NEAREST;
-      key->samplers[0].sampler_state.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
+      struct lp_sampler_static_state *samp0 = lp_fs_variant_key_sampler_idx(key, 0);
+      assert(samp0);
+      samp0->sampler_state.min_img_filter = PIPE_TEX_FILTER_NEAREST;
+      samp0->sampler_state.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
    }
 
    return key;
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_fs.h b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_fs.h
index 9f43665d83..f88ef70dbd 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_fs.h	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_fs.h	
@@ -111,9 +111,7 @@ struct lp_fragment_shader_variant_key
    uint8_t zsbuf_nr_samples;
    uint8_t coverage_samples;
    uint8_t min_samples;
-
-   struct lp_sampler_static_state samplers[1];
-   /* followed by variable number of images */
+   /* followed by variable number of samplers + images */
 };
 
 #define LP_FS_MAX_VARIANT_KEY_SIZE                                      \
@@ -124,17 +122,30 @@ struct lp_fragment_shader_variant_key
 static inline size_t
 lp_fs_variant_key_size(unsigned nr_samplers, unsigned nr_images)
 {
-   unsigned samplers = nr_samplers > 1 ? (nr_samplers - 1) : 0;
    return (sizeof(struct lp_fragment_shader_variant_key) +
-           samplers * sizeof(struct lp_sampler_static_state) +
+           nr_samplers * sizeof(struct lp_sampler_static_state) +
            nr_images * sizeof(struct lp_image_static_state));
 }
 
+static inline struct lp_sampler_static_state *
+lp_fs_variant_key_samplers(const struct lp_fragment_shader_variant_key *key)
+{
+   return (struct lp_sampler_static_state *)&(key[1]);
+}
+
+static inline struct lp_sampler_static_state *
+lp_fs_variant_key_sampler_idx(const struct lp_fragment_shader_variant_key *key, int idx)
+{
+   if (idx >= key->nr_samplers)
+      return NULL;
+   return &lp_fs_variant_key_samplers(key)[idx];
+}
+
 static inline struct lp_image_static_state *
 lp_fs_variant_key_images(struct lp_fragment_shader_variant_key *key)
 {
    return (struct lp_image_static_state *)
-      &key->samplers[key->nr_samplers];
+      &(lp_fs_variant_key_samplers(key)[key->nr_samplers]);
 }
 
 /** doubly-linked list item */
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_fs_linear.c b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_fs_linear.c
index a8d10ae300..fb710701b1 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_fs_linear.c	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_fs_linear.c	
@@ -667,16 +667,20 @@ is_one_inv_src_alpha_blend(const struct lp_fragment_shader_variant *variant)
 void
 llvmpipe_fs_variant_linear_fastpath(struct lp_fragment_shader_variant *variant)
 {
-   enum pipe_format tex_format = variant->key.samplers[0].texture_state.format;
+   struct lp_sampler_static_state *samp0 = lp_fs_variant_key_sampler_idx(&variant->key, 0);
 
    if (LP_PERF & PERF_NO_SHADE) {
       variant->jit_linear                   = linear_red;
       return;
    }
 
+   if (!samp0)
+      return;
+
+   enum pipe_format tex_format = samp0->texture_state.format;
    if (variant->shader->kind == LP_FS_KIND_BLIT_RGBA &&
        tex_format == PIPE_FORMAT_B8G8R8A8_UNORM &&
-       is_nearest_clamp_sampler(&variant->key.samplers[0])) {
+       is_nearest_clamp_sampler(samp0)) {
       if (variant->opaque) {
          variant->jit_linear_blit             = blit_rgba_blit;
          variant->jit_linear                  = blit_rgba;
@@ -692,7 +696,7 @@ llvmpipe_fs_variant_linear_fastpath(struct lp_fragment_shader_variant *variant)
        variant->opaque &&
        (tex_format == PIPE_FORMAT_B8G8R8A8_UNORM ||
         tex_format == PIPE_FORMAT_B8G8R8X8_UNORM) &&
-       is_nearest_clamp_sampler(&variant->key.samplers[0])) {
+       is_nearest_clamp_sampler(samp0)) {
       variant->jit_linear_blit             = blit_rgb1_blit;
       variant->jit_linear                  = blit_rgb1;
       return;
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_sampler.c b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index 5332bb049c..aa47cab3ab 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_sampler.c	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_sampler.c	
@@ -120,6 +120,7 @@ llvmpipe_set_sampler_views(struct pipe_context *pipe,
                            unsigned start,
                            unsigned num,
                            unsigned unbind_num_trailing_slots,
+                           bool take_ownership,
                            struct pipe_sampler_view **views)
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
@@ -150,8 +151,15 @@ llvmpipe_set_sampler_views(struct pipe_context *pipe,
 
       if (view)
          llvmpipe_flush_resource(pipe, view->texture, 0, true, false, false, "sampler_view");
-      pipe_sampler_view_reference(&llvmpipe->sampler_views[shader][start + i],
-                                  view);
+
+      if (take_ownership) {
+         pipe_sampler_view_reference(&llvmpipe->sampler_views[shader][start + i],
+                                     NULL);
+         llvmpipe->sampler_views[shader][start + i] = view;
+      } else {
+         pipe_sampler_view_reference(&llvmpipe->sampler_views[shader][start + i],
+                                     view);
+      }
    }
 
    for (; i < num + unbind_num_trailing_slots; i++) {
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_setup.c b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_setup.c
index e214d6e48c..9f385a084e 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_setup.c	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_setup.c	
@@ -201,7 +201,7 @@ lp_twoside(struct gallivm_state *gallivm,
 
 }
 
-static void
+static LLVMValueRef
 lp_do_offset_tri(struct gallivm_state *gallivm,
                  struct lp_setup_args *args,
                  const struct lp_setup_variant_key *key,
@@ -215,9 +215,7 @@ lp_do_offset_tri(struct gallivm_state *gallivm,
    struct lp_build_context int_scalar_bld;
    struct lp_build_context *bld = &args->bld;
    LLVMValueRef zoffset, mult;
-   LLVMValueRef z0_new, z1_new, z2_new;
    LLVMValueRef dzdxdzdy, dzdx, dzdy, dzxyz20, dyzzx01, dyzzx01_dzxyz20, dzx01_dyz20;
-   LLVMValueRef z0z1, z0z1z2;
    LLVMValueRef max, max_value, res12;
    LLVMValueRef shuffles[4];
    LLVMTypeRef shuf_type = LLVMInt32TypeInContext(gallivm->context);
@@ -268,8 +266,8 @@ lp_do_offset_tri(struct gallivm_state *gallivm,
 
    if (key->floating_point_depth) {
       /*
-       * bias = pgon_offset_units * 2^(exponent(max(z0, z1, z2)) - mantissa_bits) +
-       *           MAX2(dzdx, dzdy) * pgon_offset_scale
+       * bias = pgon_offset_units * 2^(exponent(max(abs(z0), abs(z1), abs(z2))) -
+       *           mantissa_bits) + MAX2(dzdx, dzdy) * pgon_offset_scale
        *
        * NOTE: Assumes IEEE float32.
        */
@@ -282,11 +280,14 @@ lp_do_offset_tri(struct gallivm_state *gallivm,
       exp_mask = lp_build_const_int32(gallivm, 0xff << 23);
 
       maxz0z1_value = lp_build_max(&flt_scalar_bld,
-                         LLVMBuildExtractElement(b, attribv[0], twoi, ""),
-                         LLVMBuildExtractElement(b, attribv[1], twoi, ""));
+                         lp_build_abs(&flt_scalar_bld,
+                            LLVMBuildExtractElement(b, attribv[0], twoi, "")),
+                         lp_build_abs(&flt_scalar_bld,
+                            LLVMBuildExtractElement(b, attribv[1], twoi, "")));
 
       maxz_value = lp_build_max(&flt_scalar_bld,
-                      LLVMBuildExtractElement(b, attribv[2], twoi, ""),
+                      lp_build_abs(&flt_scalar_bld,
+                         LLVMBuildExtractElement(b, attribv[2], twoi, "")),
                       maxz0z1_value);
 
       exp = LLVMBuildBitCast(b, maxz_value, int_scalar_bld.vec_type, "");
@@ -323,29 +324,7 @@ lp_do_offset_tri(struct gallivm_state *gallivm,
                              zoffset);
    }
 
-   /* yuck */
-   shuffles[0] = twoi;
-   shuffles[1] = lp_build_const_int32(gallivm, 6);
-   shuffles[2] = LLVMGetUndef(shuf_type);
-   shuffles[3] = LLVMGetUndef(shuf_type);
-   z0z1 = LLVMBuildShuffleVector(b, attribv[0], attribv[1], LLVMConstVector(shuffles, 4), "");
-   shuffles[0] = zeroi;
-   shuffles[1] = onei;
-   shuffles[2] = lp_build_const_int32(gallivm, 6);
-   shuffles[3] = LLVMGetUndef(shuf_type);
-   z0z1z2 = LLVMBuildShuffleVector(b, z0z1, attribv[2], LLVMConstVector(shuffles, 4), "");
-   zoffset = lp_build_broadcast_scalar(bld, zoffset);
-
-   z0z1z2 = LLVMBuildFAdd(b, z0z1z2, zoffset, "");
-
-   /* insert into args->a0.z, a1.z, a2.z:
-    */
-   z0_new = LLVMBuildExtractElement(b, z0z1z2, zeroi, "");
-   z1_new = LLVMBuildExtractElement(b, z0z1z2, onei, "");
-   z2_new = LLVMBuildExtractElement(b, z0z1z2, twoi, "");
-   attribv[0] = LLVMBuildInsertElement(b, attribv[0], z0_new, twoi, "");
-   attribv[1] = LLVMBuildInsertElement(b, attribv[1], z1_new, twoi, "");
-   attribv[2] = LLVMBuildInsertElement(b, attribv[2], z2_new, twoi, "");
+   return zoffset;
 }
 
 static void
@@ -389,12 +368,12 @@ load_attribute(struct gallivm_state *gallivm,
  * which obviously wouldn't work)).
  */
 static void 
-emit_coef4( struct gallivm_state *gallivm,
+calc_coef4( struct gallivm_state *gallivm,
             struct lp_setup_args *args,
-            unsigned slot,
             LLVMValueRef a0,
             LLVMValueRef a1,
-            LLVMValueRef a2)
+            LLVMValueRef a2,
+            LLVMValueRef out[3])
 {
    LLVMBuilderRef b = gallivm->builder;
    LLVMValueRef attr_0;
@@ -426,7 +405,23 @@ emit_coef4( struct gallivm_state *gallivm,
    LLVMValueRef attr_v0    = LLVMBuildFAdd(b, dadx_x0, dady_y0, "attr_v0");
    attr_0                  = LLVMBuildFSub(b, a0, attr_v0, "attr_0");
 
-   store_coef(gallivm, args, slot, attr_0, dadx, dady);
+   out[0] = attr_0;
+   out[1] = dadx;
+   out[2] = dady;
+}
+
+static void
+emit_coef4( struct gallivm_state *gallivm,
+            struct lp_setup_args *args,
+            unsigned slot,
+            LLVMValueRef a0,
+            LLVMValueRef a1,
+            LLVMValueRef a2)
+{
+   LLVMValueRef coeffs[3];
+   calc_coef4(gallivm, args, a0, a1, a2, coeffs);
+   store_coef(gallivm, args, slot,
+              coeffs[0], coeffs[1], coeffs[2]);
 }
 
 
@@ -476,82 +471,6 @@ apply_perspective_corr( struct gallivm_state *gallivm,
 }
 
 
-/**
- * Apply cylindrical wrapping to vertex attributes if enabled.
- * Input coordinates must be in [0, 1] range, otherwise results are undefined.
- *
- * @param cyl_wrap  TGSI_CYLINDRICAL_WRAP_x flags
- */
-static void
-emit_apply_cyl_wrap(struct gallivm_state *gallivm,
-                    struct lp_setup_args *args,
-                    uint cyl_wrap,
-                    LLVMValueRef attribv[3])
-
-{
-   LLVMBuilderRef builder = gallivm->builder;
-   struct lp_type type = args->bld.type;
-   LLVMTypeRef float_vec_type = args->bld.vec_type;
-   LLVMValueRef pos_half;
-   LLVMValueRef neg_half;
-   LLVMValueRef cyl_mask;
-   LLVMValueRef offset;
-   LLVMValueRef delta;
-   LLVMValueRef one;
-
-   if (!cyl_wrap)
-      return;
-
-   /* Constants */
-   pos_half = lp_build_const_vec(gallivm, type, +0.5f);
-   neg_half = lp_build_const_vec(gallivm, type, -0.5f);
-   cyl_mask = lp_build_const_mask_aos(gallivm, type, cyl_wrap, 4);
-
-   one = lp_build_const_vec(gallivm, type, 1.0f);
-   one = LLVMBuildBitCast(builder, one, lp_build_int_vec_type(gallivm, type), "");
-   one = LLVMBuildAnd(builder, one, cyl_mask, "");
-
-   /* Edge v0 -> v1 */
-   delta = LLVMBuildFSub(builder, attribv[1], attribv[0], "");
-
-   offset     = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, delta, pos_half);
-   offset     = LLVMBuildAnd(builder, offset, one, "");
-   offset     = LLVMBuildBitCast(builder, offset, float_vec_type, "");
-   attribv[0] = LLVMBuildFAdd(builder, attribv[0], offset, "");
-
-   offset     = lp_build_compare(gallivm, type, PIPE_FUNC_LESS, delta, neg_half);
-   offset     = LLVMBuildAnd(builder, offset, one, "");
-   offset     = LLVMBuildBitCast(builder, offset, float_vec_type, "");
-   attribv[1] = LLVMBuildFAdd(builder, attribv[1], offset, "");
-
-   /* Edge v1 -> v2 */
-   delta = LLVMBuildFSub(builder, attribv[2], attribv[1], "");
-
-   offset     = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, delta, pos_half);
-   offset     = LLVMBuildAnd(builder, offset, one, "");
-   offset     = LLVMBuildBitCast(builder, offset, float_vec_type, "");
-   attribv[1] = LLVMBuildFAdd(builder, attribv[1], offset, "");
-
-   offset     = lp_build_compare(gallivm, type, PIPE_FUNC_LESS, delta, neg_half);
-   offset     = LLVMBuildAnd(builder, offset, one, "");
-   offset     = LLVMBuildBitCast(builder, offset, float_vec_type, "");
-   attribv[2] = LLVMBuildFAdd(builder, attribv[2], offset, "");
-
-   /* Edge v2 -> v0 */
-   delta = LLVMBuildFSub(builder, attribv[0], attribv[2], "");
-
-   offset     = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, delta, pos_half);
-   offset     = LLVMBuildAnd(builder, offset, one, "");
-   offset     = LLVMBuildBitCast(builder, offset, float_vec_type, "");
-   attribv[2] = LLVMBuildFAdd(builder, attribv[2], offset, "");
-
-   offset     = lp_build_compare(gallivm, type, PIPE_FUNC_LESS, delta, neg_half);
-   offset     = LLVMBuildAnd(builder, offset, one, "");
-   offset     = LLVMBuildBitCast(builder, offset, float_vec_type, "");
-   attribv[0] = LLVMBuildFAdd(builder, attribv[0], offset, "");
-}
-
-
 /**
  * Compute the inputs-> dadx, dady, a0 values.
  */
@@ -580,13 +499,11 @@ emit_tri_coef( struct gallivm_state *gallivm,
 
       case LP_INTERP_LINEAR:
          load_attribute(gallivm, args, key, key->inputs[slot].src_index, attribs);
-         emit_apply_cyl_wrap(gallivm, args, key->inputs[slot].cyl_wrap, attribs);
          emit_linear_coef(gallivm, args, slot+1, attribs);
          break;
 
       case LP_INTERP_PERSPECTIVE:
          load_attribute(gallivm, args, key, key->inputs[slot].src_index, attribs);
-         emit_apply_cyl_wrap(gallivm, args, key->inputs[slot].cyl_wrap, attribs);
          apply_perspective_corr(gallivm, args, slot+1, attribs);
          emit_linear_coef(gallivm, args, slot+1, attribs);
          break;
@@ -637,6 +554,7 @@ init_args(struct gallivm_state *gallivm,
    LLVMValueRef e, f, ef, ooa;
    LLVMValueRef shuffles[4], shuf10;
    LLVMValueRef attr_pos[3];
+   LLVMValueRef polygon_offset;
    struct lp_type typef4 = lp_type_float_vec(32, 128);
    struct lp_build_context bld;
 
@@ -677,7 +595,9 @@ init_args(struct gallivm_state *gallivm,
 
    /* tri offset calc shares a lot of arithmetic, do it here */
    if (key->pgon_offset_scale != 0.0f || key->pgon_offset_units != 0.0f) {
-      lp_do_offset_tri(gallivm, args, key, ooa, dxy01, dxy20, attr_pos);
+      polygon_offset = lp_do_offset_tri(gallivm, args, key, ooa, dxy01, dxy20, attr_pos);
+   } else {
+      polygon_offset = lp_build_const_float(gallivm, 0.0f);
    }
 
    dxy20 = LLVMBuildFMul(b, dxy20, ooa, "");
@@ -692,7 +612,22 @@ init_args(struct gallivm_state *gallivm,
    args->x0_center = lp_build_extract_broadcast(gallivm, typef4, typef4, xy0_center, zeroi);
    args->y0_center = lp_build_extract_broadcast(gallivm, typef4, typef4, xy0_center, onei);
 
-   emit_linear_coef(gallivm, args, 0, attr_pos);
+   LLVMValueRef coeffs[3];
+   calc_coef4(gallivm, args,
+              attr_pos[0], attr_pos[1], attr_pos[2],
+              coeffs);
+
+   /* This is a bit sneaky:
+    * Because we observe that the X component of A0 is otherwise unused,
+    * we can overwrite it with the computed polygon-offset value, to make
+    * sure it's available in the fragment shader without having to change
+    * the interface (which is error-prone).
+    */
+   coeffs[0] = LLVMBuildInsertElement(b, coeffs[0], polygon_offset,
+                                      lp_build_const_int32(gallivm, 0), "");
+
+   store_coef(gallivm, args, 0,
+              coeffs[0], coeffs[1], coeffs[2]);
 }
 
 /**
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_tess.c b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_tess.c
index b9e919e76f..28cc1258b7 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_tess.c	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_state_tess.c	
@@ -181,6 +181,14 @@ llvmpipe_set_tess_state(struct pipe_context *pipe,
    draw_set_tess_state(llvmpipe->draw, default_outer_level, default_inner_level);
 }
 
+static void
+llvmpipe_set_patch_vertices(struct pipe_context *pipe, uint8_t patch_vertices)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   llvmpipe->patch_vertices = patch_vertices;
+}
+
 void
 llvmpipe_init_tess_funcs(struct llvmpipe_context *llvmpipe)
 {
@@ -193,4 +201,5 @@ llvmpipe_init_tess_funcs(struct llvmpipe_context *llvmpipe)
    llvmpipe->pipe.delete_tes_state = llvmpipe_delete_tes_state;
 
    llvmpipe->pipe.set_tess_state = llvmpipe_set_tess_state;
+   llvmpipe->pipe.set_patch_vertices = llvmpipe_set_patch_vertices;
 }
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_texture.c b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_texture.c
index eb635b6fc9..5bfc8dbc97 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_texture.c	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_texture.c	
@@ -52,6 +52,7 @@
 #include "lp_rast.h"
 
 #include "frontend/sw_winsys.h"
+#include "git_sha1.h"
 
 #ifndef _WIN32
 #include "drm-uapi/drm_fourcc.h"
@@ -334,6 +335,104 @@ llvmpipe_resource_create_unbacked(struct pipe_screen *_screen,
    return pt;
 }
 
+static struct pipe_memory_object *
+llvmpipe_memobj_create_from_handle(struct pipe_screen *pscreen,
+                                   struct winsys_handle *handle,
+                                   bool dedicated)
+{
+#ifdef PIPE_MEMORY_FD
+   struct llvmpipe_memory_object *memobj = CALLOC_STRUCT(llvmpipe_memory_object);
+
+   if (handle->type == WINSYS_HANDLE_TYPE_FD && 
+       pscreen->import_memory_fd(pscreen, handle->handle, &memobj->data, &memobj->size)) {
+      return &memobj->b;
+   }
+   free(memobj);
+#endif
+   return NULL;
+}
+
+static void
+llvmpipe_memobj_destroy(struct pipe_screen *pscreen,
+                        struct pipe_memory_object *memobj)
+{
+   if (!memobj)
+      return;
+   struct llvmpipe_memory_object *lpmo = llvmpipe_memory_object(memobj);
+#ifdef PIPE_MEMORY_FD
+   pscreen->free_memory_fd(pscreen, lpmo->data);
+#endif
+   free(lpmo);
+}
+
+static struct pipe_resource *
+llvmpipe_resource_from_memobj(struct pipe_screen *pscreen,
+                              const struct pipe_resource *templat,
+                              struct pipe_memory_object *memobj,
+                              uint64_t offset)
+{
+   if (!memobj)
+      return NULL;
+   struct llvmpipe_screen *screen = llvmpipe_screen(pscreen);
+   struct llvmpipe_memory_object *lpmo = llvmpipe_memory_object(memobj);
+   struct llvmpipe_resource *lpr = CALLOC_STRUCT(llvmpipe_resource);
+   lpr->base = *templat;
+
+   lpr->screen = screen;
+   pipe_reference_init(&lpr->base.reference, 1);
+   lpr->base.screen = &screen->base;
+
+   if (llvmpipe_resource_is_texture(&lpr->base)) {
+      /* texture map */
+      if (!llvmpipe_texture_layout(screen, lpr, false))
+         goto fail;
+      if(lpmo->size < lpr->size_required)
+         goto fail;
+      lpr->tex_data = lpmo->data;
+   }
+   else {
+      /* other data (vertex buffer, const buffer, etc) */
+      const uint bytes = templat->width0;
+      assert(util_format_get_blocksize(templat->format) == 1);
+      assert(templat->height0 == 1);
+      assert(templat->depth0 == 1);
+      assert(templat->last_level == 0);
+      /*
+       * Reserve some extra storage since if we'd render to a buffer we
+       * read/write always LP_RASTER_BLOCK_SIZE pixels, but the element
+       * offset doesn't need to be aligned to LP_RASTER_BLOCK_SIZE.
+       */
+      /*
+       * buffers don't really have stride but it's probably safer
+       * (for code doing same calculations for buffers and textures)
+       * to put something reasonable in there.
+       */
+      lpr->row_stride[0] = bytes;
+
+      lpr->size_required = bytes;
+      if (!(templat->flags & PIPE_RESOURCE_FLAG_DONT_OVER_ALLOCATE))
+         lpr->size_required += (LP_RASTER_BLOCK_SIZE - 1) * 4 * sizeof(float);
+
+      if(lpmo->size < lpr->size_required)
+         goto fail;
+      lpr->data = lpmo->data;
+   }
+   lpr->id = id_counter++;
+   lpr->imported_memory = true;
+
+#ifdef DEBUG
+   mtx_lock(&resource_list_mutex);
+   insert_at_tail(&resource_list, lpr);
+   mtx_unlock(&resource_list_mutex);
+#endif
+
+   return &lpr->base;
+
+fail:
+   free(lpr);
+   return NULL;
+}
+
 static void
 llvmpipe_resource_destroy(struct pipe_screen *pscreen,
                           struct pipe_resource *pt)
@@ -341,7 +440,7 @@ llvmpipe_resource_destroy(struct pipe_screen *pscreen,
    struct llvmpipe_screen *screen = llvmpipe_screen(pscreen);
    struct llvmpipe_resource *lpr = llvmpipe_resource(pt);
 
-   if (!lpr->backable) {
+   if (!lpr->backable && !lpr->user_ptr) {
       if (lpr->dt) {
          /* display target */
          struct sw_winsys *winsys = screen->winsys;
@@ -350,13 +449,14 @@ llvmpipe_resource_destroy(struct pipe_screen *pscreen,
       else if (llvmpipe_resource_is_texture(pt)) {
          /* free linear image data */
          if (lpr->tex_data) {
-            align_free(lpr->tex_data);
+            if (!lpr->imported_memory)
+               align_free(lpr->tex_data);
             lpr->tex_data = NULL;
          }
       }
-      else if (!lpr->userBuffer) {
-         if (lpr->data)
-            align_free(lpr->data);
+      else if (lpr->data) {
+            if (!lpr->imported_memory)
+               align_free(lpr->data);
       }
    }
 #ifdef DEBUG
@@ -549,9 +649,18 @@ llvmpipe_resource_from_user_memory(struct pipe_screen *_screen,
    pipe_reference_init(&lpr->base.reference, 1);
    lpr->base.screen = _screen;
 
-   lpr->data = user_memory;
-   lpr->userBuffer = TRUE;
+   if (llvmpipe_resource_is_texture(&lpr->base)) {
+      if (!llvmpipe_texture_layout(screen, lpr, false))
+         goto fail;
+
+      lpr->tex_data = user_memory;
+   } else
+      lpr->data = user_memory;
+   lpr->user_ptr = true;
    return &lpr->base;
+fail:
+   FREE(lpr);
+   return NULL;
 }
 
 void *
@@ -773,7 +882,7 @@ llvmpipe_user_buffer_create(struct pipe_screen *screen,
    buffer->base.height0 = 1;
    buffer->base.depth0 = 1;
    buffer->base.array_size = 1;
-   buffer->userBuffer = TRUE;
+   buffer->user_ptr = true;
    buffer->data = ptr;
 
    return &buffer->base;
@@ -850,6 +959,28 @@ static void llvmpipe_free_memory(struct pipe_screen *screen,
    os_free_aligned(pmem);
 }
 
+#ifdef PIPE_MEMORY_FD
+
+static const char *driver_id = "llvmpipe" MESA_GIT_SHA1;
+
+static struct pipe_memory_allocation *llvmpipe_allocate_memory_fd(struct pipe_screen *screen, uint64_t size, int *fd)
+{
+   return os_malloc_aligned_fd(size, 256, fd, "llvmpipe memory fd", driver_id);
+}
+
+static bool llvmpipe_import_memory_fd(struct pipe_screen *screen, int fd, struct pipe_memory_allocation **ptr, uint64_t *size)
+{
+   return os_import_memory_fd(fd, (void**)ptr, size, driver_id);
+}
+
+static void llvmpipe_free_memory_fd(struct pipe_screen *screen,
+                                    struct pipe_memory_allocation *pmem)
+{
+   os_free_fd(pmem);
+}
+
+#endif
+
 static bool llvmpipe_resource_bind_backing(struct pipe_screen *screen,
                                            struct pipe_resource *pt,
                                            struct pipe_memory_allocation *pmem,
@@ -995,16 +1126,25 @@ llvmpipe_init_screen_resource_funcs(struct pipe_screen *screen)
 /*   screen->resource_create_front = llvmpipe_resource_create_front; */
    screen->resource_destroy = llvmpipe_resource_destroy;
    screen->resource_from_handle = llvmpipe_resource_from_handle;
+   screen->resource_from_memobj = llvmpipe_resource_from_memobj;
    screen->resource_get_handle = llvmpipe_resource_get_handle;
    screen->can_create_resource = llvmpipe_can_create_resource;
 
    screen->resource_create_unbacked = llvmpipe_resource_create_unbacked;
 
+   screen->memobj_create_from_handle = llvmpipe_memobj_create_from_handle;
+   screen->memobj_destroy = llvmpipe_memobj_destroy;
+
    screen->resource_get_info = llvmpipe_get_resource_info;
    screen->resource_get_param = llvmpipe_resource_get_param;
    screen->resource_from_user_memory = llvmpipe_resource_from_user_memory;
    screen->allocate_memory = llvmpipe_allocate_memory;
    screen->free_memory = llvmpipe_free_memory;
+#ifdef PIPE_MEMORY_FD
+   screen->allocate_memory_fd = llvmpipe_allocate_memory_fd;
+   screen->import_memory_fd = llvmpipe_import_memory_fd;
+   screen->free_memory_fd = llvmpipe_free_memory_fd;
+#endif
    screen->map_memory = llvmpipe_map_memory;
    screen->unmap_memory = llvmpipe_unmap_memory;
 
diff --git a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_texture.h b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_texture.h
index d1ba22b268..c683cdd1e0 100644
--- a/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_texture.h	
+++ b/mesa 3D driver/src/gallium/drivers/llvmpipe/lp_texture.h	
@@ -89,7 +89,7 @@ struct llvmpipe_resource
     */
    void *data;
 
-   boolean userBuffer;  /** Is this a user-space buffer? */
+   bool user_ptr;  /** Is this a user-space buffer? */
    unsigned timestamp;
 
    unsigned id;  /**< temporary, for debugging */
@@ -99,6 +99,7 @@ struct llvmpipe_resource
    uint64_t size_required;
    uint64_t backing_offset;
    bool backable;
+   bool imported_memory;
 #ifdef DEBUG
    /** for linked list */
    struct llvmpipe_resource *prev, *next;
@@ -111,6 +112,13 @@ struct llvmpipe_transfer
    struct pipe_transfer base;
 };
 
+struct llvmpipe_memory_object
+{
+   struct pipe_memory_object b;
+   struct pipe_memory_allocation *data;
+   uint64_t size;
+};
+
 
 /** cast wrappers */
 static inline struct llvmpipe_resource *
@@ -133,6 +141,12 @@ llvmpipe_transfer(struct pipe_transfer *pt)
    return (struct llvmpipe_transfer *) pt;
 }
 
+static inline struct llvmpipe_memory_object *
+llvmpipe_memory_object(struct pipe_memory_object *pt)
+{
+   return (struct llvmpipe_memory_object *) pt;
+}
+
 
 void llvmpipe_init_screen_resource_funcs(struct pipe_screen *screen);
 void llvmpipe_init_context_resource_funcs(struct pipe_context *pipe);
diff --git a/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp b/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
index feafbd147b..20ed5cd525 100644
--- a/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp	
+++ b/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp	
@@ -340,7 +340,7 @@ BuildUtil::mkClobber(DataFile f, uint32_t rMask, int unit)
       int base2 = (baseSize2[mask] >>  8) & 0xf;
       int size2 = (baseSize2[mask] >> 12) & 0xf;
       Instruction *insn = mkOp(OP_NOP, TYPE_NONE, NULL);
-      if (1) { // size1 can't be 0
+      if (true) { // size1 can't be 0
          LValue *reg = new_LValue(func, f);
          reg->reg.size = size1 << unit;
          reg->reg.data.id = base + base1;
diff --git a/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index 83aae296fb..8853b7410c 100644
--- a/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h	
+++ b/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h	
@@ -244,8 +244,8 @@ extern void nv50_ir_get_target_library(uint32_t chipset,
 #ifdef __cplusplus
 namespace nv50_ir
 {
-   class FixupEntry;
-   class FixupData;
+   struct FixupEntry;
+   struct FixupData;
 
    void
    gk110_interpApply(const nv50_ir::FixupEntry *entry, uint32_t *code,
diff --git a/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index 56bc9fc3ce..545477e0cd 100644
--- a/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp	
+++ b/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp	
@@ -567,9 +567,9 @@ CodeEmitterGM107::emitCAL()
    const FlowInstruction *insn = this->insn->asFlow();
 
    if (insn->absolute) {
-      emitInsn(0xe2200000, 0); // JCAL
+      emitInsn(0xe2200000, false); // JCAL
    } else {
-      emitInsn(0xe2600000, 0); // CAL
+      emitInsn(0xe2600000, false); // CAL
    }
 
    if (!insn->srcExists(0) || insn->src(0).getFile() != FILE_MEMORY_CONST) {
@@ -595,7 +595,7 @@ CodeEmitterGM107::emitPCNT()
 {
    const FlowInstruction *insn = this->insn->asFlow();
 
-   emitInsn(0xe2b00000, 0);
+   emitInsn(0xe2b00000, false);
 
    if (!insn->srcExists(0) || insn->src(0).getFile() != FILE_MEMORY_CONST) {
       emitField(0x14, 24, insn->target.bb->binPos - (codeSize + 8));
@@ -617,7 +617,7 @@ CodeEmitterGM107::emitPBK()
 {
    const FlowInstruction *insn = this->insn->asFlow();
 
-   emitInsn(0xe2a00000, 0);
+   emitInsn(0xe2a00000, false);
 
    if (!insn->srcExists(0) || insn->src(0).getFile() != FILE_MEMORY_CONST) {
       emitField(0x14, 24, insn->target.bb->binPos - (codeSize + 8));
@@ -639,7 +639,7 @@ CodeEmitterGM107::emitPRET()
 {
    const FlowInstruction *insn = this->insn->asFlow();
 
-   emitInsn(0xe2700000, 0);
+   emitInsn(0xe2700000, false);
 
    if (!insn->srcExists(0) || insn->src(0).getFile() != FILE_MEMORY_CONST) {
       emitField(0x14, 24, insn->target.bb->binPos - (codeSize + 8));
@@ -661,7 +661,7 @@ CodeEmitterGM107::emitSSY()
 {
    const FlowInstruction *insn = this->insn->asFlow();
 
-   emitInsn(0xe2900000, 0);
+   emitInsn(0xe2900000, false);
 
    if (!insn->srcExists(0) || insn->src(0).getFile() != FILE_MEMORY_CONST) {
       emitField(0x14, 24, insn->target.bb->binPos - (codeSize + 8));
@@ -681,13 +681,13 @@ CodeEmitterGM107::emitSYNC()
 void
 CodeEmitterGM107::emitSAM()
 {
-   emitInsn(0xe3700000, 0);
+   emitInsn(0xe3700000, false);
 }
 
 void
 CodeEmitterGM107::emitRAM()
 {
-   emitInsn(0xe3800000, 0);
+   emitInsn(0xe3800000, false);
 }
 
 /*******************************************************************************
@@ -3880,7 +3880,7 @@ void
 SchedDataCalculatorGM107::setReuseFlag(Instruction *insn)
 {
    Instruction *next = insn->next;
-   BitSet defs(255, 1);
+   BitSet defs(255, true);
 
    if (!targ->isReuseSupported(insn))
       return;
@@ -4040,7 +4040,7 @@ SchedDataCalculatorGM107::setDelay(Instruction *insn, int delay,
 bool
 SchedDataCalculatorGM107::needRdDepBar(const Instruction *insn) const
 {
-   BitSet srcs(255, 1), defs(255, 1);
+   BitSet srcs(255, true), defs(255, true);
    int a, b;
 
    if (!targ->isBarrierRequired(insn))
@@ -4202,7 +4202,7 @@ SchedDataCalculatorGM107::insertBarriers(BasicBlock *bb)
    std::list<LiveBarUse> live_uses;
    std::list<LiveBarDef> live_defs;
    Instruction *insn, *next;
-   BitSet bars(6, 1);
+   BitSet bars(6, true);
    int bar_id;
 
    for (insn = bb->getEntry(); insn != NULL; insn = next) {
@@ -4278,7 +4278,7 @@ SchedDataCalculatorGM107::insertBarriers(BasicBlock *bb)
    }
 
    // Remove unnecessary barrier waits.
-   BitSet alive_bars(6, 1);
+   BitSet alive_bars(6, true);
    for (insn = bb->getEntry(); insn != NULL; insn = next) {
       int wr, rd, wt;
 
diff --git a/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
index d6c8bf74cd..1a0c63b706 100644
--- a/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp	
+++ b/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp	
@@ -270,7 +270,7 @@ CodeEmitterNV50::emitFlagsWr(const Instruction *i)
       for (int d = 0; i->defExists(d); ++d)
          if (i->def(d).getFile() == FILE_FLAGS)
             flagsDef = d;
-      if (flagsDef >= 0 && 0) // TODO: enforce use of flagsDef at some point
+      if (flagsDef >= 0 && false) // TODO: enforce use of flagsDef at some point
          WARN("Instruction::flagsDef was not set properly\n");
    }
    if (flagsDef == 0 && i->defExists(1))
diff --git a/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index 825d7f0ba6..c55aae3080 100644
--- a/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp	
+++ b/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp	
@@ -2962,7 +2962,7 @@ CodeEmitterNVC0::getMinEncodingSize(const Instruction *i) const
 {
    const Target::OpInfo &info = targ->getOpInfo(i);
 
-   if (writeIssueDelays || info.minEncSize == 8 || 1)
+   if (writeIssueDelays || info.minEncSize == 8 || true)
       return 8;
 
    if (i->ftz || i->saturate || i->join)
@@ -2973,7 +2973,7 @@ CodeEmitterNVC0::getMinEncodingSize(const Instruction *i) const
       return 8;
 
    if (i->op == OP_PINTERP) {
-      if (i->getSampleMode() || 1) // XXX: grr, short op doesn't work
+      if (i->getSampleMode() || true) // XXX: grr, short op doesn't work
          return 8;
    } else
    if (i->op == OP_MOV && i->lanes != 0xf) {
diff --git a/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp b/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp
index 37ad8a1a53..ff835fdb77 100644
--- a/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp	
+++ b/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp	
@@ -1706,7 +1706,7 @@ Converter::visit(nir_intrinsic_instr *insn)
          }
 
          TexInstruction *texi = mkTex(OP_TXF, TEX_TARGET_2D_MS_ARRAY, 0, 0, defs, srcs);
-         texi->tex.levelZero = 1;
+         texi->tex.levelZero = true;
          texi->tex.mask = mask;
          texi->tex.useOffsets = 0;
          texi->tex.r = 0xffff;
@@ -3286,7 +3286,8 @@ nvir_nir_shader_compiler_options(int chipset)
    op.lower_device_index_to_zero = false; // TODO
    op.lower_wpos_pntc = false; // TODO
    op.lower_hadd = true; // TODO
-   op.lower_add_sat = true; // TODO
+   op.lower_uadd_sat = true; // TODO
+   op.lower_iadd_sat = true; // TODO
    op.vectorize_io = false;
    op.lower_to_scalar = false;
    op.unify_interfaces = false;
diff --git a/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 3c0c9e79f1..20b9ca8119 100644
--- a/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp	
+++ b/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp	
@@ -2458,7 +2458,7 @@ Converter::handleFBFETCH(Value *dst[4])
    unsigned int c, d;
 
    texi->tex.target = TEX_TARGET_2D_MS_ARRAY;
-   texi->tex.levelZero = 1;
+   texi->tex.levelZero = true;
    texi->tex.useOffsets = 0;
 
    for (c = 0, d = 0; c < 4; ++c) {
diff --git a/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index f942428905..42b2d0cd97 100644
--- a/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp	
+++ b/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp	
@@ -1737,7 +1737,7 @@ ModifierFolding::visit(BasicBlock *bb)
    for (i = bb->getEntry(); i; i = next) {
       next = i->next;
 
-      if (0 && i->op == OP_SUB) {
+      if (false && i->op == OP_SUB) {
          // turn "sub" into "add neg" (do we really want this ?)
          i->op = OP_ADD;
          i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
@@ -3911,6 +3911,7 @@ LocalCSE::visit(BasicBlock *bb)
 class DeadCodeElim : public Pass
 {
 public:
+   DeadCodeElim() : deadCount(0) {}
    bool buryAll(Program *);
 
 private:
diff --git a/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index 58544e8bd3..b18a7a86fa 100644
--- a/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp	
+++ b/mesa 3D driver/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp	
@@ -2653,7 +2653,7 @@ RegAlloc::InsertConstraintsPass::insertConstraintMoves()
       Instruction *cst = *it;
       Instruction *mov;
 
-      if (cst->op == OP_SPLIT && 0) {
+      if (cst->op == OP_SPLIT && false) {
          // spilling splits is annoying, just make sure they're separate
          for (int d = 0; cst->defExists(d); ++d) {
             if (!cst->getDef(d)->refCount())
diff --git a/mesa 3D driver/src/gallium/drivers/nouveau/nv30/nv30_context.h b/mesa 3D driver/src/gallium/drivers/nouveau/nv30/nv30_context.h
index 95fc68fb8a..7ea5c5b87a 100644
--- a/mesa 3D driver/src/gallium/drivers/nouveau/nv30/nv30_context.h	
+++ b/mesa 3D driver/src/gallium/drivers/nouveau/nv30/nv30_context.h	
@@ -188,11 +188,13 @@ nv40_verttex_sampler_states_bind(struct pipe_context *pipe,
 
 void
 nv40_verttex_set_sampler_views(struct pipe_context *pipe, unsigned nr,
+                               bool take_ownership,
                                struct pipe_sampler_view **views);
 
 void
 nv30_fragtex_set_sampler_views(struct pipe_context *pipe,
-                               unsigned nr, struct pipe_sampler_view **views);
+                               unsigned nr, bool take_ownership,
+                               struct pipe_sampler_view **views);
 
 void
 nv30_push_vbo(struct nv30_context *nv30, const struct pipe_draw_info *info,
diff --git a/mesa 3D driver/src/gallium/drivers/nouveau/nv30/nv30_draw.c b/mesa 3D driver/src/gallium/drivers/nouveau/nv30/nv30_draw.c
index a41778827b..c1392cf776 100644
--- a/mesa 3D driver/src/gallium/drivers/nouveau/nv30/nv30_draw.c	
+++ b/mesa 3D driver/src/gallium/drivers/nouveau/nv30/nv30_draw.c	
@@ -445,7 +445,7 @@ nv30_render_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info,
       draw_set_indexes(draw, NULL, 0, 0);
    }
 
-   draw_vbo(draw, info, drawid_offset, NULL, draw_one, 1);
+   draw_vbo(draw, info, drawid_offset, NULL, draw_one, 1, 0);
    draw_flush(draw);
 
    if (info->index_size && transferi)
diff --git a/mesa 3D driver/src/gallium/drivers/nouveau/nv30/nv30_fragtex.c b/mesa 3D driver/src/gallium/drivers/nouveau/nv30/nv30_fragtex.c
index bfd3cf2b48..1a06eac474 100644
--- a/mesa 3D driver/src/gallium/drivers/nouveau/nv30/nv30_fragtex.c	
+++ b/mesa 3D driver/src/gallium/drivers/nouveau/nv30/nv30_fragtex.c	
@@ -173,6 +173,7 @@ nv30_fragtex_sampler_states_bind(struct pipe_context *pipe,
 
 void
 nv30_fragtex_set_sampler_views(struct pipe_context *pipe, unsigned nr,
+                               bool take_ownership,
                                struct pipe_sampler_view **views)
 {
    struct nv30_context *nv30 = nv30_context(pipe);
@@ -180,7 +181,12 @@ nv30_fragtex_set_sampler_views(struct pipe_context *pipe, unsigned nr,
 
    for (i = 0; i < nr; i++) {
       nouveau_bufctx_reset(nv30->bufctx, BUFCTX_FRAGTEX(i));
-      pipe_sampler_view_reference(&nv30->fragprog.textures[i], views[i]);
+      if (take_ownership) {
+         pipe_sampler_view_reference(&nv30->fragprog.textures[i], NULL);
+         nv30->fragprog.textures[i] = views[i];
+      } else {
+         pipe_sampler_view_reference(&nv30->fragprog.textures[i], views[i]);
+      }
       nv30->fragprog.dirty_samplers |= (1 << i);
    }
 
@@ -199,15 +205,16 @@ static void
 nv30_set_sampler_views(struct pipe_context *pipe, enum pipe_shader_type shader,
                        unsigned start, unsigned nr,
                        unsigned unbind_num_trailing_slots,
+                       bool take_ownership,
                        struct pipe_sampler_view **views)
 {
    assert(start == 0);
    switch (shader) {
    case PIPE_SHADER_FRAGMENT:
-      nv30_fragtex_set_sampler_views(pipe, nr, views);
+      nv30_fragtex_set_sampler_views(pipe, nr, take_ownership, views);
       break;
    case PIPE_SHADER_VERTEX:
-      nv40_verttex_set_sampler_views(pipe, nr, views);
+      nv40_verttex_set_sampler_views(pipe, nr, take_ownership, views);
       break;
    default:
       ;
diff --git a/mesa 3D driver/src/gallium/drivers/nouveau/nv30/nv40_verttex.c b/mesa 3D driver/src/gallium/drivers/nouveau/nv30/nv40_verttex.c
index 48b8c5a5ff..578695a22a 100644
--- a/mesa 3D driver/src/gallium/drivers/nouveau/nv30/nv40_verttex.c	
+++ b/mesa 3D driver/src/gallium/drivers/nouveau/nv30/nv40_verttex.c	
@@ -72,6 +72,7 @@ nv40_verttex_sampler_states_bind(struct pipe_context *pipe,
 
 void
 nv40_verttex_set_sampler_views(struct pipe_context *pipe, unsigned nr,
+                               bool take_ownership,
                                struct pipe_sampler_view **views)
 {
    struct nv30_context *nv30 = nv30_context(pipe);
@@ -79,7 +80,12 @@ nv40_verttex_set_sampler_views(struct pipe_context *pipe, unsigned nr,
 
    for (i = 0; i < nr; i++) {
       nouveau_bufctx_reset(nv30->bufctx, BUFCTX_VERTTEX(i));
-      pipe_sampler_view_reference(&nv30->vertprog.textures[i], views[i]);
+      if (take_ownership) {
+         pipe_sampler_view_reference(&nv30->vertprog.textures[i], NULL);
+         nv30->vertprog.textures[i] = views[i];
+      } else {
+         pipe_sampler_view_reference(&nv30->vertprog.textures[i], views[i]);
+      }
       nv30->vertprog.dirty_samplers |= (1 << i);
    }
 
diff --git a/mesa 3D driver/src/gallium/drivers/nouveau/nv50/nv50_state.c b/mesa 3D driver/src/gallium/drivers/nouveau/nv50/nv50_state.c
index 0eafe86d38..3ad9e41fdb 100644
--- a/mesa 3D driver/src/gallium/drivers/nouveau/nv50/nv50_state.c	
+++ b/mesa 3D driver/src/gallium/drivers/nouveau/nv50/nv50_state.c	
@@ -661,7 +661,7 @@ nv50_sampler_view_destroy(struct pipe_context *pipe,
 
 static inline void
 nv50_stage_set_sampler_views(struct nv50_context *nv50, int s,
-                             unsigned nr,
+                             unsigned nr, bool take_ownership,
                              struct pipe_sampler_view **views)
 {
    unsigned i;
@@ -684,7 +684,12 @@ nv50_stage_set_sampler_views(struct nv50_context *nv50, int s,
          nv50->textures_coherent[s] &= ~(1 << i);
       }
 
-      pipe_sampler_view_reference(&nv50->textures[s][i], view);
+      if (take_ownership) {
+         pipe_sampler_view_reference(&nv50->textures[s][i], NULL);
+         nv50->textures[s][i] = view;
+      } else {
+         pipe_sampler_view_reference(&nv50->textures[s][i], view);
+      }
    }
 
    assert(nv50->num_textures[s] <= PIPE_MAX_SAMPLERS);
@@ -704,13 +709,14 @@ static void
 nv50_set_sampler_views(struct pipe_context *pipe, enum pipe_shader_type shader,
                        unsigned start, unsigned nr,
                        unsigned unbind_num_trailing_slots,
+                       bool take_ownership,
                        struct pipe_sampler_view **views)
 {
    struct nv50_context *nv50 = nv50_context(pipe);
    unsigned s = nv50_context_shader_stage(shader);
 
    assert(start == 0);
-   nv50_stage_set_sampler_views(nv50, s, nr, views);
+   nv50_stage_set_sampler_views(nv50, s, nr, take_ownership, views);
 
    if (unlikely(s == NV50_SHADER_STAGE_COMPUTE)) {
       nouveau_bufctx_reset(nv50->bufctx_cp, NV50_BIND_CP_TEXTURES);
diff --git a/mesa 3D driver/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/mesa 3D driver/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index c6871dc544..0ad06db798 100644
--- a/mesa 3D driver/src/gallium/drivers/nouveau/nvc0/nvc0_context.h	
+++ b/mesa 3D driver/src/gallium/drivers/nouveau/nvc0/nvc0_context.h	
@@ -254,6 +254,7 @@ struct nvc0_context {
 
    float default_tess_outer[4];
    float default_tess_inner[2];
+   uint8_t patch_vertices;
 
    bool vbo_push_hint;
 
diff --git a/mesa 3D driver/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/mesa 3D driver/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index 9676016c5b..13c4fc6a27 100644
--- a/mesa 3D driver/src/gallium/drivers/nouveau/nvc0/nvc0_state.c	
+++ b/mesa 3D driver/src/gallium/drivers/nouveau/nvc0/nvc0_state.c	
@@ -516,7 +516,7 @@ nvc0_sampler_view_destroy(struct pipe_context *pipe,
 
 static inline void
 nvc0_stage_set_sampler_views(struct nvc0_context *nvc0, int s,
-                             unsigned nr,
+                             unsigned nr, bool take_ownership,
                              struct pipe_sampler_view **views)
 {
    unsigned i;
@@ -525,8 +525,11 @@ nvc0_stage_set_sampler_views(struct nvc0_context *nvc0, int s,
       struct pipe_sampler_view *view = views ? views[i] : NULL;
       struct nv50_tic_entry *old = nv50_tic_entry(nvc0->textures[s][i]);
 
-      if (view == nvc0->textures[s][i])
+      if (view == nvc0->textures[s][i]) {
+         if (take_ownership)
+            pipe_sampler_view_reference(&view, NULL);
          continue;
+      }
       nvc0->textures_dirty[s] |= 1 << i;
 
       if (view && view->texture) {
@@ -548,7 +551,12 @@ nvc0_stage_set_sampler_views(struct nvc0_context *nvc0, int s,
          nvc0_screen_tic_unlock(nvc0->screen, old);
       }
 
-      pipe_sampler_view_reference(&nvc0->textures[s][i], view);
+      if (take_ownership) {
+         pipe_sampler_view_reference(&nvc0->textures[s][i], NULL);
+         nvc0->textures[s][i] = view;
+      } else {
+         pipe_sampler_view_reference(&nvc0->textures[s][i], view);
+      }
    }
 
    for (i = nr; i < nvc0->num_textures[s]; ++i) {
@@ -570,12 +578,13 @@ static void
 nvc0_set_sampler_views(struct pipe_context *pipe, enum pipe_shader_type shader,
                        unsigned start, unsigned nr,
                        unsigned unbind_num_trailing_slots,
+                       bool take_ownership,
                        struct pipe_sampler_view **views)
 {
    const unsigned s = nvc0_shader_stage(shader);
 
    assert(start == 0);
-   nvc0_stage_set_sampler_views(nvc0_context(pipe), s, nr, views);
+   nvc0_stage_set_sampler_views(nvc0_context(pipe), s, nr, take_ownership, views);
 
    if (s == 5)
       nvc0_context(pipe)->dirty_cp |= NVC0_NEW_CP_TEXTURES;
@@ -994,6 +1003,14 @@ nvc0_set_tess_state(struct pipe_context *pipe,
    nvc0->dirty_3d |= NVC0_NEW_3D_TESSFACTOR;
 }
 
+static void
+nvc0_set_patch_vertices(struct pipe_context *pipe, uint8_t patch_vertices)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+   nvc0->patch_vertices = patch_vertices;
+}
+
 static void
 nvc0_set_vertex_buffers(struct pipe_context *pipe,
                         unsigned start_slot, unsigned count,
@@ -1490,6 +1507,7 @@ nvc0_init_state_functions(struct nvc0_context *nvc0)
    pipe->set_viewport_states = nvc0_set_viewport_states;
    pipe->set_window_rectangles = nvc0_set_window_rectangles;
    pipe->set_tess_state = nvc0_set_tess_state;
+   pipe->set_patch_vertices = nvc0_set_patch_vertices;
 
    pipe->create_vertex_elements_state = nvc0_vertex_state_create;
    pipe->delete_vertex_elements_state = nvc0_vertex_state_delete;
diff --git a/mesa 3D driver/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/mesa 3D driver/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index 29bc9b28fa..ca6bfda6c1 100644
--- a/mesa 3D driver/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c	
+++ b/mesa 3D driver/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c	
@@ -977,8 +977,8 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info,
    }
 
    if (info->mode == PIPE_PRIM_PATCHES &&
-       nvc0->state.patch_vertices != info->vertices_per_patch) {
-      nvc0->state.patch_vertices = info->vertices_per_patch;
+       nvc0->state.patch_vertices != nvc0->patch_vertices) {
+      nvc0->state.patch_vertices = nvc0->patch_vertices;
       PUSH_SPACE(push, 1);
       IMMED_NVC0(push, NVC0_3D(PATCH_VERTICES), nvc0->state.patch_vertices);
    }
diff --git a/mesa 3D driver/src/gallium/drivers/panfrost/meson.build b/mesa 3D driver/src/gallium/drivers/panfrost/meson.build
index 221bf23696..171a20c8b2 100644
--- a/mesa 3D driver/src/gallium/drivers/panfrost/meson.build	
+++ b/mesa 3D driver/src/gallium/drivers/panfrost/meson.build	
@@ -56,11 +56,11 @@ libpanfrost_versions = []
 foreach ver : panfrost_versions
   libpanfrost_versions += static_library(
     'panfrost-v' + ver,
-    ['pan_cmdstream.c', midgard_pack],
+    ['pan_cmdstream.c', pan_packers],
     include_directories : panfrost_includes,
     c_args : ['-DPAN_ARCH=' + ver],
     gnu_symbol_visibility : 'hidden',
-    dependencies : [idep_midgard_pack, idep_nir, dep_libdrm],
+    dependencies : [idep_pan_packers, idep_nir, dep_libdrm],
 )
 endforeach
 
@@ -72,7 +72,7 @@ libpanfrost = static_library(
     dep_libdrm,
     idep_mesautil,
     idep_nir,
-    idep_midgard_pack
+    idep_pan_packers
   ],
   include_directories : panfrost_includes,
   c_args : [c_msvc_compat_args, compile_args_panfrost],
diff --git a/mesa 3D driver/src/gallium/drivers/panfrost/pan_assemble.c b/mesa 3D driver/src/gallium/drivers/panfrost/pan_assemble.c
index 1638c9aef0..7dce4dde98 100644
--- a/mesa 3D driver/src/gallium/drivers/panfrost/pan_assemble.c	
+++ b/mesa 3D driver/src/gallium/drivers/panfrost/pan_assemble.c	
@@ -77,7 +77,7 @@ panfrost_shader_compile(struct pipe_screen *pscreen,
         struct util_dynarray binary;
 
         util_dynarray_init(&binary, NULL);
-        pan_shader_compile(dev, s, &inputs, &binary, &state->info);
+        screen->vtbl.compile_shader(s, &inputs, &binary, &state->info);
 
         if (binary.size) {
                 state->bin = panfrost_pool_take_ref(shader_pool,
@@ -89,7 +89,7 @@ panfrost_shader_compile(struct pipe_screen *pscreen,
         /* Don't upload RSD for fragment shaders since they need draw-time
          * merging for e.g. depth/stencil/alpha */
         bool upload = stage != MESA_SHADER_FRAGMENT;
-        screen->vtbl.prepare_rsd(dev, state, desc_pool, upload);
+        screen->vtbl.prepare_rsd(state, desc_pool, upload);
 
         panfrost_analyze_sysvals(state);
 
diff --git a/mesa 3D driver/src/gallium/drivers/panfrost/pan_cmdstream.c b/mesa 3D driver/src/gallium/drivers/panfrost/pan_cmdstream.c
index 5857556775..ae5e116b7a 100644
--- a/mesa 3D driver/src/gallium/drivers/panfrost/pan_cmdstream.c	
+++ b/mesa 3D driver/src/gallium/drivers/panfrost/pan_cmdstream.c	
@@ -31,14 +31,14 @@
 #include "util/u_memory.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
-#include "indices/u_primconvert.h"
 #include "gallium/auxiliary/util/u_blend.h"
 
 #include "panfrost-quirks.h"
-#include "gen_macros.h"
+#include "genxml/gen_macros.h"
 
 #include "pan_pool.h"
 #include "pan_bo.h"
+#include "pan_blend.h"
 #include "pan_context.h"
 #include "pan_job.h"
 #include "pan_shader.h"
@@ -48,7 +48,43 @@
 #include "pan_indirect_dispatch.h"
 #include "pan_blitter.h"
 
-#include "midgard_pack.h"
+struct panfrost_rasterizer {
+        struct pipe_rasterizer_state base;
+
+        /* Partially packed RSD words */
+        struct mali_multisample_misc_packed multisample;
+        struct mali_stencil_mask_misc_packed stencil_misc;
+};
+
+struct panfrost_zsa_state {
+        struct pipe_depth_stencil_alpha_state base;
+
+        /* Is any depth, stencil, or alpha testing enabled? */
+        bool enabled;
+
+        /* Mask of PIPE_CLEAR_{DEPTH,STENCIL} written */
+        unsigned draws;
+
+        /* Prepacked words from the RSD */
+        struct mali_multisample_misc_packed rsd_depth;
+        struct mali_stencil_mask_misc_packed rsd_stencil;
+        struct mali_stencil_packed stencil_front, stencil_back;
+};
+
+struct panfrost_sampler_state {
+        struct pipe_sampler_state base;
+        struct mali_sampler_packed hw;
+};
+
+/* Misnomer: Sampler view corresponds to textures, not samplers */
+
+struct panfrost_sampler_view {
+        struct pipe_sampler_view base;
+        struct panfrost_pool_ref state;
+        struct mali_texture_packed bifrost_descriptor;
+        mali_ptr texture_bo;
+        uint64_t modifier;
+};
 
 /* Statically assert that PIPE_* enums match the hardware enums.
  * (As long as they match, we don't need to translate them.)
@@ -87,21 +123,26 @@ translate_tex_wrap(enum pipe_tex_wrap w, bool using_nearest)
         /* Bifrost doesn't support the GL_CLAMP wrap mode, so instead use
          * CLAMP_TO_EDGE and CLAMP_TO_BORDER. On Midgard, CLAMP is broken for
          * nearest filtering, so use CLAMP_TO_EDGE in that case. */
-        bool supports_clamp = (PAN_ARCH <= 5);
 
         switch (w) {
         case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
         case PIPE_TEX_WRAP_CLAMP:
                 return using_nearest ? MALI_WRAP_MODE_CLAMP_TO_EDGE :
-                     (supports_clamp ? MALI_WRAP_MODE_CLAMP :
-                                       MALI_WRAP_MODE_CLAMP_TO_BORDER);
+#if PAN_ARCH <= 5
+                     MALI_WRAP_MODE_CLAMP;
+#else
+                     MALI_WRAP_MODE_CLAMP_TO_BORDER;
+#endif
         case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
         case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
         case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
         case PIPE_TEX_WRAP_MIRROR_CLAMP:
                 return using_nearest ? MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE :
-                     (supports_clamp ? MALI_WRAP_MODE_MIRRORED_CLAMP :
-                                       MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER);
+#if PAN_ARCH <= 5
+                     MALI_WRAP_MODE_MIRRORED_CLAMP;
+#else
+                     MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
+#endif
         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
         case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
         default: unreachable("Invalid wrap");
@@ -144,11 +185,7 @@ panfrost_create_sampler_state(
 
         bool using_nearest = cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST;
 
-#if PAN_ARCH <= 5
-        pan_pack(&so->hw, MIDGARD_SAMPLER, cfg) {
-#else
-        pan_pack(&so->hw, BIFROST_SAMPLER, cfg) {
-#endif
+        pan_pack(&so->hw, SAMPLER, cfg) {
                 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
                 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
 
@@ -240,14 +277,14 @@ panfrost_emit_blend(struct panfrost_batch *batch, void *rts, mali_ptr *blend_sha
 
         /* Always have at least one render target for depth-only passes */
         for (unsigned i = 0; i < MAX2(rt_count, 1); ++i) {
-                struct mali_blend_packed *packed = rts + (i * MALI_BLEND_LENGTH);
+                struct mali_blend_packed *packed = rts + (i * pan_size(BLEND));
 
                 /* Disable blending for unbacked render targets */
                 if (rt_count == 0 || !batch->key.cbufs[i] || so->info[i].no_colour) {
-                        pan_pack(rts + i * MALI_BLEND_LENGTH, BLEND, cfg) {
+                        pan_pack(rts + i * pan_size(BLEND), BLEND, cfg) {
                                 cfg.enable = false;
 #if PAN_ARCH >= 6
-                                cfg.bifrost.internal.mode = MALI_BIFROST_BLEND_MODE_OFF;
+                                cfg.internal.mode = MALI_BLEND_MODE_OFF;
 #endif
                         }
 
@@ -266,20 +303,20 @@ panfrost_emit_blend(struct panfrost_batch *batch, void *rts, mali_ptr *blend_sha
                         cfg.round_to_fb_precision = !dithered;
                         cfg.alpha_to_one = ctx->blend->base.alpha_to_one;
 #if PAN_ARCH >= 6
-                        cfg.bifrost.constant = pack_blend_constant(format, cons);
+                        cfg.constant = pack_blend_constant(format, cons);
 #else
-                        cfg.midgard.blend_shader = (blend_shaders[i] != 0);
+                        cfg.blend_shader = (blend_shaders[i] != 0);
 
                         if (blend_shaders[i])
-                                cfg.midgard.shader_pc = blend_shaders[i];
+                                cfg.shader_pc = blend_shaders[i];
                         else
-                                cfg.midgard.constant = cons;
+                                cfg.constant = cons;
 #endif
                 }
 
                 if (!blend_shaders[i]) {
                         /* Word 1: Blend Equation */
-                        STATIC_ASSERT(MALI_BLEND_EQUATION_LENGTH == 4);
+                        STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4);
                         packed->opaque[PAN_ARCH >= 6 ? 1 : 2] = so->equation[i];
                 }
 
@@ -301,17 +338,17 @@ panfrost_emit_blend(struct panfrost_batch *batch, void *rts, mali_ptr *blend_sha
                         unsigned ret_offset = fs->info.bifrost.blend[i].return_offset;
                         assert(!(ret_offset & 0x7));
 
-                        pan_pack(&packed->opaque[2], BIFROST_INTERNAL_BLEND, cfg) {
-                                cfg.mode = MALI_BIFROST_BLEND_MODE_SHADER;
+                        pan_pack(&packed->opaque[2], INTERNAL_BLEND, cfg) {
+                                cfg.mode = MALI_BLEND_MODE_SHADER;
                                 cfg.shader.pc = (u32) blend_shaders[i];
                                 cfg.shader.return_value = ret_offset ?
                                         fs->bin.gpu + ret_offset : 0;
                         }
                 } else {
-                        pan_pack(&packed->opaque[2], BIFROST_INTERNAL_BLEND, cfg) {
+                        pan_pack(&packed->opaque[2], INTERNAL_BLEND, cfg) {
                                 cfg.mode = info.opaque ?
-                                        MALI_BIFROST_BLEND_MODE_OPAQUE :
-                                        MALI_BIFROST_BLEND_MODE_FIXED_FUNCTION;
+                                        MALI_BLEND_MODE_OPAQUE :
+                                        MALI_BLEND_MODE_FIXED_FUNCTION;
 
                                 /* If we want the conversion to work properly,
                                  * num_comps must be set to 4
@@ -346,36 +383,21 @@ pan_merge_empty_fs(struct mali_renderer_state_packed *rsd)
 
         pan_pack(&empty_rsd, RENDERER_STATE, cfg) {
 #if PAN_ARCH >= 6
-                cfg.properties.bifrost.shader_modifies_coverage = true;
-                cfg.properties.bifrost.allow_forward_pixel_to_kill = true;
-                cfg.properties.bifrost.allow_forward_pixel_to_be_killed = true;
-                cfg.properties.bifrost.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY;
+                cfg.properties.shader_modifies_coverage = true;
+                cfg.properties.allow_forward_pixel_to_kill = true;
+                cfg.properties.allow_forward_pixel_to_be_killed = true;
+                cfg.properties.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY;
 #else
                 cfg.shader.shader = 0x1;
-                cfg.properties.midgard.work_register_count = 1;
+                cfg.properties.work_register_count = 1;
                 cfg.properties.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
-                cfg.properties.midgard.force_early_z = true;
+                cfg.properties.force_early_z = true;
 #endif
         }
 
         pan_merge((*rsd), empty_rsd, RENDERER_STATE);
 }
 
-#if PAN_ARCH == 5
-/* Get the last blend shader, for an erratum workaround */
-
-static mali_ptr
-panfrost_last_nonnull(mali_ptr *ptrs, unsigned count)
-{
-        for (signed i = ((signed) count - 1); i >= 0; --i) {
-                if (ptrs[i])
-                        return ptrs[i];
-        }
-
-        return 0;
-}
-#endif
-
 static void
 panfrost_prepare_fs_state(struct panfrost_context *ctx,
                           mali_ptr *blend_shaders,
@@ -388,7 +410,13 @@ panfrost_prepare_fs_state(struct panfrost_context *ctx,
         bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
         bool msaa = rast->multisample;
 
-        UNUSED unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
+        unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
+
+        bool has_blend_shader = false;
+
+        for (unsigned c = 0; c < rt_count; ++c)
+                has_blend_shader |= (blend_shaders[c] != 0);
+
         pan_pack(rsd, RENDERER_STATE, cfg) {
                 if (panfrost_fs_required(fs, so, &ctx->pipe_framebuffer, zsa)) {
 #if PAN_ARCH >= 6
@@ -398,69 +426,64 @@ panfrost_prepare_fs_state(struct panfrost_context *ctx,
                         uint64_t rt_written = (fs->info.outputs_written >> FRAG_RESULT_DATA0);
                         bool blend_reads_dest = (so->load_dest_mask & rt_mask);
 
-                        cfg.properties.bifrost.allow_forward_pixel_to_kill =
+                        cfg.properties.allow_forward_pixel_to_kill =
                                 fs->info.fs.can_fpk &&
                                 !(rt_mask & ~rt_written) &&
                                 !alpha_to_coverage &&
                                 !blend_reads_dest;
 #else
-                        cfg.properties.midgard.force_early_z =
+                        cfg.properties.force_early_z =
                                 fs->info.fs.can_early_z && !alpha_to_coverage &&
                                 ((enum mali_func) zsa->base.alpha_func == MALI_FUNC_ALWAYS);
 
-                        bool has_blend_shader = false;
-
-                        for (unsigned c = 0; c < rt_count; ++c)
-                                has_blend_shader |= (blend_shaders[c] != 0);
-
                         /* TODO: Reduce this limit? */
                         if (has_blend_shader)
-                                cfg.properties.midgard.work_register_count = MAX2(fs->info.work_reg_count, 8);
+                                cfg.properties.work_register_count = MAX2(fs->info.work_reg_count, 8);
                         else
-                                cfg.properties.midgard.work_register_count = fs->info.work_reg_count;
+                                cfg.properties.work_register_count = fs->info.work_reg_count;
 
                         /* Hardware quirks around early-zs forcing without a
                          * depth buffer. Note this breaks occlusion queries. */
                         bool has_oq = ctx->occlusion_query && ctx->active_queries;
                         bool force_ez_with_discard = !zsa->enabled && !has_oq;
 
-                        cfg.properties.midgard.shader_reads_tilebuffer =
+                        cfg.properties.shader_reads_tilebuffer =
                                 force_ez_with_discard && fs->info.fs.can_discard;
-                        cfg.properties.midgard.shader_contains_discard =
+                        cfg.properties.shader_contains_discard =
                                 !force_ez_with_discard && fs->info.fs.can_discard;
 #endif
                 }
 
 #if PAN_ARCH == 4
                 if (rt_count > 0) {
-                        cfg.multisample_misc.sfbd_load_destination = so->info[0].load_dest;
-                        cfg.multisample_misc.sfbd_blend_shader = (blend_shaders[0] != 0);
-                        cfg.stencil_mask_misc.sfbd_write_enable = !so->info[0].no_colour;
-                        cfg.stencil_mask_misc.sfbd_srgb = util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format);
-                        cfg.stencil_mask_misc.sfbd_dither_disable = !so->base.dither;
-                        cfg.stencil_mask_misc.sfbd_alpha_to_one = so->base.alpha_to_one;
+                        cfg.multisample_misc.load_destination = so->info[0].load_dest;
+                        cfg.multisample_misc.blend_shader = (blend_shaders[0] != 0);
+                        cfg.stencil_mask_misc.write_enable = !so->info[0].no_colour;
+                        cfg.stencil_mask_misc.srgb = util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format);
+                        cfg.stencil_mask_misc.dither_disable = !so->base.dither;
+                        cfg.stencil_mask_misc.alpha_to_one = so->base.alpha_to_one;
 
                         if (blend_shaders[0]) {
-                                cfg.sfbd_blend_shader = blend_shaders[0];
+                                cfg.blend_shader = blend_shaders[0];
                         } else {
-                                cfg.sfbd_blend_constant = pan_blend_get_constant(
+                                cfg.blend_constant = pan_blend_get_constant(
                                                 so->info[0].constant_mask,
                                                 ctx->blend_color.color);
                         }
                 } else {
                         /* If there is no colour buffer, leaving fields default is
                          * fine, except for blending which is nonnullable */
-                        cfg.sfbd_blend_equation.color_mask = 0xf;
-                        cfg.sfbd_blend_equation.rgb.a = MALI_BLEND_OPERAND_A_SRC;
-                        cfg.sfbd_blend_equation.rgb.b = MALI_BLEND_OPERAND_B_SRC;
-                        cfg.sfbd_blend_equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO;
-                        cfg.sfbd_blend_equation.alpha.a = MALI_BLEND_OPERAND_A_SRC;
-                        cfg.sfbd_blend_equation.alpha.b = MALI_BLEND_OPERAND_B_SRC;
-                        cfg.sfbd_blend_equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO;
+                        cfg.blend_equation.color_mask = 0xf;
+                        cfg.blend_equation.rgb.a = MALI_BLEND_OPERAND_A_SRC;
+                        cfg.blend_equation.rgb.b = MALI_BLEND_OPERAND_B_SRC;
+                        cfg.blend_equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO;
+                        cfg.blend_equation.alpha.a = MALI_BLEND_OPERAND_A_SRC;
+                        cfg.blend_equation.alpha.b = MALI_BLEND_OPERAND_B_SRC;
+                        cfg.blend_equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO;
                 }
 #elif PAN_ARCH == 5
                 /* Workaround */
-                cfg.sfbd_blend_shader = panfrost_last_nonnull(blend_shaders, rt_count);
+                cfg.legacy_blend_shader = panfrost_last_nonnull(blend_shaders, rt_count);
 #endif
 
                 cfg.multisample_misc.sample_mask = msaa ? ctx->sample_mask : 0xFFFF;
@@ -468,6 +491,18 @@ panfrost_prepare_fs_state(struct panfrost_context *ctx,
                 cfg.multisample_misc.evaluate_per_sample =
                         msaa && (ctx->min_samples > 1);
 
+#if PAN_ARCH >= 6
+                /* MSAA blend shaders need to pass their sample ID to
+                 * LD_TILE/ST_TILE, so we must preload it. Additionally, we
+                 * need per-sample shading for the blend shader, accomplished
+                 * by forcing per-sample shading for the whole program. */
+
+                if (msaa && has_blend_shader) {
+                        cfg.multisample_misc.evaluate_per_sample = true;
+                        cfg.preload.fragment.sample_mask_id = true;
+                }
+#endif
+
                 cfg.stencil_mask_misc.alpha_to_coverage = alpha_to_coverage;
                 cfg.depth_units = rast->offset_units * 2.0f;
                 cfg.depth_factor = rast->offset_scale;
@@ -502,16 +537,20 @@ panfrost_emit_frag_shader(struct panfrost_context *ctx,
 #if PAN_ARCH == 4
         if (ctx->pipe_framebuffer.nr_cbufs > 0 && !blend_shaders[0]) {
                 /* Word 14: SFBD Blend Equation */
-                STATIC_ASSERT(MALI_BLEND_EQUATION_LENGTH == 4);
+                STATIC_ASSERT(pan_size(BLEND_EQUATION) == 4);
                 rsd.opaque[14] = ctx->blend->equation[0];
         }
 #endif
 
         /* Merge with CSO state and upload */
-        if (panfrost_fs_required(fs, ctx->blend, &ctx->pipe_framebuffer, zsa))
-                pan_merge(rsd, fs->partial_rsd, RENDERER_STATE);
-        else
+        if (panfrost_fs_required(fs, ctx->blend, &ctx->pipe_framebuffer, zsa)) {
+                struct mali_renderer_state_packed *partial_rsd =
+                        (struct mali_renderer_state_packed *)&fs->partial_rsd;
+                STATIC_ASSERT(sizeof(fs->partial_rsd) == sizeof(*partial_rsd));
+                pan_merge(rsd, *partial_rsd, RENDERER_STATE);
+        } else {
                 pan_merge_empty_fs(&rsd);
+        }
 
         /* Word 8, 9 Misc state */
         rsd.opaque[8] |= zsa->rsd_depth.opaque[0]
@@ -558,7 +597,7 @@ panfrost_emit_frag_shader_meta(struct panfrost_batch *batch)
                                              PAN_DESC_ARRAY(rt_count, BLEND));
 #endif
 
-        mali_ptr blend_shaders[PIPE_MAX_COLOR_BUFS];
+        mali_ptr blend_shaders[PIPE_MAX_COLOR_BUFS] = { 0 };
         unsigned shader_offset = 0;
         struct panfrost_bo *shader_bo = NULL;
 
@@ -572,7 +611,7 @@ panfrost_emit_frag_shader_meta(struct panfrost_batch *batch)
         panfrost_emit_frag_shader(ctx, (struct mali_renderer_state_packed *) xfer.cpu, blend_shaders);
 
 #if PAN_ARCH >= 5
-        panfrost_emit_blend(batch, xfer.cpu + MALI_RENDERER_STATE_LENGTH, blend_shaders);
+        panfrost_emit_blend(batch, xfer.cpu + pan_size(RENDERER_STATE), blend_shaders);
 #else
         batch->draws |= PIPE_CLEAR_COLOR0;
         batch->resolve |= PIPE_CLEAR_COLOR0;
@@ -885,9 +924,9 @@ panfrost_upload_rt_conversion_sysval(struct panfrost_batch *batch,
         if (rt < batch->key.nr_cbufs && batch->key.cbufs[rt]) {
                 enum pipe_format format = batch->key.cbufs[rt]->format;
                 uniform->u[0] =
-                        pan_blend_get_bifrost_desc(dev, format, rt, size, false) >> 32;
+                        GENX(pan_blend_get_internal_desc)(dev, format, rt, size, false) >> 32;
         } else {
-                pan_pack(&uniform->u[0], BIFROST_INTERNAL_CONVERSION, cfg)
+                pan_pack(&uniform->u[0], INTERNAL_CONVERSION, cfg)
                         cfg.memory_format = dev->formats[PIPE_FORMAT_NONE].hw;
         }
 }
@@ -1211,12 +1250,14 @@ panfrost_create_sampler_view_bo(struct panfrost_sampler_view *so,
         enum pipe_format format = so->base.format;
         assert(prsrc->image.data.bo);
 
-        /* Format to access the stencil portion of a Z32_S8 texture */
+        /* Format to access the stencil/depth portion of a Z32_S8 texture */
         if (format == PIPE_FORMAT_X32_S8X24_UINT) {
                 assert(prsrc->separate_stencil);
                 texture = &prsrc->separate_stencil->base;
                 prsrc = (struct panfrost_resource *)texture;
                 format = texture->format;
+        } else if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
+                format = PIPE_FORMAT_Z32_FLOAT;
         }
 
         const struct util_format_description *desc = util_format_description(format);
@@ -1279,8 +1320,8 @@ panfrost_create_sampler_view_bo(struct panfrost_sampler_view *so,
         };
 
         unsigned size =
-                (PAN_ARCH <= 5 ? MALI_MIDGARD_TEXTURE_LENGTH : 0) +
-                panfrost_estimate_texture_payload_size(device, &iview);
+                (PAN_ARCH <= 5 ? pan_size(TEXTURE) : 0) +
+                GENX(panfrost_estimate_texture_payload_size)(&iview);
 
         struct panfrost_ptr payload = pan_pool_alloc_aligned(&ctx->descs.base, size, 64);
         so->state = panfrost_pool_take_ref(&ctx->descs, payload.gpu);
@@ -1288,11 +1329,11 @@ panfrost_create_sampler_view_bo(struct panfrost_sampler_view *so,
         void *tex = (PAN_ARCH >= 6) ? &so->bifrost_descriptor : payload.cpu;
 
         if (PAN_ARCH <= 5) {
-                payload.cpu += MALI_MIDGARD_TEXTURE_LENGTH;
-                payload.gpu += MALI_MIDGARD_TEXTURE_LENGTH;
+                payload.cpu += pan_size(TEXTURE);
+                payload.gpu += pan_size(TEXTURE);
         }
 
-        panfrost_new_texture(device, &iview, tex, &payload);
+        GENX(panfrost_new_texture)(device, &iview, tex, &payload);
 }
 
 static void
@@ -1320,9 +1361,9 @@ panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
         struct panfrost_ptr T =
                 pan_pool_alloc_desc_array(&batch->pool.base,
                                           ctx->sampler_view_count[stage],
-                                          BIFROST_TEXTURE);
-        struct mali_bifrost_texture_packed *out =
-                (struct mali_bifrost_texture_packed *) T.cpu;
+                                          TEXTURE);
+        struct mali_texture_packed *out =
+                (struct mali_texture_packed *) T.cpu;
 
         for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
                 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
@@ -1364,14 +1405,11 @@ panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
         if (!ctx->sampler_count[stage])
                 return 0;
 
-        assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
-        assert(MALI_BIFROST_SAMPLER_ALIGN == MALI_MIDGARD_SAMPLER_ALIGN);
-
         struct panfrost_ptr T =
                 pan_pool_alloc_desc_array(&batch->pool.base,
                                           ctx->sampler_count[stage],
-                                          MIDGARD_SAMPLER);
-        struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
+                                          SAMPLER);
+        struct mali_sampler_packed *out = (struct mali_sampler_packed *) T.cpu;
 
         for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
                 out[i] = ctx->samplers[stage][i]->hw;
@@ -1425,7 +1463,6 @@ emit_image_bufs(struct panfrost_batch *batch, enum pipe_shader_type shader,
         for (unsigned i = 0; i < last_bit; ++i) {
                 struct pipe_image_view *image = &ctx->images[shader][i];
 
-                /* TODO: understand how v3d/freedreno does it */
                 if (!(ctx->image_mask[shader] & (1 << i)) ||
                     !(image->shader_access & PIPE_IMAGE_ACCESS_READ_WRITE)) {
                         /* Unused image bindings */
@@ -1528,7 +1565,7 @@ panfrost_emit_image_attribs(struct panfrost_batch *batch,
 
         /* We need an empty attrib buf to stop the prefetching on Bifrost */
 #if PAN_ARCH >= 6
-        pan_pack(bufs.cpu + ((buf_count - 1) * MALI_ATTRIBUTE_BUFFER_LENGTH),
+        pan_pack(bufs.cpu + ((buf_count - 1) * pan_size(ATTRIBUTE_BUFFER)),
                  ATTRIBUTE_BUFFER, cfg);
 #endif
 
@@ -2068,9 +2105,9 @@ panfrost_emit_varying_descs(
         unsigned consumer_count = consumer->info.varyings.input_count;
 
         /* Offsets within the general varying buffer, indexed by location */
-        signed offsets[PIPE_MAX_ATTRIBS];
-        assert(producer_count < ARRAY_SIZE(offsets));
-        assert(consumer_count < ARRAY_SIZE(offsets));
+        signed offsets[PAN_MAX_VARYINGS];
+        assert(producer_count <= ARRAY_SIZE(offsets));
+        assert(consumer_count <= ARRAY_SIZE(offsets));
 
         /* Allocate enough descriptors for both shader stages */
         struct panfrost_ptr T =
@@ -2087,7 +2124,7 @@ panfrost_emit_varying_descs(
         struct mali_attribute_packed *descs = T.cpu;
         out->producer = producer_count ? T.gpu : 0;
         out->consumer = consumer_count ? T.gpu +
-                (MALI_ATTRIBUTE_LENGTH * producer_count) : 0;
+                (pan_size(ATTRIBUTE) * producer_count) : 0;
 
         /* Lay out the varyings. Must use producer to lay out, in order to
          * respect transform feedback precisions. */
@@ -2137,6 +2174,7 @@ panfrost_emit_varying_descs(
         }
 }
 
+#if PAN_ARCH <= 5
 static void
 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
                 unsigned present,
@@ -2152,6 +2190,7 @@ pan_emit_special_input(struct mali_attribute_buffer_packed *out,
                 }
         }
 }
+#endif
 
 static void
 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
@@ -2247,12 +2286,14 @@ panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
                                 2, vertex_count);
         }
 
+#if PAN_ARCH <= 5
         pan_emit_special_input(varyings, present,
                         PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
         pan_emit_special_input(varyings, present, PAN_VARY_FACE,
                         MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
         pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD,
                         MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
+#endif
 
         *buffers = T.gpu;
         *vs_attribs = linkage->producer;
@@ -2309,7 +2350,7 @@ emit_tls(struct panfrost_batch *batch)
         };
 
         assert(batch->tls.cpu);
-        pan_emit_tls(dev, &tls, batch->tls.cpu);
+        GENX(pan_emit_tls)(&tls, batch->tls.cpu);
 }
 
 static void
@@ -2331,8 +2372,8 @@ emit_fbd(struct panfrost_batch *batch, const struct pan_fb_info *fb)
         };
 
         batch->framebuffer.gpu |=
-                pan_emit_fbd(dev, fb, &tls, &batch->tiler_ctx,
-                             batch->framebuffer.cpu);
+                GENX(pan_emit_fbd)(dev, fb, &tls, &batch->tiler_ctx,
+                                   batch->framebuffer.cpu);
 }
 
 /* Mark a surface as written */
@@ -2353,8 +2394,6 @@ panfrost_initialize_surface(struct panfrost_batch *batch,
 static mali_ptr
 emit_fragment_job(struct panfrost_batch *batch, const struct pan_fb_info *pfb)
 {
-        struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
-
         /* Mark the affected buffers as initialized, since we're writing to it.
          * Also, add the surfaces we're writing to to the batch */
 
@@ -2388,8 +2427,8 @@ emit_fragment_job(struct panfrost_batch *batch, const struct pan_fb_info *pfb)
         struct panfrost_ptr transfer =
                 pan_pool_alloc_desc(&batch->pool.base, FRAGMENT_JOB);
 
-        pan_emit_fragment_job(dev, pfb, batch->framebuffer.gpu,
-                              transfer.cpu);
+        GENX(pan_emit_fragment_job)(pfb, batch->framebuffer.gpu,
+                                    transfer.cpu);
 
         return transfer.gpu;
 }
@@ -2408,8 +2447,10 @@ pan_draw_mode(enum pipe_prim_type mode)
                 DEFINE_CASE(TRIANGLE_STRIP);
                 DEFINE_CASE(TRIANGLE_FAN);
                 DEFINE_CASE(QUADS);
-                DEFINE_CASE(QUAD_STRIP);
                 DEFINE_CASE(POLYGON);
+#if PAN_ARCH <= 6
+                DEFINE_CASE(QUAD_STRIP);
+#endif
 
         default:
                 unreachable("Invalid draw mode");
@@ -2485,7 +2526,7 @@ panfrost_draw_emit_vertex(struct panfrost_batch *batch,
 {
         void *section =
                 pan_section_ptr(job, COMPUTE_JOB, INVOCATION);
-        memcpy(section, invocation_template, MALI_INVOCATION_LENGTH);
+        memcpy(section, invocation_template, pan_size(INVOCATION));
 
         pan_section_pack(job, COMPUTE_JOB, PARAMETERS, cfg) {
                 cfg.job_task_split = 5;
@@ -2493,7 +2534,6 @@ panfrost_draw_emit_vertex(struct panfrost_batch *batch,
 
         pan_section_pack(job, COMPUTE_JOB, DRAW, cfg) {
                 cfg.draw_descriptor_is_64b = true;
-                cfg.texture_descriptor_is_64b = (PAN_ARCH <= 5);
                 cfg.state = batch->rsd[PIPE_SHADER_VERTEX];
                 cfg.attributes = attribs;
                 cfg.attribute_buffers = attrib_bufs;
@@ -2502,8 +2542,6 @@ panfrost_draw_emit_vertex(struct panfrost_batch *batch,
                 cfg.thread_storage = batch->tls.gpu;
                 pan_emit_draw_descs(batch, &cfg, PIPE_SHADER_VERTEX);
         }
-
-        pan_section_pack(job, COMPUTE_JOB, DRAW_PADDING, cfg);
 }
 
 static void
@@ -2612,28 +2650,22 @@ panfrost_batch_get_bifrost_tiler(struct panfrost_batch *batch, unsigned vertex_c
                 return batch->tiler_ctx.bifrost;
 
         struct panfrost_ptr t =
-                pan_pool_alloc_desc(&batch->pool.base, BIFROST_TILER_HEAP);
+                pan_pool_alloc_desc(&batch->pool.base, TILER_HEAP);
 
-        pan_emit_bifrost_tiler_heap(dev, t.cpu);
+        GENX(pan_emit_tiler_heap)(dev, t.cpu);
 
         mali_ptr heap = t.gpu;
 
-        t = pan_pool_alloc_desc(&batch->pool.base, BIFROST_TILER);
-        pan_emit_bifrost_tiler(dev, batch->key.width, batch->key.height,
-                               util_framebuffer_get_num_samples(&batch->key),
-                               heap, t.cpu);
+        t = pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT);
+        GENX(pan_emit_tiler_ctx)(dev, batch->key.width, batch->key.height,
+                                 util_framebuffer_get_num_samples(&batch->key),
+                                 heap, t.cpu);
 
         batch->tiler_ctx.bifrost = t.gpu;
         return batch->tiler_ctx.bifrost;
 }
 #endif
 
-#if PAN_ARCH >= 6
-#define TILER_JOB BIFROST_TILER_JOB
-#else
-#define TILER_JOB MIDGARD_TILER_JOB
-#endif
-
 static void
 panfrost_draw_emit_tiler(struct panfrost_batch *batch,
                          const struct pipe_draw_info *info,
@@ -2646,7 +2678,7 @@ panfrost_draw_emit_tiler(struct panfrost_batch *batch,
         struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
 
         void *section = pan_section_ptr(job, TILER_JOB, INVOCATION);
-        memcpy(section, invocation_template, MALI_INVOCATION_LENGTH);
+        memcpy(section, invocation_template, pan_size(INVOCATION));
 
         section = pan_section_ptr(job, TILER_JOB, PRIMITIVE);
         pan_pack(section, PRIMITIVE, cfg) {
@@ -2692,14 +2724,12 @@ panfrost_draw_emit_tiler(struct panfrost_batch *batch,
         }
 
         pan_section_pack(job, TILER_JOB, PADDING, cfg);
-        pan_section_pack(job, TILER_JOB, DRAW_PADDING, cfg);
 #endif
 
         section = pan_section_ptr(job, TILER_JOB, DRAW);
         pan_pack(section, DRAW, cfg) {
                 cfg.four_components_per_vertex = true;
                 cfg.draw_descriptor_is_64b = true;
-                cfg.texture_descriptor_is_64b = (PAN_ARCH <= 5);
                 cfg.front_face_ccw = rast->front_ccw;
                 cfg.cull_front_face = rast->cull_face & PIPE_FACE_FRONT;
                 cfg.cull_back_face = rast->cull_face & PIPE_FACE_BACK;
@@ -2753,18 +2783,6 @@ panfrost_direct_draw(struct panfrost_batch *batch,
 
         struct panfrost_context *ctx = batch->ctx;
 
-        /* Fallback for unsupported modes */
-        if (!(ctx->draw_modes & BITFIELD_BIT(info->mode))) {
-                if (draw->count < 4) {
-                        /* Degenerate case? */
-                        return;
-                }
-
-                util_primconvert_save_rasterizer_state(ctx->primconvert, &ctx->rasterizer->base);
-                util_primconvert_draw_vbo(ctx->primconvert, info, drawid_offset, NULL, draw, 1);
-                return;
-        }
-
         /* Take into account a negative bias */
         ctx->indirect_draw = false;
         ctx->vertex_count = draw->count + (info->index_size ? abs(draw->index_bias) : 0);
@@ -2866,7 +2884,6 @@ panfrost_indirect_draw(struct panfrost_batch *batch,
         /* TODO: Increment transform feedback offsets */
         assert(ctx->streamout.num_targets == 0);
 
-        assert(ctx->draw_modes & (1 << info->mode));
         ctx->active_prim = info->mode;
         ctx->drawid = drawid_offset;
         ctx->indirect_draw = true;
@@ -2976,10 +2993,10 @@ panfrost_indirect_draw(struct panfrost_batch *batch,
         }
 
         batch->indirect_draw_job_id =
-                panfrost_emit_indirect_draw(&batch->pool.base,
-                                            &batch->scoreboard,
-                                            &draw_info,
-                                            &batch->indirect_draw_ctx);
+                GENX(panfrost_emit_indirect_draw)(&batch->pool.base,
+                                                  &batch->scoreboard,
+                                                  &draw_info,
+                                                  &batch->indirect_draw_ctx);
 
         panfrost_emit_vertex_tiler_jobs(batch, &vertex, &tiler);
 }
@@ -2998,8 +3015,8 @@ panfrost_draw_vbo(struct pipe_context *pipe,
         if (!panfrost_render_condition_check(ctx))
                 return;
 
-        /* Emulate indirect draws when debugging */
-        if (dev->debug & PAN_DBG_NOINDIRECT && indirect && indirect->buffer) {
+        /* Emulate indirect draws unless we're using the experimental path */
+        if (!(dev->debug & PAN_DBG_INDIRECT) && indirect && indirect->buffer) {
                 assert(num_draws == 1);
                 util_draw_indirect(pipe, info, indirect);
                 return;
@@ -3142,7 +3159,6 @@ panfrost_launch_grid(struct pipe_context *pipe,
 
         pan_section_pack(t.cpu, COMPUTE_JOB, DRAW, cfg) {
                 cfg.draw_descriptor_is_64b = true;
-                cfg.texture_descriptor_is_64b = (PAN_ARCH <= 5);
                 cfg.state = panfrost_emit_compute_shader_meta(batch, PIPE_SHADER_COMPUTE);
                 cfg.attributes = panfrost_emit_image_attribs(batch, &cfg.attribute_buffers, PIPE_SHADER_COMPUTE);
                 cfg.thread_storage = panfrost_emit_shared_memory(batch, info);
@@ -3154,8 +3170,6 @@ panfrost_launch_grid(struct pipe_context *pipe,
                                 PIPE_SHADER_COMPUTE);
         }
 
-        pan_section_pack(t.cpu, COMPUTE_JOB, DRAW_PADDING, cfg);
-
         unsigned indirect_dep = 0;
         if (info->indirect) {
                 struct pan_indirect_dispatch_info indirect = {
@@ -3169,9 +3183,9 @@ panfrost_launch_grid(struct pipe_context *pipe,
                         },
                 };
 
-                indirect_dep = pan_indirect_dispatch_emit(&batch->pool.base,
-                                                          &batch->scoreboard,
-                                                          &indirect);
+                indirect_dep = GENX(pan_indirect_dispatch_emit)(&batch->pool.base,
+                                                                &batch->scoreboard,
+                                                                &indirect);
         }
 
         panfrost_add_job(&batch->pool.base, &batch->scoreboard,
@@ -3360,8 +3374,11 @@ panfrost_create_sampler_view(
         struct pipe_resource *texture,
         const struct pipe_sampler_view *template)
 {
+        struct panfrost_context *ctx = pan_context(pctx);
         struct panfrost_sampler_view *so = rzalloc(pctx, struct panfrost_sampler_view);
 
+        pan_legalize_afbc_format(ctx, pan_resource(texture), template->format);
+
         pipe_reference(NULL, &texture->reference);
 
         so->base = *template;
@@ -3472,11 +3489,11 @@ panfrost_create_blend_state(struct pipe_context *pipe,
 }
 
 static void
-prepare_rsd(struct panfrost_device *dev,
-            struct panfrost_shader_state *state,
+prepare_rsd(struct panfrost_shader_state *state,
             struct panfrost_pool *pool, bool upload)
 {
-        struct mali_renderer_state_packed *out = &state->partial_rsd;
+        struct mali_renderer_state_packed *out =
+                (struct mali_renderer_state_packed *)&state->partial_rsd;
 
         if (upload) {
                 struct panfrost_ptr ptr =
@@ -3487,8 +3504,7 @@ prepare_rsd(struct panfrost_device *dev,
         }
 
         pan_pack(out, RENDERER_STATE, cfg) {
-                pan_shader_prepare_rsd(dev, &state->info, state->bin.gpu,
-                                       &cfg);
+                pan_shader_prepare_rsd(&state->info, state->bin.gpu, &cfg);
         }
 }
 
@@ -3508,14 +3524,16 @@ static void
 screen_destroy(struct pipe_screen *pscreen)
 {
         struct panfrost_device *dev = pan_device(pscreen);
-        pan_blitter_cleanup(dev);
+        GENX(panfrost_cleanup_indirect_draw_shaders)(dev);
+        GENX(pan_indirect_dispatch_cleanup)(dev);
+        GENX(pan_blitter_cleanup)(dev);
 }
 
 static void
 preload(struct panfrost_batch *batch, struct pan_fb_info *fb)
 {
-        pan_preload_fb(&batch->pool.base, &batch->scoreboard, fb, batch->tls.gpu,
-                       PAN_ARCH >= 6 ? batch->tiler_ctx.bifrost : 0);
+        GENX(pan_preload_fb)(&batch->pool.base, &batch->scoreboard, fb, batch->tls.gpu,
+                             PAN_ARCH >= 6 ? batch->tiler_ctx.bifrost : 0, NULL);
 }
 
 static void
@@ -3524,10 +3542,10 @@ init_batch(struct panfrost_batch *batch)
         /* Reserve the framebuffer and local storage descriptors */
         batch->framebuffer =
 #if PAN_ARCH == 4
-                pan_pool_alloc_desc(&batch->pool.base, SINGLE_TARGET_FRAMEBUFFER);
+                pan_pool_alloc_desc(&batch->pool.base, FRAMEBUFFER);
 #else
                 pan_pool_alloc_desc_aggregate(&batch->pool.base,
-                                              PAN_DESC(MULTI_TARGET_FRAMEBUFFER),
+                                              PAN_DESC(FRAMEBUFFER),
                                               PAN_DESC(ZS_CRC_EXTENSION),
                                               PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET));
 
@@ -3542,6 +3560,18 @@ init_batch(struct panfrost_batch *batch)
 #endif
 }
 
+static void
+panfrost_sampler_view_destroy(
+        struct pipe_context *pctx,
+        struct pipe_sampler_view *pview)
+{
+        struct panfrost_sampler_view *view = (struct panfrost_sampler_view *) pview;
+
+        pipe_resource_reference(&pview->texture, NULL);
+        panfrost_bo_unreference(view->state.bo);
+        ralloc_free(view);
+}
+
 static void
 context_init(struct pipe_context *pipe)
 {
@@ -3552,12 +3582,73 @@ context_init(struct pipe_context *pipe)
         pipe->create_rasterizer_state = panfrost_create_rasterizer_state;
         pipe->create_depth_stencil_alpha_state = panfrost_create_depth_stencil_state;
         pipe->create_sampler_view = panfrost_create_sampler_view;
+        pipe->sampler_view_destroy = panfrost_sampler_view_destroy;
         pipe->create_sampler_state = panfrost_create_sampler_state;
         pipe->create_blend_state = panfrost_create_blend_state;
 
         pipe->get_sample_position = panfrost_get_sample_position;
 }
 
+#if PAN_ARCH <= 5
+
+/* Returns the polygon list's GPU address if available, or otherwise allocates
+ * the polygon list.  It's perfectly fast to use allocate/free BO directly,
+ * since we'll hit the BO cache and this is one-per-batch anyway. */
+
+static mali_ptr
+batch_get_polygon_list(struct panfrost_batch *batch)
+{
+        struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
+
+        if (!batch->tiler_ctx.midgard.polygon_list) {
+                bool has_draws = batch->scoreboard.first_tiler != NULL;
+                unsigned size =
+                        panfrost_tiler_get_polygon_list_size(dev,
+                                                             batch->key.width,
+                                                             batch->key.height,
+                                                             has_draws);
+                size = util_next_power_of_two(size);
+
+                /* Create the BO as invisible if we can. In the non-hierarchical tiler case,
+                 * we need to write the polygon list manually because there's not WRITE_VALUE
+                 * job in the chain (maybe we should add one...). */
+                bool init_polygon_list = !has_draws && (dev->quirks & MIDGARD_NO_HIER_TILING);
+                batch->tiler_ctx.midgard.polygon_list =
+                        panfrost_batch_create_bo(batch, size,
+                                                 init_polygon_list ? 0 : PAN_BO_INVISIBLE,
+                                                 PIPE_SHADER_VERTEX,
+                                                 "Polygon list");
+                panfrost_batch_add_bo(batch, batch->tiler_ctx.midgard.polygon_list,
+                                PIPE_SHADER_FRAGMENT);
+
+                if (init_polygon_list) {
+                        assert(batch->tiler_ctx.midgard.polygon_list->ptr.cpu);
+                        uint32_t *polygon_list_body =
+                                batch->tiler_ctx.midgard.polygon_list->ptr.cpu +
+                                MALI_MIDGARD_TILER_MINIMUM_HEADER_SIZE;
+
+                        /* Magic for Mali T720 */
+                        polygon_list_body[0] = 0xa0000000;
+                }
+
+                batch->tiler_ctx.midgard.disable = !has_draws;
+        }
+
+        return batch->tiler_ctx.midgard.polygon_list->ptr.gpu;
+}
+#endif
+
+static void
+init_polygon_list(struct panfrost_batch *batch)
+{
+#if PAN_ARCH <= 5
+        mali_ptr polygon_list = batch_get_polygon_list(batch);
+        panfrost_scoreboard_initialize_tiler(&batch->pool.base,
+                                             &batch->scoreboard,
+                                             polygon_list);
+#endif
+}
+
 void
 GENX(panfrost_cmdstream_screen_init)(struct panfrost_screen *screen)
 {
@@ -3571,7 +3662,13 @@ GENX(panfrost_cmdstream_screen_init)(struct panfrost_screen *screen)
         screen->vtbl.preload     = preload;
         screen->vtbl.context_init = context_init;
         screen->vtbl.init_batch = init_batch;
+        screen->vtbl.get_blend_shader = GENX(pan_blend_get_shader_locked);
+        screen->vtbl.init_polygon_list = init_polygon_list;
+        screen->vtbl.get_compiler_options = GENX(pan_shader_get_compiler_options);
+        screen->vtbl.compile_shader = GENX(pan_shader_compile);
 
-        pan_blitter_init(dev, &screen->blitter.bin_pool.base,
-                         &screen->blitter.desc_pool.base);
+        GENX(pan_blitter_init)(dev, &screen->blitter.bin_pool.base,
+                               &screen->blitter.desc_pool.base);
+        GENX(pan_indirect_dispatch_init)(dev);
+        GENX(panfrost_init_indirect_draw_shaders)(dev, &screen->indirect_draw.bin_pool.base);
 }
diff --git a/mesa 3D driver/src/gallium/drivers/panfrost/pan_compute.c b/mesa 3D driver/src/gallium/drivers/panfrost/pan_compute.c
index 99c8c28b69..8b1a4265fd 100644
--- a/mesa 3D driver/src/gallium/drivers/panfrost/pan_compute.c	
+++ b/mesa 3D driver/src/gallium/drivers/panfrost/pan_compute.c	
@@ -44,7 +44,7 @@ panfrost_create_compute_state(
         const struct pipe_compute_state *cso)
 {
         struct panfrost_context *ctx = pan_context(pctx);
-        struct panfrost_device *dev = pan_device(pctx->screen);
+        struct panfrost_screen *screen = pan_screen(pctx->screen);
 
         struct panfrost_shader_variants *so = CALLOC_STRUCT(panfrost_shader_variants);
         so->cbase = *cso;
@@ -63,7 +63,7 @@ panfrost_create_compute_state(
                 blob_reader_init(&reader, hdr->blob, hdr->num_bytes);
 
                 const struct nir_shader_compiler_options *options =
-                        pan_shader_get_compiler_options(dev);
+                        screen->vtbl.get_compiler_options();
 
                 so->cbase.prog = nir_deserialize(NULL, options, &reader);
                 so->cbase.ir_type = PIPE_SHADER_IR_NIR;
diff --git a/mesa 3D driver/src/gallium/drivers/panfrost/pan_context.c b/mesa 3D driver/src/gallium/drivers/panfrost/pan_context.c
index 36c895aad5..69bbc43f82 100644
--- a/mesa 3D driver/src/gallium/drivers/panfrost/pan_context.c	
+++ b/mesa 3D driver/src/gallium/drivers/panfrost/pan_context.c	
@@ -44,7 +44,6 @@
 #include "util/format/u_format.h"
 #include "util/u_prim.h"
 #include "util/u_prim_restart.h"
-#include "indices/u_primconvert.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_from_mesa.h"
 #include "util/u_math.h"
@@ -201,8 +200,11 @@ panfrost_get_blend(struct panfrost_batch *batch, unsigned rti, struct panfrost_b
 
         pthread_mutex_lock(&dev->blend_shaders.lock);
         struct pan_blend_shader_variant *shader =
-                pan_blend_get_shader_locked(dev, &pan_blend,
-                                col0_type, col1_type, rti);
+                pan_screen(ctx->base.screen)->vtbl.get_blend_shader(dev,
+                                                                    &pan_blend,
+                                                                    col0_type,
+                                                                    col1_type,
+                                                                    rti);
 
         /* Size check and upload */
         unsigned offset = *shader_offset;
@@ -298,6 +300,8 @@ panfrost_create_shader_state(
         struct panfrost_device *dev = pan_device(pctx->screen);
         so->base = *cso;
 
+        simple_mtx_init(&so->lock, mtx_plain);
+
         /* Token deep copy to prevent memory corruption */
 
         if (cso->type == PIPE_SHADER_IR_TGSI)
@@ -340,6 +344,8 @@ panfrost_delete_shader_state(
                 panfrost_bo_unreference(shader_state->linkage.bo);
         }
 
+        simple_mtx_destroy(&cso->lock);
+
         free(cso->variants);
         free(so);
 }
@@ -367,8 +373,6 @@ panfrost_variant_matches(
         struct panfrost_shader_state *variant,
         enum pipe_shader_type type)
 {
-        struct panfrost_device *dev = pan_device(ctx->base.screen);
-
         if (variant->info.stage == MESA_SHADER_FRAGMENT &&
             variant->info.fs.outputs_read) {
                 struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
@@ -380,10 +384,7 @@ panfrost_variant_matches(
                         if ((fb->nr_cbufs > i) && fb->cbufs[i])
                                 fmt = fb->cbufs[i]->format;
 
-                        const struct util_format_description *desc =
-                                util_format_description(fmt);
-
-                        if (pan_format_class_load(desc, dev->quirks) == PAN_FORMAT_NATIVE)
+                        if (panfrost_blendable_formats_v6[fmt].internal)
                                 fmt = PIPE_FORMAT_NONE;
 
                         if (variant->rt_formats[i] != fmt)
@@ -445,7 +446,6 @@ panfrost_bind_shader_state(
         enum pipe_shader_type type)
 {
         struct panfrost_context *ctx = pan_context(pctx);
-        struct panfrost_device *dev = pan_device(ctx->base.screen);
         ctx->shader[type] = hwcso;
 
         ctx->dirty |= PAN_DIRTY_TLS_SIZE;
@@ -458,6 +458,8 @@ panfrost_bind_shader_state(
         signed variant = -1;
         struct panfrost_shader_variants *variants = (struct panfrost_shader_variants *) hwcso;
 
+        simple_mtx_lock(&variants->lock);
+
         for (unsigned i = 0; i < variants->variant_count; ++i) {
                 if (panfrost_variant_matches(ctx, &variants->variants[i], type)) {
                         variant = i;
@@ -501,10 +503,7 @@ panfrost_bind_shader_state(
                                 if ((fb->nr_cbufs > i) && fb->cbufs[i])
                                         fmt = fb->cbufs[i]->format;
 
-                                const struct util_format_description *desc =
-                                        util_format_description(fmt);
-
-                                if (pan_format_class_load(desc, dev->quirks) == PAN_FORMAT_NATIVE)
+                                if (panfrost_blendable_formats_v6[fmt].internal)
                                         fmt = PIPE_FORMAT_NONE;
 
                                 v->rt_formats[i] = fmt;
@@ -538,6 +537,11 @@ panfrost_bind_shader_state(
                         update_so_info(&shader_state->stream_output,
                                        shader_state->info.outputs_written);
         }
+
+        /* TODO: it would be more efficient to release the lock before
+         * compiling instead of after, but that can race if thread A compiles a
+         * variant while thread B searches for that same variant */
+        simple_mtx_unlock(&variants->lock);
 }
 
 static void *
@@ -618,6 +622,7 @@ panfrost_set_sampler_views(
         enum pipe_shader_type shader,
         unsigned start_slot, unsigned num_views,
         unsigned unbind_num_trailing_slots,
+        bool take_ownership,
         struct pipe_sampler_view **views)
 {
         struct panfrost_context *ctx = pan_context(pctx);
@@ -634,8 +639,14 @@ panfrost_set_sampler_views(
         for (i = 0; i < num_views; ++i) {
                 if (views[i])
                         new_nr = i + 1;
-		pipe_sampler_view_reference((struct pipe_sampler_view **)&ctx->sampler_views[shader][i],
-		                            views[i]);
+                if (take_ownership) {
+                        pipe_sampler_view_reference((struct pipe_sampler_view **)&ctx->sampler_views[shader][i],
+                                                    NULL);
+                        ctx->sampler_views[shader][i] = (struct panfrost_sampler_view *)views[i];
+                } else {
+                        pipe_sampler_view_reference((struct pipe_sampler_view **)&ctx->sampler_views[shader][i],
+                                                    views[i]);
+                }
         }
 
         for (; i < ctx->sampler_view_count[shader]; i++) {
@@ -645,18 +656,6 @@ panfrost_set_sampler_views(
         ctx->sampler_view_count[shader] = new_nr;
 }
 
-static void
-panfrost_sampler_view_destroy(
-        struct pipe_context *pctx,
-        struct pipe_sampler_view *pview)
-{
-        struct panfrost_sampler_view *view = (struct panfrost_sampler_view *) pview;
-
-        pipe_resource_reference(&pview->texture, NULL);
-        panfrost_bo_unreference(view->state.bo);
-        ralloc_free(view);
-}
-
 static void
 panfrost_set_shader_buffers(
         struct pipe_context *pctx,
@@ -689,11 +688,10 @@ panfrost_set_framebuffer_state(struct pipe_context *pctx,
         }
 
         /* We may need to generate a new variant if the fragment shader is
-         * keyed to the framebuffer format (due to EXT_framebuffer_fetch) */
+         * keyed to the framebuffer format or render target count */
         struct panfrost_shader_variants *fs = ctx->shader[PIPE_SHADER_FRAGMENT];
 
-        if (fs && fs->variant_count &&
-            fs->variants[fs->active_variant].info.fs.outputs_read)
+        if (fs && fs->variant_count)
                 ctx->base.bind_fs_state(&ctx->base, fs);
 }
 
@@ -795,6 +793,8 @@ panfrost_destroy(struct pipe_context *pipe)
 {
         struct panfrost_context *panfrost = pan_context(pipe);
 
+        _mesa_hash_table_destroy(panfrost->writers, NULL);
+
         if (panfrost->blitter)
                 util_blitter_destroy(panfrost->blitter);
 
@@ -804,8 +804,6 @@ panfrost_destroy(struct pipe_context *pipe)
         panfrost_pool_cleanup(&panfrost->descs);
         panfrost_pool_cleanup(&panfrost->shaders);
 
-        util_primconvert_destroy(panfrost->primconvert);
-
         ralloc_free(pipe);
 }
 
@@ -1057,7 +1055,6 @@ panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
         gallium->set_stencil_ref = panfrost_set_stencil_ref;
 
         gallium->set_sampler_views = panfrost_set_sampler_views;
-        gallium->sampler_view_destroy = panfrost_sampler_view_destroy;
 
         gallium->bind_rasterizer_state = panfrost_bind_rasterizer_state;
         gallium->delete_rasterizer_state = panfrost_generic_cso_delete;
@@ -1118,20 +1115,11 @@ panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
         panfrost_pool_init(&ctx->shaders, ctx, dev,
                         PAN_BO_EXECUTE, 4096, "Shaders", true, false);
 
-        /* All of our GPUs support ES mode. Midgard supports additionally
-         * QUADS/QUAD_STRIPS/POLYGON. Bifrost supports just QUADS. */
-
-        ctx->draw_modes = (1 << (PIPE_PRIM_QUADS + 1)) - 1;
-
-        if (!pan_is_bifrost(dev)) {
-                ctx->draw_modes |= (1 << PIPE_PRIM_QUAD_STRIP);
-                ctx->draw_modes |= (1 << PIPE_PRIM_POLYGON);
-        }
-
-        ctx->primconvert = util_primconvert_create(gallium, ctx->draw_modes);
-
         ctx->blitter = util_blitter_create(gallium);
 
+        ctx->writers = _mesa_hash_table_create(gallium, _mesa_hash_pointer,
+                                                        _mesa_key_pointer_equal);
+
         assert(ctx->blitter);
 
         /* Prepare for render! */
diff --git a/mesa 3D driver/src/gallium/drivers/panfrost/pan_context.h b/mesa 3D driver/src/gallium/drivers/panfrost/pan_context.h
index 8ed66966b9..82486f7f14 100644
--- a/mesa 3D driver/src/gallium/drivers/panfrost/pan_context.h	
+++ b/mesa 3D driver/src/gallium/drivers/panfrost/pan_context.h	
@@ -26,7 +26,6 @@
 #define __BUILDER_H__
 
 #define _LARGEFILE64_SOURCE 1
-#define CACHE_LINE_SIZE 1024 /* TODO */
 #include <sys/mman.h>
 #include <assert.h>
 #include "pan_resource.h"
@@ -44,6 +43,7 @@
 #include "pipe/p_state.h"
 #include "util/u_blitter.h"
 #include "util/hash_table.h"
+#include "util/simple_mtx.h"
 
 #include "midgard/midgard_compile.h"
 #include "compiler/shader_enums.h"
@@ -140,17 +140,20 @@ struct panfrost_context {
         struct {
                 uint64_t seqnum;
                 struct panfrost_batch slots[PAN_MAX_BATCHES];
+
+                /** Set of active batches for faster traversal */
+                BITSET_DECLARE(active, PAN_MAX_BATCHES);
         } batches;
 
+        /* Map from resources to panfrost_batches */
+        struct hash_table *writers;
+
         /* Bound job batch */
         struct panfrost_batch *batch;
 
         /* Within a launch_grid call.. */
         const struct pipe_grid_info *compute_grid;
 
-        /* Bit mask for supported PIPE_DRAW for this hardware */
-        unsigned draw_modes;
-
         struct pipe_framebuffer_state pipe_framebuffer;
         struct panfrost_streamout streamout;
 
@@ -175,10 +178,7 @@ struct panfrost_context {
          * it is disabled, just equal to plain vertex count */
         unsigned padded_count;
 
-        /* TODO: Multiple uniform buffers (index =/= 0), finer updates? */
-
         struct panfrost_constant_buffer constant_buffer[PIPE_SHADER_TYPES];
-
         struct panfrost_rasterizer *rasterizer;
         struct panfrost_shader_variants *shader[PIPE_SHADER_TYPES];
         struct panfrost_vertex_state *vertex;
@@ -198,7 +198,6 @@ struct panfrost_context {
         struct panfrost_sampler_view *sampler_views[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS];
         unsigned sampler_view_count[PIPE_SHADER_TYPES];
 
-        struct primconvert_context *primconvert;
         struct blitter_context *blitter;
 
         struct panfrost_blend_state *blend;
@@ -223,13 +222,7 @@ struct panfrost_context {
 
 /* Corresponds to the CSO */
 
-struct panfrost_rasterizer {
-        struct pipe_rasterizer_state base;
-
-        /* Partially packed RSD words */
-        struct mali_multisample_misc_packed multisample;
-        struct mali_stencil_mask_misc_packed stencil_misc;
-};
+struct panfrost_rasterizer;
 
 /* Linked varyings */
 struct pan_linkage {
@@ -248,6 +241,8 @@ struct pan_linkage {
         uint32_t stride;
 };
 
+#define RSD_WORDS 16
+
 /* Variants bundle together to form the backing CSO, bundling multiple
  * shaders with varying emulated features baked in */
 
@@ -260,7 +255,7 @@ struct panfrost_shader_state {
         struct panfrost_pool_ref bin, state;
 
         /* For fragment shaders, a prepared (but not uploaded RSD) */
-        struct mali_renderer_state_packed partial_rsd;
+        uint32_t partial_rsd[RSD_WORDS];
 
         struct pan_shader_info info;
 
@@ -290,6 +285,9 @@ struct panfrost_shader_variants {
                 struct pipe_compute_state cbase;
         };
 
+        /** Lock for the variants array */
+        simple_mtx_t lock;
+
         struct panfrost_shader_state *variants;
         unsigned variant_space;
 
@@ -317,35 +315,9 @@ struct panfrost_vertex_state {
         unsigned formats[PIPE_MAX_ATTRIBS];
 };
 
-struct panfrost_zsa_state {
-        struct pipe_depth_stencil_alpha_state base;
-
-        /* Is any depth, stencil, or alpha testing enabled? */
-        bool enabled;
-
-        /* Mask of PIPE_CLEAR_{DEPTH,STENCIL} written */
-        unsigned draws;
-
-        /* Prepacked words from the RSD */
-        struct mali_multisample_misc_packed rsd_depth;
-        struct mali_stencil_mask_misc_packed rsd_stencil;
-        struct mali_stencil_packed stencil_front, stencil_back;
-};
-
-struct panfrost_sampler_state {
-        struct pipe_sampler_state base;
-        struct mali_midgard_sampler_packed hw;
-};
-
-/* Misnomer: Sampler view corresponds to textures, not samplers */
-
-struct panfrost_sampler_view {
-        struct pipe_sampler_view base;
-        struct panfrost_pool_ref state;
-        struct mali_bifrost_texture_packed bifrost_descriptor;
-        mali_ptr texture_bo;
-        uint64_t modifier;
-};
+struct panfrost_zsa_state;
+struct panfrost_sampler_state;
+struct panfrost_sampler_view;
 
 static inline struct panfrost_context *
 pan_context(struct pipe_context *pcontext)
diff --git a/mesa 3D driver/src/gallium/drivers/panfrost/pan_job.c b/mesa 3D driver/src/gallium/drivers/panfrost/pan_job.c
index 620b9d19b5..ebf42cbfc8 100644
--- a/mesa 3D driver/src/gallium/drivers/panfrost/pan_job.c	
+++ b/mesa 3D driver/src/gallium/drivers/panfrost/pan_job.c	
@@ -40,12 +40,26 @@
 #include "decode.h"
 #include "panfrost-quirks.h"
 
+#define foreach_batch(ctx, idx) \
+        BITSET_FOREACH_SET(idx, ctx->batches.active, PAN_MAX_BATCHES)
+
 static unsigned
 panfrost_batch_idx(struct panfrost_batch *batch)
 {
         return batch - batch->ctx->batches.slots;
 }
 
+/* Adds the BO backing surface to a batch if the surface is non-null */
+
+static void
+panfrost_batch_add_surface(struct panfrost_batch *batch, struct pipe_surface *surf)
+{
+        if (surf) {
+                struct panfrost_resource *rsrc = pan_resource(surf->texture);
+                panfrost_batch_write_rsrc(batch, rsrc, PIPE_SHADER_FRAGMENT);
+        }
+}
+
 static void
 panfrost_batch_init(struct panfrost_context *ctx,
                     const struct pipe_framebuffer_state *key,
@@ -67,7 +81,8 @@ panfrost_batch_init(struct panfrost_context *ctx,
         batch->maxx = batch->maxy = 0;
 
         util_copy_framebuffer_state(&batch->key, key);
-        util_dynarray_init(&batch->resources, NULL);
+        batch->resources =_mesa_set_create(NULL, _mesa_hash_pointer,
+                                          _mesa_key_pointer_equal);
 
         /* Preallocate the main pool, since every batch has at least one job
          * structure so it will be used */
@@ -79,18 +94,17 @@ panfrost_batch_init(struct panfrost_context *ctx,
         panfrost_pool_init(&batch->invisible_pool, NULL, dev,
                         PAN_BO_INVISIBLE, 65536, "Varyings", false, true);
 
-        panfrost_batch_add_fbo_bos(batch);
+        for (unsigned i = 0; i < batch->key.nr_cbufs; ++i)
+                panfrost_batch_add_surface(batch, batch->key.cbufs[i]);
+
+        panfrost_batch_add_surface(batch, batch->key.zsbuf);
 
         screen->vtbl.init_batch(batch);
 }
 
 static void
-panfrost_batch_cleanup(struct panfrost_batch *batch)
+panfrost_batch_cleanup(struct panfrost_context *ctx, struct panfrost_batch *batch)
 {
-        if (!batch)
-                return;
-
-        struct panfrost_context *ctx = batch->ctx;
         struct panfrost_device *dev = pan_device(ctx->base.screen);
 
         assert(batch->seqnum);
@@ -110,16 +124,20 @@ panfrost_batch_cleanup(struct panfrost_batch *batch)
                 panfrost_bo_unreference(bo);
         }
 
-        util_dynarray_foreach(&batch->resources, struct panfrost_resource *, rsrc) {
-                BITSET_CLEAR((*rsrc)->track.users, batch_idx);
+        set_foreach_remove(batch->resources, entry) {
+                struct panfrost_resource *rsrc = (void *) entry->key;
 
-                if ((*rsrc)->track.writer == batch)
-                        (*rsrc)->track.writer = NULL;
+                if (_mesa_hash_table_search(ctx->writers, rsrc)) {
+                        _mesa_hash_table_remove_key(ctx->writers, rsrc);
+                        rsrc->track.nr_writers--;
+                }
 
-                pipe_resource_reference((struct pipe_resource **) rsrc, NULL);
+                rsrc->track.nr_users--;
+
+                pipe_resource_reference((struct pipe_resource **) &rsrc, NULL);
         }
 
-        util_dynarray_fini(&batch->resources);
+        _mesa_set_destroy(batch->resources, NULL);
         panfrost_pool_cleanup(&batch->pool);
         panfrost_pool_cleanup(&batch->invisible_pool);
 
@@ -128,10 +146,12 @@ panfrost_batch_cleanup(struct panfrost_batch *batch)
         util_sparse_array_finish(&batch->bos);
 
         memset(batch, 0, sizeof(*batch));
+        BITSET_CLEAR(ctx->batches.active, batch_idx);
 }
 
 static void
-panfrost_batch_submit(struct panfrost_batch *batch,
+panfrost_batch_submit(struct panfrost_context *ctx,
+                      struct panfrost_batch *batch,
                       uint32_t in_sync, uint32_t out_sync);
 
 static struct panfrost_batch *
@@ -158,37 +178,13 @@ panfrost_get_batch(struct panfrost_context *ctx,
 
         /* The selected slot is used, we need to flush the batch */
         if (batch->seqnum)
-                panfrost_batch_submit(batch, 0, 0);
+                panfrost_batch_submit(ctx, batch, 0, 0);
 
         panfrost_batch_init(ctx, key, batch);
 
-        return batch;
-}
+        unsigned batch_idx = panfrost_batch_idx(batch);
+        BITSET_SET(ctx->batches.active, batch_idx);
 
-struct panfrost_batch *
-panfrost_get_fresh_batch(struct panfrost_context *ctx,
-                         const struct pipe_framebuffer_state *key,
-                         const char *reason)
-{
-        struct panfrost_batch *batch = panfrost_get_batch(ctx, key);
-
-        panfrost_dirty_state_all(ctx);
-
-        /* The batch has no draw/clear queued, let's return it directly.
-         * Note that it's perfectly fine to re-use a batch with an
-         * existing clear, we'll just update it with the new clear request.
-         */
-        if (!batch->scoreboard.first_job) {
-                ctx->batch = batch;
-                return batch;
-        }
-
-        /* Otherwise, we need to flush the existing one and instantiate a new
-         * one.
-         */
-        perf_debug_ctx(ctx, "Flushing a batch due to: %s", reason);
-        panfrost_batch_submit(batch, 0, 0);
-        batch = panfrost_get_batch(ctx, key);
         return batch;
 }
 
@@ -225,21 +221,15 @@ panfrost_get_fresh_batch_for_fbo(struct panfrost_context *ctx, const char *reaso
         batch = panfrost_get_batch(ctx, &ctx->pipe_framebuffer);
         panfrost_dirty_state_all(ctx);
 
-        /* The batch has no draw/clear queued, let's return it directly.
-         * Note that it's perfectly fine to re-use a batch with an
-         * existing clear, we'll just update it with the new clear request.
-         */
-        if (!batch->scoreboard.first_job) {
-                ctx->batch = batch;
-                return batch;
+        /* We only need to submit and get a fresh batch if there is no
+         * draw/clear queued. Otherwise we may reuse the batch. */
+
+        if (batch->scoreboard.first_job) {
+                perf_debug_ctx(ctx, "Flushing the current FBO due to: %s", reason);
+                panfrost_batch_submit(ctx, batch, 0, 0);
+                batch = panfrost_get_batch(ctx, &ctx->pipe_framebuffer);
         }
 
-        /* Otherwise, we need to freeze the existing one and instantiate a new
-         * one.
-         */
-        perf_debug_ctx(ctx, "Flushing the current FBO due to: %s", reason);
-        panfrost_batch_submit(batch, 0, 0);
-        batch = panfrost_get_batch(ctx, &ctx->pipe_framebuffer);
         ctx->batch = batch;
         return batch;
 }
@@ -250,33 +240,40 @@ panfrost_batch_update_access(struct panfrost_batch *batch,
 {
         struct panfrost_context *ctx = batch->ctx;
         uint32_t batch_idx = panfrost_batch_idx(batch);
-        struct panfrost_batch *writer = rsrc->track.writer;
+        struct hash_entry *entry = _mesa_hash_table_search(ctx->writers, rsrc);
+        struct panfrost_batch *writer = entry ? entry->data : NULL;
+        bool found = false;
 
-        if (unlikely(!BITSET_TEST(rsrc->track.users, batch_idx))) {
-                BITSET_SET(rsrc->track.users, batch_idx);
+        _mesa_set_search_or_add(batch->resources, rsrc, &found);
+
+        if (!found) {
+                /* Cache number of batches accessing a resource */
+                rsrc->track.nr_users++;
 
                 /* Reference the resource on the batch */
-                struct pipe_resource **dst = util_dynarray_grow(&batch->resources,
-                                struct pipe_resource *, 1);
-
-                *dst = NULL;
-                pipe_resource_reference(dst, &rsrc->base);
+                pipe_reference(NULL, &rsrc->base.reference);
         }
 
         /* Flush users if required */
         if (writes || ((writer != NULL) && (writer != batch))) {
                 unsigned i;
-                BITSET_FOREACH_SET(i, rsrc->track.users, PAN_MAX_BATCHES) {
+                foreach_batch(ctx, i) {
+                        struct panfrost_batch *batch = &ctx->batches.slots[i];
+
                         /* Skip the entry if this our batch. */
                         if (i == batch_idx)
                                 continue;
 
-                        panfrost_batch_submit(&ctx->batches.slots[i], 0, 0);
+                        /* Submit if it's a user */
+                        if (_mesa_set_search(batch->resources, rsrc))
+                                panfrost_batch_submit(ctx, batch, 0, 0);
                 }
         }
 
-        if (writes)
-                rsrc->track.writer = batch;
+        if (writes) {
+                _mesa_hash_table_insert(ctx->writers, rsrc, batch);
+                rsrc->track.nr_writers++;
+        }
 }
 
 static void
@@ -356,26 +353,6 @@ panfrost_batch_write_rsrc(struct panfrost_batch *batch,
         panfrost_batch_update_access(batch, rsrc, true);
 }
 
-/* Adds the BO backing surface to a batch if the surface is non-null */
-
-static void
-panfrost_batch_add_surface(struct panfrost_batch *batch, struct pipe_surface *surf)
-{
-        if (surf) {
-                struct panfrost_resource *rsrc = pan_resource(surf->texture);
-                panfrost_batch_write_rsrc(batch, rsrc, PIPE_SHADER_FRAGMENT);
-        }
-}
-
-void
-panfrost_batch_add_fbo_bos(struct panfrost_batch *batch)
-{
-        for (unsigned i = 0; i < batch->key.nr_cbufs; ++i)
-                panfrost_batch_add_surface(batch, batch->key.cbufs[i]);
-
-        panfrost_batch_add_surface(batch, batch->key.zsbuf);
-}
-
 struct panfrost_bo *
 panfrost_batch_create_bo(struct panfrost_batch *batch, size_t size,
                          uint32_t create_flags, enum pipe_shader_type stage,
@@ -396,52 +373,6 @@ panfrost_batch_create_bo(struct panfrost_batch *batch, size_t size,
         return bo;
 }
 
-/* Returns the polygon list's GPU address if available, or otherwise allocates
- * the polygon list.  It's perfectly fast to use allocate/free BO directly,
- * since we'll hit the BO cache and this is one-per-batch anyway. */
-
-static mali_ptr
-panfrost_batch_get_polygon_list(struct panfrost_batch *batch)
-{
-        struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
-
-        assert(!pan_is_bifrost(dev));
-
-        if (!batch->tiler_ctx.midgard.polygon_list) {
-                bool has_draws = batch->scoreboard.first_tiler != NULL;
-                unsigned size =
-                        panfrost_tiler_get_polygon_list_size(dev,
-                                                             batch->key.width,
-                                                             batch->key.height,
-                                                             has_draws);
-                size = util_next_power_of_two(size);
-
-                /* Create the BO as invisible if we can. In the non-hierarchical tiler case,
-                 * we need to write the polygon list manually because there's not WRITE_VALUE
-                 * job in the chain (maybe we should add one...). */
-                bool init_polygon_list = !has_draws && (dev->quirks & MIDGARD_NO_HIER_TILING);
-                batch->tiler_ctx.midgard.polygon_list =
-                        panfrost_batch_create_bo(batch, size,
-                                                 init_polygon_list ? 0 : PAN_BO_INVISIBLE,
-                                                 PIPE_SHADER_VERTEX,
-                                                 "Polygon list");
-                panfrost_batch_add_bo(batch, batch->tiler_ctx.midgard.polygon_list,
-                                PIPE_SHADER_FRAGMENT);
-
-                if (init_polygon_list) {
-                        assert(batch->tiler_ctx.midgard.polygon_list->ptr.cpu);
-                        uint32_t *polygon_list_body =
-                                batch->tiler_ctx.midgard.polygon_list->ptr.cpu +
-                                MALI_MIDGARD_TILER_MINIMUM_HEADER_SIZE;
-                         polygon_list_body[0] = 0xa0000000; /* TODO: Just that? */
-                }
-
-                batch->tiler_ctx.midgard.disable = !has_draws;
-        }
-
-        return batch->tiler_ctx.midgard.polygon_list->ptr.gpu;
-}
-
 struct panfrost_bo *
 panfrost_batch_get_scratchpad(struct panfrost_batch *batch,
                 unsigned size_per_thread,
@@ -712,10 +643,11 @@ panfrost_batch_submit_ioctl(struct panfrost_batch *batch,
                                INT64_MAX, 0, NULL);
 
                 if (dev->debug & PAN_DBG_TRACE)
-                        pandecode_jc(submit.jc, pan_is_bifrost(dev), dev->gpu_id);
+                        pandecode_jc(submit.jc, dev->gpu_id);
 
-                if (dev->debug & PAN_DBG_SYNC)
-                        pandecode_abort_on_fault(submit.jc);
+                /* Jobs won't be complete if blackhole rendering, that's ok */
+                if (!ctx->is_noop && dev->debug & PAN_DBG_SYNC)
+                        pandecode_abort_on_fault(submit.jc, dev->gpu_id);
         }
 
         return 0;
@@ -796,12 +728,12 @@ panfrost_emit_tile_map(struct panfrost_batch *batch, struct pan_fb_info *fb)
 }
 
 static void
-panfrost_batch_submit(struct panfrost_batch *batch,
+panfrost_batch_submit(struct panfrost_context *ctx,
+                      struct panfrost_batch *batch,
                       uint32_t in_sync, uint32_t out_sync)
 {
-        struct pipe_screen *pscreen = batch->ctx->base.screen;
+        struct pipe_screen *pscreen = ctx->base.screen;
         struct panfrost_screen *screen = pan_screen(pscreen);
-        struct panfrost_device *dev = pan_device(pscreen);
         int ret;
 
         /* Nothing to do! */
@@ -814,14 +746,7 @@ panfrost_batch_submit(struct panfrost_batch *batch,
         panfrost_batch_to_fb_info(batch, &fb, rts, &zs, &s, false);
 
         screen->vtbl.preload(batch, &fb);
-
-        if (!pan_is_bifrost(dev)) {
-                mali_ptr polygon_list = panfrost_batch_get_polygon_list(batch);
-
-                panfrost_scoreboard_initialize_tiler(&batch->pool.base,
-                                                     &batch->scoreboard,
-                                                     polygon_list);
-        }
+        screen->vtbl.init_polygon_list(batch);
 
         /* Now that all draws are in, we can finally prepare the
          * FBD for the batch (if there is one). */
@@ -850,13 +775,13 @@ panfrost_batch_submit(struct panfrost_batch *batch,
                 if (!batch->key.cbufs[i])
                         continue;
 
-                panfrost_resource_set_damage_region(batch->ctx->base.screen,
+                panfrost_resource_set_damage_region(ctx->base.screen,
                                                     batch->key.cbufs[i]->texture,
                                                     0, NULL);
         }
 
 out:
-        panfrost_batch_cleanup(batch);
+        panfrost_batch_cleanup(ctx, batch);
 }
 
 /* Submit all batches, applying the out_sync to the currently bound batch */
@@ -865,14 +790,14 @@ void
 panfrost_flush_all_batches(struct panfrost_context *ctx, const char *reason)
 {
         struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
-        panfrost_batch_submit(batch, ctx->syncobj, ctx->syncobj);
+        panfrost_batch_submit(ctx, batch, ctx->syncobj, ctx->syncobj);
 
         for (unsigned i = 0; i < PAN_MAX_BATCHES; i++) {
                 if (ctx->batches.slots[i].seqnum) {
                         if (reason)
                                 perf_debug_ctx(ctx, "Flushing everything due to: %s", reason);
 
-                        panfrost_batch_submit(&ctx->batches.slots[i],
+                        panfrost_batch_submit(ctx, &ctx->batches.slots[i],
                                               ctx->syncobj, ctx->syncobj);
                 }
         }
@@ -883,10 +808,11 @@ panfrost_flush_writer(struct panfrost_context *ctx,
                       struct panfrost_resource *rsrc,
                       const char *reason)
 {
-        if (rsrc->track.writer) {
+        struct hash_entry *entry = _mesa_hash_table_search(ctx->writers, rsrc);
+
+        if (entry) {
                 perf_debug_ctx(ctx, "Flushing writer due to: %s", reason);
-                panfrost_batch_submit(rsrc->track.writer, ctx->syncobj, ctx->syncobj);
-                rsrc->track.writer = NULL;
+                panfrost_batch_submit(ctx, entry->data, ctx->syncobj, ctx->syncobj);
         }
 }
 
@@ -896,14 +822,15 @@ panfrost_flush_batches_accessing_rsrc(struct panfrost_context *ctx,
                                       const char *reason)
 {
         unsigned i;
-        BITSET_FOREACH_SET(i, rsrc->track.users, PAN_MAX_BATCHES) {
-                perf_debug_ctx(ctx, "Flushing user due to: %s", reason);
-                panfrost_batch_submit(&ctx->batches.slots[i],
-                                      ctx->syncobj, ctx->syncobj);
-        }
+        foreach_batch(ctx, i) {
+                struct panfrost_batch *batch = &ctx->batches.slots[i];
 
-        assert(!BITSET_COUNT(rsrc->track.users));
-        rsrc->track.writer = NULL;
+                if (!_mesa_set_search(batch->resources, rsrc))
+                        continue;
+
+                perf_debug_ctx(ctx, "Flushing user due to: %s", reason);
+                panfrost_batch_submit(ctx, batch, ctx->syncobj, ctx->syncobj);
+        }
 }
 
 void
@@ -974,14 +901,3 @@ panfrost_batch_union_scissor(struct panfrost_batch *batch,
         batch->maxx = MAX2(batch->maxx, maxx);
         batch->maxy = MAX2(batch->maxy, maxy);
 }
-
-void
-panfrost_batch_intersection_scissor(struct panfrost_batch *batch,
-                                  unsigned minx, unsigned miny,
-                                  unsigned maxx, unsigned maxy)
-{
-        batch->minx = MAX2(batch->minx, minx);
-        batch->miny = MAX2(batch->miny, miny);
-        batch->maxx = MIN2(batch->maxx, maxx);
-        batch->maxy = MIN2(batch->maxy, maxy);
-}
diff --git a/mesa 3D driver/src/gallium/drivers/panfrost/pan_job.h b/mesa 3D driver/src/gallium/drivers/panfrost/pan_job.h
index 6b613aa654..32ba7be754 100644
--- a/mesa 3D driver/src/gallium/drivers/panfrost/pan_job.h	
+++ b/mesa 3D driver/src/gallium/drivers/panfrost/pan_job.h	
@@ -130,17 +130,12 @@ struct panfrost_batch {
         mali_ptr uniform_buffers[PIPE_SHADER_TYPES];
         mali_ptr push_uniforms[PIPE_SHADER_TYPES];
 
-        /* Referenced resources for cleanup */
-        struct util_dynarray resources;
+        /* Referenced resources */
+        struct set *resources;
 };
 
 /* Functions for managing the above */
 
-struct panfrost_batch *
-panfrost_get_fresh_batch(struct panfrost_context *ctx,
-                         const struct pipe_framebuffer_state *key,
-                         const char *reason);
-
 struct panfrost_batch *
 panfrost_get_batch_for_fbo(struct panfrost_context *ctx);
 
@@ -162,9 +157,6 @@ panfrost_batch_write_rsrc(struct panfrost_batch *batch,
                           struct panfrost_resource *rsrc,
                           enum pipe_shader_type stage);
 
-void
-panfrost_batch_add_fbo_bos(struct panfrost_batch *batch);
-
 struct panfrost_bo *
 panfrost_batch_create_bo(struct panfrost_batch *batch, size_t size,
                          uint32_t create_flags, uint32_t access_flags,
@@ -203,9 +195,4 @@ panfrost_batch_union_scissor(struct panfrost_batch *batch,
                              unsigned minx, unsigned miny,
                              unsigned maxx, unsigned maxy);
 
-void
-panfrost_batch_intersection_scissor(struct panfrost_batch *batch,
-                                    unsigned minx, unsigned miny,
-                                    unsigned maxx, unsigned maxy);
-
 #endif
diff --git a/mesa 3D driver/src/gallium/drivers/panfrost/pan_resource.c b/mesa 3D driver/src/gallium/drivers/panfrost/pan_resource.c
index 49bd3fb1e9..b6a6d81138 100644
--- a/mesa 3D driver/src/gallium/drivers/panfrost/pan_resource.c	
+++ b/mesa 3D driver/src/gallium/drivers/panfrost/pan_resource.c	
@@ -67,7 +67,7 @@ panfrost_resource_from_handle(struct pipe_screen *pscreen,
 
         assert(whandle->type == WINSYS_HANDLE_TYPE_FD);
 
-        rsc = rzalloc(pscreen, struct panfrost_resource);
+        rsc = CALLOC_STRUCT(panfrost_resource);
         if (!rsc)
                 return NULL;
 
@@ -98,7 +98,7 @@ panfrost_resource_from_handle(struct pipe_screen *pscreen,
                                            crc_mode, &explicit_layout);
 
         if (!valid) {
-                ralloc_free(rsc);
+                FREE(rsc);
                 return NULL;
         }
 
@@ -107,7 +107,7 @@ panfrost_resource_from_handle(struct pipe_screen *pscreen,
          * memory space to mmap it etc.
          */
         if (!rsc->image.data.bo) {
-                ralloc_free(rsc);
+                FREE(rsc);
                 return NULL;
         }
         if (rsc->image.layout.crc_mode == PAN_IMAGE_CRC_OOB)
@@ -135,8 +135,21 @@ panfrost_resource_get_handle(struct pipe_screen *pscreen,
                              unsigned usage)
 {
         struct panfrost_device *dev = pan_device(pscreen);
-        struct panfrost_resource *rsrc = (struct panfrost_resource *) pt;
-        struct renderonly_scanout *scanout = rsrc->scanout;
+        struct panfrost_resource *rsrc;
+        struct renderonly_scanout *scanout;
+        struct pipe_resource *cur = pt;
+
+        /* Even though panfrost doesn't support multi-planar formats, we
+         * can get here through GBM, which does. Walk the list of planes
+         * to find the right one.
+         */
+        for (int i = 0; i < handle->plane; i++) {
+                cur = cur->next;
+                if (!cur)
+                        return false;
+        }
+        rsrc = pan_resource(cur);
+        scanout = rsrc->scanout;
 
         handle->modifier = rsrc->image.layout.modifier;
         rsrc->modifier_constant = true;
@@ -191,6 +204,8 @@ panfrost_resource_get_param(struct pipe_screen *pscreen,
                             unsigned usage, uint64_t *value)
 {
         struct panfrost_resource *rsrc = (struct panfrost_resource *) prsc;
+        struct pipe_resource *cur;
+        unsigned count;
 
         switch (param) {
         case PIPE_RESOURCE_PARAM_STRIDE:
@@ -202,6 +217,16 @@ panfrost_resource_get_param(struct pipe_screen *pscreen,
         case PIPE_RESOURCE_PARAM_MODIFIER:
                 *value = rsrc->image.layout.modifier;
                 return true;
+        case PIPE_RESOURCE_PARAM_NPLANES:
+                /* Panfrost doesn't directly support multi-planar formats,
+                 * but we should still handle this case for gbm users
+                 * that might want to use resources shared with panfrost
+                 * on video processing hardware that does.
+                 */
+                for (count = 0, cur = prsc; cur; cur = cur->next)
+                        count++;
+                *value = count;
+                return true;
         default:
                 return false;
         }
@@ -218,8 +243,11 @@ panfrost_create_surface(struct pipe_context *pipe,
                         struct pipe_resource *pt,
                         const struct pipe_surface *surf_tmpl)
 {
+        struct panfrost_context *ctx = pan_context(pipe);
         struct pipe_surface *ps = NULL;
 
+        pan_legalize_afbc_format(ctx, pan_resource(pt), surf_tmpl->format);
+
         ps = CALLOC_STRUCT(pipe_surface);
 
         if (ps) {
@@ -355,8 +383,8 @@ panfrost_should_afbc(struct panfrost_device *dev,
         if (pres->base.bind & ~valid_binding)
                 return false;
 
-        /* AFBC introduced with Mali T760 */
-        if (dev->quirks & MIDGARD_NO_AFBC)
+        /* AFBC support is optional */
+        if (!dev->has_afbc)
                 return false;
 
         /* AFBC<-->staging is expensive */
@@ -554,7 +582,7 @@ panfrost_resource_set_damage_region(struct pipe_screen *screen,
                                 pres->damage.tile_map.stride *
                                 DIV_ROUND_UP(res->height0, 32);
                         pres->damage.tile_map.data =
-                                ralloc_size(pres, pres->damage.tile_map.size);
+                                malloc(pres->damage.tile_map.size);
                 }
 
                 memset(pres->damage.tile_map.data, 0, pres->damage.tile_map.size);
@@ -638,7 +666,7 @@ panfrost_resource_create_with_modifier(struct pipe_screen *screen,
             (PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT | PIPE_BIND_SHARED)))
                 return panfrost_create_scanout_res(screen, template, modifier);
 
-        struct panfrost_resource *so = rzalloc(screen, struct panfrost_resource);
+        struct panfrost_resource *so = CALLOC_STRUCT(panfrost_resource);
         so->base = *template;
         so->base.screen = screen;
 
@@ -676,7 +704,7 @@ panfrost_resource_create_with_modifier(struct pipe_screen *screen,
         panfrost_resource_set_damage_region(screen, &so->base, 0, NULL);
 
         if (template->bind & PIPE_BIND_INDEX_BUFFER)
-                so->index_cache = rzalloc(so, struct panfrost_minmax_cache);
+                so->index_cache = CALLOC_STRUCT(panfrost_minmax_cache);
 
         return (struct pipe_resource *)so;
 }
@@ -727,8 +755,11 @@ panfrost_resource_destroy(struct pipe_screen *screen,
         if (rsrc->image.crc.bo)
                 panfrost_bo_unreference(rsrc->image.crc.bo);
 
+        free(rsrc->index_cache);
+        free(rsrc->damage.tile_map.data);
+
         util_range_destroy(&rsrc->valid_buffer_range);
-        ralloc_free(rsrc);
+        free(rsrc);
 }
 
 /* Most of the time we can do CPU-side transfers, but sometimes we need to use
@@ -871,7 +902,7 @@ panfrost_ptr_map(struct pipe_context *pctx,
 
                 bool valid = BITSET_TEST(rsrc->valid.data, level);
 
-                if ((usage & PIPE_MAP_READ) && (valid || rsrc->track.writer)) {
+                if ((usage & PIPE_MAP_READ) && (valid || rsrc->track.nr_writers > 0)) {
                         pan_blit_to_staging(pctx, transfer);
                         panfrost_flush_writer(ctx, staging, "AFBC read staging blit");
                         panfrost_bo_wait(staging->image.data.bo, INT64_MAX, false);
@@ -895,7 +926,7 @@ panfrost_ptr_map(struct pipe_context *pctx,
             (usage & PIPE_MAP_WRITE) &&
             !(resource->target == PIPE_BUFFER
               && !util_ranges_intersect(&rsrc->valid_buffer_range, box->x, box->x + box->width)) &&
-            BITSET_COUNT(rsrc->track.users) != 0) {
+            rsrc->track.nr_users > 0) {
 
                 /* When a resource to be modified is already being used by a
                  * pending batch, it is often faster to copy the whole BO than
@@ -914,7 +945,7 @@ panfrost_ptr_map(struct pipe_context *pctx,
                  * not ready yet (still accessed by one of the already flushed
                  * batches), we try to allocate a new one to avoid waiting.
                  */
-                if (BITSET_COUNT(rsrc->track.users) ||
+                if (rsrc->track.nr_users > 0 ||
                     !panfrost_bo_wait(bo, 0, true)) {
                         /* We want the BO to be MMAPed. */
                         uint32_t flags = bo->flags & ~PAN_BO_DELAY_MMAP;
@@ -1064,6 +1095,29 @@ pan_resource_modifier_convert(struct panfrost_context *ctx,
         pipe_resource_reference(&tmp_prsrc, NULL);
 }
 
+/* Validate that an AFBC resource may be used as a particular format. If it may
+ * not, decompress it on the fly. Failure to do so can produce wrong results or
+ * invalid data faults when sampling or rendering to AFBC */
+
+void
+pan_legalize_afbc_format(struct panfrost_context *ctx,
+                         struct panfrost_resource *rsrc,
+                         enum pipe_format format)
+{
+        struct panfrost_device *dev = pan_device(ctx->base.screen);
+
+        if (!drm_is_afbc(rsrc->image.layout.modifier))
+                return;
+
+        if (panfrost_afbc_format(dev, pan_blit_format(rsrc->base.format)) ==
+            panfrost_afbc_format(dev, pan_blit_format(format)))
+                return;
+
+        pan_resource_modifier_convert(ctx, rsrc,
+                        DRM_FORMAT_MOD_ARM_16X16_BLOCK_U_INTERLEAVED,
+                        "Reinterpreting AFBC surface as incompatible format");
+}
+
 static bool
 panfrost_should_linear_convert(struct panfrost_device *dev,
                                struct panfrost_resource *prsrc,
@@ -1275,44 +1329,6 @@ panfrost_generate_mipmap(
         return blit_res;
 }
 
-/* Computes the address to a texture at a particular slice */
-
-mali_ptr
-panfrost_get_texture_address(struct panfrost_resource *rsrc,
-                             unsigned level, unsigned layer,
-                             unsigned sample)
-{
-        bool is_3d = rsrc->base.target == PIPE_TEXTURE_3D;
-        unsigned array_idx = is_3d ? 0 : layer;
-        unsigned surface_idx = is_3d ? layer : sample;
-        return rsrc->image.data.bo->ptr.gpu +
-               panfrost_texture_offset(&rsrc->image.layout, level,
-                                       array_idx, surface_idx);
-}
-
-void
-panfrost_get_afbc_pointers(struct panfrost_resource *rsrc,
-                           unsigned level, unsigned layer,
-                           mali_ptr *header, mali_ptr *body)
-{
-        assert(drm_is_afbc(rsrc->image.layout.modifier));
-
-        struct pan_image_slice_layout *slice = &rsrc->image.layout.slices[level];
-
-        if (rsrc->base.target == PIPE_TEXTURE_3D) {
-                *header = rsrc->image.data.bo->ptr.gpu + slice->offset +
-                          (layer * slice->afbc.surface_stride);
-                *body = rsrc->image.data.bo->ptr.gpu + slice->offset +
-                        slice->afbc.header_size +
-                        (slice->surface_stride * layer);
-        } else {
-                *header = rsrc->image.data.bo->ptr.gpu +
-                          panfrost_texture_offset(&rsrc->image.layout,
-                                                  level, layer, 0);
-                *body = *header + slice->afbc.header_size;
-        }
-}
-
 static void
 panfrost_resource_set_stencil(struct pipe_resource *prsrc,
                               struct pipe_resource *stencil)
diff --git a/mesa 3D driver/src/gallium/drivers/panfrost/pan_resource.h b/mesa 3D driver/src/gallium/drivers/panfrost/pan_resource.h
index 7ee2ef0e24..1d7bb81781 100644
--- a/mesa 3D driver/src/gallium/drivers/panfrost/pan_resource.h	
+++ b/mesa 3D driver/src/gallium/drivers/panfrost/pan_resource.h	
@@ -48,8 +48,14 @@ struct panfrost_resource {
         } damage;
 
         struct {
-                struct panfrost_batch *writer;
-                BITSET_DECLARE(users, PAN_MAX_BATCHES);
+                /** Number of batches accessing this resource. Used to check if
+                 * a resource is in use. */
+                _Atomic unsigned nr_users;
+
+                /** Number of batches writing this resource. Note that only one
+                 * batch per context may write a resource, so this is the
+                 * number of contexts that have an active writer. */
+                _Atomic unsigned nr_writers;
         } track;
 
         struct renderonly_scanout *scanout;
@@ -101,16 +107,6 @@ pan_transfer(struct pipe_transfer *p)
         return (struct panfrost_transfer *)p;
 }
 
-mali_ptr
-panfrost_get_texture_address(struct panfrost_resource *rsrc,
-                             unsigned level, unsigned layer,
-                             unsigned sample);
-
-void
-panfrost_get_afbc_pointers(struct panfrost_resource *rsrc,
-                           unsigned level, unsigned layer,
-                           mali_ptr *header, mali_ptr *body);
-
 void panfrost_resource_screen_init(struct pipe_screen *screen);
 
 void panfrost_resource_screen_destroy(struct pipe_screen *screen);
@@ -160,4 +156,9 @@ pan_resource_modifier_convert(struct panfrost_context *ctx,
                               struct panfrost_resource *rsrc,
                               uint64_t modifier, const char *reason);
 
+void
+pan_legalize_afbc_format(struct panfrost_context *ctx,
+                         struct panfrost_resource *rsrc,
+                         enum pipe_format format);
+
 #endif /* PAN_RESOURCE_H */
diff --git a/mesa 3D driver/src/gallium/drivers/panfrost/pan_screen.c b/mesa 3D driver/src/gallium/drivers/panfrost/pan_screen.c
index 5067a64dba..abc2143275 100644
--- a/mesa 3D driver/src/gallium/drivers/panfrost/pan_screen.c	
+++ b/mesa 3D driver/src/gallium/drivers/panfrost/pan_screen.c	
@@ -49,8 +49,6 @@
 #include "pan_resource.h"
 #include "pan_public.h"
 #include "pan_util.h"
-#include "pan_indirect_dispatch.h"
-#include "pan_indirect_draw.h"
 #include "decode.h"
 
 #include "pan_context.h"
@@ -68,7 +66,7 @@ static const struct debug_named_value panfrost_debug_options[] = {
         {"noafbc",    PAN_DBG_NO_AFBC,  "Disable AFBC support"},
         {"nocrc",     PAN_DBG_NO_CRC,   "Disable transaction elimination"},
         {"msaa16",    PAN_DBG_MSAA16,   "Enable MSAA 8x and 16x support"},
-        {"noindirect", PAN_DBG_NOINDIRECT, "Emulate indirect draws on the CPU"},
+        {"indirect",  PAN_DBG_INDIRECT, "Use experimental compute kernel for indirect draws"},
         {"linear",    PAN_DBG_LINEAR,   "Force linear textures"},
         {"nocache",   PAN_DBG_NO_CACHE, "Disable BO cache"},
         DEBUG_NAMED_VALUE_END
@@ -219,19 +217,19 @@ panfrost_get_param(struct pipe_screen *screen, enum pipe_cap param)
                 return 1;
 
         case PIPE_CAP_MAX_TEXTURE_2D_SIZE:
-                return 4096;
+                return 1 << (MAX_MIP_LEVELS - 1);
+
         case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
-                return 13;
         case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
-                return 13;
+                return MAX_MIP_LEVELS;
 
         case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
-                /* Hardware is natively upper left */
+        case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+                /* Hardware is upper left. Pixel center at (0.5, 0.5) */
                 return 0;
 
         case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
         case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
-        case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
         case PIPE_CAP_TGSI_TEXCOORD:
                 return 1;
 
@@ -281,7 +279,9 @@ panfrost_get_param(struct pipe_screen *screen, enum pipe_cap param)
                 return 4;
 
         case PIPE_CAP_MAX_VARYINGS:
-                return PIPE_MAX_ATTRIBS;
+                /* Return the GLSL maximum. The internal maximum
+                 * PAN_MAX_VARYINGS accommodates internal varyings. */
+                return MAX_VARYING;
 
         /* Removed in v6 (Bifrost) */
         case PIPE_CAP_ALPHA_TEST:
@@ -309,6 +309,19 @@ panfrost_get_param(struct pipe_screen *screen, enum pipe_cap param)
         case PIPE_CAP_DRAW_PARAMETERS:
                 return pan_is_bifrost(dev);
 
+        case PIPE_CAP_SUPPORTED_PRIM_MODES:
+        case PIPE_CAP_SUPPORTED_PRIM_MODES_WITH_RESTART: {
+                /* Mali supports GLES and QUADS. Midgard supports more */
+                uint32_t modes = BITFIELD_MASK(PIPE_PRIM_QUADS + 1);
+
+                if (dev->arch <= 5) {
+                        modes |= BITFIELD_BIT(PIPE_PRIM_QUAD_STRIP);
+                        modes |= BITFIELD_BIT(PIPE_PRIM_POLYGON);
+                }
+
+                return modes;
+        }
+
         default:
                 return u_pipe_screen_get_param_defaults(screen, param);
         }
@@ -561,12 +574,9 @@ panfrost_walk_dmabuf_modifiers(struct pipe_screen *screen,
 {
         /* Query AFBC status */
         struct panfrost_device *dev = pan_device(screen);
-        bool afbc = panfrost_format_supports_afbc(dev, format);
+        bool afbc = dev->has_afbc && panfrost_format_supports_afbc(dev, format);
         bool ytr = panfrost_afbc_can_ytr(format);
 
-        /* Don't advertise AFBC before T760 */
-        afbc &= !(dev->quirks & MIDGARD_NO_AFBC);
-
         unsigned count = 0;
 
         for (unsigned i = 0; i < PAN_MODIFIER_COUNT; ++i) {
@@ -696,14 +706,13 @@ panfrost_destroy_screen(struct pipe_screen *pscreen)
         struct panfrost_screen *screen = pan_screen(pscreen);
 
         panfrost_resource_screen_destroy(pscreen);
-        pan_indirect_dispatch_cleanup(dev);
-        panfrost_cleanup_indirect_draw_shaders(dev);
         panfrost_pool_cleanup(&screen->indirect_draw.bin_pool);
         panfrost_pool_cleanup(&screen->blitter.bin_pool);
         panfrost_pool_cleanup(&screen->blitter.desc_pool);
         pan_blend_shaders_cleanup(dev);
 
-        screen->vtbl.screen_destroy(pscreen);
+        if (screen->vtbl.screen_destroy)
+                screen->vtbl.screen_destroy(pscreen);
 
         if (dev->ro)
                 dev->ro->destroy(dev->ro);
@@ -811,7 +820,7 @@ panfrost_screen_get_compiler_options(struct pipe_screen *pscreen,
                                      enum pipe_shader_ir ir,
                                      enum pipe_shader_type shader)
 {
-        return pan_shader_get_compiler_options(pan_device(pscreen));
+        return pan_screen(pscreen)->vtbl.get_compiler_options();
 }
 
 struct pipe_screen *
@@ -830,19 +839,7 @@ panfrost_create_screen(int fd, struct renderonly *ro)
         panfrost_open_device(screen, fd, dev);
 
         if (dev->debug & PAN_DBG_NO_AFBC)
-                dev->quirks |= MIDGARD_NO_AFBC;
-
-        /* XXX: AFBC is currently broken on Bifrost in a few different ways
-         *
-         *  - Preload is broken if the effective tile size is not 16x16
-         *  - Some systems lack AFBC but we need kernel changes to know that
-         */
-        if (dev->arch == 7)
-                dev->quirks |= MIDGARD_NO_AFBC;
-
-        /* XXX: Indirect draws on Midgard need debugging, emulate for now */
-        if (dev->arch < 6)
-                dev->debug |= PAN_DBG_NOINDIRECT;
+                dev->has_afbc = false;
 
         dev->ro = ro;
 
@@ -890,8 +887,6 @@ panfrost_create_screen(int fd, struct renderonly *ro)
         panfrost_pool_init(&screen->indirect_draw.bin_pool, NULL, dev,
                            PAN_BO_EXECUTE, 65536, "Indirect draw shaders",
                            false, true);
-        panfrost_init_indirect_draw_shaders(dev, &screen->indirect_draw.bin_pool.base);
-        pan_indirect_dispatch_init(dev);
         panfrost_pool_init(&screen->blitter.bin_pool, NULL, dev, PAN_BO_EXECUTE,
                            4096, "Blitter shaders", false, true);
         panfrost_pool_init(&screen->blitter.desc_pool, NULL, dev, 0, 65536,
diff --git a/mesa 3D driver/src/gallium/drivers/panfrost/pan_screen.h b/mesa 3D driver/src/gallium/drivers/panfrost/pan_screen.h
index 198f33b26d..6f2676fe9f 100644
--- a/mesa 3D driver/src/gallium/drivers/panfrost/pan_screen.h	
+++ b/mesa 3D driver/src/gallium/drivers/panfrost/pan_screen.h	
@@ -46,14 +46,14 @@ struct panfrost_context;
 struct panfrost_resource;
 struct panfrost_shader_state;
 struct pan_fb_info;
+struct pan_blend_state;
 
 /* Virtual table of per-generation (GenXML) functions */
 
 struct panfrost_vtable {
         /* Prepares the renderer state descriptor for a given compiled shader,
          * and if desired uploads it as well */
-        void (*prepare_rsd)(struct panfrost_device *,
-                            struct panfrost_shader_state *,
+        void (*prepare_rsd)(struct panfrost_shader_state *,
                             struct panfrost_pool *, bool);
 
         /* Emits a thread local storage descriptor */
@@ -76,6 +76,23 @@ struct panfrost_vtable {
 
         /* Device-dependent initialization of a panfrost_batch */
         void (*init_batch)(struct panfrost_batch *batch);
+
+        /* Get blend shader */
+        struct pan_blend_shader_variant *
+        (*get_blend_shader)(const struct panfrost_device *,
+                            const struct pan_blend_state *,
+                            nir_alu_type, nir_alu_type,
+                            unsigned rt);
+
+        /* Initialize the polygon list */
+        void (*init_polygon_list)(struct panfrost_batch *);
+
+        /* Shader compilation methods */
+        const nir_shader_compiler_options *(*get_compiler_options)(void);
+        void (*compile_shader)(nir_shader *s,
+                               struct panfrost_compile_inputs *inputs,
+                               struct util_dynarray *binary,
+                               struct pan_shader_info *info);
 };
 
 struct panfrost_screen {
diff --git a/mesa 3D driver/src/gallium/drivers/r300/compiler/r300_fragprog_swizzle.c b/mesa 3D driver/src/gallium/drivers/r300/compiler/r300_fragprog_swizzle.c
index 89cb9b6d9b..da2a484d2a 100644
--- a/mesa 3D driver/src/gallium/drivers/r300/compiler/r300_fragprog_swizzle.c	
+++ b/mesa 3D driver/src/gallium/drivers/r300/compiler/r300_fragprog_swizzle.c	
@@ -195,7 +195,7 @@ static void r300_swizzle_split(
 	}
 }
 
-struct rc_swizzle_caps r300_swizzle_caps = {
+const struct rc_swizzle_caps r300_swizzle_caps = {
 	.IsNative = r300_swizzle_is_native,
 	.Split = r300_swizzle_split
 };
diff --git a/mesa 3D driver/src/gallium/drivers/r300/compiler/r300_fragprog_swizzle.h b/mesa 3D driver/src/gallium/drivers/r300/compiler/r300_fragprog_swizzle.h
index f2635be140..b125944181 100644
--- a/mesa 3D driver/src/gallium/drivers/r300/compiler/r300_fragprog_swizzle.h	
+++ b/mesa 3D driver/src/gallium/drivers/r300/compiler/r300_fragprog_swizzle.h	
@@ -30,7 +30,7 @@
 
 #include "radeon_swizzle.h"
 
-extern struct rc_swizzle_caps r300_swizzle_caps;
+extern const struct rc_swizzle_caps r300_swizzle_caps;
 
 unsigned int r300FPTranslateRGBSwizzle(unsigned int src, unsigned int swizzle);
 unsigned int r300FPTranslateAlphaSwizzle(unsigned int src, unsigned int swizzle);
diff --git a/mesa 3D driver/src/gallium/drivers/r300/compiler/r3xx_vertprog.c b/mesa 3D driver/src/gallium/drivers/r300/compiler/r3xx_vertprog.c
index bd4b16117e..88b8dfcd9f 100644
--- a/mesa 3D driver/src/gallium/drivers/r300/compiler/r3xx_vertprog.c	
+++ b/mesa 3D driver/src/gallium/drivers/r300/compiler/r3xx_vertprog.c	
@@ -369,7 +369,7 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user)
 	struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
 	struct rc_instruction *rci;
 
-	unsigned loops[R500_PVS_MAX_LOOP_DEPTH];
+	unsigned loops[R500_PVS_MAX_LOOP_DEPTH] = {};
 	unsigned loop_depth = 0;
 
 	compiler->code->pos_end = 0;	/* Not supported yet */
@@ -755,8 +755,8 @@ static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)
 	int i;
 
 	for(i = 0; i < 32; ++i) {
-		if ((compiler->RequiredOutputs & (1 << i)) &&
-		    !(compiler->Base.Program.OutputsWritten & (1 << i))) {
+		if ((compiler->RequiredOutputs & (1U << i)) &&
+		    !(compiler->Base.Program.OutputsWritten & (1U << i))) {
 			struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
 			inst->U.I.Opcode = RC_OPCODE_MOV;
 
@@ -768,7 +768,7 @@ static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)
 			inst->U.I.SrcReg[0].Index = 0;
 			inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
 
-			compiler->Base.Program.OutputsWritten |= 1 << i;
+			compiler->Base.Program.OutputsWritten |= 1U << i;
 		}
 	}
 }
@@ -780,7 +780,7 @@ static void dataflow_outputs_mark_used(void * userdata, void * data,
 	int i;
 
 	for(i = 0; i < 32; ++i) {
-		if (c->RequiredOutputs & (1 << i))
+		if (c->RequiredOutputs & (1U << i))
 			callback(data, i, RC_MASK_XYZW);
 	}
 }
@@ -864,7 +864,7 @@ static void rc_emulate_negative_addressing(struct radeon_compiler *compiler, voi
 		transform_negative_addressing(c, lastARL, inst, min_offset);
 }
 
-struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
+const struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
 	.IsNative = &swizzle_is_native,
 	.Split = 0 /* should never be called */
 };
diff --git a/mesa 3D driver/src/gallium/drivers/r300/compiler/r3xx_vertprog_dump.c b/mesa 3D driver/src/gallium/drivers/r300/compiler/r3xx_vertprog_dump.c
index 3cffe4458e..dde74795b1 100644
--- a/mesa 3D driver/src/gallium/drivers/r300/compiler/r3xx_vertprog_dump.c	
+++ b/mesa 3D driver/src/gallium/drivers/r300/compiler/r3xx_vertprog_dump.c	
@@ -26,7 +26,7 @@
 
 #include <stdio.h>
 
-static char* r300_vs_ve_ops[] = {
+static const char* r300_vs_ve_ops[] = {
 	/* R300 vector ops */
 	"                 VE_NO_OP",
 	"           VE_DOT_PRODUCT",
@@ -63,7 +63,7 @@ static char* r300_vs_ve_ops[] = {
 	"               (reserved)",
 };
 
-static char* r300_vs_me_ops[] = {
+static const char* r300_vs_me_ops[] = {
 	/* R300 math ops */
 	"                 ME_NO_OP",
 	"          ME_EXP_BASE2_DX",
@@ -101,14 +101,14 @@ static char* r300_vs_me_ops[] = {
 };
 
 /* XXX refactor to avoid clashing symbols */
-static char* r300_vs_src_debug[] = {
+static const char* r300_vs_src_debug[] = {
 	"t",
 	"i",
 	"c",
 	"a",
 };
 
-static char* r300_vs_dst_debug[] = {
+static const char* r300_vs_dst_debug[] = {
 	"t",
 	"a0",
 	"o",
@@ -119,7 +119,7 @@ static char* r300_vs_dst_debug[] = {
 	"u",
 };
 
-static char* r300_vs_swiz_debug[] = {
+static const char* r300_vs_swiz_debug[] = {
 	"X",
 	"Y",
 	"Z",
diff --git a/mesa 3D driver/src/gallium/drivers/r300/compiler/r500_fragprog.c b/mesa 3D driver/src/gallium/drivers/r300/compiler/r500_fragprog.c
index 8198ff2a36..6be615f33d 100644
--- a/mesa 3D driver/src/gallium/drivers/r300/compiler/r500_fragprog.c	
+++ b/mesa 3D driver/src/gallium/drivers/r300/compiler/r500_fragprog.c	
@@ -266,7 +266,7 @@ static void r500_swizzle_split(struct rc_src_register src, unsigned int usemask,
 	}
 }
 
-struct rc_swizzle_caps r500_swizzle_caps = {
+const struct rc_swizzle_caps r500_swizzle_caps = {
 	.IsNative = r500_swizzle_is_native,
 	.Split = r500_swizzle_split
 };
diff --git a/mesa 3D driver/src/gallium/drivers/r300/compiler/r500_fragprog.h b/mesa 3D driver/src/gallium/drivers/r300/compiler/r500_fragprog.h
index 6aa448cc6f..1c30dc0e85 100644
--- a/mesa 3D driver/src/gallium/drivers/r300/compiler/r500_fragprog.h	
+++ b/mesa 3D driver/src/gallium/drivers/r300/compiler/r500_fragprog.h	
@@ -40,7 +40,7 @@ extern void r500BuildFragmentProgramHwCode(struct radeon_compiler *c, void *user
 
 extern void r500FragmentProgramDump(struct radeon_compiler *c, void *user);
 
-extern struct rc_swizzle_caps r500_swizzle_caps;
+extern const struct rc_swizzle_caps r500_swizzle_caps;
 
 extern int r500_transform_IF(
 	struct radeon_compiler * c,
diff --git a/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_compiler.c b/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_compiler.c
index 081cd2d0d5..78902d9806 100644
--- a/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_compiler.c	
+++ b/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_compiler.c	
@@ -123,12 +123,12 @@ void rc_calculate_inputs_outputs(struct radeon_compiler * c)
 
 		for (i = 0; i < opcode->NumSrcRegs; ++i) {
 			if (inst->U.I.SrcReg[i].File == RC_FILE_INPUT)
-				c->Program.InputsRead |= 1 << inst->U.I.SrcReg[i].Index;
+				c->Program.InputsRead |= 1U << inst->U.I.SrcReg[i].Index;
 		}
 
 		if (opcode->HasDstReg) {
 			if (inst->U.I.DstReg.File == RC_FILE_OUTPUT)
-				c->Program.OutputsWritten |= 1 << inst->U.I.DstReg.Index;
+				c->Program.OutputsWritten |= 1U << inst->U.I.DstReg.Index;
 		}
 	}
 }
@@ -141,7 +141,7 @@ void rc_move_input(struct radeon_compiler * c, unsigned input, struct rc_src_reg
 {
 	struct rc_instruction * inst;
 
-	c->Program.InputsRead &= ~(1 << input);
+	c->Program.InputsRead &= ~(1U << input);
 
 	for(inst = c->Program.Instructions.Next; inst != &c->Program.Instructions; inst = inst->Next) {
 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
@@ -157,7 +157,7 @@ void rc_move_input(struct radeon_compiler * c, unsigned input, struct rc_src_reg
 					inst->U.I.SrcReg[i].Abs = new_input.Abs;
 				}
 
-				c->Program.InputsRead |= 1 << new_input.Index;
+				c->Program.InputsRead |= 1U << new_input.Index;
 			}
 		}
 	}
@@ -173,7 +173,7 @@ void rc_move_output(struct radeon_compiler * c, unsigned output, unsigned new_ou
 {
 	struct rc_instruction * inst;
 
-	c->Program.OutputsWritten &= ~(1 << output);
+	c->Program.OutputsWritten &= ~(1U << output);
 
 	for(inst = c->Program.Instructions.Next; inst != &c->Program.Instructions; inst = inst->Next) {
 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
@@ -183,7 +183,7 @@ void rc_move_output(struct radeon_compiler * c, unsigned output, unsigned new_ou
 				inst->U.I.DstReg.Index = new_output;
 				inst->U.I.DstReg.WriteMask &= writemask;
 
-				c->Program.OutputsWritten |= 1 << new_output;
+				c->Program.OutputsWritten |= 1U << new_output;
 			}
 		}
 	}
@@ -227,7 +227,7 @@ void rc_copy_output(struct radeon_compiler * c, unsigned output, unsigned dup_ou
 	inst->U.I.SrcReg[0].Index = tempreg;
 	inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
 
-	c->Program.OutputsWritten |= 1 << dup_output;
+	c->Program.OutputsWritten |= 1U << dup_output;
 }
 
 
@@ -243,8 +243,8 @@ void rc_transform_fragment_wpos(struct radeon_compiler * c, unsigned wpos, unsig
 	struct rc_instruction * inst_mad;
 	struct rc_instruction * inst;
 
-	c->Program.InputsRead &= ~(1 << wpos);
-	c->Program.InputsRead |= 1 << new_input;
+	c->Program.InputsRead &= ~(1U << wpos);
+	c->Program.InputsRead |= 1U << new_input;
 
 	/* perspective divide */
 	inst_rcp = rc_insert_new_instruction(c, &c->Program.Instructions);
diff --git a/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_compiler.h b/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_compiler.h
index 9dc6855be3..7089bcbea2 100644
--- a/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_compiler.h	
+++ b/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_compiler.h	
@@ -68,7 +68,7 @@ struct radeon_compiler {
 	 * of the compiler
 	 */
 	/*@{*/
-	struct rc_swizzle_caps * SwizzleCaps;
+	const struct rc_swizzle_caps * SwizzleCaps;
 	/*@}*/
 
 	struct emulate_loop_state loop_state;
diff --git a/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_opcodes.c b/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_opcodes.c
index 10033e0287..3e08a09499 100644
--- a/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_opcodes.c	
+++ b/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_opcodes.c	
@@ -32,7 +32,7 @@
 
 #include "util/compiler.h"
 
-struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = {
+const struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = {
 	{
 		.Opcode = RC_OPCODE_NOP,
 		.Name = "NOP"
diff --git a/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_opcodes.h b/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_opcodes.h
index 1c42505072..3f695ff480 100644
--- a/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_opcodes.h	
+++ b/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_opcodes.h	
@@ -268,7 +268,7 @@ struct rc_opcode_info {
 	unsigned int IsStandardScalar:1;
 };
 
-extern struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE];
+extern const struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE];
 
 static inline const struct rc_opcode_info * rc_get_opcode_info(rc_opcode opcode)
 {
diff --git a/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_program_alu.c b/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_program_alu.c
index c8aabc255d..9d96307f11 100644
--- a/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_program_alu.c	
+++ b/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_program_alu.c	
@@ -858,7 +858,7 @@ static void transform_r300_vertex_SSG(struct radeon_compiler* c,
 	 *   SLT tmp1, x, 0;
 	 *   ADD result, tmp0, -tmp1;
 	 */
-	struct rc_dst_register dst0 = try_to_reuse_dst(c, inst);
+	struct rc_dst_register dst0;
 	unsigned tmp1;
 
 	/* 0 < x */
diff --git a/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_program_tex.c b/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_program_tex.c
index 17d6ee9aeb..a7b3ad7480 100644
--- a/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_program_tex.c	
+++ b/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_program_tex.c	
@@ -140,7 +140,7 @@ int radeonTransformTEX(
 
 	/* ARB_shadow & EXT_shadow_funcs */
 	if (inst->U.I.Opcode != RC_OPCODE_KIL &&
-		((c->Program.ShadowSamplers & (1 << inst->U.I.TexSrcUnit)) ||
+		((c->Program.ShadowSamplers & (1U << inst->U.I.TexSrcUnit)) ||
 		 (compiler->state.unit[inst->U.I.TexSrcUnit].compare_mode_enabled))) {
 		rc_compare_func comparefunc = compiler->state.unit[inst->U.I.TexSrcUnit].texture_compare_func;
 
diff --git a/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_swizzle.h b/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_swizzle.h
index 9a048e4eac..f7a9baae1b 100644
--- a/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_swizzle.h	
+++ b/mesa 3D driver/src/gallium/drivers/r300/compiler/radeon_swizzle.h	
@@ -54,6 +54,6 @@ struct rc_swizzle_caps {
 	void (*Split)(struct rc_src_register reg, unsigned int mask, struct rc_swizzle_split * split);
 };
 
-extern struct rc_swizzle_caps r300_vertprog_swizzle_caps;
+extern const struct rc_swizzle_caps r300_vertprog_swizzle_caps;
 
 #endif /* RADEON_SWIZZLE_H */
diff --git a/mesa 3D driver/src/gallium/drivers/r300/r300_blit.c b/mesa 3D driver/src/gallium/drivers/r300/r300_blit.c
index 738bc53ba9..2609ee3053 100644
--- a/mesa 3D driver/src/gallium/drivers/r300/r300_blit.c	
+++ b/mesa 3D driver/src/gallium/drivers/r300/r300_blit.c	
@@ -384,7 +384,7 @@ static void r300_clear(struct pipe_context* pipe,
             r300_get_num_cs_end_dwords(r300);
 
         /* Reserve CS space. */
-        if (!r300->rws->cs_check_space(&r300->cs, dwords, false)) {
+        if (!r300->rws->cs_check_space(&r300->cs, dwords)) {
             r300_flush(&r300->context, PIPE_FLUSH_ASYNC, NULL);
         }
 
@@ -676,7 +676,7 @@ static void r300_resource_copy_region(struct pipe_context *pipe,
     util_blitter_blit_generic(r300->blitter, dst_view, &dstbox,
                               src_view, src_box, src_width0, src_height0,
                               PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL,
-                              FALSE);
+                              FALSE, FALSE);
     r300_blitter_end(r300);
 
     pipe_surface_reference(&dst_view, NULL);
diff --git a/mesa 3D driver/src/gallium/drivers/r300/r300_context.c b/mesa 3D driver/src/gallium/drivers/r300/r300_context.c
index db5ad901f7..b5a7d10a07 100644
--- a/mesa 3D driver/src/gallium/drivers/r300/r300_context.c	
+++ b/mesa 3D driver/src/gallium/drivers/r300/r300_context.c	
@@ -441,8 +441,8 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
      * dummy texture there. */
     if (!r300->screen->caps.is_r500) {
         struct pipe_resource *tex;
-        struct pipe_resource rtempl = {{0}};
-        struct pipe_sampler_view vtempl = {{0}};
+        struct pipe_resource rtempl = {0};
+        struct pipe_sampler_view vtempl = {0};
 
         rtempl.target = PIPE_TEXTURE_2D;
         rtempl.format = PIPE_FORMAT_I8_UNORM;
diff --git a/mesa 3D driver/src/gallium/drivers/r300/r300_emit.c b/mesa 3D driver/src/gallium/drivers/r300/r300_emit.c
index 374377b5bd..c91f9851a2 100644
--- a/mesa 3D driver/src/gallium/drivers/r300/r300_emit.c	
+++ b/mesa 3D driver/src/gallium/drivers/r300/r300_emit.c	
@@ -1353,7 +1353,7 @@ boolean r300_emit_buffer_validate(struct r300_context *r300,
     if (r300->textures_state.dirty) {
         /* ...textures... */
         for (i = 0; i < texstate->count; i++) {
-            if (!(texstate->tx_enable & (1 << i))) {
+            if (!(texstate->tx_enable & (1U << i))) {
                 continue;
             }
 
diff --git a/mesa 3D driver/src/gallium/drivers/r300/r300_reg.h b/mesa 3D driver/src/gallium/drivers/r300/r300_reg.h
index 05d85cf474..56be6d690d 100644
--- a/mesa 3D driver/src/gallium/drivers/r300/r300_reg.h	
+++ b/mesa 3D driver/src/gallium/drivers/r300/r300_reg.h	
@@ -305,7 +305,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R300_VAP_PSC_SGN_NORM_CNTL                0x21dc
 #   define SGN_NORM_ZERO                                    0
 #   define SGN_NORM_ZERO_CLAMP_MINUS_ONE                    1
-#   define SGN_NORM_NO_ZERO                                 2
+#   define SGN_NORM_NO_ZERO                                 2U
 #   define R300_SGN_NORM_NO_ZERO (SGN_NORM_NO_ZERO | \
         (SGN_NORM_NO_ZERO << 2) | (SGN_NORM_NO_ZERO << 4) | \
         (SGN_NORM_NO_ZERO << 6) | (SGN_NORM_NO_ZERO << 8) | \
@@ -822,8 +822,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #	define R500_RS_COL_PTR(x)		        ((x) << 24)
 #       define R500_RS_COL_FMT(x)                       ((x) << 27)
 /* gap */
-#define R500_RS_IP_OFFSET_DIS 				(0 << 31)
-#define R500_RS_IP_OFFSET_EN 				(1 << 31)
+#define R500_RS_IP_OFFSET_DIS 				(0U << 31)
+#define R500_RS_IP_OFFSET_EN 				(1U << 31)
 
 /* gap */
 
@@ -2078,7 +2078,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_ALU_OUTC_MOD_DIV8           (6 << R300_ALU_OUTC_MOD_SHIFT)
 
 #       define R300_ALU_OUTC_CLAMP              (1 << 30)
-#       define R300_ALU_INSERT_NOP              (1 << 31)
+#       define R300_ALU_INSERT_NOP              (1U << 31)
 
 #define R300_US_ALU_ALPHA_INST_0                 0x49C0
 #       define R300_ALU_ARGA_SRC0C_X            0
@@ -2311,7 +2311,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_DISCARD_SRC_PIXELS_SRC_COLOR_1     (5 << 3)
 #       define R300_DISCARD_SRC_PIXELS_SRC_ALPHA_COLOR_1     (6 << 3)
 #       define R500_SRC_ALPHA_0_NO_READ                (1 << 30)
-#       define R500_SRC_ALPHA_1_NO_READ                (1 << 31)
+#       define R500_SRC_ALPHA_1_NO_READ                (1U << 31)
 
 /* the following are shared between CBLEND and ABLEND */
 #       define R300_FCN_MASK                         (3  << 12)
@@ -2666,8 +2666,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #	define R500_CONTIGUOUS_6XAA_SAMPLES_DISABLE          (1 << 17)
 #	define R500_PEQ_PACKING_DISABLE                      (0 << 18)
 #	define R500_PEQ_PACKING_ENABLE                       (1 << 18)
-#	define R500_COVERED_PTR_MASKING_DISABLE              (0 << 18)
-#	define R500_COVERED_PTR_MASKING_ENABLE               (1 << 18)
+#	define R500_COVERED_PTR_MASKING_DISABLE              (0 << 19)
+#	define R500_COVERED_PTR_MASKING_ENABLE               (1 << 19)
 
 
 /* gap */
@@ -3310,7 +3310,7 @@ enum {
 #   define R500_FC_KBOOL(x)				(x)
 #define R500_US_FC_CTRL					0x4624
 #   define R500_FC_TEST_EN				(1 << 30)
-#   define R500_FC_FULL_FC_EN				(1 << 31)
+#   define R500_FC_FULL_FC_EN				(1U << 31)
 #define R500_US_FC_INST_0				0x9800
 #   define R500_FC_OP_JUMP				(0 << 0)
 #   define R500_FC_OP_LOOP				(1 << 0)
@@ -3489,7 +3489,7 @@ enum {
 #define R300_PACKET3_INDX_BUFFER            0x00003300
 #    define R300_INDX_BUFFER_DST_SHIFT          0
 #    define R300_INDX_BUFFER_SKIP_SHIFT         16
-#    define R300_INDX_BUFFER_ONE_REG_WR		(1<<31)
+#    define R300_INDX_BUFFER_ONE_REG_WR		(1U << 31)
 
 /* Same as R300_PACKET3_3D_DRAW_VBUF but without VAP_VTX_FMT */
 #define R300_PACKET3_3D_DRAW_VBUF_2         0x00003400
diff --git a/mesa 3D driver/src/gallium/drivers/r300/r300_render.c b/mesa 3D driver/src/gallium/drivers/r300/r300_render.c
index 1c09e9ef64..b39fca9bbd 100644
--- a/mesa 3D driver/src/gallium/drivers/r300/r300_render.c	
+++ b/mesa 3D driver/src/gallium/drivers/r300/r300_render.c	
@@ -216,7 +216,7 @@ static boolean r300_reserve_cs_dwords(struct r300_context *r300,
     cs_dwords += r300_get_num_cs_end_dwords(r300);
 
     /* Reserve requested CS space. */
-    if (!r300->rws->cs_check_space(&r300->cs, cs_dwords, false)) {
+    if (!r300->rws->cs_check_space(&r300->cs, cs_dwords)) {
         r300_flush(&r300->context, PIPE_FLUSH_ASYNC, NULL);
         flushed = TRUE;
     }
@@ -886,7 +886,7 @@ static void r300_swtcl_draw_vbo(struct pipe_context* pipe,
 
     r300_update_derived_state(r300);
 
-    draw_vbo(r300->draw, info, drawid_offset, NULL, &draw, 1);
+    draw_vbo(r300->draw, info, drawid_offset, NULL, &draw, 1, 0);
     draw_flush(r300->draw);
 }
 
diff --git a/mesa 3D driver/src/gallium/drivers/r300/r300_state.c b/mesa 3D driver/src/gallium/drivers/r300/r300_state.c
index 3501c0b861..07299bb0b7 100644
--- a/mesa 3D driver/src/gallium/drivers/r300/r300_state.c	
+++ b/mesa 3D driver/src/gallium/drivers/r300/r300_state.c	
@@ -1519,6 +1519,7 @@ static void r300_set_sampler_views(struct pipe_context* pipe,
                                    enum pipe_shader_type shader,
                                    unsigned start, unsigned count,
                                    unsigned unbind_num_trailing_slots,
+                                   bool take_ownership,
                                    struct pipe_sampler_view** views)
 {
     struct r300_context* r300 = r300_context(pipe);
@@ -1529,13 +1530,16 @@ static void r300_set_sampler_views(struct pipe_context* pipe,
     unsigned tex_units = r300->screen->caps.num_tex_units;
     boolean dirty_tex = FALSE;
 
-    if (shader != PIPE_SHADER_FRAGMENT)
-       return;
-
     assert(start == 0);  /* non-zero not handled yet */
 
-    if (count > tex_units) {
-        return;
+    if (shader != PIPE_SHADER_FRAGMENT || count > tex_units) {
+       if (take_ownership) {
+          for (unsigned i = 0; i < count; i++) {
+             struct pipe_sampler_view *view = views[i];
+             pipe_sampler_view_reference(&view, NULL);
+          }
+       }
+       return;
     }
 
     /* Calculate the real number of views. */
@@ -1545,9 +1549,15 @@ static void r300_set_sampler_views(struct pipe_context* pipe,
     }
 
     for (i = 0; i < count; i++) {
-        pipe_sampler_view_reference(
-                (struct pipe_sampler_view**)&state->sampler_views[i],
-                views[i]);
+        if (take_ownership) {
+            pipe_sampler_view_reference(
+                    (struct pipe_sampler_view**)&state->sampler_views[i], NULL);
+            state->sampler_views[i] = (struct r300_sampler_view*)views[i];
+        } else {
+            pipe_sampler_view_reference(
+                    (struct pipe_sampler_view**)&state->sampler_views[i],
+                    views[i]);
+        }
 
         if (!views[i]) {
             continue;
@@ -1811,7 +1821,7 @@ static void r300_vertex_psc(struct r300_vertex_element_state *velems)
 
         if (i & 1) {
             vstream->vap_prog_stream_cntl[i >> 1] |= type << 16;
-            vstream->vap_prog_stream_cntl_ext[i >> 1] |= swizzle << 16;
+            vstream->vap_prog_stream_cntl_ext[i >> 1] |= (uint32_t)swizzle << 16;
         } else {
             vstream->vap_prog_stream_cntl[i >> 1] |= type;
             vstream->vap_prog_stream_cntl_ext[i >> 1] |= swizzle;
diff --git a/mesa 3D driver/src/gallium/drivers/r300/r300_state_derived.c b/mesa 3D driver/src/gallium/drivers/r300/r300_state_derived.c
index c31fce66e8..0e1ab4c940 100644
--- a/mesa 3D driver/src/gallium/drivers/r300/r300_state_derived.c	
+++ b/mesa 3D driver/src/gallium/drivers/r300/r300_state_derived.c	
@@ -102,7 +102,7 @@ static void r300_draw_emit_all_attribs(struct r300_context* r300)
     gen_count = 0;
     for (i = 0; i < ATTR_GENERIC_COUNT && gen_count < 8; i++) {
         if (vs_outputs->generic[i] != ATTR_UNUSED &&
-            !(r300->sprite_coord_enable & (1 << i))) {
+            !(r300->sprite_coord_enable & (1U << i))) {
             r300_draw_emit_attrib(r300, EMIT_4F, vs_outputs->generic[i]);
             gen_count++;
         }
@@ -168,7 +168,7 @@ static void r300_swtcl_vertex_psc(struct r300_context *r300)
         /* Add the attribute to the PSC table. */
         if (i & 1) {
             vstream->vap_prog_stream_cntl[i >> 1] |= type << 16;
-            vstream->vap_prog_stream_cntl_ext[i >> 1] |= swizzle << 16;
+            vstream->vap_prog_stream_cntl_ext[i >> 1] |= (uint32_t)swizzle << 16;
         } else {
             vstream->vap_prog_stream_cntl[i >> 1] |= type;
             vstream->vap_prog_stream_cntl_ext[i >> 1] |= swizzle;
@@ -441,7 +441,7 @@ static void r300_update_rs_block(struct r300_context *r300)
 	for (i = 0; i < ATTR_GENERIC_COUNT && col_count < 2; i++) {
 	    /* Cannot use color varyings for sprite coords. */
 	    if (fs_inputs->generic[i] != ATTR_UNUSED &&
-		(r300->sprite_coord_enable & (1 << i))) {
+		(r300->sprite_coord_enable & (1U << i))) {
 		break;
 	    }
 
@@ -807,7 +807,7 @@ static void r300_merge_textures_and_samplers(struct r300_context* r300)
 
     for (i = 0; i < count; i++) {
         if (state->sampler_views[i] && state->sampler_states[i]) {
-            state->tx_enable |= 1 << i;
+            state->tx_enable |= 1U << i;
 
             view = state->sampler_views[i];
             tex = r300_resource(view->base.texture);
@@ -973,7 +973,7 @@ static void r300_merge_textures_and_samplers(struct r300_context* r300)
                         (struct pipe_sampler_view**)&state->sampler_views[i],
                         &r300->texkill_sampler->base);
 
-                state->tx_enable |= 1 << i;
+                state->tx_enable |= 1U << i;
 
                 texstate = &state->regs[i];
 
diff --git a/mesa 3D driver/src/gallium/drivers/r300/r300_texture.c b/mesa 3D driver/src/gallium/drivers/r300/r300_texture.c
index 843443bfb0..cff8774eed 100644
--- a/mesa 3D driver/src/gallium/drivers/r300/r300_texture.c	
+++ b/mesa 3D driver/src/gallium/drivers/r300/r300_texture.c	
@@ -1167,7 +1167,7 @@ struct pipe_resource *r300_texture_from_handle(struct pipe_screen *screen,
         return NULL;
     }
 
-    buffer = rws->buffer_from_handle(rws, whandle, 0);
+    buffer = rws->buffer_from_handle(rws, whandle, 0, false);
     if (!buffer)
         return NULL;
 
diff --git a/mesa 3D driver/src/gallium/drivers/r300/r300_tgsi_to_rc.c b/mesa 3D driver/src/gallium/drivers/r300/r300_tgsi_to_rc.c
index 7217dd09a3..fbc827afcf 100644
--- a/mesa 3D driver/src/gallium/drivers/r300/r300_tgsi_to_rc.c	
+++ b/mesa 3D driver/src/gallium/drivers/r300/r300_tgsi_to_rc.c	
@@ -225,17 +225,38 @@ static void transform_texture(struct rc_instruction * dst, struct tgsi_instructi
         case TGSI_TEXTURE_SHADOW1D:
             dst->U.I.TexSrcTarget = RC_TEXTURE_1D;
             dst->U.I.TexShadow = 1;
-            *shadowSamplers |= 1 << dst->U.I.TexSrcUnit;
+            *shadowSamplers |= 1U << dst->U.I.TexSrcUnit;
             break;
         case TGSI_TEXTURE_SHADOW2D:
             dst->U.I.TexSrcTarget = RC_TEXTURE_2D;
             dst->U.I.TexShadow = 1;
-            *shadowSamplers |= 1 << dst->U.I.TexSrcUnit;
+            *shadowSamplers |= 1U << dst->U.I.TexSrcUnit;
             break;
         case TGSI_TEXTURE_SHADOWRECT:
             dst->U.I.TexSrcTarget = RC_TEXTURE_RECT;
             dst->U.I.TexShadow = 1;
-            *shadowSamplers |= 1 << dst->U.I.TexSrcUnit;
+            *shadowSamplers |= 1U << dst->U.I.TexSrcUnit;
+            break;
+        case TGSI_TEXTURE_1D_ARRAY:
+            dst->U.I.TexSrcTarget = RC_TEXTURE_1D_ARRAY;
+            break;
+        case TGSI_TEXTURE_2D_ARRAY:
+            dst->U.I.TexSrcTarget = RC_TEXTURE_2D_ARRAY;
+            break;
+        case TGSI_TEXTURE_SHADOW1D_ARRAY:
+            dst->U.I.TexSrcTarget = RC_TEXTURE_1D_ARRAY;
+            dst->U.I.TexShadow = 1;
+            *shadowSamplers |= 1U << dst->U.I.TexSrcUnit;
+            break;
+        case TGSI_TEXTURE_SHADOW2D_ARRAY:
+            dst->U.I.TexSrcTarget = RC_TEXTURE_2D_ARRAY;
+            dst->U.I.TexShadow = 1;
+            *shadowSamplers |= 1U << dst->U.I.TexSrcUnit;
+            break;
+        case TGSI_TEXTURE_SHADOWCUBE:
+            dst->U.I.TexSrcTarget = RC_TEXTURE_CUBE;
+            dst->U.I.TexShadow = 1;
+            *shadowSamplers |= 1U << dst->U.I.TexSrcUnit;
             break;
     }
     dst->U.I.TexSwizzle = RC_SWIZZLE_XYZW;
diff --git a/mesa 3D driver/src/gallium/drivers/r300/r300_vs.c b/mesa 3D driver/src/gallium/drivers/r300/r300_vs.c
index 62152714d2..96d6c4386a 100644
--- a/mesa 3D driver/src/gallium/drivers/r300/r300_vs.c	
+++ b/mesa 3D driver/src/gallium/drivers/r300/r300_vs.c	
@@ -249,7 +249,7 @@ void r300_translate_vertex_shader(struct r300_context *r300,
         compiler.Base.remove_unused_constants = TRUE;
     }
 
-    compiler.RequiredOutputs = ~(~0 << (vs->info.num_outputs + 1));
+    compiler.RequiredOutputs = ~(~0U << (vs->info.num_outputs + 1));
     compiler.SetHwInputOutput = &set_vertex_inputs_outputs;
 
     /* Insert the WPOS output. */
diff --git a/mesa 3D driver/src/gallium/drivers/r600/evergreen_state.c b/mesa 3D driver/src/gallium/drivers/r600/evergreen_state.c
index 83a521102a..9d9d40ae62 100644
--- a/mesa 3D driver/src/gallium/drivers/r600/evergreen_state.c	
+++ b/mesa 3D driver/src/gallium/drivers/r600/evergreen_state.c	
@@ -3997,6 +3997,13 @@ static void evergreen_set_tess_state(struct pipe_context *ctx,
 	rctx->driver_consts[PIPE_SHADER_TESS_CTRL].tcs_default_levels_dirty = true;
 }
 
+static void evergreen_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices)
+{
+	struct r600_context *rctx = (struct r600_context *)ctx;
+
+	rctx->patch_vertices = patch_vertices;
+}
+
 static void evergreen_setup_immed_buffer(struct r600_context *rctx,
 					 struct r600_image_view *rview,
 					 enum pipe_format pformat)
@@ -4489,6 +4496,7 @@ void evergreen_init_state_functions(struct r600_context *rctx)
 	rctx->b.b.set_polygon_stipple = evergreen_set_polygon_stipple;
 	rctx->b.b.set_min_samples = evergreen_set_min_samples;
 	rctx->b.b.set_tess_state = evergreen_set_tess_state;
+	rctx->b.b.set_patch_vertices = evergreen_set_patch_vertices;
 	rctx->b.b.set_hw_atomic_buffers = evergreen_set_hw_atomic_buffers;
 	rctx->b.b.set_shader_images = evergreen_set_shader_images;
 	rctx->b.b.set_shader_buffers = evergreen_set_shader_buffers;
@@ -4524,7 +4532,7 @@ void evergreen_setup_tess_constants(struct r600_context *rctx, const struct pipe
 	struct pipe_constant_buffer constbuf = {0};
 	struct r600_pipe_shader_selector *tcs = rctx->tcs_shader ? rctx->tcs_shader : rctx->tes_shader;
 	struct r600_pipe_shader_selector *ls = rctx->vs_shader;
-	unsigned num_tcs_input_cp = info->vertices_per_patch;
+	unsigned num_tcs_input_cp = rctx->patch_vertices;
 	unsigned num_tcs_outputs;
 	unsigned num_tcs_output_cp;
 	unsigned num_tcs_patch_outputs;
@@ -4624,10 +4632,10 @@ uint32_t evergreen_get_ls_hs_config(struct r600_context *rctx,
 
 	num_output_cp = rctx->tcs_shader ?
 		rctx->tcs_shader->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT] :
-		info->vertices_per_patch;
+		rctx->patch_vertices;
 
 	return S_028B58_NUM_PATCHES(num_patches) |
-		S_028B58_HS_NUM_INPUT_CP(info->vertices_per_patch) |
+		S_028B58_HS_NUM_INPUT_CP(rctx->patch_vertices) |
 		S_028B58_HS_NUM_OUTPUT_CP(num_output_cp);
 }
 
diff --git a/mesa 3D driver/src/gallium/drivers/r600/r600_blit.c b/mesa 3D driver/src/gallium/drivers/r600/r600_blit.c
index b8924f8267..aca3773023 100644
--- a/mesa 3D driver/src/gallium/drivers/r600/r600_blit.c	
+++ b/mesa 3D driver/src/gallium/drivers/r600/r600_blit.c	
@@ -813,7 +813,7 @@ void r600_resource_copy_region(struct pipe_context *ctx,
 	util_blitter_blit_generic(rctx->blitter, dst_view, &dstbox,
 				  src_view, src_box, src_width0, src_height0,
 				  PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL,
-				  FALSE);
+				  FALSE, FALSE);
 	r600_blitter_end(ctx);
 
 	pipe_surface_reference(&dst_view, NULL);
diff --git a/mesa 3D driver/src/gallium/drivers/r600/r600_buffer_common.c b/mesa 3D driver/src/gallium/drivers/r600/r600_buffer_common.c
index ba645298f8..2035456e4f 100644
--- a/mesa 3D driver/src/gallium/drivers/r600/r600_buffer_common.c	
+++ b/mesa 3D driver/src/gallium/drivers/r600/r600_buffer_common.c	
@@ -584,7 +584,7 @@ r600_alloc_buffer_struct(struct pipe_screen *screen,
 	pipe_reference_init(&rbuffer->b.b.reference, 1);
 	rbuffer->b.b.screen = screen;
 
-	threaded_resource_init(&rbuffer->b.b);
+	threaded_resource_init(&rbuffer->b.b, false, 0);
 
 	rbuffer->buf = NULL;
 	rbuffer->bind_history = 0;
diff --git a/mesa 3D driver/src/gallium/drivers/r600/r600_dump.c b/mesa 3D driver/src/gallium/drivers/r600/r600_dump.c
index 76b56bc7d5..3fcf13aef8 100644
--- a/mesa 3D driver/src/gallium/drivers/r600/r600_dump.c	
+++ b/mesa 3D driver/src/gallium/drivers/r600/r600_dump.c	
@@ -181,7 +181,6 @@ void print_pipe_info(FILE *f, struct tgsi_shader_info *shader)
       PRINT_UINT_ARRAY_MEMBER(input_interpolate, i);
       PRINT_UINT_ARRAY_MEMBER(input_interpolate_loc, i);
       PRINT_UINT_ARRAY_MEMBER(input_usage_mask, i);
-      PRINT_UINT_ARRAY_MEMBER(input_cylindrical_wrap, i);
    }
 
    for (int i = 0; i < shader->num_inputs; ++i) {
diff --git a/mesa 3D driver/src/gallium/drivers/r600/r600_hw_context.c b/mesa 3D driver/src/gallium/drivers/r600/r600_hw_context.c
index de032c6dcf..c3aef0f4be 100644
--- a/mesa 3D driver/src/gallium/drivers/r600/r600_hw_context.c	
+++ b/mesa 3D driver/src/gallium/drivers/r600/r600_hw_context.c	
@@ -84,7 +84,7 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 	num_dw += 10;
 
 	/* Flush if there's not enough space. */
-	if (!ctx->b.ws->cs_check_space(&ctx->b.gfx.cs, num_dw, false)) {
+	if (!ctx->b.ws->cs_check_space(&ctx->b.gfx.cs, num_dw)) {
 		ctx->b.gfx.flush(ctx, PIPE_FLUSH_ASYNC, NULL);
 	}
 }
diff --git a/mesa 3D driver/src/gallium/drivers/r600/r600_pipe.h b/mesa 3D driver/src/gallium/drivers/r600/r600_pipe.h
index 3cb171a0d5..60e4ef3e00 100644
--- a/mesa 3D driver/src/gallium/drivers/r600/r600_pipe.h	
+++ b/mesa 3D driver/src/gallium/drivers/r600/r600_pipe.h	
@@ -614,6 +614,7 @@ struct r600_context {
 	struct r600_resource	*trace_buf;
 	unsigned		trace_id;
 
+	uint8_t patch_vertices;
 	bool cmd_buf_is_compute;
 	struct pipe_resource *append_fence;
 	uint32_t append_fence_id;
diff --git a/mesa 3D driver/src/gallium/drivers/r600/r600_pipe_common.c b/mesa 3D driver/src/gallium/drivers/r600/r600_pipe_common.c
index a6af8149dc..c7c77eacc3 100644
--- a/mesa 3D driver/src/gallium/drivers/r600/r600_pipe_common.c	
+++ b/mesa 3D driver/src/gallium/drivers/r600/r600_pipe_common.c	
@@ -263,7 +263,7 @@ void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
 	 * engine busy while uploads are being submitted.
 	 */
 	num_dw++; /* for emit_wait_idle below */
-	if (!ctx->ws->cs_check_space(&ctx->dma.cs, num_dw, false) ||
+	if (!ctx->ws->cs_check_space(&ctx->dma.cs, num_dw) ||
 	    ctx->dma.cs.used_vram_kb + ctx->dma.cs.used_gart_kb > 64 * 1024 ||
 	    !radeon_cs_memory_below_limit(ctx->screen, &ctx->dma.cs, vram, gtt)) {
 		ctx->dma.flush(ctx, PIPE_FLUSH_ASYNC, NULL);
diff --git a/mesa 3D driver/src/gallium/drivers/r600/r600_state_common.c b/mesa 3D driver/src/gallium/drivers/r600/r600_state_common.c
index 0a70d2d967..b8c446820e 100644
--- a/mesa 3D driver/src/gallium/drivers/r600/r600_state_common.c	
+++ b/mesa 3D driver/src/gallium/drivers/r600/r600_state_common.c	
@@ -641,6 +641,7 @@ static void r600_set_sampler_views(struct pipe_context *pipe,
 				   enum pipe_shader_type shader,
 				   unsigned start, unsigned count,
 				   unsigned unbind_num_trailing_slots,
+				   bool take_ownership,
 				   struct pipe_sampler_view **views)
 {
 	struct r600_context *rctx = (struct r600_context *) pipe;
@@ -674,6 +675,10 @@ static void r600_set_sampler_views(struct pipe_context *pipe,
 
 	for (i = 0; i < count; i++) {
 		if (rviews[i] == dst->views.views[i]) {
+			if (take_ownership) {
+				struct pipe_sampler_view *view = views[i];
+				pipe_sampler_view_reference(&view, NULL);
+			}
 			continue;
 		}
 
@@ -704,7 +709,12 @@ static void r600_set_sampler_views(struct pipe_context *pipe,
 				dirty_sampler_states_mask |= 1 << i;
 			}
 
-			pipe_sampler_view_reference((struct pipe_sampler_view **)&dst->views.views[i], views[i]);
+			if (take_ownership) {
+				pipe_sampler_view_reference((struct pipe_sampler_view **)&dst->views.views[i], NULL);
+				dst->views.views[i] = (struct r600_pipe_sampler_view*)views[i];
+			} else {
+				pipe_sampler_view_reference((struct pipe_sampler_view **)&dst->views.views[i], views[i]);
+			}
 			new_mask |= 1 << i;
 			r600_context_add_resource_size(pipe, views[i]->texture);
 		} else {
diff --git a/mesa 3D driver/src/gallium/drivers/r600/r600_texture.c b/mesa 3D driver/src/gallium/drivers/r600/r600_texture.c
index 0c34a8db4e..2be8c90ebd 100644
--- a/mesa 3D driver/src/gallium/drivers/r600/r600_texture.c	
+++ b/mesa 3D driver/src/gallium/drivers/r600/r600_texture.c	
@@ -1124,7 +1124,7 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen
 		return NULL;
 
 	buf = rscreen->ws->buffer_from_handle(rscreen->ws, whandle,
-					      rscreen->info.max_alignment);
+					      rscreen->info.max_alignment, false);
 	if (!buf)
 		return NULL;
 
@@ -1844,7 +1844,7 @@ r600_memobj_from_handle(struct pipe_screen *screen,
 		return NULL;
 
 	buf = rscreen->ws->buffer_from_handle(rscreen->ws, whandle,
-					      rscreen->info.max_alignment);
+					      rscreen->info.max_alignment, false);
 	if (!buf) {
 		free(memobj);
 		return NULL;
diff --git a/mesa 3D driver/src/gallium/drivers/r600/sfn/sfn_nir_vectorize_vs_inputs.c b/mesa 3D driver/src/gallium/drivers/r600/sfn/sfn_nir_vectorize_vs_inputs.c
index 35fd5cfffa..583eb12d89 100644
--- a/mesa 3D driver/src/gallium/drivers/r600/sfn/sfn_nir_vectorize_vs_inputs.c	
+++ b/mesa 3D driver/src/gallium/drivers/r600/sfn/sfn_nir_vectorize_vs_inputs.c	
@@ -159,7 +159,7 @@ r600_create_new_load(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *va
 
    if (intr->intrinsic == nir_intrinsic_interp_deref_at_offset ||
        intr->intrinsic == nir_intrinsic_interp_deref_at_sample)
-      nir_src_copy(&new_intr->src[1], &intr->src[1], &new_intr->instr);
+      nir_src_copy(&new_intr->src[1], &intr->src[1]);
 
    nir_builder_instr_insert(b, &new_intr->instr);
 
diff --git a/mesa 3D driver/src/gallium/drivers/radeon/radeon_vcn_dec.c b/mesa 3D driver/src/gallium/drivers/radeon/radeon_vcn_dec.c
index d8e4d66bdd..ee965b7c7a 100644
--- a/mesa 3D driver/src/gallium/drivers/radeon/radeon_vcn_dec.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeon/radeon_vcn_dec.c	
@@ -467,6 +467,9 @@ static rvcn_dec_message_vp9_t get_vp9_msg(struct radeon_decoder *dec,
       RDECODE_FRAME_HDR_INFO_VP9_USE_PREV_IN_FIND_MV_REFS_MASK;
    dec->show_frame = pic->picture_parameter.pic_fields.show_frame;
 
+   result.frame_header_flags |=  (1 << RDECODE_FRAME_HDR_INFO_VP9_USE_UNCOMPRESSED_HEADER_SHIFT) &
+                                 RDECODE_FRAME_HDR_INFO_VP9_USE_UNCOMPRESSED_HEADER_MASK;
+
    result.interp_filter = pic->picture_parameter.pic_fields.mcomp_filter_type;
 
    result.frame_context_idx = pic->picture_parameter.pic_fields.frame_context_idx;
@@ -1336,7 +1339,7 @@ static void rvcn_dec_message_create(struct radeon_decoder *dec)
 static unsigned rvcn_dec_dynamic_dpb_t2_message(struct radeon_decoder *dec, rvcn_dec_message_decode_t *decode,
       rvcn_dec_message_dynamic_dpb_t2_t *dynamic_dpb_t2)
 {
-   struct rvcn_dec_dynamic_dpb_t2 *dpb = NULL;
+   struct rvcn_dec_dynamic_dpb_t2 *dpb = NULL, *dummy = NULL;
    unsigned width, height, size;
    uint64_t addr;
    int i;
@@ -1350,7 +1353,14 @@ static unsigned rvcn_dec_dynamic_dpb_t2_message(struct radeon_decoder *dec, rvcn
    list_for_each_entry_safe(struct rvcn_dec_dynamic_dpb_t2, d, &dec->dpb_ref_list, list) {
       for (i = 0; i < dec->ref_codec.ref_size; ++i) {
          if ((dec->ref_codec.ref_list[i] != 0x7f) && (d->index == (dec->ref_codec.ref_list[i] & 0x7f))) {
+            if (!dummy)
+               dummy = d;
+
             addr = dec->ws->buffer_get_virtual_address(d->dpb.res->buf);
+            if (!addr && dummy) {
+               RVID_ERR("Ref list from application is incorrect, using dummy buffer instead.\n");
+               addr = dec->ws->buffer_get_virtual_address(dummy->dpb.res->buf);
+            }
             dynamic_dpb_t2->dpbAddrLo[i] = addr;
             dynamic_dpb_t2->dpbAddrHi[i] = addr >> 32;
             ++dynamic_dpb_t2->dpbArraySize;
@@ -1358,8 +1368,12 @@ static unsigned rvcn_dec_dynamic_dpb_t2_message(struct radeon_decoder *dec, rvcn
          }
       }
       if (i == dec->ref_codec.ref_size) {
-         list_del(&d->list);
-         list_addtail(&d->list, &dec->dpb_unref_list);
+         if (d->dpb.res->b.b.width0 * d->dpb.res->b.b.height0 != size) {
+            list_del(&d->list);
+            list_addtail(&d->list, &dec->dpb_unref_list);
+         } else {
+            d->index = 0x7f;
+         }
       }
    }
 
@@ -1371,11 +1385,9 @@ static unsigned rvcn_dec_dynamic_dpb_t2_message(struct radeon_decoder *dec, rvcn
    }
 
    if (!dpb) {
-      list_for_each_entry_safe(struct rvcn_dec_dynamic_dpb_t2, d, &dec->dpb_unref_list, list) {
-         if (d->dpb.res->b.b.width0 * d->dpb.res->b.b.height0 == size) {
+      list_for_each_entry_safe(struct rvcn_dec_dynamic_dpb_t2, d, &dec->dpb_ref_list, list) {
+         if (d->index == 0x7f) {
             d->index = dec->ref_codec.index;
-            list_del(&d->list);
-            list_addtail(&d->list, &dec->dpb_ref_list);
             dpb = d;
             break;
          }
diff --git a/mesa 3D driver/src/gallium/drivers/radeon/radeon_vcn_dec.h b/mesa 3D driver/src/gallium/drivers/radeon/radeon_vcn_dec.h
index 7ce902ed68..70309e5343 100644
--- a/mesa 3D driver/src/gallium/drivers/radeon/radeon_vcn_dec.h	
+++ b/mesa 3D driver/src/gallium/drivers/radeon/radeon_vcn_dec.h	
@@ -224,6 +224,7 @@
 #define TYPE7 7
 
 /* VP9 Frame header flags */
+#define RDECODE_FRAME_HDR_INFO_VP9_USE_UNCOMPRESSED_HEADER_SHIFT      (14)
 #define RDECODE_FRAME_HDR_INFO_VP9_USE_PREV_IN_FIND_MV_REFS_SHIFT     (13)
 #define RDECODE_FRAME_HDR_INFO_VP9_MODE_REF_DELTA_UPDATE_SHIFT        (12)
 #define RDECODE_FRAME_HDR_INFO_VP9_MODE_REF_DELTA_ENABLED_SHIFT       (11)
@@ -239,6 +240,8 @@
 #define RDECODE_FRAME_HDR_INFO_VP9_FRAME_TYPE_SHIFT                   (1)
 #define RDECODE_FRAME_HDR_INFO_VP9_SHOW_EXISTING_FRAME_SHIFT          (0)
 
+
+#define RDECODE_FRAME_HDR_INFO_VP9_USE_UNCOMPRESSED_HEADER_MASK      (0x00004000)
 #define RDECODE_FRAME_HDR_INFO_VP9_USE_PREV_IN_FIND_MV_REFS_MASK     (0x00002000)
 #define RDECODE_FRAME_HDR_INFO_VP9_MODE_REF_DELTA_UPDATE_MASK        (0x00001000)
 #define RDECODE_FRAME_HDR_INFO_VP9_MODE_REF_DELTA_ENABLED_MASK       (0x00000800)
diff --git a/mesa 3D driver/src/gallium/drivers/radeon/radeon_vcn_enc.c b/mesa 3D driver/src/gallium/drivers/radeon/radeon_vcn_enc.c
index f02def3a82..ab13dad3c6 100644
--- a/mesa 3D driver/src/gallium/drivers/radeon/radeon_vcn_enc.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeon/radeon_vcn_enc.c	
@@ -61,7 +61,7 @@ static void radeon_vcn_enc_get_param(struct radeon_encoder *enc, struct pipe_pic
          enc->enc_pic.crop_top = 0;
          enc->enc_pic.crop_bottom = (align(enc->base.height, 16) - enc->base.height) / 2;
       }
-      enc->enc_pic.num_temporal_layers = pic->num_temporal_layers;
+      enc->enc_pic.num_temporal_layers = pic->num_temporal_layers ? pic->num_temporal_layers : 1;
       enc->enc_pic.temporal_id = 0;
       for (int i = 0; i < enc->enc_pic.num_temporal_layers; i++)
       {
diff --git a/mesa 3D driver/src/gallium/drivers/radeon/radeon_winsys.h b/mesa 3D driver/src/gallium/drivers/radeon/radeon_winsys.h
index 6b72bfb800..55c6b11602 100644
--- a/mesa 3D driver/src/gallium/drivers/radeon/radeon_winsys.h	
+++ b/mesa 3D driver/src/gallium/drivers/radeon/radeon_winsys.h	
@@ -80,9 +80,6 @@ enum radeon_bo_flag
 
 enum radeon_dependency_flag
 {
-   /* Add the dependency to the parallel compute IB only. */
-   RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY = 1 << 0,
-
    /* Instead of waiting for a job to finish execution, the dependency will
     * be signaled when the job starts execution.
     */
@@ -386,7 +383,7 @@ struct radeon_winsys {
     *                  tracker.
     */
    struct pb_buffer *(*buffer_from_handle)(struct radeon_winsys *ws, struct winsys_handle *whandle,
-                                           unsigned vm_alignment);
+                                           unsigned vm_alignment, bool is_prime_linear_buffer);
 
    /**
     * Get a winsys buffer from a user pointer. The resulting buffer can't
@@ -512,26 +509,6 @@ struct radeon_winsys {
                                    struct pipe_fence_handle **fence),
                      void *flush_ctx, bool stop_exec_on_failure);
 
-   /**
-    * Add a parallel compute IB to a gfx IB. It will share the buffer list
-    * and fence dependencies with the gfx IB. The gfx flush call will submit
-    * both IBs at the same time.
-    *
-    * The compute IB doesn't have an output fence, so the primary IB has
-    * to use a wait packet for synchronization.
-    *
-    * The returned IB is only a stream for writing packets to the new
-    * IB. The only function that can be used on the compute cs is cs_check_space.
-    *
-    * \param compute_cs      The returned structure of the command stream.
-    * \param gfx_cs          Gfx IB
-    *
-    * \return true on success
-    */
-   bool (*cs_add_parallel_compute_ib)(struct radeon_cmdbuf *compute_cs,
-                                      struct radeon_cmdbuf *gfx_cs,
-                                      bool uses_gds_ordered_append);
-
    /**
     * Set up and enable mid command buffer preemption for the command stream.
     *
@@ -592,11 +569,9 @@ struct radeon_winsys {
     *
     * \param cs        A command stream.
     * \param dw        Number of CS dwords requested by the caller.
-    * \param force_chaining  Chain the IB into a new buffer now to discard
-    *                        the CP prefetch cache (to emulate PKT3_REWIND)
     * \return true if there is enough space
     */
-   bool (*cs_check_space)(struct radeon_cmdbuf *cs, unsigned dw, bool force_chaining);
+   bool (*cs_check_space)(struct radeon_cmdbuf *cs, unsigned dw);
 
    /**
     * Return the buffer list.
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/ci/gitlab-ci.yml b/mesa 3D driver/src/gallium/drivers/radeonsi/ci/gitlab-ci.yml
index 0d47b11565..b2d396ff56 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/ci/gitlab-ci.yml	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/ci/gitlab-ci.yml	
@@ -9,7 +9,7 @@
     KERNEL_IMAGE_TYPE: ""
     HWCI_KERNEL_MODULES: amdgpu
     DRIVER_NAME: radeonsi
-    DEQP_PARALLEL: 4
+    FDO_CI_CONCURRENT: 4
     DEQP_EXPECTED_RENDERER: STONEY
     HWCI_FREQ_MAX: "true"
   tags:
@@ -33,7 +33,7 @@ radeonsi-stoney-gles3:amd64:
   parallel: 2
   variables:
     DEQP_VER: gles3
-    DEQP_PARALLEL: 4
+    FDO_CI_CONCURRENT: 4
     DEQP_EXPECTED_RENDERER: STONEY
     DEQP_FRACTION: 2
 
@@ -45,7 +45,7 @@ radeonsi-stoney-gles31:amd64:
   parallel: 2
   variables:
     DEQP_VER: gles31
-    DEQP_PARALLEL: 4
+    FDO_CI_CONCURRENT: 4
     DEQP_EXPECTED_RENDERER: STONEY
     DEQP_FRACTION: 2
 
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/ci/navi10-piglit-quick-fail.csv b/mesa 3D driver/src/gallium/drivers/radeonsi/ci/navi10-piglit-quick-fail.csv
index c6a74c7f50..b937a49faa 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/ci/navi10-piglit-quick-fail.csv	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/ci/navi10-piglit-quick-fail.csv	
@@ -39,9 +39,6 @@ spec@!opengl 3.2@gl-3.2-adj-prims line cull-front pv-first,Fail
 spec@!opengl 3.2@gl-3.2-adj-prims pv-first,Fail
 spec@arb_bindless_texture@compiler@images@arith-bound-image.frag,Crash
 spec@arb_bindless_texture@compiler@samplers@arith-bound-sampler-texture2d.frag,Crash
-spec@arb_bindless_texture@illegal,Fail
-spec@arb_bindless_texture@illegal@Call glCopyTexImage* when a texture handle is referenced,Fail
-spec@arb_bindless_texture@illegal@Call glTexImage* when a texture handle is referenced,Fail
 spec@arb_direct_state_access@gettextureimage-formats,Crash
 spec@arb_enhanced_layouts@compiler@block-member-locations@arrayed-block-member-location.frag,Fail
 spec@arb_enhanced_layouts@compiler@block-member-locations@arrayed-block-member-location.vert,Fail
@@ -175,7 +172,6 @@ spec@khr_texture_compression_astc@sliced-3d-miptree-gl srgb-fp,Fail
 spec@khr_texture_compression_astc@sliced-3d-miptree-gl srgb-fp@sRGB decode full precision,Fail
 spec@khr_texture_compression_astc@sliced-3d-miptree-gles srgb-fp,Fail
 spec@khr_texture_compression_astc@sliced-3d-miptree-gles srgb-fp@sRGB decode full precision,Fail
-spec@nv_compute_shader_derivatives@compiler@new_functions.comp,Fail
 spec@oes_egl_image_external_essl3@oes_egl_image_external_essl3,Fail
 spec@oes_egl_image_external_essl3@oes_egl_image_external_essl3@oes_egl_image_external_essl3_imageLoad,Fail
 spec@oes_egl_image_external_essl3@oes_egl_image_external_essl3@oes_egl_image_external_essl3_imageStore,Fail
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/ci/piglit-radeonsi-stoney-fails.txt b/mesa 3D driver/src/gallium/drivers/radeonsi/ci/piglit-radeonsi-stoney-fails.txt
index 33f2ae1474..3c002828f5 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/ci/piglit-radeonsi-stoney-fails.txt	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/ci/piglit-radeonsi-stoney-fails.txt	
@@ -3,7 +3,6 @@ glx@glx_arb_create_context_robustness@invalid reset notification strategy,Fail
 glx@glx_ext_import_context@get context id,Fail
 glx@glx_ext_import_context@import context- single process,Fail
 glx@glx-visuals-stencil -pixmap,Crash
-spec@arb_bindless_texture@illegal,Fail
 spec@arb_direct_state_access@gettextureimage-formats,Crash
 spec@arb_es2_compatibility@texwrap formats bordercolor-swizzled,Fail
 spec@arb_program_interface_query@arb_program_interface_query-getprogramresourceindex,Fail
@@ -49,6 +48,7 @@ spec@arb_texture_rg@texwrap formats-int bordercolor-swizzled@GL_RG8UI- swizzled-
 spec@arb_timer_query@timestamp-get,Fail
 spec@egl 1.4@egl-copy-buffers,Crash
 spec@egl 1.4@eglterminate then unbind context,Fail
+spec@egl_chromium_sync_control@conformance@eglGetSyncValuesCHROMIUM_msc_and_sbc_test,Fail
 spec@egl_chromium_sync_control@conformance,Fail
 spec@egl_ext_protected_content@conformance,Fail
 spec@egl_khr_surfaceless_context@viewport,Fail
@@ -95,6 +95,7 @@ spec@khr_texture_compression_astc@miptree-gles srgb-fp,Fail
 spec@khr_texture_compression_astc@miptree-gles srgb-fp@sRGB decode full precision,Fail
 spec@khr_texture_compression_astc@miptree-gl srgb-fp,Fail
 spec@khr_texture_compression_astc@miptree-gl srgb-fp@sRGB decode full precision,Fail
+spec@khr_texture_compression_astc@sliced-3d-miptree-gles srgb-fp@sRGB decode full precision,Fail
 spec@khr_texture_compression_astc@sliced-3d-miptree-gles srgb-fp,Fail
 spec@khr_texture_compression_astc@sliced-3d-miptree-gl srgb-fp,Fail
 spec@oes_egl_image_external_essl3@oes_egl_image_external_essl3,Fail
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/ci/radeonsi-run-tests.py b/mesa 3D driver/src/gallium/drivers/radeonsi/ci/radeonsi-run-tests.py
index 5b386b26be..81f55a6844 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/ci/radeonsi-run-tests.py	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/ci/radeonsi-run-tests.py	
@@ -47,7 +47,13 @@ def print_yellow(txt, end_line=True, prefix=None):
     print("\033[1;33m{}\033[0m".format(txt), end="\n" if end_line else " ")
 
 
-parser = argparse.ArgumentParser(description="radeonsi tester")
+def print_green(txt, end_line=True, prefix=None):
+    if prefix:
+        print(prefix, end="")
+    print("\033[1;32m{}\033[0m".format(txt), end="\n" if end_line else " ")
+
+
+parser = argparse.ArgumentParser(description="radeonsi tester", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 parser.add_argument(
     "--jobs",
     "-j",
@@ -73,7 +79,11 @@ parser.add_argument(
     default=[],
     help="Only run the test matching this expression. This can only be a filename containing a list of failing tests to re-run.",
 )
-
+parser.add_argument(
+    "--baseline",
+    dest="baseline",
+    help="Folder containing expected results files",
+    default=os.path.dirname(__file__))
 parser.add_argument(
     "--no-piglit", dest="piglit", help="Disable piglit tests", action="store_false"
 )
@@ -115,10 +125,26 @@ parser.set_defaults(deqp_gles2=True)
 parser.set_defaults(deqp_gles3=True)
 parser.set_defaults(deqp_gles31=True)
 
-parser.add_argument("output_folder", help="Output folder (logs, etc)")
+parser.add_argument(
+    "output_folder",
+    nargs="?",
+    help="Output folder (logs, etc)",
+    default=os.path.join(tempfile.gettempdir(), datetime.now().strftime('%Y-%m-%d-%H-%M-%S')))
+
+available_gpus = []
+for f in os.listdir("/dev/dri/by-path"):
+    idx = f.find("-render")
+    if idx < 0:
+        continue
+    # gbm name is the full path, but DRI_PRIME expects a different
+    # format
+    available_gpus += [(os.path.join("/dev/dri/by-path", f),
+                        f[:idx].replace(':', '_').replace('.', '_'))]
+
+if len(available_gpus) > 1:
+    parser.add_argument('--gpu', type=int, dest="gpu", default=0, help='Select GPU (0..{})'.format(len(available_gpus) - 1))
 
 args = parser.parse_args(sys.argv[1:])
-
 piglit_path = args.piglit_path
 glcts_path = args.glcts_path
 deqp_path = args.deqp_path
@@ -135,26 +161,64 @@ else:
         parser.print_help()
         sys.exit(0)
 
-base = os.path.dirname(__file__)
-skips = os.path.join(base, "skips.csv")
+base = args.baseline
+skips = os.path.join(os.path.dirname(__file__), "skips.csv")
+
+env = os.environ.copy()
+
+if "DISPLAY" not in env:
+    print_red("DISPLAY environment variable missing.")
+    sys.exit(1)
+p = subprocess.run(
+    ["deqp-runner", "--version"],
+    capture_output="True",
+    check=True,
+    env=env
+)
+for line in p.stdout.decode().split("\n"):
+    if line.find("deqp-runner") >= 0:
+        s = line.split(" ")[1].split(".")
+        if args.verbose > 1:
+            print("Checking deqp-version ({})".format(s))
+        # We want at least 0.9.0
+        if not (int(s[0]) > 0 or int(s[1]) >= 9):
+            print("Expecting deqp-runner 0.9.0+ version (got {})".format(".".join(s)))
+            sys.exit(1)
+
+env["PIGLIT_PLATFORM"] = "gbm"
+
+if "DRI_PRIME" in env:
+    print("Don't use DRI_PRIME. Instead use --gpu N")
+    del env["DRI_PRIME"]
+if "gpu" in args:
+    env["DRI_PRIME"] = available_gpus[args.gpu][1]
+    env["WAFFLE_GBM_DEVICE"] = available_gpus[args.gpu][0]
 
 # Use piglit's glinfo to determine the GPU name
 gpu_name = "unknown"
+gpu_name_full = ""
+
 p = subprocess.run(
     ["./glinfo"],
     capture_output="True",
     cwd=os.path.join(piglit_path, "bin"),
     check=True,
+    env=env
 )
 for line in p.stdout.decode().split("\n"):
     if "GL_RENDER" in line:
+        line = line.split("=")[1]
+        gpu_name_full = '('.join(line.split("(")[:-1]).strip()
         gpu_name = line.replace("(TM)", "").split("(")[1].split(",")[0].lower()
         break
 
 output_folder = args.output_folder
+print_green("Tested GPU: '{}' ({})".format(gpu_name_full, gpu_name))
+print_green("Output folder: '{}'".format(output_folder))
+
 count = 1
 while os.path.exists(output_folder):
-    output_folder = "{}.{}".format(args.output_folder, count)
+    output_folder = "{}.{}".format(os.path.abspath(args.output_folder), count)
     count += 1
 
 os.mkdir(output_folder)
@@ -166,7 +230,7 @@ logfile = open(os.path.join(output_folder, "{}-run-tests.log".format(gpu_name)),
 spin = itertools.cycle("-\\|/")
 
 
-def run_cmd(args, verbosity, env=None):
+def run_cmd(args, verbosity):
     if verbosity > 1:
         print_yellow(
             "| Command line argument '"
@@ -258,9 +322,8 @@ if args.piglit:
 
     if os.path.exists(baseline):
         cmd += ["--baseline", baseline]
-    env = os.environ.copy()
-    env["PIGLIT_PLATFORM"] = "gbm"
-    run_cmd(cmd, args.verbose, env)
+        print_yellow("[baseline {}]".format(baseline), args.verbose > 0)
+    run_cmd(cmd, args.verbose)
     shutil.copy(os.path.join(out, "failures.csv"), new_baseline)
     verify_results(baseline, new_baseline)
 
@@ -299,27 +362,28 @@ if args.glcts:
 
     if os.path.exists(baseline):
         cmd += ["--baseline", baseline]
+        print_yellow("[baseline {}]".format(baseline), args.verbose > 0)
     cmd += deqp_args
     run_cmd(cmd, args.verbose)
     shutil.copy(os.path.join(out, "failures.csv"), new_baseline)
     verify_results(baseline, new_baseline)
 
 if args.deqp:
-    if args.include_tests:
-        print_yellow("dEQP tests cannot be run with the -t/--include-tests option yet.")
-        sys.exit(0)
-
     print_yellow("Running   dEQP tests", args.verbose > 0)
 
     # Generate a test-suite file
+    out = os.path.join(output_folder, "deqp")
     suite_filename = os.path.join(output_folder, "deqp-suite.toml")
     suite = open(suite_filename, "w")
-    os.mkdir(os.path.join(output_folder, "deqp"))
+    os.mkdir(out)
     baseline = os.path.join(base, "{}-deqp-fail.csv".format(gpu_name))
     new_baseline = os.path.join(
         new_baseline_folder, "{}-deqp-fail.csv".format(gpu_name)
     )
 
+    if os.path.exists(baseline):
+        print_yellow("[baseline {}]".format(baseline), args.verbose > 0)
+
     deqp_tests = {
         "egl": args.deqp_egl,
         "gles2": args.deqp_gles2,
@@ -362,7 +426,7 @@ if args.deqp:
         os.path.join(output_folder, "deqp"),
         "--suite",
         suite_filename,
-    ]
+    ] + filters_args
     run_cmd(cmd, args.verbose)
     shutil.copy(os.path.join(out, "failures.csv"), new_baseline)
     verify_results(baseline, new_baseline)
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/ci/raven-deqp-fail.csv b/mesa 3D driver/src/gallium/drivers/radeonsi/ci/raven-deqp-fail.csv
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/ci/raven-glcts-fail.csv b/mesa 3D driver/src/gallium/drivers/radeonsi/ci/raven-glcts-fail.csv
new file mode 100644
index 0000000000..6193859d38
--- /dev/null
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/ci/raven-glcts-fail.csv	
@@ -0,0 +1,28 @@
+KHR-GL46.gl_spirv.spirv_glsl_to_spirv_builtin_functions_test,Fail
+KHR-GL46.packed_pixels.pbo_rectangle.r16_snorm,Fail
+KHR-GL46.packed_pixels.pbo_rectangle.r8_snorm,Fail
+KHR-GL46.packed_pixels.pbo_rectangle.rg16_snorm,Fail
+KHR-GL46.packed_pixels.pbo_rectangle.rg8_snorm,Fail
+KHR-GL46.packed_pixels.pbo_rectangle.rgb16_snorm,Fail
+KHR-GL46.packed_pixels.pbo_rectangle.rgb8_snorm,Fail
+KHR-GL46.packed_pixels.pbo_rectangle.rgba16_snorm,Fail
+KHR-GL46.packed_pixels.pbo_rectangle.rgba8_snorm,Fail
+KHR-GL46.packed_pixels.rectangle.r16_snorm,Fail
+KHR-GL46.packed_pixels.rectangle.r8_snorm,Fail
+KHR-GL46.packed_pixels.rectangle.rg16_snorm,Fail
+KHR-GL46.packed_pixels.rectangle.rg8_snorm,Fail
+KHR-GL46.packed_pixels.rectangle.rgb16_snorm,Fail
+KHR-GL46.packed_pixels.rectangle.rgb8_snorm,Fail
+KHR-GL46.packed_pixels.rectangle.rgba16_snorm,Fail
+KHR-GL46.packed_pixels.rectangle.rgba8_snorm,Fail
+KHR-GL46.packed_pixels.varied_rectangle.r16_snorm,Fail
+KHR-GL46.packed_pixels.varied_rectangle.r8_snorm,Fail
+KHR-GL46.packed_pixels.varied_rectangle.rg16_snorm,Fail
+KHR-GL46.packed_pixels.varied_rectangle.rg8_snorm,Fail
+KHR-GL46.packed_pixels.varied_rectangle.rgb16_snorm,Fail
+KHR-GL46.packed_pixels.varied_rectangle.rgb8_snorm,Fail
+KHR-GL46.packed_pixels.varied_rectangle.rgba16_snorm,Fail
+KHR-GL46.packed_pixels.varied_rectangle.rgba8_snorm,Fail
+KHR-GL46.shader_ballot_tests.ShaderBallotFunctionRead,Fail
+KHR-GL46.shader_image_load_store.basic-api-bind,Fail
+KHR-GL46.sparse_buffer_tests.BufferStorageTest,Fail
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/ci/raven-piglit-quick-fail.csv b/mesa 3D driver/src/gallium/drivers/radeonsi/ci/raven-piglit-quick-fail.csv
new file mode 100644
index 0000000000..fe4bde09be
--- /dev/null
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/ci/raven-piglit-quick-fail.csv	
@@ -0,0 +1,271 @@
+glx@glx-make-current,Crash
+glx@glx-multi-window-single-context,Fail
+glx@glx-query-drawable-glx_fbconfig_id-window,Fail
+glx@glx-visuals-depth -pixmap,Crash
+glx@glx-visuals-stencil -pixmap,Crash
+glx@glx_arb_create_context_es2_profile@invalid opengl es version,Fail
+glx@glx_arb_create_context_no_error@no error,Fail
+glx@glx_arb_create_context_robustness@invalid reset notification strategy,Fail
+glx@glx_ext_import_context@free context,Fail
+glx@glx_ext_import_context@get context id,Fail
+glx@glx_ext_import_context@get current display,Fail
+glx@glx_ext_import_context@import context- multi process,Fail
+glx@glx_ext_import_context@import context- single process,Fail
+glx@glx_ext_import_context@imported context has same context id,Fail
+glx@glx_ext_import_context@make current- multi process,Fail
+glx@glx_ext_import_context@make current- single process,Fail
+glx@glx_ext_import_context@query context info,Fail
+glx@glx_ext_no_config_context@no fbconfig,Fail
+spec@!opengl 1.0@gl-1.0-swapbuffers-behavior,Fail
+spec@!opengl 1.0@gl-1.0-user-clip-all-planes,Fail
+spec@!opengl 1.0@rasterpos,Fail
+spec@!opengl 1.0@rasterpos@glsl_vs_gs_linked,Fail
+spec@!opengl 1.0@rasterpos@glsl_vs_tes_linked,Fail
+spec@!opengl 1.1@read-front,Fail
+spec@!opengl 1.1@read-front clear-front-first,Fail
+spec@!opengl 1.1@read-front clear-front-first samples=2,Fail
+spec@!opengl 1.1@read-front clear-front-first samples=4,Fail
+spec@!opengl 1.1@read-front clear-front-first samples=6,Fail
+spec@!opengl 1.1@read-front clear-front-first samples=8,Fail
+spec@!opengl 1.1@read-front samples=2,Fail
+spec@!opengl 1.1@read-front samples=4,Fail
+spec@!opengl 1.1@read-front samples=6,Fail
+spec@!opengl 1.1@read-front samples=8,Fail
+spec@!opengl 1.1@teximage-colors gl_r16_snorm,Fail
+spec@!opengl 1.1@teximage-colors gl_r16_snorm@GL_R16_SNORM texture with GL_BGR and GL_BYTE,Fail
+spec@!opengl 1.1@teximage-colors gl_r16_snorm@GL_R16_SNORM texture with GL_BGRA and GL_BYTE,Fail
+spec@!opengl 1.1@teximage-colors gl_r16_snorm@GL_R16_SNORM texture with GL_RGB and GL_BYTE,Fail
+spec@!opengl 1.1@teximage-colors gl_rg16_snorm,Fail
+spec@!opengl 1.1@teximage-colors gl_rg16_snorm@GL_RG16_SNORM texture with GL_BGR and GL_BYTE,Fail
+spec@!opengl 1.1@teximage-colors gl_rg16_snorm@GL_RG16_SNORM texture with GL_BGRA and GL_BYTE,Fail
+spec@!opengl 1.1@teximage-colors gl_rg16_snorm@GL_RG16_SNORM texture with GL_GREEN and GL_BYTE,Fail
+spec@!opengl 1.1@teximage-colors gl_rg16_snorm@GL_RG16_SNORM texture with GL_RGB and GL_BYTE,Fail
+spec@!opengl 1.1@teximage-colors gl_rgb16_snorm,Fail
+spec@!opengl 1.1@teximage-colors gl_rgb16_snorm@GL_RGB16_SNORM texture with GL_BGR and GL_BYTE,Fail
+spec@!opengl 1.1@teximage-colors gl_rgb16_snorm@GL_RGB16_SNORM texture with GL_BGRA and GL_BYTE,Fail
+spec@!opengl 1.1@teximage-colors gl_rgb16_snorm@GL_RGB16_SNORM texture with GL_BLUE and GL_BYTE,Fail
+spec@!opengl 1.1@teximage-colors gl_rgb16_snorm@GL_RGB16_SNORM texture with GL_GREEN and GL_BYTE,Fail
+spec@!opengl 1.1@teximage-colors gl_rgb16_snorm@GL_RGB16_SNORM texture with GL_RGB and GL_BYTE,Fail
+spec@!opengl 1.1@teximage-colors gl_rgba16_snorm,Fail
+spec@!opengl 1.1@teximage-colors gl_rgba16_snorm@GL_RGBA16_SNORM texture with GL_BGR and GL_BYTE,Fail
+spec@!opengl 1.1@teximage-colors gl_rgba16_snorm@GL_RGBA16_SNORM texture with GL_BGRA and GL_BYTE,Fail
+spec@!opengl 1.1@teximage-colors gl_rgba16_snorm@GL_RGBA16_SNORM texture with GL_BLUE and GL_BYTE,Fail
+spec@!opengl 1.1@teximage-colors gl_rgba16_snorm@GL_RGBA16_SNORM texture with GL_GREEN and GL_BYTE,Fail
+spec@!opengl 1.1@teximage-colors gl_rgba16_snorm@GL_RGBA16_SNORM texture with GL_RGB and GL_BYTE,Fail
+spec@!opengl 1.1@windowoverlap,Fail
+spec@arb_bindless_texture@compiler@images@arith-bound-image.frag,Crash
+spec@arb_bindless_texture@compiler@samplers@arith-bound-sampler-texture2d.frag,Crash
+spec@arb_bindless_texture@illegal,Fail
+spec@arb_bindless_texture@illegal@Call glCopyTexImage* when a texture handle is referenced,Fail
+spec@arb_bindless_texture@illegal@Call glTexImage* when a texture handle is referenced,Fail
+spec@arb_direct_state_access@gettextureimage-formats,Crash
+spec@arb_enhanced_layouts@compiler@block-member-locations@arrayed-block-member-location.frag,Fail
+spec@arb_enhanced_layouts@compiler@block-member-locations@arrayed-block-member-location.vert,Fail
+spec@arb_gl_spirv@execution@ssbo@aoa,Fail
+spec@arb_gl_spirv@execution@ssbo@aoa-2,Fail
+spec@arb_gl_spirv@execution@ssbo@array,Fail
+spec@arb_gl_spirv@execution@ssbo@array-indirect,Fail
+spec@arb_gl_spirv@execution@ssbo@array-inside-ssbo,Fail
+spec@arb_gl_spirv@execution@ssbo@array-of-arrays-inside-ssbo,Fail
+spec@arb_gl_spirv@execution@ssbo@matrix@column-major,Fail
+spec@arb_gl_spirv@execution@ssbo@matrix@column-vs-row,Fail
+spec@arb_gl_spirv@execution@ssbo@matrix@complex,Fail
+spec@arb_gl_spirv@execution@ssbo@matrix@indirect-column-major,Fail
+spec@arb_gl_spirv@execution@ssbo@matrix@indirect-row-major,Fail
+spec@arb_gl_spirv@execution@ssbo@matrix@row-major,Fail
+spec@arb_gl_spirv@execution@ssbo@simple,Fail
+spec@arb_gl_spirv@execution@ssbo@two-ssbo,Fail
+spec@arb_gl_spirv@execution@ssbo@two-ssbo-different-layouts,Fail
+spec@arb_gl_spirv@execution@ssbo@two-stages,Fail
+spec@arb_gl_spirv@execution@ssbo@unsized-array,Fail
+spec@arb_gl_spirv@execution@ssbo@unsized-array-length,Fail
+spec@arb_gl_spirv@execution@ubo@aoa,Fail
+spec@arb_gl_spirv@execution@ubo@aoa-2,Fail
+spec@arb_gl_spirv@execution@ubo@array,Fail
+spec@arb_gl_spirv@execution@ubo@array-complex,Fail
+spec@arb_gl_spirv@execution@ubo@array-different-array-stride-ubo,Fail
+spec@arb_gl_spirv@execution@ubo@array-indirect,Fail
+spec@arb_gl_spirv@execution@ubo@array-inside-ubo,Fail
+spec@arb_gl_spirv@execution@ubo@array-inside-ubo-copy,Fail
+spec@arb_gl_spirv@execution@ubo@array-of-arrays-inside-ubo,Fail
+spec@arb_gl_spirv@execution@ubo@explicit-offset,Fail
+spec@arb_gl_spirv@execution@ubo@explicit-offset-nested-struct,Fail
+spec@arb_gl_spirv@execution@ubo@location-0-crash,Fail
+spec@arb_gl_spirv@execution@ubo@matrix@column-major,Fail
+spec@arb_gl_spirv@execution@ubo@matrix@column-vs-row,Fail
+spec@arb_gl_spirv@execution@ubo@matrix@complex,Fail
+spec@arb_gl_spirv@execution@ubo@matrix@different-matrix-stride,Fail
+spec@arb_gl_spirv@execution@ubo@matrix@indirect-column-major,Fail
+spec@arb_gl_spirv@execution@ubo@matrix@indirect-row-major,Fail
+spec@arb_gl_spirv@execution@ubo@matrix@row-major,Fail
+spec@arb_gl_spirv@execution@ubo@simple,Fail
+spec@arb_gl_spirv@execution@ubo@two-stages,Fail
+spec@arb_gl_spirv@execution@ubo@two-ubos,Fail
+spec@arb_gl_spirv@execution@uniform@array,Fail
+spec@arb_gl_spirv@execution@uniform@arrays-of-arrays,Fail
+spec@arb_gl_spirv@execution@uniform@atomic-uint-aoa-cs,Fail
+spec@arb_gl_spirv@execution@uniform@atomic-uint-aoa-fs,Fail
+spec@arb_gl_spirv@execution@uniform@atomic-uint-array-cs,Fail
+spec@arb_gl_spirv@execution@uniform@atomic-uint-array-fs,Fail
+spec@arb_gl_spirv@execution@uniform@atomic-uint-cs,Fail
+spec@arb_gl_spirv@execution@uniform@atomic-uint-mixing-with-normal-uniforms,Fail
+spec@arb_gl_spirv@execution@uniform@atomic-uint-several-slots,Fail
+spec@arb_gl_spirv@execution@uniform@embedded-structs,Fail
+spec@arb_gl_spirv@execution@uniform@index-matches-location,Fail
+spec@arb_gl_spirv@execution@uniform@initializer,Fail
+spec@arb_gl_spirv@execution@uniform@initializer-complex,Fail
+spec@arb_gl_spirv@execution@uniform@initializer-dvec4,Fail
+spec@arb_gl_spirv@execution@uniform@initializer-mat4x3,Fail
+spec@arb_gl_spirv@execution@uniform@nonsequential-locations,Fail
+spec@arb_gl_spirv@execution@uniform@sampler2d,Fail
+spec@arb_gl_spirv@execution@uniform@sampler2d-binding,Fail
+spec@arb_gl_spirv@execution@uniform@sampler2d-binding-array,Fail
+spec@arb_gl_spirv@execution@uniform@sampler2d-nonconst-nested-array,Fail
+spec@arb_gl_spirv@execution@uniform@sampler2d-struct,Fail
+spec@arb_gl_spirv@execution@uniform@simple,Fail
+spec@arb_gl_spirv@execution@uniform@simple-without-names,Fail
+spec@arb_gl_spirv@execution@uniform@struct,Fail
+spec@arb_gl_spirv@execution@uniform@struct-array,Fail
+spec@arb_gl_spirv@execution@uniform@two-uniforms,Fail
+spec@arb_gl_spirv@execution@va64-simple,Fail
+spec@arb_gl_spirv@execution@vs-ps-simple,Fail
+spec@arb_gl_spirv@execution@vs-ps-specializations,Fail
+spec@arb_gl_spirv@execution@xfb@vs_aoa,Fail
+spec@arb_gl_spirv@execution@xfb@vs_block,Fail
+spec@arb_gl_spirv@execution@xfb@vs_block_array,Fail
+spec@arb_gl_spirv@execution@xfb@vs_block_array_offset_per_member,Fail
+spec@arb_gl_spirv@execution@xfb@vs_double,Fail
+spec@arb_gl_spirv@execution@xfb@vs_lines,Fail
+spec@arb_gl_spirv@execution@xfb@vs_simple,Fail
+spec@arb_gl_spirv@execution@xfb@vs_simple_multiple_samples,Fail
+spec@arb_gl_spirv@execution@xfb@vs_struct,Fail
+spec@arb_gl_spirv@execution@xfb@vs_struct_array,Fail
+spec@arb_gl_spirv@execution@xfb@vs_triangles,Fail
+spec@arb_gl_spirv@execution@xfb@vs_two_block,Fail
+spec@arb_gl_spirv@execution@xfb@vs_two_sets,Fail
+spec@arb_gl_spirv@execution@xfb@vs_two_sets_ifc,Fail
+spec@arb_gl_spirv@execution@xfb@vs_two_sets_struct,Fail
+spec@arb_gl_spirv@linker@uniform@multisampler,Fail
+spec@arb_gl_spirv@linker@uniform@multisampler-array,Fail
+spec@arb_gpu_shader5@arb_gpu_shader5-xfb-streams-without-invocations spirv,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@frag-conversion-explicit-dmat2-mat2,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@frag-conversion-explicit-dmat2x3-mat2x3,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@frag-conversion-explicit-dmat2x4-mat2x4,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@frag-conversion-explicit-dmat3-mat3,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@frag-conversion-explicit-dmat3x2-mat3x2,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@frag-conversion-explicit-dmat3x4-mat3x4,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@frag-conversion-explicit-dmat4-mat4,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@frag-conversion-explicit-dmat4x2-mat4x2,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@frag-conversion-explicit-dmat4x3-mat4x3,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@frag-conversion-explicit-double-float,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@frag-conversion-explicit-dvec2-vec2,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@frag-conversion-explicit-dvec3-vec3,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@frag-conversion-explicit-dvec4-vec4,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@geom-conversion-explicit-dmat2-mat2,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@geom-conversion-explicit-dmat2x3-mat2x3,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@geom-conversion-explicit-dmat2x4-mat2x4,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@geom-conversion-explicit-dmat3-mat3,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@geom-conversion-explicit-dmat3x2-mat3x2,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@geom-conversion-explicit-dmat3x4-mat3x4,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@geom-conversion-explicit-dmat4-mat4,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@geom-conversion-explicit-dmat4x2-mat4x2,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@geom-conversion-explicit-dmat4x3-mat4x3,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@geom-conversion-explicit-double-float,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@geom-conversion-explicit-dvec2-vec2,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@geom-conversion-explicit-dvec3-vec3,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@geom-conversion-explicit-dvec4-vec4,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dmat2-mat2,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dmat2x3-mat2x3,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dmat2x4-mat2x4,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dmat3-mat3,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dmat3x2-mat3x2,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dmat3x4-mat3x4,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dmat4-mat4,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dmat4x2-mat4x2,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dmat4x3-mat4x3,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-double-float,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec2-vec2,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec3-vec3,Fail
+spec@arb_gpu_shader_fp64@execution@conversion@vert-conversion-explicit-dvec4-vec4,Fail
+spec@arb_program_interface_query@arb_program_interface_query-getprogramresourceindex,Fail
+spec@arb_program_interface_query@arb_program_interface_query-getprogramresourceindex@'vs_input2[1][0]' on GL_PROGRAM_INPUT,Fail
+spec@arb_shader_texture_lod@execution@arb_shader_texture_lod-texgradcube,Fail
+spec@arb_shading_language_packing@execution@built-in-functions@fs-packhalf2x16,Fail
+spec@arb_shading_language_packing@execution@built-in-functions@vs-packhalf2x16,Fail
+spec@arb_sparse_buffer@buffer-data,Fail
+spec@arb_sparse_buffer@commit,Fail
+spec@arb_texture_float@multisample-formats 2 gl_arb_texture_float,Fail
+spec@arb_texture_rg@multisample-formats 2 gl_arb_texture_rg-float,Fail
+spec@egl 1.4@egl-copy-buffers,Crash
+spec@egl_chromium_sync_control@conformance,Fail
+spec@egl_chromium_sync_control@conformance@eglGetSyncValuesCHROMIUM_msc_and_sbc_test,Fail
+spec@egl_ext_protected_content@conformance,Fail
+spec@ext_framebuffer_blit@fbo-blit-check-limits,Fail
+spec@ext_framebuffer_multisample@turn-on-off 2,Fail
+spec@ext_framebuffer_multisample@turn-on-off 4,Fail
+spec@ext_framebuffer_multisample@turn-on-off 6,Fail
+spec@ext_framebuffer_multisample@turn-on-off 8,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_uyvy,Fail
+spec@ext_image_dma_buf_import@ext_image_dma_buf_import-sample_yuyv,Fail
+spec@ext_texture_integer@fbo-integer,Fail
+spec@ext_texture_snorm@multisample-formats 2 gl_ext_texture_snorm,Fail
+spec@glsl-1.20@compiler@invalid-vec4-array-to-vec3-array-conversion.vert,Fail
+spec@glsl-1.50@execution@geometry@tri-strip-ordering-with-prim-restart gl_triangle_strip_adjacency ffs,Fail
+spec@glsl-1.50@execution@geometry@tri-strip-ordering-with-prim-restart gl_triangle_strip_adjacency other,Fail
+spec@glsl-4.00@execution@conversion@frag-conversion-explicit-dmat2-mat2,Fail
+spec@glsl-4.00@execution@conversion@frag-conversion-explicit-dmat2x3-mat2x3,Fail
+spec@glsl-4.00@execution@conversion@frag-conversion-explicit-dmat2x4-mat2x4,Fail
+spec@glsl-4.00@execution@conversion@frag-conversion-explicit-dmat3-mat3,Fail
+spec@glsl-4.00@execution@conversion@frag-conversion-explicit-dmat3x2-mat3x2,Fail
+spec@glsl-4.00@execution@conversion@frag-conversion-explicit-dmat3x4-mat3x4,Fail
+spec@glsl-4.00@execution@conversion@frag-conversion-explicit-dmat4-mat4,Fail
+spec@glsl-4.00@execution@conversion@frag-conversion-explicit-dmat4x2-mat4x2,Fail
+spec@glsl-4.00@execution@conversion@frag-conversion-explicit-dmat4x3-mat4x3,Fail
+spec@glsl-4.00@execution@conversion@frag-conversion-explicit-double-float,Fail
+spec@glsl-4.00@execution@conversion@frag-conversion-explicit-dvec2-vec2,Fail
+spec@glsl-4.00@execution@conversion@frag-conversion-explicit-dvec3-vec3,Fail
+spec@glsl-4.00@execution@conversion@frag-conversion-explicit-dvec4-vec4,Fail
+spec@glsl-4.00@execution@conversion@geom-conversion-explicit-dmat2-mat2,Fail
+spec@glsl-4.00@execution@conversion@geom-conversion-explicit-dmat2x3-mat2x3,Fail
+spec@glsl-4.00@execution@conversion@geom-conversion-explicit-dmat2x4-mat2x4,Fail
+spec@glsl-4.00@execution@conversion@geom-conversion-explicit-dmat3-mat3,Fail
+spec@glsl-4.00@execution@conversion@geom-conversion-explicit-dmat3x2-mat3x2,Fail
+spec@glsl-4.00@execution@conversion@geom-conversion-explicit-dmat3x4-mat3x4,Fail
+spec@glsl-4.00@execution@conversion@geom-conversion-explicit-dmat4-mat4,Fail
+spec@glsl-4.00@execution@conversion@geom-conversion-explicit-dmat4x2-mat4x2,Fail
+spec@glsl-4.00@execution@conversion@geom-conversion-explicit-dmat4x3-mat4x3,Fail
+spec@glsl-4.00@execution@conversion@geom-conversion-explicit-double-float,Fail
+spec@glsl-4.00@execution@conversion@geom-conversion-explicit-dvec2-vec2,Fail
+spec@glsl-4.00@execution@conversion@geom-conversion-explicit-dvec3-vec3,Fail
+spec@glsl-4.00@execution@conversion@geom-conversion-explicit-dvec4-vec4,Fail
+spec@glsl-4.00@execution@conversion@vert-conversion-explicit-dmat2-mat2,Fail
+spec@glsl-4.00@execution@conversion@vert-conversion-explicit-dmat2x3-mat2x3,Fail
+spec@glsl-4.00@execution@conversion@vert-conversion-explicit-dmat2x4-mat2x4,Fail
+spec@glsl-4.00@execution@conversion@vert-conversion-explicit-dmat3-mat3,Fail
+spec@glsl-4.00@execution@conversion@vert-conversion-explicit-dmat3x2-mat3x2,Fail
+spec@glsl-4.00@execution@conversion@vert-conversion-explicit-dmat3x4-mat3x4,Fail
+spec@glsl-4.00@execution@conversion@vert-conversion-explicit-dmat4-mat4,Fail
+spec@glsl-4.00@execution@conversion@vert-conversion-explicit-dmat4x2-mat4x2,Fail
+spec@glsl-4.00@execution@conversion@vert-conversion-explicit-dmat4x3-mat4x3,Fail
+spec@glsl-4.00@execution@conversion@vert-conversion-explicit-double-float,Fail
+spec@glsl-4.00@execution@conversion@vert-conversion-explicit-dvec2-vec2,Fail
+spec@glsl-4.00@execution@conversion@vert-conversion-explicit-dvec3-vec3,Fail
+spec@glsl-4.00@execution@conversion@vert-conversion-explicit-dvec4-vec4,Fail
+spec@glsl-es-3.00@execution@built-in-functions@fs-packhalf2x16,Fail
+spec@glsl-es-3.00@execution@built-in-functions@vs-packhalf2x16,Fail
+spec@khr_texture_compression_astc@miptree-gl srgb-fp,Fail
+spec@khr_texture_compression_astc@miptree-gl srgb-fp@sRGB decode full precision,Fail
+spec@khr_texture_compression_astc@miptree-gles srgb-fp,Fail
+spec@khr_texture_compression_astc@miptree-gles srgb-fp@sRGB decode full precision,Fail
+spec@khr_texture_compression_astc@sliced-3d-miptree-gl srgb-fp,Fail
+spec@khr_texture_compression_astc@sliced-3d-miptree-gl srgb-fp@sRGB decode full precision,Fail
+spec@khr_texture_compression_astc@sliced-3d-miptree-gles srgb-fp,Fail
+spec@khr_texture_compression_astc@sliced-3d-miptree-gles srgb-fp@sRGB decode full precision,Fail
+spec@nv_compute_shader_derivatives@compiler@new_functions.comp,Fail
+spec@oes_egl_image_external_essl3@oes_egl_image_external_essl3,Fail
+spec@oes_egl_image_external_essl3@oes_egl_image_external_essl3@oes_egl_image_external_essl3_imageLoad,Fail
+spec@oes_egl_image_external_essl3@oes_egl_image_external_essl3@oes_egl_image_external_essl3_imageStore,Fail
+spec@oes_shader_io_blocks@compiler@layout-location-aliasing.vert,Fail
+wgl@wgl-multi-context-single-window,Fail
+wgl@wgl-multi-window-single-context,Fail
+wgl@wgl-sanity,Fail
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/ci/sienna_cichlid-piglit-quick-fail.csv b/mesa 3D driver/src/gallium/drivers/radeonsi/ci/sienna_cichlid-piglit-quick-fail.csv
index 12775d52c8..6a8e45a45a 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/ci/sienna_cichlid-piglit-quick-fail.csv	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/ci/sienna_cichlid-piglit-quick-fail.csv	
@@ -39,9 +39,6 @@ spec@!opengl 3.2@gl-3.2-adj-prims line cull-front pv-first,Fail
 spec@!opengl 3.2@gl-3.2-adj-prims pv-first,Fail
 spec@arb_bindless_texture@compiler@images@arith-bound-image.frag,Crash
 spec@arb_bindless_texture@compiler@samplers@arith-bound-sampler-texture2d.frag,Crash
-spec@arb_bindless_texture@illegal,Fail
-spec@arb_bindless_texture@illegal@Call glCopyTexImage* when a texture handle is referenced,Fail
-spec@arb_bindless_texture@illegal@Call glTexImage* when a texture handle is referenced,Fail
 spec@arb_direct_state_access@gettextureimage-formats,Crash
 spec@arb_enhanced_layouts@compiler@block-member-locations@arrayed-block-member-location.frag,Fail
 spec@arb_enhanced_layouts@compiler@block-member-locations@arrayed-block-member-location.vert,Fail
@@ -180,7 +177,6 @@ spec@khr_texture_compression_astc@sliced-3d-miptree-gl srgb-fp,Fail
 spec@khr_texture_compression_astc@sliced-3d-miptree-gl srgb-fp@sRGB decode full precision,Fail
 spec@khr_texture_compression_astc@sliced-3d-miptree-gles srgb-fp,Fail
 spec@khr_texture_compression_astc@sliced-3d-miptree-gles srgb-fp@sRGB decode full precision,Fail
-spec@nv_compute_shader_derivatives@compiler@new_functions.comp,Fail
 spec@oes_egl_image_external_essl3@oes_egl_image_external_essl3,Fail
 spec@oes_egl_image_external_essl3@oes_egl_image_external_essl3@oes_egl_image_external_essl3_imageLoad,Fail
 spec@oes_egl_image_external_essl3@oes_egl_image_external_essl3@oes_egl_image_external_essl3_imageStore,Fail
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/ci/traces-radeonsi.yml b/mesa 3D driver/src/gallium/drivers/radeonsi/ci/traces-radeonsi.yml
index 60620f784c..b2e46f3e81 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/ci/traces-radeonsi.yml	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/ci/traces-radeonsi.yml	
@@ -21,11 +21,11 @@ traces:
   - path: pathfinder/demo.trace
     expectations:
       - device: gl-radeonsi-stoney
-        checksum: c81c85f9b247dd1b06c3dd5b669cc283
+        checksum: 8ff636268dfa0d54b6f15d70d15e354d
   - path: pathfinder/canvas_moire.trace
     expectations:
       - device: gl-radeonsi-stoney
-        checksum: 78dd2357ad6e5ffc049a75bfb11c5497
+        checksum: 505b9cad6e65c13463a0786944f8b679
   - path: pathfinder/canvas_text_v2.trace
     expectations:
       - device: gl-radeonsi-stoney
@@ -37,7 +37,7 @@ traces:
   - path: gputest/pixmark-piano.trace
     expectations:
       - device: gl-radeonsi-stoney
-        checksum: 86ebe6ff8038975de8724fa9536edb7e
+        checksum: 58a86d233d03e2a174cb79c16028f916
   - path: gputest/triangle.trace
     expectations:
       - device: gl-radeonsi-stoney
@@ -133,7 +133,7 @@ traces:
   - path: glmark2/refract.trace
     expectations:
       - device: gl-radeonsi-stoney
-        checksum: 9d0a2d2fce0b80a265fbcee5107c9e82
+        checksum: 41d105bdd10a354f6d161c67f715b7f9
   - path: glmark2/shading:shading=blinn-phong-inf.trace
     expectations:
       - device: gl-radeonsi-stoney
@@ -173,11 +173,11 @@ traces:
   - path: godot/Material Testers.x86_64_2020.04.08_13.38_frame799.rdc
     expectations:
       - device: gl-radeonsi-stoney
-        checksum: 02f654ad77c0c1106e1b31e1c86c93bb
+        checksum: 4df1fbfc346851fe9e086a0708afde21
   - path: gputest/gimark.trace
     expectations:
       - device: gl-radeonsi-stoney
-        checksum: 4442dbd44a9704c499da4817fffce306
+        checksum: 52f76e6db877111845990ee128552082
   - path: gputest/pixmark-julia-fp32.trace
     expectations:
       - device: gl-radeonsi-stoney
@@ -193,7 +193,7 @@ traces:
   - path: gputest/plot3d.trace
     expectations:
       - device: gl-radeonsi-stoney
-        checksum: ce101e0808b9a2e0092d1668e653a32b
+        checksum: a62be186a3e0a33ecbd520edd3873eb1
   - path: gputest/tessmark.trace
     expectations:
       - device: gl-radeonsi-stoney
@@ -201,7 +201,7 @@ traces:
   - path: humus/AmbientAperture.trace
     expectations:
       - device: gl-radeonsi-stoney
-        checksum: 664ea58a62b27737b7d0ae9e86ab85c0
+        checksum: 7ad498c94dcfbf22ef56f115648be86d
   - path: humus/CelShading.trace
     expectations:
       - device: gl-radeonsi-stoney
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/driinfo_radeonsi.h b/mesa 3D driver/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
index 1e1ca86c2d..3957f9288a 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/driinfo_radeonsi.h	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/driinfo_radeonsi.h	
@@ -8,6 +8,7 @@ DRI_CONF_SECTION_END
 
 DRI_CONF_SECTION_DEBUG
 #define OPT_BOOL(name, dflt, description) DRI_CONF_OPT_B(radeonsi_##name, dflt, description)
+#define OPT_INT(name, dflt, description) DRI_CONF_OPT_I(radeonsi_##name, dflt, INT_MIN, INT_MAX, description)
 
 #include "radeonsi/si_debug_options.h"
 DRI_CONF_SECTION_END
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/mesa 3D driver/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
index edfe0ee364..745dd90eb2 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c	
@@ -70,17 +70,6 @@ static LLVMValueRef ngg_get_query_buf(struct si_shader_context *ctx)
                                 LLVMConstInt(ctx->ac.i32, GFX10_GS_QUERY_BUF, false));
 }
 
-static LLVMValueRef ngg_get_initial_edgeflag(struct si_shader_context *ctx, unsigned index)
-{
-   if (ctx->stage == MESA_SHADER_VERTEX) {
-      LLVMValueRef tmp;
-      tmp = LLVMBuildLShr(ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args.gs_invocation_id),
-                          LLVMConstInt(ctx->ac.i32, 8 + index, false), "");
-      return LLVMBuildTrunc(ctx->ac.builder, tmp, ctx->ac.i1, "");
-   }
-   return ctx->ac.i1false;
-}
-
 /**
  * Return the number of vertices as a constant in \p num_vertices,
  * and return a more precise value as LLVMValueRef from the function.
@@ -94,11 +83,13 @@ static LLVMValueRef ngg_get_vertices_per_prim(struct si_shader_context *ctx, uns
          /* Blits always use axis-aligned rectangles with 3 vertices. */
          *num_vertices = 3;
          return LLVMConstInt(ctx->ac.i32, 3, 0);
+      } else if (ctx->shader->key.ge.opt.ngg_culling & SI_NGG_CULL_LINES) {
+         *num_vertices = 2;
+         return LLVMConstInt(ctx->ac.i32, 2, 0);
       } else {
          /* We always build up all three indices for the prim export
           * independent of the primitive type. The additional garbage
-          * data shouldn't hurt. This number doesn't matter with
-          * NGG passthrough.
+          * data shouldn't hurt. This is used by exports and streamout.
           */
          *num_vertices = 3;
 
@@ -124,9 +115,10 @@ bool gfx10_ngg_export_prim_early(struct si_shader *shader)
 {
    struct si_shader_selector *sel = shader->selector;
 
-   assert(shader->key.as_ngg && !shader->key.as_es);
+   assert(shader->key.ge.as_ngg && !shader->key.ge.as_es);
 
-   return sel->info.stage != MESA_SHADER_GEOMETRY && !sel->info.writes_edgeflag;
+   return sel->info.stage != MESA_SHADER_GEOMETRY &&
+          !gfx10_ngg_writes_user_edgeflags(shader);
 }
 
 void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx)
@@ -145,7 +137,7 @@ void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef use
 {
    LLVMBuilderRef builder = ctx->ac.builder;
 
-   if (gfx10_is_ngg_passthrough(ctx->shader) || ctx->shader->key.opt.ngg_culling) {
+   if (gfx10_is_ngg_passthrough(ctx->shader) || ctx->shader->key.ge.opt.ngg_culling) {
       ac_build_ifcc(&ctx->ac, si_is_gs_thread(ctx), 6001);
       {
          struct ac_ngg_prim prim = {};
@@ -153,12 +145,12 @@ void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef use
          if (prim_passthrough)
             prim.passthrough = prim_passthrough;
          else
-            prim.passthrough = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
+            prim.passthrough = ac_get_arg(&ctx->ac, ctx->args.gs_vtx_offset[0]);
 
          /* This is only used with NGG culling, which returns the NGG
           * passthrough prim export encoding.
           */
-         if (ctx->shader->selector->info.writes_edgeflag) {
+         if (gfx10_ngg_writes_user_edgeflags(ctx->shader)) {
             unsigned all_bits_no_edgeflags = ~SI_NGG_PRIM_EDGE_FLAG_BITS;
             LLVMValueRef edgeflags = LLVMConstInt(ctx->ac.i32, all_bits_no_edgeflags, 0);
 
@@ -190,20 +182,27 @@ void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef use
       ngg_get_vertices_per_prim(ctx, &prim.num_vertices);
 
       prim.isnull = ctx->ac.i1false;
-      prim.index[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
-      prim.index[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
-      prim.index[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
 
-      for (unsigned i = 0; i < prim.num_vertices; ++i) {
-         prim.edgeflag[i] = ngg_get_initial_edgeflag(ctx, i);
+      if (gfx10_edgeflags_have_effect(ctx->shader))
+         prim.edgeflags = ac_pack_edgeflags_for_export(&ctx->ac, &ctx->args);
+      else
+         prim.edgeflags = ctx->ac.i32_0;
 
-         if (ctx->shader->selector->info.writes_edgeflag) {
+      for (unsigned i = 0; i < prim.num_vertices; ++i)
+         prim.index[i] = si_unpack_param(ctx, ctx->args.gs_vtx_offset[i / 2], (i & 1) * 16, 16);
+
+      if (gfx10_ngg_writes_user_edgeflags(ctx->shader)) {
+         LLVMValueRef edgeflags = ctx->ac.i32_0;
+
+         for (unsigned i = 0; i < prim.num_vertices; ++i) {
             LLVMValueRef edge;
 
             edge = LLVMBuildLoad(ctx->ac.builder, user_edgeflags[i], "");
-            edge = LLVMBuildAnd(ctx->ac.builder, prim.edgeflag[i], edge, "");
-            prim.edgeflag[i] = edge;
+            edge = LLVMBuildZExt(ctx->ac.builder, edge, ctx->ac.i32, "");
+            edge = LLVMBuildShl(ctx->ac.builder, edge, LLVMConstInt(ctx->ac.i32, 9 + i*10, 0), "");
+            edgeflags = LLVMBuildOr(ctx->ac.builder, edgeflags, edge, "");
          }
+         prim.edgeflags = LLVMBuildAnd(ctx->ac.builder, prim.edgeflags, edgeflags, "");
       }
 
       ac_build_export_prim(&ctx->ac, &prim);
@@ -607,7 +606,7 @@ static unsigned ngg_nogs_vertex_size(struct si_shader *shader)
     * used for padding to reduce LDS bank conflicts. */
    if (shader->selector->so.num_outputs)
       lds_vertex_size = 4 * shader->selector->info.num_outputs + 1;
-   if (shader->selector->info.writes_edgeflag)
+   if (gfx10_ngg_writes_user_edgeflags(shader))
       lds_vertex_size = MAX2(lds_vertex_size, 1);
 
    /* LDS size for passing data from GS to ES.
@@ -615,17 +614,17 @@ static unsigned ngg_nogs_vertex_size(struct si_shader *shader)
     * to the ES thread of the provoking vertex. All ES threads
     * load and export PrimitiveID for their thread.
     */
-   if (shader->selector->info.stage == MESA_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id)
+   if (shader->selector->info.stage == MESA_SHADER_VERTEX && shader->key.ge.mono.u.vs_export_prim_id)
       lds_vertex_size = MAX2(lds_vertex_size, 1);
 
-   if (shader->key.opt.ngg_culling) {
+   if (shader->key.ge.opt.ngg_culling) {
       if (shader->selector->info.stage == MESA_SHADER_VERTEX) {
          STATIC_ASSERT(lds_instance_id + 1 == 7);
          lds_vertex_size = MAX2(lds_vertex_size, 7);
       } else {
          assert(shader->selector->info.stage == MESA_SHADER_TESS_EVAL);
 
-         if (shader->selector->info.uses_primid || shader->key.mono.u.vs_export_prim_id) {
+         if (shader->selector->info.uses_primid || shader->key.ge.mono.u.vs_export_prim_id) {
             STATIC_ASSERT(lds_tes_patch_id + 2 == 9); /* +1 for LDS padding */
             lds_vertex_size = MAX2(lds_vertex_size, 9);
          } else {
@@ -796,9 +795,12 @@ static void gfx10_build_primitive_accepted(struct ac_llvm_context *ac, LLVMValue
    LLVMValueRef gs_accepted = params[0];
    LLVMValueRef *gs_vtxptr = (LLVMValueRef *)params[1];
 
+   unsigned num_vertices;
+   ngg_get_vertices_per_prim(ctx, &num_vertices);
+
    ac_build_ifcc(&ctx->ac, accepted, 0);
    LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_1, gs_accepted);
-   for (unsigned vtx = 0; vtx < 3; vtx++) {
+   for (unsigned vtx = 0; vtx < num_vertices; vtx++) {
       LLVMBuildStore(ctx->ac.builder, ctx->ac.i8_1,
                      si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte0_accept_flag));
    }
@@ -811,20 +813,20 @@ static void gfx10_build_primitive_accepted(struct ac_llvm_context *ac, LLVMValue
  * Also return the position, which is passed to the shader as an input,
  * so that we don't compute it twice.
  */
-void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
-                                     LLVMValueRef *addrs)
+void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
 {
    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
    struct si_shader *shader = ctx->shader;
    struct si_shader_selector *sel = shader->selector;
    struct si_shader_info *info = &sel->info;
    LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef *addrs = abi->outputs;
    unsigned max_waves = DIV_ROUND_UP(ctx->screen->ngg_subgroup_size, ctx->ac.wave_size);
 
-   assert(shader->key.opt.ngg_culling);
-   assert(shader->key.as_ngg);
+   assert(shader->key.ge.opt.ngg_culling);
+   assert(shader->key.ge.as_ngg);
    assert(sel->info.stage == MESA_SHADER_VERTEX ||
-          (sel->info.stage == MESA_SHADER_TESS_EVAL && !shader->key.as_es));
+          (sel->info.stage == MESA_SHADER_TESS_EVAL && !shader->key.ge.as_es));
 
    LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
    unsigned pos_index = 0;
@@ -838,8 +840,8 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_out
           * the position. This is useful for analyzing maximum theoretical
           * performance without VS input loads.
           */
-         if (shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE &&
-             shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE) {
+         if (shader->key.ge.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE &&
+             shader->key.ge.opt.ngg_culling & SI_NGG_CULL_BACK_FACE) {
             for (unsigned j = 0; j < 4; j++)
                LLVMBuildStore(builder, LLVMGetUndef(ctx->ac.f32), addrs[4 * i + j]);
             break;
@@ -875,6 +877,9 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_out
 
    LLVMValueRef tid = ac_get_thread_id(&ctx->ac);
 
+   unsigned num_vertices;
+   ngg_get_vertices_per_prim(ctx, &num_vertices);
+
    /* The hardware requires that there are no holes between unculled vertices,
     * which means we have to pack ES threads, i.e. reduce the ES thread count
     * and move ES input VGPRs to lower threads. The upside is that varyings
@@ -904,23 +909,13 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_out
     */
 
    LLVMValueRef vtxindex[3];
-   if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) {
-      /* For the GS fast launch, the VS prolog simply puts the Vertex IDs
-       * into these VGPRs.
-       */
-      vtxindex[0] = ac_get_arg(&ctx->ac, ctx->gs_vtx01_offset);
-      vtxindex[1] = ac_get_arg(&ctx->ac, ctx->gs_vtx23_offset);
-      vtxindex[2] = ac_get_arg(&ctx->ac, ctx->gs_vtx45_offset);
-   } else {
-      vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
-      vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
-      vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
-   };
-   LLVMValueRef gs_vtxptr[] = {
-      ngg_nogs_vertex_ptr(ctx, vtxindex[0]),
-      ngg_nogs_vertex_ptr(ctx, vtxindex[1]),
-      ngg_nogs_vertex_ptr(ctx, vtxindex[2]),
-   };
+   for (unsigned i = 0; i < num_vertices; ++i)
+      vtxindex[i] = si_unpack_param(ctx, ctx->args.gs_vtx_offset[i / 2], (i & 1) * 16, 16);
+
+   LLVMValueRef gs_vtxptr[3];
+   for (unsigned i = 0; i < num_vertices; i++)
+      gs_vtxptr[i] = ngg_nogs_vertex_ptr(ctx, vtxindex[i]);
+
    es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
 
    /* Adding these optimization barriers improves the generated code as follows. Crazy right?
@@ -948,9 +943,8 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_out
     * - v_mul_u32_u24_e32 v17, 28, v11
     * - v_mul_u32_u24_e32 v18, 28, v10
     */
-   ac_build_optimization_barrier(&ctx->ac, &gs_vtxptr[0], false);
-   ac_build_optimization_barrier(&ctx->ac, &gs_vtxptr[1], false);
-   ac_build_optimization_barrier(&ctx->ac, &gs_vtxptr[2], false);
+   for (unsigned i = 0; i < num_vertices; i++)
+      ac_build_optimization_barrier(&ctx->ac, &gs_vtxptr[i], false);
 
    LLVMValueRef gs_accepted = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
 
@@ -959,7 +953,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_out
    {
       /* Load positions. */
       LLVMValueRef pos[3][4] = {};
-      for (unsigned vtx = 0; vtx < 3; vtx++) {
+      for (unsigned vtx = 0; vtx < num_vertices; vtx++) {
          for (unsigned chan = 0; chan < 4; chan++) {
             unsigned index;
             if (chan == 0 || chan == 1)
@@ -996,21 +990,30 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_out
 
       /* Execute culling code. */
       struct ac_cull_options options = {};
-      options.cull_front = shader->key.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE;
-      options.cull_back = shader->key.opt.ngg_culling & SI_NGG_CULL_BACK_FACE;
-      options.cull_view_xy = shader->key.opt.ngg_culling & SI_NGG_CULL_VIEW_SMALLPRIMS;
-      options.cull_small_prims = options.cull_view_xy;
-      options.cull_zero_area = options.cull_front || options.cull_back;
+      options.cull_view_xy = true;
       options.cull_w = true;
 
+      if (shader->key.ge.opt.ngg_culling & SI_NGG_CULL_LINES) {
+         options.num_vertices = 2;
+
+         assert(!(shader->key.ge.opt.ngg_culling & SI_NGG_CULL_BACK_FACE));
+         assert(!(shader->key.ge.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE));
+      } else {
+         options.num_vertices = 3;
+         options.cull_front = shader->key.ge.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE;
+         options.cull_back = shader->key.ge.opt.ngg_culling & SI_NGG_CULL_BACK_FACE;
+         options.cull_small_prims = true; /* this would only be false with conservative rasterization */
+         options.cull_zero_area = options.cull_front || options.cull_back;
+      }
+
       /* Tell ES threads whether their vertex survived. */
       LLVMValueRef params[] = {
          gs_accepted,
          (void*)gs_vtxptr,
       };
-      ac_cull_triangle(&ctx->ac, pos, ctx->ac.i1true, vp_scale, vp_translate,
-                       small_prim_precision, &options,
-                       gfx10_build_primitive_accepted, params);
+      ac_cull_primitive(&ctx->ac, pos, ctx->ac.i1true, vp_scale, vp_translate,
+                        small_prim_precision, &options,
+                        gfx10_build_primitive_accepted, params);
    }
    ac_build_endif(&ctx->ac, 16002);
    ac_build_s_barrier(&ctx->ac);
@@ -1052,10 +1055,10 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_out
 
    bool uses_instance_id = ctx->stage == MESA_SHADER_VERTEX &&
                            (sel->info.uses_instanceid ||
-                            shader->key.part.vs.prolog.instance_divisor_is_one ||
-                            shader->key.part.vs.prolog.instance_divisor_is_fetched);
+                            shader->key.ge.part.vs.prolog.instance_divisor_is_one ||
+                            shader->key.ge.part.vs.prolog.instance_divisor_is_fetched);
    bool uses_tes_prim_id = ctx->stage == MESA_SHADER_TESS_EVAL &&
-                           (sel->info.uses_primid || shader->key.mono.u.vs_export_prim_id);
+                           (sel->info.uses_primid || shader->key.ge.mono.u.vs_export_prim_id);
 
    /* ES threads compute their prefix sum, which is the new ES thread ID.
     * Then they write the vertex position and input VGPRs into the LDS address
@@ -1160,14 +1163,18 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_out
    ac_build_ifcc(&ctx->ac, LLVMBuildTrunc(builder, gs_accepted, ctx->ac.i1, ""), 16011);
    {
       struct ac_ngg_prim prim = {};
-      prim.num_vertices = 3;
+      prim.num_vertices = num_vertices;
       prim.isnull = ctx->ac.i1false;
 
-      for (unsigned vtx = 0; vtx < 3; vtx++) {
+      if (gfx10_edgeflags_have_effect(shader))
+         prim.edgeflags = ac_pack_edgeflags_for_export(&ctx->ac, &ctx->args);
+      else
+         prim.edgeflags = ctx->ac.i32_0;
+
+      for (unsigned vtx = 0; vtx < num_vertices; vtx++) {
          prim.index[vtx] = LLVMBuildLoad(
             builder, si_build_gep_i8(ctx, gs_vtxptr[vtx], lds_byte1_new_thread_id), "");
          prim.index[vtx] = LLVMBuildZExt(builder, prim.index[vtx], ctx->ac.i32, "");
-         prim.edgeflag[vtx] = ngg_get_initial_edgeflag(ctx, vtx);
       }
 
       /* Set the new GS input VGPR. */
@@ -1237,11 +1244,11 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_out
 
    val = LLVMBuildLoad(builder, new_vgpr0, "");
    ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
-   vgpr++; /* gs_vtx23_offset */
+   vgpr++; /* gs_vtx_offset[1] = offsets of vertices 2-3  */
 
    ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
    ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
-   vgpr++; /* gs_vtx45_offset */
+   vgpr++; /* gs_vtx_offset[2] = offsets of vertices 4-5 */
 
    /* Set the input VPGRs to the corresponding LDS addresses where the VGPR values are
     * stored. The VS prolog will load them.
@@ -1270,8 +1277,8 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_out
    }
 
    /* These two also use LDS. */
-   if (sel->info.writes_edgeflag ||
-       (ctx->stage == MESA_SHADER_VERTEX && shader->key.mono.u.vs_export_prim_id))
+   if (gfx10_ngg_writes_user_edgeflags(shader) ||
+       (ctx->stage == MESA_SHADER_VERTEX && shader->key.ge.mono.u.vs_export_prim_id))
       ac_build_s_barrier(&ctx->ac);
 
    ctx->return_value = ret;
@@ -1280,21 +1287,22 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_out
 /**
  * Emit the epilogue of an API VS or TES shader compiled as ESGS shader.
  */
-void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
+void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi)
 {
    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
    struct si_shader_selector *sel = ctx->shader->selector;
    struct si_shader_info *info = &sel->info;
    struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
    LLVMBuilderRef builder = ctx->ac.builder;
+   LLVMValueRef *addrs = abi->outputs;
    LLVMValueRef tmp, tmp2;
 
    assert(!ctx->shader->is_gs_copy_shader);
-   assert(info->num_outputs <= max_outputs);
+   assert(info->num_outputs <= AC_LLVM_MAX_OUTPUTS);
 
    LLVMValueRef vertex_ptr = NULL;
 
-   if (sel->so.num_outputs || sel->info.writes_edgeflag)
+   if (sel->so.num_outputs || gfx10_ngg_writes_user_edgeflags(ctx->shader))
       vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
 
    for (unsigned i = 0; i < info->num_outputs; i++) {
@@ -1315,7 +1323,7 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LL
       }
 
       /* Store the edgeflag at the end (if streamout is enabled) */
-      if (info->output_semantic[i] == VARYING_SLOT_EDGE && sel->info.writes_edgeflag) {
+      if (info->output_semantic[i] == VARYING_SLOT_EDGE && gfx10_ngg_writes_user_edgeflags(ctx->shader)) {
          LLVMValueRef edgeflag = LLVMBuildLoad(builder, addrs[4 * i], "");
          /* The output is a float, but the hw expects a 1-bit integer. */
          edgeflag = LLVMBuildFPToUI(ctx->ac.builder, edgeflag, ctx->ac.i32, "");
@@ -1328,9 +1336,9 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LL
    }
 
    bool unterminated_es_if_block =
-      !sel->so.num_outputs && !sel->info.writes_edgeflag &&
+      !sel->so.num_outputs && !gfx10_ngg_writes_user_edgeflags(ctx->shader) &&
       !ctx->screen->use_ngg_streamout && /* no query buffer */
-      (ctx->stage != MESA_SHADER_VERTEX || !ctx->shader->key.mono.u.vs_export_prim_id);
+      (ctx->stage != MESA_SHADER_VERTEX || !ctx->shader->key.ge.mono.u.vs_export_prim_id);
 
    if (!unterminated_es_if_block)
       ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
@@ -1339,14 +1347,12 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LL
    LLVMValueRef is_es_thread = si_is_es_thread(ctx);
    LLVMValueRef vtxindex[3];
 
-   if (ctx->shader->key.opt.ngg_culling) {
-      vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 9);
-      vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 10, 9);
-      vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 20, 9);
+   if (ctx->shader->key.ge.opt.ngg_culling || gfx10_is_ngg_passthrough(ctx->shader)) {
+      for (unsigned i = 0; i < 3; ++i)
+         vtxindex[i] = si_unpack_param(ctx, ctx->args.gs_vtx_offset[0], 10 * i, 9);
    } else {
-      vtxindex[0] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 0, 16);
-      vtxindex[1] = si_unpack_param(ctx, ctx->gs_vtx01_offset, 16, 16);
-      vtxindex[2] = si_unpack_param(ctx, ctx->gs_vtx23_offset, 0, 16);
+      for (unsigned i = 0; i < 3; ++i)
+         vtxindex[i] = si_unpack_param(ctx, ctx->args.gs_vtx_offset[i / 2], (i & 1) * 16, 16);
    }
 
    /* Determine the number of vertices per primitive. */
@@ -1372,7 +1378,7 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LL
 
    LLVMValueRef user_edgeflags[3] = {};
 
-   if (sel->info.writes_edgeflag) {
+   if (gfx10_ngg_writes_user_edgeflags(ctx->shader)) {
       assert(!unterminated_es_if_block);
 
       /* Streamout already inserted the barrier, so don't insert it again. */
@@ -1396,11 +1402,11 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LL
    /* Copy Primitive IDs from GS threads to the LDS address corresponding
     * to the ES thread of the provoking vertex.
     */
-   if (ctx->stage == MESA_SHADER_VERTEX && ctx->shader->key.mono.u.vs_export_prim_id) {
+   if (ctx->stage == MESA_SHADER_VERTEX && ctx->shader->key.ge.mono.u.vs_export_prim_id) {
       assert(!unterminated_es_if_block);
 
       /* Streamout and edge flags use LDS. Make it idle, so that we can reuse it. */
-      if (sel->so.num_outputs || sel->info.writes_edgeflag)
+      if (sel->so.num_outputs || gfx10_ngg_writes_user_edgeflags(ctx->shader))
          ac_build_s_barrier(&ctx->ac);
 
       ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
@@ -1473,7 +1479,7 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LL
           * load it from LDS.
           */
          if (info->output_semantic[i] == VARYING_SLOT_POS &&
-             ctx->shader->key.opt.ngg_culling) {
+             ctx->shader->key.ge.opt.ngg_culling) {
             vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
 
             for (unsigned j = 0; j < 4; j++) {
@@ -1489,7 +1495,7 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LL
          }
       }
 
-      if (ctx->shader->key.mono.u.vs_export_prim_id) {
+      if (ctx->shader->key.ge.mono.u.vs_export_prim_id) {
          outputs[i].semantic = VARYING_SLOT_PRIMITIVE_ID;
 
          if (ctx->stage == MESA_SHADER_VERTEX) {
@@ -1913,11 +1919,11 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
       tmp = ngg_gs_vertex_ptr(ctx, tid);
       flags = LLVMBuildLoad(builder, ngg_gs_get_emit_primflag_ptr(ctx, tmp, 0), "");
       prim.isnull = LLVMBuildNot(builder, LLVMBuildTrunc(builder, flags, ctx->ac.i1, ""), "");
+      prim.edgeflags = ctx->ac.i32_0;
 
       for (unsigned i = 0; i < verts_per_prim; ++i) {
          prim.index[i] = LLVMBuildSub(builder, vertlive_scan.result_exclusive,
                                       LLVMConstInt(ctx->ac.i32, verts_per_prim - i - 1, false), "");
-         prim.edgeflag[i] = ctx->ac.i1false;
       }
 
       /* Geometry shaders output triangle strips, but NGG expects triangles. */
@@ -1994,7 +2000,7 @@ bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
       shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel;
    const gl_shader_stage gs_stage = gs_sel->info.stage;
    const unsigned gs_num_invocations = MAX2(gs_sel->info.base.gs.invocations, 1);
-   const unsigned input_prim = si_get_input_prim(gs_sel);
+   const unsigned input_prim = si_get_input_prim(gs_sel, &shader->key);
    const bool use_adjacency =
       input_prim >= PIPE_PRIM_LINES_ADJACENCY && input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
    const unsigned max_verts_per_prim = u_vertices_per_prim(input_prim);
@@ -2014,14 +2020,6 @@ bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
    unsigned max_gsprims_base = gs_sel->screen->ngg_subgroup_size; /* default prim group size clamp */
    unsigned max_esverts_base = gs_sel->screen->ngg_subgroup_size;
 
-   if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
-      /* All lanes are filled in wave32. */
-      max_gsprims_base = ROUND_DOWN_TO(max_gsprims_base / 3, 32);
-      max_esverts_base = max_gsprims_base * 3;
-   } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
-      max_gsprims_base = max_esverts_base - 2;
-   }
-
    if (gs_stage == MESA_SHADER_GEOMETRY) {
       bool force_multi_cycling = false;
       unsigned max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out * gs_num_invocations;
@@ -2151,28 +2149,6 @@ bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
       prim_amp_factor = gs_sel->info.base.gs.vertices_out;
    }
 
-   /* Fix up the thread counts for fast launch. */
-   if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST) {
-      /* The vertex count must be a multiple of 3. */
-      max_esverts -= max_esverts % 3;
-      /* We can only decrease the size, not increase it. */
-      if (max_gsprims * 3 < max_esverts) {
-         max_esverts = max_gsprims * 3;
-      } else {
-         max_gsprims = max_esverts / 3;
-      }
-   } else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP) {
-      /* The primitive count must be even to get correct winding for triangle strips. */
-      max_gsprims &= ~1;
-      if (max_gsprims - 2 < max_esverts) {
-         max_esverts = max_gsprims + 2;
-      } else {
-         max_gsprims = max_esverts - 2;
-         max_gsprims &= ~1;
-         max_esverts = max_gsprims + 2;
-      }
-   }
-
    shader->ngg.hw_max_esverts = max_esverts;
    shader->ngg.max_gsprims = max_gsprims;
    shader->ngg.max_out_verts = max_out_vertices;
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/meson.build b/mesa 3D driver/src/gallium/drivers/radeonsi/meson.build
index 4b734d2b1e..88d8ccc9d8 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/meson.build	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/meson.build	
@@ -27,7 +27,6 @@ files_libradeonsi = files(
   'si_build_pm4.h',
   'si_clear.c',
   'si_compute.c',
-  'si_compute_prim_discard.c',
   'si_compute.h',
   'si_compute_blit.c',
   'si_cp_dma.c',
@@ -47,6 +46,7 @@ files_libradeonsi = files(
   'si_query.c',
   'si_query.h',
   'si_nir_optim.c',
+  'si_sdma_copy_image.c',
   'si_shader.c',
   'si_shader.h',
   'si_shader_internal.h',
@@ -64,7 +64,7 @@ files_libradeonsi = files(
   'si_state.h',
   'si_state_binning.c',
   'si_state_msaa.c',
-  'si_state_shaders.c',
+  'si_state_shaders.cpp',
   'si_state_streamout.c',
   'si_state_viewport.c',
   'si_test_blit.c',
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_blit.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_blit.c
index bf95fa6c7c..a51ff789b5 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_blit.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_blit.c	
@@ -98,11 +98,13 @@ void si_blitter_end(struct si_context *sctx)
    /* Restore shader pointers because the VS blit shader changed all
     * non-global VS user SGPRs. */
    sctx->shader_pointers_dirty |= SI_DESCS_SHADER_MASK(VERTEX);
+
+   unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sctx->screen);
    sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL &&
                                        sctx->num_vertex_elements >
-                                       sctx->screen->num_vbos_in_user_sgprs;
+                                       num_vbos_in_user_sgprs;
    sctx->vertex_buffer_user_sgprs_dirty = sctx->num_vertex_elements > 0 &&
-                                          sctx->screen->num_vbos_in_user_sgprs;
+                                          num_vbos_in_user_sgprs;
    si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
 }
 
@@ -1027,7 +1029,7 @@ void si_resource_copy_region(struct pipe_context *ctx, struct pipe_resource *dst
    /* Copy. */
    si_blitter_begin(sctx, SI_COPY);
    util_blitter_blit_generic(sctx->blitter, dst_view, &dstbox, src_view, src_box, src_width0,
-                             src_height0, PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL, false);
+                             src_height0, PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL, false, false);
    si_blitter_end(sctx);
 
    pipe_surface_reference(&dst_view, NULL);
@@ -1203,11 +1205,47 @@ static bool do_hardware_msaa_resolve(struct pipe_context *ctx, const struct pipe
 static void si_blit(struct pipe_context *ctx, const struct pipe_blit_info *info)
 {
    struct si_context *sctx = (struct si_context *)ctx;
+   struct si_texture *sdst = (struct si_texture *)info->dst.resource;
 
    if (do_hardware_msaa_resolve(ctx, info)) {
       return;
    }
 
+   if ((info->dst.resource->bind & PIPE_BIND_DRI_PRIME) && sdst->surface.is_linear &&
+       sctx->chip_class >= GFX7 && sdst->surface.flags & RADEON_SURF_IMPORTED) {
+      struct si_texture *ssrc = (struct si_texture *)info->src.resource;
+      /* Use SDMA or async compute when copying to a DRI_PRIME imported linear surface. */
+      bool async_copy = info->dst.box.x == 0 && info->dst.box.y == 0 && info->dst.box.z == 0 &&
+                        info->src.box.x == 0 && info->src.box.y == 0 && info->src.box.z == 0 &&
+                        info->dst.level == 0 && info->src.level == 0 &&
+                        info->src.box.width == info->dst.resource->width0 &&
+                        info->src.box.height == info->dst.resource->height0 &&
+                        info->src.box.depth == 1 && util_can_blit_via_copy_region(info, true);
+      /* Try SDMA first... */
+      if (async_copy && si_sdma_copy_image(sctx, sdst, ssrc))
+         return;
+
+      /* ... and use async compute as the fallback. */
+      if (async_copy) {
+         struct si_screen *sscreen = sctx->screen;
+
+         simple_mtx_lock(&sscreen->async_compute_context_lock);
+         if (!sscreen->async_compute_context)
+            si_init_aux_async_compute_ctx(sscreen);
+
+         if (sscreen->async_compute_context) {
+            si_compute_copy_image((struct si_context*)sctx->screen->async_compute_context,
+                                  info->dst.resource, 0, info->src.resource, 0, 0, 0, 0,
+                                  &info->src.box, false, 0);
+            si_flush_gfx_cs((struct si_context*)sctx->screen->async_compute_context, 0, NULL);
+            simple_mtx_unlock(&sscreen->async_compute_context_lock);
+            return;
+         }
+
+         simple_mtx_unlock(&sscreen->async_compute_context_lock);
+      }
+   }
+
    if (unlikely(sctx->thread_trace_enabled))
       sctx->sqtt_next_event = EventCmdCopyImage;
 
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_buffer.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_buffer.c
index c958ab8acd..027304a376 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_buffer.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_buffer.c	
@@ -145,25 +145,18 @@ void si_init_resource_fields(struct si_screen *sscreen, struct si_resource *res,
       res->flags |= RADEON_FLAG_UNCACHED;
 
    /* Set expected VRAM and GART usage for the buffer. */
-   res->vram_usage_kb = 0;
-   res->gart_usage_kb = 0;
+   res->memory_usage_kb = MAX2(1, size / 1024);
 
    if (res->domains & RADEON_DOMAIN_VRAM) {
-      res->vram_usage_kb = MAX2(1, size / 1024);
-
       /* We don't want to evict buffers from VRAM by mapping them for CPU access,
        * because they might never be moved back again. If a buffer is large enough,
-       * upload data by copying from a temporary GTT buffer. 8K might not seem much,
-       * but there can be 100000 buffers.
-       *
-       * This tweak improves performance for viewperf creo & snx.
+       * upload data by copying from a temporary GTT buffer.
        */
       if (!sscreen->info.smart_access_memory &&
           sscreen->info.has_dedicated_vram &&
-          size >= 8196)
+          !res->b.cpu_storage && /* TODO: The CPU storage breaks this. */
+          size >= SI_MAX_VRAM_MAP_SIZE)
          res->b.b.flags |= PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY;
-   } else if (res->domains & RADEON_DOMAIN_GTT) {
-      res->gart_usage_kb = MAX2(1, size / 1024);
    }
 }
 
@@ -223,12 +216,12 @@ static void si_resource_destroy(struct pipe_screen *screen, struct pipe_resource
       util_range_destroy(&buffer->valid_buffer_range);
       radeon_bo_reference(((struct si_screen*)screen)->ws, &buffer->buf, NULL);
       util_idalloc_mt_free(&sscreen->buffer_ids, buffer->b.buffer_id_unique);
-      FREE(buffer);
+      FREE_CL(buffer);
    } else if (buf->flags & SI_RESOURCE_AUX_PLANE) {
       struct si_auxiliary_texture *tex = (struct si_auxiliary_texture *)buf;
 
       radeon_bo_reference(((struct si_screen*)screen)->ws, &tex->buffer, NULL);
-      FREE(tex);
+      FREE_CL(tex);
    } else {
       struct si_texture *tex = (struct si_texture *)buf;
       struct si_resource *resource = &tex->buffer;
@@ -239,7 +232,7 @@ static void si_resource_destroy(struct pipe_screen *screen, struct pipe_resource
          si_resource_reference(&tex->cmask_buffer, NULL);
       }
       radeon_bo_reference(((struct si_screen*)screen)->ws, &resource->buf, NULL);
-      FREE(tex);
+      FREE_CL(tex);
    }
 }
 
@@ -292,8 +285,7 @@ void si_replace_buffer_storage(struct pipe_context *ctx, struct pipe_resource *d
    sdst->b.b.bind = ssrc->b.b.bind;
    sdst->flags = ssrc->flags;
 
-   assert(sdst->vram_usage_kb == ssrc->vram_usage_kb);
-   assert(sdst->gart_usage_kb == ssrc->gart_usage_kb);
+   assert(sdst->memory_usage_kb == ssrc->memory_usage_kb);
    assert(sdst->bo_size == ssrc->bo_size);
    assert(sdst->bo_alignment_log2 == ssrc->bo_alignment_log2);
    assert(sdst->domains == ssrc->domains);
@@ -564,18 +556,17 @@ static void si_buffer_subdata(struct pipe_context *ctx, struct pipe_resource *bu
 }
 
 static struct si_resource *si_alloc_buffer_struct(struct pipe_screen *screen,
-                                                  const struct pipe_resource *templ)
+                                                  const struct pipe_resource *templ,
+                                                  bool allow_cpu_storage)
 {
-   struct si_resource *buf;
-
-   buf = MALLOC_STRUCT(si_resource);
+   struct si_resource *buf = MALLOC_STRUCT_CL(si_resource);
 
    buf->b.b = *templ;
    buf->b.b.next = NULL;
    pipe_reference_init(&buf->b.b.reference, 1);
    buf->b.b.screen = screen;
 
-   threaded_resource_init(&buf->b.b);
+   threaded_resource_init(&buf->b.b, allow_cpu_storage, SI_MAP_BUFFER_ALIGNMENT);
 
    buf->buf = NULL;
    buf->bind_history = 0;
@@ -588,7 +579,9 @@ static struct pipe_resource *si_buffer_create(struct pipe_screen *screen,
                                               const struct pipe_resource *templ, unsigned alignment)
 {
    struct si_screen *sscreen = (struct si_screen *)screen;
-   struct si_resource *buf = si_alloc_buffer_struct(screen, templ);
+   struct si_resource *buf =
+      si_alloc_buffer_struct(screen, templ,
+                             templ->width0 <= sscreen->options.tc_max_cpu_storage_size);
 
    if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE)
       buf->b.b.flags |= SI_RESOURCE_FLAG_UNMAPPABLE;
@@ -598,13 +591,13 @@ static struct pipe_resource *si_buffer_create(struct pipe_screen *screen,
    if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE)
       buf->flags |= RADEON_FLAG_SPARSE;
 
+   buf->b.buffer_id_unique = util_idalloc_mt_alloc(&sscreen->buffer_ids);
+
    if (!si_alloc_resource(sscreen, buf)) {
-      threaded_resource_deinit(&buf->b.b);
-      FREE(buf);
+      si_resource_destroy(screen, &buf->b.b);
       return NULL;
    }
 
-   buf->b.buffer_id_unique = util_idalloc_mt_alloc(&sscreen->buffer_ids);
    return &buf->b.b;
 }
 
@@ -638,7 +631,7 @@ static struct pipe_resource *si_buffer_from_user_memory(struct pipe_screen *scre
 {
    struct si_screen *sscreen = (struct si_screen *)screen;
    struct radeon_winsys *ws = sscreen->ws;
-   struct si_resource *buf = si_alloc_buffer_struct(screen, templ);
+   struct si_resource *buf = si_alloc_buffer_struct(screen, templ, false);
 
    buf->domains = RADEON_DOMAIN_GTT;
    buf->flags = 0;
@@ -646,18 +639,17 @@ static struct pipe_resource *si_buffer_from_user_memory(struct pipe_screen *scre
    util_range_add(&buf->b.b, &buf->valid_buffer_range, 0, templ->width0);
    util_range_add(&buf->b.b, &buf->b.valid_buffer_range, 0, templ->width0);
 
+   buf->b.buffer_id_unique = util_idalloc_mt_alloc(&sscreen->buffer_ids);
+
    /* Convert a user pointer to a buffer. */
    buf->buf = ws->buffer_from_ptr(ws, user_memory, templ->width0);
    if (!buf->buf) {
-      threaded_resource_deinit(&buf->b.b);
-      FREE(buf);
+      si_resource_destroy(screen, &buf->b.b);
       return NULL;
    }
 
    buf->gpu_address = ws->buffer_get_virtual_address(buf->buf);
-   buf->vram_usage_kb = 0;
-   buf->gart_usage_kb = templ->width0 / 1024;
-   buf->b.buffer_id_unique = util_idalloc_mt_alloc(&sscreen->buffer_ids);
+   buf->memory_usage_kb = templ->width0 / 1024;
    return &buf->b.b;
 }
 
@@ -667,7 +659,7 @@ struct pipe_resource *si_buffer_from_winsys_buffer(struct pipe_screen *screen,
                                                    bool dedicated)
 {
    struct si_screen *sscreen = (struct si_screen *)screen;
-   struct si_resource *res = si_alloc_buffer_struct(screen, templ);
+   struct si_resource *res = si_alloc_buffer_struct(screen, templ, false);
 
    if (!res)
       return 0;
@@ -678,10 +670,7 @@ struct pipe_resource *si_buffer_from_winsys_buffer(struct pipe_screen *screen,
    res->bo_alignment_log2 = imported_buf->alignment_log2;
    res->domains = sscreen->ws->buffer_get_initial_domain(res->buf);
 
-   if (res->domains & RADEON_DOMAIN_VRAM)
-      res->vram_usage_kb = MAX2(1, res->bo_size / 1024);
-   else if (res->domains & RADEON_DOMAIN_GTT)
-      res->gart_usage_kb = MAX2(1, res->bo_size / 1024);
+   res->memory_usage_kb = MAX2(1, res->bo_size / 1024);
 
    if (sscreen->ws->buffer_get_flags)
       res->flags = sscreen->ws->buffer_get_flags(res->buf);
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_build_pm4.h b/mesa 3D driver/src/gallium/drivers/radeonsi/si_build_pm4.h
index e08ffe2f30..66589d6a0b 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_build_pm4.h	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_build_pm4.h	
@@ -58,7 +58,7 @@
    __cs = NULL; \
 } while (0)
 
-#define radeon_emit(cs, value)  __cs_buf[__cs_num++] = (value)
+#define radeon_emit(value)  __cs_buf[__cs_num++] = (value)
 #define radeon_packets_added()  (__cs_num != __cs_num_initial)
 
 #define radeon_end_update_context_roll(sctx) do { \
@@ -67,79 +67,79 @@
       (sctx)->context_roll = true; \
 } while (0)
 
-#define radeon_emit_array(cs, values, num) do { \
+#define radeon_emit_array(values, num) do { \
    unsigned __n = (num); \
    memcpy(__cs_buf + __cs_num, (values), __n * 4); \
    __cs_num += __n; \
 } while (0)
 
-#define radeon_set_config_reg_seq(cs, reg, num) do { \
+#define radeon_set_config_reg_seq(reg, num) do { \
    SI_CHECK_SHADOWED_REGS(reg, num); \
    assert((reg) < SI_CONTEXT_REG_OFFSET); \
-   radeon_emit(cs, PKT3(PKT3_SET_CONFIG_REG, num, 0)); \
-   radeon_emit(cs, ((reg) - SI_CONFIG_REG_OFFSET) >> 2); \
+   radeon_emit(PKT3(PKT3_SET_CONFIG_REG, num, 0)); \
+   radeon_emit(((reg) - SI_CONFIG_REG_OFFSET) >> 2); \
 } while (0)
 
-#define radeon_set_config_reg(cs, reg, value) do { \
-   radeon_set_config_reg_seq(cs, reg, 1); \
-   radeon_emit(cs, value); \
+#define radeon_set_config_reg(reg, value) do { \
+   radeon_set_config_reg_seq(reg, 1); \
+   radeon_emit(value); \
 } while (0)
 
-#define radeon_set_context_reg_seq(cs, reg, num) do { \
+#define radeon_set_context_reg_seq(reg, num) do { \
    SI_CHECK_SHADOWED_REGS(reg, num); \
    assert((reg) >= SI_CONTEXT_REG_OFFSET); \
-   radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, num, 0)); \
-   radeon_emit(cs, ((reg) - SI_CONTEXT_REG_OFFSET) >> 2); \
+   radeon_emit(PKT3(PKT3_SET_CONTEXT_REG, num, 0)); \
+   radeon_emit(((reg) - SI_CONTEXT_REG_OFFSET) >> 2); \
 } while (0)
 
-#define radeon_set_context_reg(cs, reg, value) do { \
-   radeon_set_context_reg_seq(cs, reg, 1); \
-   radeon_emit(cs, value); \
+#define radeon_set_context_reg(reg, value) do { \
+   radeon_set_context_reg_seq(reg, 1); \
+   radeon_emit(value); \
 } while (0)
 
-#define radeon_set_context_reg_seq_array(cs, reg, num, values) do { \
-   radeon_set_context_reg_seq(cs, reg, num); \
-   radeon_emit_array(cs, values, num); \
+#define radeon_set_context_reg_seq_array(reg, num, values) do { \
+   radeon_set_context_reg_seq(reg, num); \
+   radeon_emit_array(values, num); \
 } while (0)
 
-#define radeon_set_context_reg_idx(cs, reg, idx, value) do { \
+#define radeon_set_context_reg_idx(reg, idx, value) do { \
    SI_CHECK_SHADOWED_REGS(reg, 1); \
    assert((reg) >= SI_CONTEXT_REG_OFFSET); \
-   radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, 1, 0)); \
-   radeon_emit(cs, ((reg) - SI_CONTEXT_REG_OFFSET) >> 2 | ((idx) << 28)); \
-   radeon_emit(cs, value); \
+   radeon_emit(PKT3(PKT3_SET_CONTEXT_REG, 1, 0)); \
+   radeon_emit(((reg) - SI_CONTEXT_REG_OFFSET) >> 2 | ((idx) << 28)); \
+   radeon_emit(value); \
 } while (0)
 
-#define radeon_set_sh_reg_seq(cs, reg, num) do { \
+#define radeon_set_sh_reg_seq(reg, num) do { \
    SI_CHECK_SHADOWED_REGS(reg, num); \
    assert((reg) >= SI_SH_REG_OFFSET && (reg) < SI_SH_REG_END); \
-   radeon_emit(cs, PKT3(PKT3_SET_SH_REG, num, 0)); \
-   radeon_emit(cs, ((reg) - SI_SH_REG_OFFSET) >> 2); \
+   radeon_emit(PKT3(PKT3_SET_SH_REG, num, 0)); \
+   radeon_emit(((reg) - SI_SH_REG_OFFSET) >> 2); \
 } while (0)
 
-#define radeon_set_sh_reg(cs, reg, value) do { \
-   radeon_set_sh_reg_seq(cs, reg, 1); \
-   radeon_emit(cs, value); \
+#define radeon_set_sh_reg(reg, value) do { \
+   radeon_set_sh_reg_seq(reg, 1); \
+   radeon_emit(value); \
 } while (0)
 
-#define radeon_set_uconfig_reg_seq(cs, reg, num, perfctr) do { \
+#define radeon_set_uconfig_reg_seq(reg, num, perfctr) do { \
    SI_CHECK_SHADOWED_REGS(reg, num); \
    assert((reg) >= CIK_UCONFIG_REG_OFFSET && (reg) < CIK_UCONFIG_REG_END); \
-   radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, num, perfctr)); \
-   radeon_emit(cs, ((reg) - CIK_UCONFIG_REG_OFFSET) >> 2); \
+   radeon_emit(PKT3(PKT3_SET_UCONFIG_REG, num, perfctr)); \
+   radeon_emit(((reg) - CIK_UCONFIG_REG_OFFSET) >> 2); \
 } while (0)
 
-#define radeon_set_uconfig_reg(cs, reg, value) do { \
-   radeon_set_uconfig_reg_seq(cs, reg, 1, false); \
-   radeon_emit(cs, value); \
+#define radeon_set_uconfig_reg(reg, value) do { \
+   radeon_set_uconfig_reg_seq(reg, 1, false); \
+   radeon_emit(value); \
 } while (0)
 
-#define radeon_set_uconfig_reg_perfctr(cs, reg, value) do { \
-   radeon_set_uconfig_reg_seq(cs, reg, 1, true); \
-   radeon_emit(cs, value); \
+#define radeon_set_uconfig_reg_perfctr(reg, value) do { \
+   radeon_set_uconfig_reg_seq(reg, 1, true); \
+   radeon_emit(value); \
 } while (0)
 
-#define radeon_set_uconfig_reg_idx(cs, screen, chip_class, reg, idx, value) do { \
+#define radeon_set_uconfig_reg_idx(screen, chip_class, reg, idx, value) do { \
    SI_CHECK_SHADOWED_REGS(reg, 1); \
    assert((reg) >= CIK_UCONFIG_REG_OFFSET && (reg) < CIK_UCONFIG_REG_END); \
    assert((idx) != 0); \
@@ -147,31 +147,9 @@
    if ((chip_class) < GFX9 || \
        ((chip_class) == GFX9 && (screen)->info.me_fw_version < 26)) \
       __opcode = PKT3_SET_UCONFIG_REG; \
-   radeon_emit(cs, PKT3(__opcode, 1, 0)); \
-   radeon_emit(cs, ((reg) - CIK_UCONFIG_REG_OFFSET) >> 2 | ((idx) << 28)); \
-   radeon_emit(cs, value); \
-} while (0)
-
-#define radeon_set_context_reg_rmw(cs, reg, value, mask) do { \
-   SI_CHECK_SHADOWED_REGS(reg, 1); \
-   assert((reg) >= SI_CONTEXT_REG_OFFSET); \
-   radeon_emit(cs, PKT3(PKT3_CONTEXT_REG_RMW, 2, 0)); \
-   radeon_emit(cs, ((reg) - SI_CONTEXT_REG_OFFSET) >> 2); \
-   radeon_emit(cs, mask); \
-   radeon_emit(cs, value); \
-} while (0)
-
-/* Emit PKT3_CONTEXT_REG_RMW if the register value is different. */
-#define radeon_opt_set_context_reg_rmw(sctx, offset, reg, val, mask) do { \
-   unsigned __value = (val); \
-   assert((__value & ~mask) == 0); \
-   __value &= mask; \
-   if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x1) != 0x1 || \
-       sctx->tracked_regs.reg_value[reg] != __value) { \
-      radeon_set_context_reg_rmw(&sctx->gfx_cs, offset, __value, mask); \
-      sctx->tracked_regs.reg_saved |= 0x1ull << (reg); \
-      sctx->tracked_regs.reg_value[reg] = __value; \
-   } \
+   radeon_emit(PKT3(__opcode, 1, 0)); \
+   radeon_emit(((reg) - CIK_UCONFIG_REG_OFFSET) >> 2 | ((idx) << 28)); \
+   radeon_emit(value); \
 } while (0)
 
 /* Emit PKT3_SET_CONTEXT_REG if the register value is different. */
@@ -179,7 +157,7 @@
    unsigned __value = val; \
    if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x1) != 0x1 || \
        sctx->tracked_regs.reg_value[reg] != __value) { \
-      radeon_set_context_reg(&sctx->gfx_cs, offset, __value); \
+      radeon_set_context_reg(offset, __value); \
       sctx->tracked_regs.reg_saved |= 0x1ull << (reg); \
       sctx->tracked_regs.reg_value[reg] = __value; \
    } \
@@ -196,9 +174,9 @@
    if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x3) != 0x3 || \
        sctx->tracked_regs.reg_value[reg] != __value1 || \
        sctx->tracked_regs.reg_value[(reg) + 1] != __value2) { \
-      radeon_set_context_reg_seq(&sctx->gfx_cs, offset, 2); \
-      radeon_emit(cs, __value1); \
-      radeon_emit(cs, __value2); \
+      radeon_set_context_reg_seq(offset, 2); \
+      radeon_emit(__value1); \
+      radeon_emit(__value2); \
       sctx->tracked_regs.reg_value[reg] = __value1; \
       sctx->tracked_regs.reg_value[(reg) + 1] = __value2; \
       sctx->tracked_regs.reg_saved |= 0x3ull << (reg); \
@@ -214,10 +192,10 @@
        sctx->tracked_regs.reg_value[reg] != __value1 || \
        sctx->tracked_regs.reg_value[(reg) + 1] != __value2 || \
        sctx->tracked_regs.reg_value[(reg) + 2] != __value3) { \
-      radeon_set_context_reg_seq(&sctx->gfx_cs, offset, 3); \
-      radeon_emit(cs, __value1); \
-      radeon_emit(cs, __value2); \
-      radeon_emit(cs, __value3); \
+      radeon_set_context_reg_seq(offset, 3); \
+      radeon_emit(__value1); \
+      radeon_emit(__value2); \
+      radeon_emit(__value3); \
       sctx->tracked_regs.reg_value[reg] = __value1; \
       sctx->tracked_regs.reg_value[(reg) + 1] = __value2; \
       sctx->tracked_regs.reg_value[(reg) + 2] = __value3; \
@@ -235,11 +213,11 @@
        sctx->tracked_regs.reg_value[(reg) + 1] != __value2 || \
        sctx->tracked_regs.reg_value[(reg) + 2] != __value3 || \
        sctx->tracked_regs.reg_value[(reg) + 3] != __value4) { \
-      radeon_set_context_reg_seq(&sctx->gfx_cs, offset, 4); \
-      radeon_emit(cs, __value1); \
-      radeon_emit(cs, __value2); \
-      radeon_emit(cs, __value3); \
-      radeon_emit(cs, __value4); \
+      radeon_set_context_reg_seq(offset, 4); \
+      radeon_emit(__value1); \
+      radeon_emit(__value2); \
+      radeon_emit(__value3); \
+      radeon_emit(__value4); \
       sctx->tracked_regs.reg_value[reg] = __value1; \
       sctx->tracked_regs.reg_value[(reg) + 1] = __value2; \
       sctx->tracked_regs.reg_value[(reg) + 2] = __value3; \
@@ -252,37 +230,53 @@
  * Set consecutive registers if any registers value is different.
  */
 #define radeon_opt_set_context_regn(sctx, offset, value, saved_val, num) do { \
-   for (unsigned i = 0; i < (num); i++) { \
-      if ((saved_val)[i] != (value)[i]) { \
-         radeon_set_context_reg_seq(&(sctx)->gfx_cs, offset, num); \
-         for (unsigned j = 0; j < (num); j++) \
-            radeon_emit(cs, value[j]); \
-         memcpy(saved_val, value, sizeof(uint32_t) * (num)); \
-         break; \
-      } \
+   if (memcmp(value, saved_val, sizeof(uint32_t) * (num))) { \
+      radeon_set_context_reg_seq(offset, num); \
+      radeon_emit_array(value, num); \
+      memcpy(saved_val, value, sizeof(uint32_t) * (num)); \
    } \
 } while (0)
 
-#define radeon_set_privileged_config_reg(cs, reg, value) do { \
-   assert((reg) < CIK_UCONFIG_REG_OFFSET); \
-   radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); \
-   radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | \
-               COPY_DATA_DST_SEL(COPY_DATA_PERF)); \
-   radeon_emit(cs, value); \
-   radeon_emit(cs, 0); /* unused */ \
-   radeon_emit(cs, (reg) >> 2); \
-   radeon_emit(cs, 0); /* unused */ \
+#define radeon_opt_set_sh_reg(sctx, offset, reg, val) do { \
+   unsigned __value = val; \
+   if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x1) != 0x1 || \
+       sctx->tracked_regs.reg_value[reg] != __value) { \
+      radeon_set_sh_reg(offset, __value); \
+      sctx->tracked_regs.reg_saved |= BITFIELD64_BIT(reg); \
+      sctx->tracked_regs.reg_value[reg] = __value; \
+   } \
 } while (0)
 
-#define radeon_emit_32bit_pointer(sscreen, cs, va) do { \
-   radeon_emit(cs, va); \
+#define radeon_opt_set_uconfig_reg(sctx, offset, reg, val) do { \
+   unsigned __value = val; \
+   if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x1) != 0x1 || \
+       sctx->tracked_regs.reg_value[reg] != __value) { \
+      radeon_set_uconfig_reg(offset, __value); \
+      sctx->tracked_regs.reg_saved |= 0x1ull << (reg); \
+      sctx->tracked_regs.reg_value[reg] = __value; \
+   } \
+} while (0)
+
+#define radeon_set_privileged_config_reg(reg, value) do { \
+   assert((reg) < CIK_UCONFIG_REG_OFFSET); \
+   radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0)); \
+   radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_IMM) | \
+               COPY_DATA_DST_SEL(COPY_DATA_PERF)); \
+   radeon_emit(value); \
+   radeon_emit(0); /* unused */ \
+   radeon_emit((reg) >> 2); \
+   radeon_emit(0); /* unused */ \
+} while (0)
+
+#define radeon_emit_32bit_pointer(sscreen, va) do { \
+   radeon_emit(va); \
    assert((va) == 0 || ((va) >> 32) == sscreen->info.address32_hi); \
 } while (0)
 
 #define radeon_emit_one_32bit_pointer(sctx, desc, sh_base) do { \
    unsigned sh_offset = (sh_base) + (desc)->shader_userdata_offset; \
-   radeon_set_sh_reg_seq(&sctx->gfx_cs, sh_offset, 1); \
-   radeon_emit_32bit_pointer(sctx->screen, cs, (desc)->gpu_address); \
+   radeon_set_sh_reg_seq(sh_offset, 1); \
+   radeon_emit_32bit_pointer(sctx->screen, (desc)->gpu_address); \
 } while (0)
 
 /* This should be evaluated at compile time if all parameters are constants. */
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_clear.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_clear.c
index 48283c0340..b8cf49a6b4 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_clear.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_clear.c	
@@ -1074,7 +1074,8 @@ static void si_clear_render_target(struct pipe_context *ctx, struct pipe_surface
    struct si_context *sctx = (struct si_context *)ctx;
    struct si_texture *sdst = (struct si_texture *)dst->texture;
 
-   if (dst->texture->nr_samples <= 1 && !vi_dcc_enabled(sdst, dst->u.tex.level)) {
+   if (dst->texture->nr_samples <= 1 &&
+       (sctx->chip_class >= GFX10 || !vi_dcc_enabled(sdst, dst->u.tex.level))) {
       si_compute_clear_render_target(ctx, dst, color, dstx, dsty, width, height,
                                      render_condition_enabled);
       return;
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_compute.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_compute.c
index 37dc9cd0cb..0ae232db27 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_compute.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_compute.c	
@@ -367,11 +367,14 @@ static void si_set_global_binding(struct pipe_context *ctx, unsigned first, unsi
 void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs)
 {
    radeon_begin(cs);
-   radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2);
+   radeon_set_sh_reg(R_00B834_COMPUTE_PGM_HI,
+                     S_00B834_DATA(sctx->screen->info.address32_hi >> 8));
+
+   radeon_set_sh_reg_seq(R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2);
    /* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1,
     * renamed COMPUTE_DESTINATION_EN_SEn on gfx10. */
-   radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
-   radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
+   radeon_emit(S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
+   radeon_emit(S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
 
    if (sctx->chip_class == GFX6) {
       /* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID
@@ -381,25 +384,25 @@ void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf
        * TODO: This should be:
        * (number of compute units) * 4 * (waves per simd) - 1
        */
-      radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID, 0x190 /* Default value */);
+      radeon_set_sh_reg(R_00B82C_COMPUTE_MAX_WAVE_ID, 0x190 /* Default value */);
 
       if (sctx->screen->info.si_TA_CS_BC_BASE_ADDR_allowed) {
          uint64_t bc_va = sctx->border_color_buffer->gpu_address;
 
-         radeon_set_config_reg(cs, R_00950C_TA_CS_BC_BASE_ADDR, bc_va >> 8);
+         radeon_set_config_reg(R_00950C_TA_CS_BC_BASE_ADDR, bc_va >> 8);
       }
    }
 
    if (sctx->chip_class >= GFX7) {
       /* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */
-      radeon_set_sh_reg_seq(cs, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2);
-      radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
-      radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
+      radeon_set_sh_reg_seq(R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2);
+      radeon_emit(S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
+      radeon_emit(S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
 
       /* Disable profiling on compute queues. */
       if (cs != &sctx->gfx_cs || !sctx->screen->info.has_graphics) {
-         radeon_set_sh_reg(cs, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, 0);
-         radeon_set_sh_reg(cs, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, 0);
+         radeon_set_sh_reg(R_00B82C_COMPUTE_PERFCOUNT_ENABLE, 0);
+         radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE, 0);
       }
 
       /* Set the pointer to border colors. */
@@ -407,9 +410,9 @@ void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf
       if (sctx->border_color_buffer) {
          uint64_t bc_va = sctx->border_color_buffer->gpu_address;
 
-         radeon_set_uconfig_reg_seq(cs, R_030E00_TA_CS_BC_BASE_ADDR, 2, false);
-         radeon_emit(cs, bc_va >> 8);                    /* R_030E00_TA_CS_BC_BASE_ADDR */
-         radeon_emit(cs, S_030E04_ADDRESS(bc_va >> 40)); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */
+         radeon_set_uconfig_reg_seq(R_030E00_TA_CS_BC_BASE_ADDR, 2, false);
+         radeon_emit(bc_va >> 8);                    /* R_030E00_TA_CS_BC_BASE_ADDR */
+         radeon_emit(S_030E04_ADDRESS(bc_va >> 40)); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */
       }
    }
 
@@ -418,17 +421,19 @@ void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf
     */
    if (sctx->chip_class >= GFX9 &&
        (cs != &sctx->gfx_cs || !sctx->screen->info.has_graphics)) {
-      radeon_set_uconfig_reg(cs, R_0301EC_CP_COHER_START_DELAY,
+      radeon_set_uconfig_reg(R_0301EC_CP_COHER_START_DELAY,
                              sctx->chip_class >= GFX10 ? 0x20 : 0);
    }
 
    if (sctx->chip_class >= GFX10) {
-      radeon_set_sh_reg(cs, R_00B890_COMPUTE_USER_ACCUM_0, 0);
-      radeon_set_sh_reg(cs, R_00B894_COMPUTE_USER_ACCUM_1, 0);
-      radeon_set_sh_reg(cs, R_00B898_COMPUTE_USER_ACCUM_2, 0);
-      radeon_set_sh_reg(cs, R_00B89C_COMPUTE_USER_ACCUM_3, 0);
-      radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, 0);
-      radeon_set_sh_reg(cs, R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0);
+      radeon_set_sh_reg_seq(R_00B890_COMPUTE_USER_ACCUM_0, 5);
+      radeon_emit(0); /* R_00B890_COMPUTE_USER_ACCUM_0 */
+      radeon_emit(0); /* R_00B894_COMPUTE_USER_ACCUM_1 */
+      radeon_emit(0); /* R_00B898_COMPUTE_USER_ACCUM_2 */
+      radeon_emit(0); /* R_00B89C_COMPUTE_USER_ACCUM_3 */
+      radeon_emit(0); /* R_00B8A0_COMPUTE_PGM_RSRC3 */
+
+      radeon_set_sh_reg(R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0);
    }
    radeon_end();
 }
@@ -533,13 +538,11 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute
                              RADEON_PRIO_SHADER_BINARY);
 
    radeon_begin(cs);
-   radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
-   radeon_emit(cs, shader_va >> 8);
-   radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
+   radeon_set_sh_reg(R_00B830_COMPUTE_PGM_LO, shader_va >> 8);
 
-   radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
-   radeon_emit(cs, config->rsrc1);
-   radeon_emit(cs, config->rsrc2);
+   radeon_set_sh_reg_seq(R_00B848_COMPUTE_PGM_RSRC1, 2);
+   radeon_emit(config->rsrc1);
+   radeon_emit(config->rsrc2);
 
    COMPUTE_DBG(sctx->screen,
                "COMPUTE_PGM_RSRC1: 0x%08x "
@@ -549,7 +552,7 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute
    sctx->max_seen_compute_scratch_bytes_per_wave =
       MAX2(sctx->max_seen_compute_scratch_bytes_per_wave, config->scratch_bytes_per_wave);
 
-   radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
+   radeon_set_sh_reg(R_00B860_COMPUTE_TMPRING_SIZE,
                      S_00B860_WAVES(sctx->scratch_waves) |
                         S_00B860_WAVESIZE(sctx->max_seen_compute_scratch_bytes_per_wave >> 10));
    radeon_end();
@@ -592,11 +595,11 @@ static void setup_scratch_rsrc_user_sgprs(struct si_context *sctx,
    }
 
    radeon_begin(cs);
-   radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 4);
-   radeon_emit(cs, scratch_dword0);
-   radeon_emit(cs, scratch_dword1);
-   radeon_emit(cs, scratch_dword2);
-   radeon_emit(cs, scratch_dword3);
+   radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 4);
+   radeon_emit(scratch_dword0);
+   radeon_emit(scratch_dword1);
+   radeon_emit(scratch_dword2);
+   radeon_emit(scratch_dword3);
    radeon_end();
 }
 
@@ -656,9 +659,9 @@ static void si_setup_user_sgprs_co_v2(struct si_context *sctx, const amd_kernel_
 
       dispatch_va = dispatch_buf->gpu_address + dispatch_offset;
 
-      radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2);
-      radeon_emit(cs, dispatch_va);
-      radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) | S_008F04_STRIDE(0));
+      radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2);
+      radeon_emit(dispatch_va);
+      radeon_emit(S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) | S_008F04_STRIDE(0));
 
       si_resource_reference(&dispatch_buf, NULL);
       user_sgpr += 2;
@@ -666,16 +669,16 @@ static void si_setup_user_sgprs_co_v2(struct si_context *sctx, const amd_kernel_
 
    if (AMD_HSA_BITS_GET(code_object->code_properties,
                         AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) {
-      radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2);
-      radeon_emit(cs, kernel_args_va);
-      radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(kernel_args_va >> 32) | S_008F04_STRIDE(0));
+      radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2);
+      radeon_emit(kernel_args_va);
+      radeon_emit(S_008F04_BASE_ADDRESS_HI(kernel_args_va >> 32) | S_008F04_STRIDE(0));
       user_sgpr += 2;
    }
 
    for (i = 0; i < 3 && user_sgpr < 16; i++) {
       if (code_object->code_properties & workgroup_count_masks[i]) {
-         radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 1);
-         radeon_emit(cs, info->grid[i]);
+         radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 1);
+         radeon_emit(info->grid[i]);
          user_sgpr += 1;
       }
    }
@@ -740,21 +743,21 @@ static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_gr
          }
          radeon_begin_again(cs);
       } else {
-         radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
-         radeon_emit(cs, info->grid[0]);
-         radeon_emit(cs, info->grid[1]);
-         radeon_emit(cs, info->grid[2]);
+         radeon_set_sh_reg_seq(grid_size_reg, 3);
+         radeon_emit(info->grid[0]);
+         radeon_emit(info->grid[1]);
+         radeon_emit(info->grid[2]);
       }
    }
 
    if (sel->info.uses_variable_block_size) {
-      radeon_set_sh_reg(cs, block_size_reg,
+      radeon_set_sh_reg(block_size_reg,
                         info->block[0] | (info->block[1] << 10) | (info->block[2] << 20));
    }
 
    if (sel->info.base.cs.user_data_components_amd) {
-      radeon_set_sh_reg_seq(cs, cs_user_data_reg, sel->info.base.cs.user_data_components_amd);
-      radeon_emit_array(cs, sctx->cs_user_data, sel->info.base.cs.user_data_components_amd);
+      radeon_set_sh_reg_seq(cs_user_data_reg, sel->info.base.cs.user_data_components_amd);
+      radeon_emit_array(sctx->cs_user_data, sel->info.base.cs.user_data_components_amd);
    }
    radeon_end();
 }
@@ -780,7 +783,7 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
 
    radeon_begin(cs);
    radeon_set_sh_reg(
-      cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
+      R_00B854_COMPUTE_RESOURCE_LIMITS,
       ac_get_compute_resource_limits(&sscreen->info, waves_per_threadgroup,
                                      sctx->cs_max_waves_per_sh, threadgroups_per_cu));
 
@@ -793,7 +796,7 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
    const uint *last_block = info->last_block;
    bool partial_block_en = last_block[0] || last_block[1] || last_block[2];
 
-   radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
+   radeon_set_sh_reg_seq(R_00B81C_COMPUTE_NUM_THREAD_X, 3);
 
    if (partial_block_en) {
       unsigned partial[3];
@@ -803,18 +806,18 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
       partial[1] = last_block[1] ? last_block[1] : info->block[1];
       partial[2] = last_block[2] ? last_block[2] : info->block[2];
 
-      radeon_emit(
-         cs, S_00B81C_NUM_THREAD_FULL(info->block[0]) | S_00B81C_NUM_THREAD_PARTIAL(partial[0]));
-      radeon_emit(
-         cs, S_00B820_NUM_THREAD_FULL(info->block[1]) | S_00B820_NUM_THREAD_PARTIAL(partial[1]));
-      radeon_emit(
-         cs, S_00B824_NUM_THREAD_FULL(info->block[2]) | S_00B824_NUM_THREAD_PARTIAL(partial[2]));
+      radeon_emit(S_00B81C_NUM_THREAD_FULL(info->block[0]) |
+                  S_00B81C_NUM_THREAD_PARTIAL(partial[0]));
+      radeon_emit(S_00B820_NUM_THREAD_FULL(info->block[1]) |
+                  S_00B820_NUM_THREAD_PARTIAL(partial[1]));
+      radeon_emit(S_00B824_NUM_THREAD_FULL(info->block[2]) |
+                  S_00B824_NUM_THREAD_PARTIAL(partial[2]));
 
       dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
    } else {
-      radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]));
-      radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]));
-      radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]));
+      radeon_emit(S_00B81C_NUM_THREAD_FULL(info->block[0]));
+      radeon_emit(S_00B820_NUM_THREAD_FULL(info->block[1]));
+      radeon_emit(S_00B824_NUM_THREAD_FULL(info->block[2]));
    }
 
    if (info->indirect) {
@@ -823,25 +826,25 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
       radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(info->indirect), RADEON_USAGE_READ,
                                 RADEON_PRIO_DRAW_INDIRECT);
 
-      radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1));
-      radeon_emit(cs, 1);
-      radeon_emit(cs, base_va);
-      radeon_emit(cs, base_va >> 32);
+      radeon_emit(PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1));
+      radeon_emit(1);
+      radeon_emit(base_va);
+      radeon_emit(base_va >> 32);
 
-      radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, render_cond_bit) | PKT3_SHADER_TYPE_S(1));
-      radeon_emit(cs, info->indirect_offset);
-      radeon_emit(cs, dispatch_initiator);
+      radeon_emit(PKT3(PKT3_DISPATCH_INDIRECT, 1, render_cond_bit) | PKT3_SHADER_TYPE_S(1));
+      radeon_emit(info->indirect_offset);
+      radeon_emit(dispatch_initiator);
    } else {
-      radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) | PKT3_SHADER_TYPE_S(1));
-      radeon_emit(cs, info->grid[0]);
-      radeon_emit(cs, info->grid[1]);
-      radeon_emit(cs, info->grid[2]);
-      radeon_emit(cs, dispatch_initiator);
+      radeon_emit(PKT3(PKT3_DISPATCH_DIRECT, 3, render_cond_bit) | PKT3_SHADER_TYPE_S(1));
+      radeon_emit(info->grid[0]);
+      radeon_emit(info->grid[1]);
+      radeon_emit(info->grid[2]);
+      radeon_emit(dispatch_initiator);
    }
 
    if (unlikely(sctx->thread_trace_enabled && sctx->chip_class >= GFX9)) {
-      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-      radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0));
+      radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0));
    }
    radeon_end();
 }
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_compute_blit.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_compute_blit.c
index 66b3ed5036..b499b76665 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_compute_blit.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_compute_blit.c	
@@ -59,11 +59,58 @@ unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
    }
 }
 
+static bool si_is_buffer_idle(struct si_context *sctx, struct si_resource *buf,
+                              enum radeon_bo_usage usage)
+{
+   return !si_cs_is_buffer_referenced(sctx, buf->buf, usage) &&
+          sctx->ws->buffer_wait(sctx->ws, buf->buf, 0, usage);
+}
+
+static void si_improve_sync_flags(struct si_context *sctx, struct pipe_resource *dst,
+                                  struct pipe_resource *src, unsigned *flags)
+{
+   if (dst->target != PIPE_BUFFER || (src && src->target != PIPE_BUFFER))
+      return;
+
+   if (si_is_buffer_idle(sctx, si_resource(dst), RADEON_USAGE_READWRITE) &&
+       (!src || si_is_buffer_idle(sctx, si_resource(src), RADEON_USAGE_WRITE))) {
+      /* Idle buffers don't have to sync. */
+      *flags &= ~(SI_OP_SYNC_GE_BEFORE | SI_OP_SYNC_PS_BEFORE | SI_OP_SYNC_CS_BEFORE |
+                  SI_OP_SYNC_CPDMA_BEFORE);
+      return;
+   }
+
+   const unsigned cs_mask = SI_BIND_CONSTANT_BUFFER(PIPE_SHADER_COMPUTE) |
+                            SI_BIND_SHADER_BUFFER(PIPE_SHADER_COMPUTE) |
+                            SI_BIND_IMAGE_BUFFER(PIPE_SHADER_COMPUTE) |
+                            SI_BIND_SAMPLER_BUFFER(PIPE_SHADER_COMPUTE);
+
+   const unsigned ps_mask = SI_BIND_CONSTANT_BUFFER(PIPE_SHADER_FRAGMENT) |
+                            SI_BIND_SHADER_BUFFER(PIPE_SHADER_FRAGMENT) |
+                            SI_BIND_IMAGE_BUFFER(PIPE_SHADER_FRAGMENT) |
+                            SI_BIND_SAMPLER_BUFFER(PIPE_SHADER_FRAGMENT);
+
+   unsigned bind_history = si_resource(dst)->bind_history |
+                           (src ? si_resource(src)->bind_history : 0);
+
+   /* Clear SI_OP_SYNC_CS_BEFORE if the buffer has never been used with a CS. */
+   if (*flags & SI_OP_SYNC_CS_BEFORE && !(bind_history & cs_mask))
+      *flags &= ~SI_OP_SYNC_CS_BEFORE;
+
+   /* Clear SI_OP_SYNC_PS_BEFORE if the buffer has never been used with a PS. */
+   if (*flags & SI_OP_SYNC_PS_BEFORE && !(bind_history & ps_mask)) {
+      *flags &= ~SI_OP_SYNC_PS_BEFORE;
+      *flags |= SI_OP_SYNC_GE_BEFORE;
+   }
+}
+
 void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *info,
                              void *shader, unsigned flags)
 {
-
    /* Wait for previous shaders to finish. */
+   if (flags & SI_OP_SYNC_GE_BEFORE)
+      sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
+
    if (flags & SI_OP_SYNC_PS_BEFORE)
       sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
 
@@ -136,8 +183,9 @@ void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_inf
    }
 
    /* Bind buffers and launch compute. */
-   sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, num_buffers, buffers,
-                              writeable_bitmask);
+   si_set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, num_buffers, buffers,
+                         writeable_bitmask,
+                         true /* don't update bind_history to prevent unnecessary syncs later */);
    si_launch_grid_internal(sctx, info, shader, flags);
 
    /* Do cache flushing at the end. */
@@ -315,6 +363,8 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
    if (!size)
       return;
 
+   si_improve_sync_flags(sctx, dst, NULL, &flags);
+
    ASSERTED unsigned clear_alignment = MIN2(clear_value_size, 4);
 
    assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */
@@ -404,6 +454,8 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p
    enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
    uint64_t compute_min_size = 8 * 1024;
 
+   si_improve_sync_flags(sctx, dst, src, &flags);
+
    /* Only use compute for VRAM copies on dGPUs. */
    if (sctx->screen->info.has_dedicated_vram && si_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
        si_resource(src)->domains & RADEON_DOMAIN_VRAM && size > compute_min_size &&
@@ -422,18 +474,19 @@ void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u
                            bool is_dcc_decompress, unsigned flags)
 {
    struct pipe_context *ctx = &sctx->b;
+   struct si_texture *ssrc = (struct si_texture*)src;
+   struct si_texture *sdst = (struct si_texture*)dst;
    unsigned width = src_box->width;
    unsigned height = src_box->height;
    unsigned depth = src_box->depth;
    enum pipe_format src_format = util_format_linear(src->format);
    enum pipe_format dst_format = util_format_linear(dst->format);
-   bool is_linear = ((struct si_texture*)src)->surface.is_linear ||
-                    ((struct si_texture*)dst)->surface.is_linear;
+   bool is_linear = ssrc->surface.is_linear || sdst->surface.is_linear;
 
    assert(util_format_is_subsampled_422(src_format) == util_format_is_subsampled_422(dst_format));
 
-   if (!vi_dcc_enabled((struct si_texture*)src, src_level) &&
-       !vi_dcc_enabled((struct si_texture*)dst, dst_level) &&
+   if (!vi_dcc_enabled(ssrc, src_level) &&
+       !vi_dcc_enabled(sdst, dst_level) &&
        src_format == dst_format &&
        util_format_is_float(src_format) &&
        !util_format_is_compressed(src_format)) {
@@ -477,8 +530,12 @@ void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u
 
    /* src and dst have the same number of samples. */
    si_make_CB_shader_coherent(sctx, src->nr_samples, true,
-                              /* Only src can have DCC.*/
-                              ((struct si_texture *)src)->surface.u.gfx9.color.dcc.pipe_aligned);
+                              ssrc->surface.u.gfx9.color.dcc.pipe_aligned);
+   if (sctx->chip_class >= GFX10) {
+      /* GFX10+ uses DCC stores so si_make_CB_shader_coherent is required for dst too */
+      si_make_CB_shader_coherent(sctx, dst->nr_samples, true,
+                                 sdst->surface.u.gfx9.color.dcc.pipe_aligned);
+   }
 
    struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
    struct pipe_image_view saved_image[2] = {0};
@@ -511,7 +568,7 @@ void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u
    if (is_dcc_decompress)
       image[1].access |= SI_IMAGE_ACCESS_DCC_OFF;
    else if (sctx->chip_class >= GFX10)
-      image[1].access |= SI_IMAGE_ACCESS_DCC_WRITE;
+      image[1].access |= SI_IMAGE_ACCESS_ALLOW_DCC_STORE;
 
    ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, 0, image);
 
@@ -529,7 +586,6 @@ void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u
        * the DCC block size or a multiple thereof. The shader uses a barrier
        * between loads and stores to safely overwrite each DCC block of pixels.
        */
-      struct si_texture *tex = (struct si_texture*)src;
       unsigned dim[3] = {src_box->width, src_box->height, src_box->depth};
 
       assert(src == dst);
@@ -538,9 +594,9 @@ void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u
       if (!sctx->cs_dcc_decompress)
          sctx->cs_dcc_decompress = si_create_dcc_decompress_cs(ctx);
 
-      info.block[0] = tex->surface.u.gfx9.color.dcc_block_width;
-      info.block[1] = tex->surface.u.gfx9.color.dcc_block_height;
-      info.block[2] = tex->surface.u.gfx9.color.dcc_block_depth;
+      info.block[0] = ssrc->surface.u.gfx9.color.dcc_block_width;
+      info.block[1] = ssrc->surface.u.gfx9.color.dcc_block_height;
+      info.block[2] = ssrc->surface.u.gfx9.color.dcc_block_depth;
 
       /* Make sure the block size is at least the same as wave size. */
       while (info.block[0] * info.block[1] * info.block[2] <
@@ -613,16 +669,12 @@ void si_retile_dcc(struct si_context *sctx, struct si_texture *tex)
    sctx->cs_user_data[2] = (tex->surface.u.gfx9.color.display_dcc_pitch_max + 1) |
                            (tex->surface.u.gfx9.color.display_dcc_height << 16);
 
-   /* There is only 1 shader variant because ac_surface only supports displayable DCC
-    * with one swizzle mode and 32bpp.
-    */
+   /* We have only 1 variant per bpp for now, so expect 32 bpp. */
    assert(tex->surface.bpe == 4);
-   assert(sctx->chip_class != GFX9 || tex->surface.u.gfx9.swizzle_mode == 25);  /* 64KB_S_X */
-   assert(sctx->chip_class != GFX10 || tex->surface.u.gfx9.swizzle_mode == 27); /* 64KB_R_X */
-   assert(sctx->chip_class != GFX10_3 || tex->surface.u.gfx9.swizzle_mode == 27); /* 64KB_R_X */
 
-   if (!sctx->cs_dcc_retile)
-      sctx->cs_dcc_retile = si_create_dcc_retile_cs(sctx, &tex->surface);
+   void **shader = &sctx->cs_dcc_retile[tex->surface.u.gfx9.swizzle_mode];
+   if (!*shader)
+      *shader = si_create_dcc_retile_cs(sctx, &tex->surface);
 
    /* Dispatch compute. */
    unsigned width = DIV_ROUND_UP(tex->buffer.b.b.width0, tex->surface.u.gfx9.color.dcc_block_width);
@@ -638,7 +690,7 @@ void si_retile_dcc(struct si_context *sctx, struct si_texture *tex)
    info.grid[1] = DIV_ROUND_UP(height, info.block[1]);
    info.grid[2] = 1;
 
-   si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_dcc_retile, SI_OP_SYNC_BEFORE,
+   si_launch_grid_internal_ssbos(sctx, &info, *shader, SI_OP_SYNC_BEFORE,
                                  SI_COHERENCY_CB_META, 1, &sb, 0x1);
 
    /* Don't flush caches. L2 will be flushed by the kernel fence. */
@@ -706,7 +758,7 @@ void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex
       return;
 
    si_make_CB_shader_coherent(sctx, tex->nr_samples, true,
-                              true /* DCC is not possible with image stores */);
+                              ((struct si_texture*)tex)->surface.u.gfx9.color.dcc.pipe_aligned);
 
    /* Save states. */
    struct pipe_image_view saved_image = {0};
@@ -777,6 +829,7 @@ void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surfac
                                     bool render_condition_enabled)
 {
    struct si_context *sctx = (struct si_context *)ctx;
+   struct si_texture *tex = (struct si_texture*)dstsurf->texture;
    unsigned num_layers = dstsurf->u.tex.last_layer - dstsurf->u.tex.first_layer + 1;
    unsigned data[4 + sizeof(color->ui)] = {dstx, dsty, dstsurf->u.tex.first_layer, 0};
 
@@ -798,7 +851,7 @@ void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surfac
    }
 
    si_make_CB_shader_coherent(sctx, dstsurf->texture->nr_samples, true,
-                              true /* DCC is not possible with image stores */);
+                              tex->surface.u.gfx9.color.dcc.pipe_aligned);
 
    struct pipe_constant_buffer saved_cb = {};
    si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
@@ -814,7 +867,7 @@ void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surfac
 
    struct pipe_image_view image = {0};
    image.resource = dstsurf->texture;
-   image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE;
+   image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE | SI_IMAGE_ACCESS_ALLOW_DCC_STORE;
    image.format = util_format_linear(dstsurf->format);
    image.u.tex.level = dstsurf->u.tex.level;
    image.u.tex.first_layer = 0; /* 3D images ignore first_layer (BASE_ARRAY) */
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_cp_dma.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_cp_dma.c
index b7aece5646..f37f74a32b 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_cp_dma.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_cp_dma.c	
@@ -100,22 +100,22 @@ static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, ui
    radeon_begin(cs);
 
    if (sctx->chip_class >= GFX7) {
-      radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
-      radeon_emit(cs, header);
-      radeon_emit(cs, src_va);       /* SRC_ADDR_LO [31:0] */
-      radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */
-      radeon_emit(cs, dst_va);       /* DST_ADDR_LO [31:0] */
-      radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */
-      radeon_emit(cs, command);
+      radeon_emit(PKT3(PKT3_DMA_DATA, 5, 0));
+      radeon_emit(header);
+      radeon_emit(src_va);       /* SRC_ADDR_LO [31:0] */
+      radeon_emit(src_va >> 32); /* SRC_ADDR_HI [31:0] */
+      radeon_emit(dst_va);       /* DST_ADDR_LO [31:0] */
+      radeon_emit(dst_va >> 32); /* DST_ADDR_HI [31:0] */
+      radeon_emit(command);
    } else {
       header |= S_411_SRC_ADDR_HI(src_va >> 32);
 
-      radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
-      radeon_emit(cs, src_va);                  /* SRC_ADDR_LO [31:0] */
-      radeon_emit(cs, header);                  /* SRC_ADDR_HI [15:0] + flags. */
-      radeon_emit(cs, dst_va);                  /* DST_ADDR_LO [31:0] */
-      radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */
-      radeon_emit(cs, command);
+      radeon_emit(PKT3(PKT3_CP_DMA, 4, 0));
+      radeon_emit(src_va);                  /* SRC_ADDR_LO [31:0] */
+      radeon_emit(header);                  /* SRC_ADDR_HI [15:0] + flags. */
+      radeon_emit(dst_va);                  /* DST_ADDR_LO [31:0] */
+      radeon_emit((dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */
+      radeon_emit(command);
    }
 
    /* CP DMA is executed in ME, but index buffers are read by PFP.
@@ -124,8 +124,8 @@ static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, ui
     * should precede it.
     */
    if (sctx->has_graphics && flags & CP_DMA_PFP_SYNC_ME) {
-      radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
-      radeon_emit(cs, 0);
+      radeon_emit(PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+      radeon_emit(0);
    }
    radeon_end();
 }
@@ -196,6 +196,9 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
 
    assert(size && size % 4 == 0);
 
+   if (user_flags & SI_OP_SYNC_GE_BEFORE)
+      sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
+
    if (user_flags & SI_OP_SYNC_CS_BEFORE)
       sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
 
@@ -230,10 +233,8 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
       sdst->TC_L2_dirty = true;
 
    /* If it's not a framebuffer fast clear... */
-   if (coher == SI_COHERENCY_SHADER) {
+   if (coher == SI_COHERENCY_SHADER)
       sctx->num_cp_dma_calls++;
-      si_prim_discard_signal_next_compute_ib_start(sctx);
-   }
 }
 
 /**
@@ -339,6 +340,9 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
       }
    }
 
+   if (user_flags & SI_OP_SYNC_GE_BEFORE)
+      sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
+
    if (user_flags & SI_OP_SYNC_CS_BEFORE)
       sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME;
 
@@ -387,10 +391,8 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
       si_resource(dst)->TC_L2_dirty = true;
 
    /* If it's not a prefetch or GDS copy... */
-   if (dst && src && (dst != src || dst_offset != src_offset)) {
+   if (dst && src && (dst != src || dst_offset != src_offset))
       sctx->num_cp_dma_calls++;
-      si_prim_discard_signal_next_compute_ib_start(sctx);
-   }
 }
 
 void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf,
@@ -423,13 +425,13 @@ void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf,
 
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
    radeon_begin(cs);
-   radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
-   radeon_emit(cs, header);
-   radeon_emit(cs, address);       /* SRC_ADDR_LO [31:0] */
-   radeon_emit(cs, address >> 32); /* SRC_ADDR_HI [31:0] */
-   radeon_emit(cs, address);       /* DST_ADDR_LO [31:0] */
-   radeon_emit(cs, address >> 32); /* DST_ADDR_HI [31:0] */
-   radeon_emit(cs, command);
+   radeon_emit(PKT3(PKT3_DMA_DATA, 5, 0));
+   radeon_emit(header);
+   radeon_emit(address);       /* SRC_ADDR_LO [31:0] */
+   radeon_emit(address >> 32); /* SRC_ADDR_HI [31:0] */
+   radeon_emit(address);       /* DST_ADDR_LO [31:0] */
+   radeon_emit(address >> 32); /* DST_ADDR_HI [31:0] */
+   radeon_emit(command);
    radeon_end();
 }
 
@@ -495,11 +497,11 @@ void si_cp_write_data(struct si_context *sctx, struct si_resource *buf, unsigned
    uint64_t va = buf->gpu_address + offset;
 
    radeon_begin(cs);
-   radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + size / 4, 0));
-   radeon_emit(cs, S_370_DST_SEL(dst_sel) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine));
-   radeon_emit(cs, va);
-   radeon_emit(cs, va >> 32);
-   radeon_emit_array(cs, (const uint32_t *)data, size / 4);
+   radeon_emit(PKT3(PKT3_WRITE_DATA, 2 + size / 4, 0));
+   radeon_emit(S_370_DST_SEL(dst_sel) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine));
+   radeon_emit(va);
+   radeon_emit(va >> 32);
+   radeon_emit_array((const uint32_t *)data, size / 4);
    radeon_end();
 }
 
@@ -519,11 +521,11 @@ void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned
    uint64_t src_va = (src ? src->gpu_address : 0ull) + src_offset;
 
    radeon_begin(cs);
-   radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-   radeon_emit(cs, COPY_DATA_SRC_SEL(src_sel) | COPY_DATA_DST_SEL(dst_sel) | COPY_DATA_WR_CONFIRM);
-   radeon_emit(cs, src_va);
-   radeon_emit(cs, src_va >> 32);
-   radeon_emit(cs, dst_va);
-   radeon_emit(cs, dst_va >> 32);
+   radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
+   radeon_emit(COPY_DATA_SRC_SEL(src_sel) | COPY_DATA_DST_SEL(dst_sel) | COPY_DATA_WR_CONFIRM);
+   radeon_emit(src_va);
+   radeon_emit(src_va >> 32);
+   radeon_emit(dst_va);
+   radeon_emit(dst_va >> 32);
    radeon_end();
 }
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c
index f21f43373d..d9b741b3ec 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c	
@@ -70,12 +70,6 @@ si_create_shadowing_ib_preamble(struct si_context *sctx)
 {
    struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
 
-   if (sctx->chip_class == GFX10) {
-      /* SQ_NON_EVENT must be emitted before GE_PC_ALLOC is written. */
-      si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
-      si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0));
-   }
-
    if (sctx->screen->dpbb_allowed) {
       si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
       si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
@@ -148,8 +142,8 @@ static void si_set_context_reg_array(struct radeon_cmdbuf *cs, unsigned reg, uns
                                      const uint32_t *values)
 {
    radeon_begin(cs);
-   radeon_set_context_reg_seq(cs, reg, num);
-   radeon_emit_array(cs, values, num);
+   radeon_set_context_reg_seq(reg, num);
+   radeon_emit_array(values, num);
    radeon_end();
 }
 
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_debug.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_debug.c
index bcc8baa933..540206c152 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_debug.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_debug.c	
@@ -344,7 +344,6 @@ struct si_log_chunk_cs {
    struct si_saved_cs *cs;
    bool dump_bo_list;
    unsigned gfx_begin, gfx_end;
-   unsigned compute_begin, compute_end;
 };
 
 static void si_log_chunk_type_cs_destroy(void *data)
@@ -390,13 +389,18 @@ static void si_parse_current_ib(FILE *f, struct radeon_cmdbuf *cs, unsigned begi
    fprintf(f, "------------------- %s end (dw = %u) -------------------\n\n", name, orig_end);
 }
 
+void si_print_current_ib(struct si_context *sctx, FILE *f)
+{
+   si_parse_current_ib(f, &sctx->gfx_cs, 0, sctx->gfx_cs.prev_dw + sctx->gfx_cs.current.cdw,
+                       NULL, 0, "GFX", sctx->chip_class);
+}
+
 static void si_log_chunk_type_cs_print(void *data, FILE *f)
 {
    struct si_log_chunk_cs *chunk = data;
    struct si_context *ctx = chunk->ctx;
    struct si_saved_cs *scs = chunk->cs;
    int last_trace_id = -1;
-   int last_compute_trace_id = -1;
 
    /* We are expecting that the ddebug pipe has already
     * waited for the context, so this buffer should be idle.
@@ -404,10 +408,8 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
     */
    uint32_t *map = ctx->ws->buffer_map(ctx->ws, scs->trace_buf->buf, NULL,
                                        PIPE_MAP_UNSYNCHRONIZED | PIPE_MAP_READ);
-   if (map) {
+   if (map)
       last_trace_id = map[0];
-      last_compute_trace_id = map[1];
-   }
 
    if (chunk->gfx_end != chunk->gfx_begin) {
       if (chunk->gfx_begin == 0) {
@@ -429,20 +431,6 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
       }
    }
 
-   if (chunk->compute_end != chunk->compute_begin) {
-      assert(ctx->prim_discard_compute_cs.priv);
-
-      if (scs->flushed) {
-         ac_parse_ib(f, scs->compute.ib + chunk->compute_begin,
-                     chunk->compute_end - chunk->compute_begin, &last_compute_trace_id, map ? 1 : 0,
-                     "Compute IB", ctx->chip_class, NULL, NULL);
-      } else {
-         si_parse_current_ib(f, &ctx->prim_discard_compute_cs, chunk->compute_begin,
-                             chunk->compute_end, &last_compute_trace_id, map ? 1 : 0, "Compute IB",
-                             ctx->chip_class);
-      }
-   }
-
    if (chunk->dump_bo_list) {
       fprintf(f, "Flushing. Time: ");
       util_dump_ns(f, scs->time_flush);
@@ -462,13 +450,8 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log, bool du
 
    struct si_saved_cs *scs = ctx->current_saved_cs;
    unsigned gfx_cur = ctx->gfx_cs.prev_dw + ctx->gfx_cs.current.cdw;
-   unsigned compute_cur = 0;
 
-   if (ctx->prim_discard_compute_cs.priv)
-      compute_cur =
-         ctx->prim_discard_compute_cs.prev_dw + ctx->prim_discard_compute_cs.current.cdw;
-
-   if (!dump_bo_list && gfx_cur == scs->gfx_last_dw && compute_cur == scs->compute_last_dw)
+   if (!dump_bo_list && gfx_cur == scs->gfx_last_dw)
       return;
 
    struct si_log_chunk_cs *chunk = calloc(1, sizeof(*chunk));
@@ -481,10 +464,6 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log, bool du
    chunk->gfx_end = gfx_cur;
    scs->gfx_last_dw = gfx_cur;
 
-   chunk->compute_begin = scs->compute_last_dw;
-   chunk->compute_end = compute_cur;
-   scs->compute_last_dw = compute_cur;
-
    u_log_chunk(log, &si_log_chunk_type_cs, chunk);
 }
 
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_debug_options.h b/mesa 3D driver/src/gallium/drivers/radeonsi/si_debug_options.h
index 81880be595..9f8302f7f4 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_debug_options.h	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_debug_options.h	
@@ -10,10 +10,11 @@ OPT_BOOL(vs_fetch_always_opencode, false,
 OPT_BOOL(prim_restart_tri_strips_only, false, "Only enable primitive restart for triangle strips")
 OPT_BOOL(no_infinite_interp, false, "Kill PS with infinite interp coeff")
 OPT_BOOL(clamp_div_by_zero, false, "Clamp div by zero (x / 0 becomes FLT_MAX instead of NaN)")
-OPT_BOOL(shader_culling, false, "Cull primitives in shaders when benefical (without tess and GS)")
 OPT_BOOL(vrs2x2, false, "Enable 2x2 coarse shading for non-GUI elements")
 OPT_BOOL(enable_sam, false, "Enable Smart Access Memory with Above 4G Decoding for unvalidated platforms.")
 OPT_BOOL(disable_sam, false, "Disable Smart Access Memory.")
 OPT_BOOL(fp16, false, "Enable FP16 for mediump.")
+OPT_INT(tc_max_cpu_storage_size, 0, "Enable the CPU storage for pipelined buffer uploads in TC.")
 
 #undef OPT_BOOL
+#undef OPT_INT
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_descriptors.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_descriptors.c
index 2391a9355b..89c09db0da 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_descriptors.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_descriptors.c	
@@ -360,7 +360,17 @@ void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture
 
          state[6] |= S_00A018_META_PIPE_ALIGNED(meta.pipe_aligned) |
                      S_00A018_META_DATA_ADDRESS_LO(meta_va >> 8) |
-                     S_00A018_WRITE_COMPRESS_ENABLE((access & SI_IMAGE_ACCESS_DCC_WRITE) != 0);
+                     /* DCC image stores require the following settings:
+                      * - INDEPENDENT_64B_BLOCKS = 0
+                      * - INDEPENDENT_128B_BLOCKS = 1
+                      * - MAX_COMPRESSED_BLOCK_SIZE = 128B
+                      * - MAX_UNCOMPRESSED_BLOCK_SIZE = 256B (always used)
+                      *
+                      * The same limitations apply to SDMA compressed stores because
+                      * SDMA uses the same DCC codec.
+                      */
+                     S_00A018_WRITE_COMPRESS_ENABLE(ac_surface_supports_dcc_image_stores(sscreen->info.chip_class, &tex->surface) &&
+                                                    (access & SI_IMAGE_ACCESS_ALLOW_DCC_STORE));
       }
 
       state[7] = meta_va >> 16;
@@ -505,7 +515,7 @@ static void si_reset_sampler_view_slot(struct si_samplers *samplers, unsigned sl
 static void si_set_sampler_views(struct si_context *sctx, unsigned shader,
                                 unsigned start_slot, unsigned count,
                                 unsigned unbind_num_trailing_slots,
-                                struct pipe_sampler_view **views,
+                                bool take_ownership, struct pipe_sampler_view **views,
                                 bool disallow_early_out)
 {
    struct si_samplers *samplers = &sctx->samplers[shader];
@@ -520,8 +530,13 @@ static void si_set_sampler_views(struct si_context *sctx, unsigned shader,
          /* restrict decreases overhead of si_set_sampler_view_desc ~8x. */
          uint32_t *restrict desc = descs->list + desc_slot * 16;
 
-         if (samplers->views[slot] == &sview->base && !disallow_early_out)
+         if (samplers->views[slot] == &sview->base && !disallow_early_out) {
+            if (take_ownership) {
+               struct pipe_sampler_view *view = views[i];
+               pipe_sampler_view_reference(&view, NULL);
+            }
             continue;
+         }
 
          if (sview) {
             struct si_texture *tex = (struct si_texture *)sview->base.texture;
@@ -529,7 +544,7 @@ static void si_set_sampler_views(struct si_context *sctx, unsigned shader,
             si_set_sampler_view_desc(sctx, sview, samplers->sampler_states[slot], desc);
 
             if (tex->buffer.b.b.target == PIPE_BUFFER) {
-               tex->buffer.bind_history |= PIPE_BIND_SAMPLER_VIEW;
+               tex->buffer.bind_history |= SI_BIND_SAMPLER_BUFFER(shader);
                samplers->needs_depth_decompress_mask &= ~(1u << slot);
                samplers->needs_color_decompress_mask &= ~(1u << slot);
             } else {
@@ -549,7 +564,12 @@ static void si_set_sampler_views(struct si_context *sctx, unsigned shader,
                   sctx->need_check_render_feedback = true;
             }
 
-            pipe_sampler_view_reference(&samplers->views[slot], &sview->base);
+            if (take_ownership) {
+               pipe_sampler_view_reference(&samplers->views[slot], NULL);
+               samplers->views[slot] = &sview->base;
+            } else {
+               pipe_sampler_view_reference(&samplers->views[slot], &sview->base);
+            }
             samplers->enabled_mask |= 1u << slot;
 
             /* Since this can flush, it must be done after enabled_mask is
@@ -598,7 +618,7 @@ static void si_update_shader_needs_decompress_mask(struct si_context *sctx, unsi
 static void si_pipe_set_sampler_views(struct pipe_context *ctx, enum pipe_shader_type shader,
                                       unsigned start, unsigned count,
                                       unsigned unbind_num_trailing_slots,
-                                      struct pipe_sampler_view **views)
+                                      bool take_ownership, struct pipe_sampler_view **views)
 {
    struct si_context *sctx = (struct si_context *)ctx;
 
@@ -606,7 +626,7 @@ static void si_pipe_set_sampler_views(struct pipe_context *ctx, enum pipe_shader
       return;
 
    si_set_sampler_views(sctx, shader, start, count, unbind_num_trailing_slots,
-                        views, false);
+                        take_ownership, views, false);
    si_update_shader_needs_decompress_mask(sctx, shader);
 }
 
@@ -728,12 +748,15 @@ static void si_set_shader_image_desc(struct si_context *ctx, const struct pipe_i
       bool uses_dcc = vi_dcc_enabled(tex, level);
       unsigned access = view->access;
 
+      if (uses_dcc && screen->always_allow_dcc_stores)
+         access |= SI_IMAGE_ACCESS_ALLOW_DCC_STORE;
+
       assert(!tex->is_depth);
       assert(fmask_desc || tex->surface.fmask_offset == 0);
 
       if (uses_dcc && !skip_decompress &&
           !(access & SI_IMAGE_ACCESS_DCC_OFF) &&
-          ((!(access & SI_IMAGE_ACCESS_DCC_WRITE) && (access & PIPE_IMAGE_ACCESS_WRITE)) ||
+          ((!(access & SI_IMAGE_ACCESS_ALLOW_DCC_STORE) && (access & PIPE_IMAGE_ACCESS_WRITE)) ||
            !vi_dcc_formats_compatible(screen, res->b.b.format, view->format))) {
          /* If DCC can't be disabled, at least decompress it.
           * The decompression is relatively cheap if the surface
@@ -769,7 +792,7 @@ static void si_set_shader_image_desc(struct si_context *ctx, const struct pipe_i
          view->u.tex.first_layer, view->u.tex.last_layer, width, height, depth, desc, fmask_desc);
       si_set_mutable_tex_desc_fields(screen, tex, &tex->surface.u.legacy.level[level], level, level,
                                      util_format_get_blockwidth(view->format),
-                                     false, view->access, desc);
+                                     false, access, desc);
    }
 }
 
@@ -796,7 +819,7 @@ static void si_set_shader_image(struct si_context *ctx, unsigned shader, unsigne
    if (res->b.b.target == PIPE_BUFFER) {
       images->needs_color_decompress_mask &= ~(1 << slot);
       images->display_dcc_store_mask &= ~(1u << slot);
-      res->bind_history |= PIPE_BIND_SHADER_IMAGE;
+      res->bind_history |= SI_BIND_IMAGE_BUFFER(shader);
    } else {
       struct si_texture *tex = (struct si_texture *)res;
       unsigned level = view->u.tex.level;
@@ -807,10 +830,15 @@ static void si_set_shader_image(struct si_context *ctx, unsigned shader, unsigne
          images->needs_color_decompress_mask &= ~(1 << slot);
       }
 
-      if (tex->surface.display_dcc_offset && view->access & PIPE_IMAGE_ACCESS_WRITE)
+      if (tex->surface.display_dcc_offset && view->access & PIPE_IMAGE_ACCESS_WRITE) {
          images->display_dcc_store_mask |= 1u << slot;
-      else
+
+         /* Set displayable_dcc_dirty for non-compute stages conservatively (before draw calls). */
+         if (shader != PIPE_SHADER_COMPUTE)
+            tex->displayable_dcc_dirty = true;
+      } else {
          images->display_dcc_store_mask &= ~(1u << slot);
+      }
 
       if (vi_dcc_enabled(tex, level) && p_atomic_read(&tex->framebuffers_bound))
          ctx->need_check_render_feedback = true;
@@ -1194,6 +1222,38 @@ static void si_set_constant_buffer(struct si_context *sctx, struct si_buffer_res
    sctx->descriptors_dirty |= 1u << descriptors_idx;
 }
 
+void si_get_inline_uniform_state(union si_shader_key *key, enum pipe_shader_type shader,
+                                 bool *inline_uniforms, uint32_t **inlined_values)
+{
+   if (shader == PIPE_SHADER_FRAGMENT) {
+      *inline_uniforms = key->ps.opt.inline_uniforms;
+      *inlined_values = key->ps.opt.inlined_uniform_values;
+   } else {
+      *inline_uniforms = key->ge.opt.inline_uniforms;
+      *inlined_values = key->ge.opt.inlined_uniform_values;
+   }
+}
+
+void si_invalidate_inlinable_uniforms(struct si_context *sctx, enum pipe_shader_type shader)
+{
+   if (shader == PIPE_SHADER_COMPUTE)
+      return;
+
+   bool inline_uniforms;
+   uint32_t *inlined_values;
+   si_get_inline_uniform_state(&sctx->shaders[shader].key, shader, &inline_uniforms, &inlined_values);
+
+   if (inline_uniforms) {
+      if (shader == PIPE_SHADER_FRAGMENT)
+         sctx->shaders[shader].key.ps.opt.inline_uniforms = false;
+      else
+         sctx->shaders[shader].key.ge.opt.inline_uniforms = false;
+
+      memset(inlined_values, 0, MAX_INLINABLE_UNIFORMS * 4);
+      sctx->do_update_shaders = true;
+   }
+}
+
 static void si_pipe_set_constant_buffer(struct pipe_context *ctx, enum pipe_shader_type shader,
                                         uint slot, bool take_ownership,
                                         const struct pipe_constant_buffer *input)
@@ -1210,13 +1270,11 @@ static void si_pipe_set_constant_buffer(struct pipe_context *ctx, enum pipe_shad
             assert(!"constant buffer 0 must have a 32-bit VM address, use const_uploader");
             return;
          }
-         si_resource(input->buffer)->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
+         si_resource(input->buffer)->bind_history |= SI_BIND_CONSTANT_BUFFER(shader);
       }
 
-      if (slot == 0) {
-         /* Invalidate current inlinable uniforms. */
-         sctx->inlinable_uniforms_valid_mask &= ~(1 << shader);
-      }
+      if (slot == 0)
+         si_invalidate_inlinable_uniforms(sctx, shader);
    }
 
    slot = si_get_constbuf_slot(slot);
@@ -1231,10 +1289,21 @@ static void si_set_inlinable_constants(struct pipe_context *ctx,
 {
    struct si_context *sctx = (struct si_context *)ctx;
 
-   if (!(sctx->inlinable_uniforms_valid_mask & BITFIELD_BIT(shader))) {
+   if (shader == PIPE_SHADER_COMPUTE)
+      return;
+
+   bool inline_uniforms;
+   uint32_t *inlined_values;
+   si_get_inline_uniform_state(&sctx->shaders[shader].key, shader, &inline_uniforms, &inlined_values);
+
+   if (!inline_uniforms) {
       /* It's the first time we set the constants. Always update shaders. */
-      memcpy(sctx->inlinable_uniforms[shader], values, num_values * 4);
-      sctx->inlinable_uniforms_valid_mask |= BITFIELD_BIT(shader);
+      if (shader == PIPE_SHADER_FRAGMENT)
+         sctx->shaders[shader].key.ps.opt.inline_uniforms = true;
+      else
+         sctx->shaders[shader].key.ge.opt.inline_uniforms = true;
+
+      memcpy(inlined_values, values, num_values * 4);
       sctx->do_update_shaders = true;
       return;
    }
@@ -1242,8 +1311,8 @@ static void si_set_inlinable_constants(struct pipe_context *ctx,
    /* We have already set inlinable constants for this shader. Update the shader only if
     * the constants are being changed so as not to update shaders needlessly.
     */
-   if (memcmp(sctx->inlinable_uniforms[shader], values, num_values * 4)) {
-      memcpy(sctx->inlinable_uniforms[shader], values, num_values * 4);
+   if (memcmp(inlined_values, values, num_values * 4)) {
+      memcpy(inlined_values, values, num_values * 4);
       sctx->do_update_shaders = true;
    }
 }
@@ -1300,10 +1369,10 @@ static void si_set_shader_buffer(struct si_context *sctx, struct si_buffer_resou
                   sbuffer->buffer_offset + sbuffer->buffer_size);
 }
 
-static void si_set_shader_buffers(struct pipe_context *ctx, enum pipe_shader_type shader,
-                                  unsigned start_slot, unsigned count,
-                                  const struct pipe_shader_buffer *sbuffers,
-                                  unsigned writable_bitmask)
+void si_set_shader_buffers(struct pipe_context *ctx, enum pipe_shader_type shader,
+                           unsigned start_slot, unsigned count,
+                           const struct pipe_shader_buffer *sbuffers,
+                           unsigned writable_bitmask, bool internal_blit)
 {
    struct si_context *sctx = (struct si_context *)ctx;
    struct si_buffer_resources *buffers = &sctx->const_and_shader_buffers[shader];
@@ -1321,14 +1390,25 @@ static void si_set_shader_buffers(struct pipe_context *ctx, enum pipe_shader_typ
       const struct pipe_shader_buffer *sbuffer = sbuffers ? &sbuffers[i] : NULL;
       unsigned slot = si_get_shaderbuf_slot(start_slot + i);
 
-      if (sbuffer && sbuffer->buffer)
-         si_resource(sbuffer->buffer)->bind_history |= PIPE_BIND_SHADER_BUFFER;
+      /* Don't track bind history for internal blits, such as clear_buffer and copy_buffer
+       * to prevent unnecessary synchronization before compute blits later.
+       */
+      if (!internal_blit && sbuffer && sbuffer->buffer)
+         si_resource(sbuffer->buffer)->bind_history |= SI_BIND_SHADER_BUFFER(shader);
 
       si_set_shader_buffer(sctx, buffers, descriptors_idx, slot, sbuffer,
                            !!(writable_bitmask & (1u << i)), buffers->priority);
    }
 }
 
+static void si_pipe_set_shader_buffers(struct pipe_context *ctx, enum pipe_shader_type shader,
+                                       unsigned start_slot, unsigned count,
+                                       const struct pipe_shader_buffer *sbuffers,
+                                       unsigned writable_bitmask)
+{
+   si_set_shader_buffers(ctx, shader, start_slot, count, sbuffers, writable_bitmask, false);
+}
+
 void si_get_shader_buffers(struct si_context *sctx, enum pipe_shader_type shader, uint start_slot,
                            uint count, struct pipe_shader_buffer *sbuf)
 {
@@ -1561,7 +1641,7 @@ static bool si_reset_buffer_resources(struct si_context *sctx, struct si_buffer_
 void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf)
 {
    struct si_resource *buffer = si_resource(buf);
-   unsigned i, shader;
+   unsigned i;
    unsigned num_elems = sctx->num_vertex_elements;
 
    /* We changed the buffer, now we need to bind it where the old one
@@ -1573,7 +1653,7 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf)
    /* Vertex buffers. */
    if (!buffer) {
       sctx->vertex_buffers_dirty = num_elems > 0;
-   } else if (buffer->bind_history & PIPE_BIND_VERTEX_BUFFER) {
+   } else if (buffer->bind_history & SI_BIND_VERTEX_BUFFER) {
       for (i = 0; i < num_elems; i++) {
          int vb = sctx->vertex_elements->vertex_buffer_index[i];
 
@@ -1590,7 +1670,7 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf)
    }
 
    /* Streamout buffers. (other internal buffers can't be invalidated) */
-   if (!buffer || buffer->bind_history & PIPE_BIND_STREAM_OUTPUT) {
+   if (!buffer || buffer->bind_history & SI_BIND_STREAMOUT_BUFFER) {
       for (i = SI_VS_STREAMOUT_BUF0; i <= SI_VS_STREAMOUT_BUF3; i++) {
          struct si_buffer_resources *buffers = &sctx->internal_bindings;
          struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_INTERNAL];
@@ -1614,16 +1694,21 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf)
    }
 
    /* Constant and shader buffers. */
-   if (!buffer || buffer->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
-      for (shader = 0; shader < SI_NUM_SHADERS; shader++)
+   if (!buffer || buffer->bind_history & SI_BIND_CONSTANT_BUFFER_ALL) {
+      unsigned mask = buffer ? (buffer->bind_history & SI_BIND_CONSTANT_BUFFER_ALL) >>
+                               SI_BIND_CONSTANT_BUFFER_SHIFT : BITFIELD_MASK(SI_NUM_SHADERS);
+      u_foreach_bit(shader, mask) {
          si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader],
                                    si_const_and_shader_buffer_descriptors_idx(shader),
                                    u_bit_consecutive64(SI_NUM_SHADER_BUFFERS, SI_NUM_CONST_BUFFERS),
                                    buf, sctx->const_and_shader_buffers[shader].priority_constbuf);
+      }
    }
 
-   if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_BUFFER) {
-      for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
+   if (!buffer || buffer->bind_history & SI_BIND_SHADER_BUFFER_ALL) {
+      unsigned mask = buffer ? (buffer->bind_history & SI_BIND_SHADER_BUFFER_ALL) >>
+                               SI_BIND_SHADER_BUFFER_SHIFT : BITFIELD_MASK(SI_NUM_SHADERS);
+      u_foreach_bit(shader, mask) {
          if (si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader],
                                        si_const_and_shader_buffer_descriptors_idx(shader),
                                        u_bit_consecutive64(0, SI_NUM_SHADER_BUFFERS), buf,
@@ -1634,9 +1719,11 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf)
       }
    }
 
-   if (!buffer || buffer->bind_history & PIPE_BIND_SAMPLER_VIEW) {
+   if (!buffer || buffer->bind_history & SI_BIND_SAMPLER_BUFFER_ALL) {
+      unsigned mask = buffer ? (buffer->bind_history & SI_BIND_SAMPLER_BUFFER_ALL) >>
+                               SI_BIND_SAMPLER_BUFFER_SHIFT : BITFIELD_MASK(SI_NUM_SHADERS);
       /* Texture buffers - update bindings. */
-      for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
+      u_foreach_bit(shader, mask) {
          struct si_samplers *samplers = &sctx->samplers[shader];
          struct si_descriptors *descs = si_sampler_and_image_descriptors(sctx, shader);
          unsigned mask = samplers->enabled_mask;
@@ -1660,8 +1747,10 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf)
    }
 
    /* Shader images */
-   if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_IMAGE) {
-      for (shader = 0; shader < SI_NUM_SHADERS; ++shader) {
+   if (!buffer || buffer->bind_history & SI_BIND_IMAGE_BUFFER_ALL) {
+      unsigned mask = buffer ? (buffer->bind_history & SI_BIND_IMAGE_BUFFER_SHIFT) >>
+                               SI_BIND_IMAGE_BUFFER_SHIFT : BITFIELD_MASK(SI_NUM_SHADERS);
+      u_foreach_bit(shader, mask) {
          struct si_images *images = &sctx->images[shader];
          struct si_descriptors *descs = si_sampler_and_image_descriptors(sctx, shader);
          unsigned mask = images->enabled_mask;
@@ -1891,7 +1980,7 @@ void si_update_all_texture_descriptors(struct si_context *sctx)
          if (!view || !view->texture || view->texture->target == PIPE_BUFFER)
             continue;
 
-         si_set_sampler_views(sctx, shader, i, 1, 0, &samplers->views[i], true);
+         si_set_sampler_views(sctx, shader, i, 1, 0, false, &samplers->views[i], true);
       }
 
       si_update_shader_needs_decompress_mask(sctx, shader);
@@ -1909,11 +1998,13 @@ static void si_mark_shader_pointers_dirty(struct si_context *sctx, unsigned shad
       u_bit_consecutive(SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS, SI_NUM_SHADER_DESCS);
 
    if (shader == PIPE_SHADER_VERTEX) {
+      unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sctx->screen);
+
       sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL &&
                                           sctx->num_vertex_elements >
-                                          sctx->screen->num_vbos_in_user_sgprs;
+                                          num_vbos_in_user_sgprs;
       sctx->vertex_buffer_user_sgprs_dirty =
-         sctx->num_vertex_elements > 0 && sctx->screen->num_vbos_in_user_sgprs;
+         sctx->num_vertex_elements > 0 && num_vbos_in_user_sgprs;
    }
 
    si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
@@ -1921,12 +2012,14 @@ static void si_mark_shader_pointers_dirty(struct si_context *sctx, unsigned shad
 
 void si_shader_pointers_mark_dirty(struct si_context *sctx)
 {
+   unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sctx->screen);
+
    sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
    sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL &&
                                        sctx->num_vertex_elements >
-                                       sctx->screen->num_vbos_in_user_sgprs;
+                                       num_vbos_in_user_sgprs;
    sctx->vertex_buffer_user_sgprs_dirty =
-      sctx->num_vertex_elements > 0 && sctx->screen->num_vbos_in_user_sgprs;
+      sctx->num_vertex_elements > 0 && num_vbos_in_user_sgprs;
    si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
    sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
    sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
@@ -1975,6 +2068,36 @@ void si_shader_change_notify(struct si_context *sctx)
                                                sctx->shader.gs.cso ? GS_ON : GS_OFF,
                                                sctx->ngg ? NGG_ON : NGG_OFF,
                                                PIPE_SHADER_TESS_EVAL));
+
+   /* Update as_* flags in shader keys. Ignore disabled shader stages.
+    *   as_ls = VS before TCS
+    *   as_es = VS before GS or TES before GS
+    *   as_ngg = NGG enabled for the last geometry stage.
+    *            If GS sets as_ngg, the previous stage must set as_ngg too.
+    */
+   if (sctx->shader.tes.cso) {
+      sctx->shader.vs.key.ge.as_ls = 1;
+      sctx->shader.vs.key.ge.as_es = 0;
+      sctx->shader.vs.key.ge.as_ngg = 0;
+
+      if (sctx->shader.gs.cso) {
+         sctx->shader.tes.key.ge.as_es = 1;
+         sctx->shader.tes.key.ge.as_ngg = sctx->ngg;
+         sctx->shader.gs.key.ge.as_ngg = sctx->ngg;
+      } else {
+         sctx->shader.tes.key.ge.as_es = 0;
+         sctx->shader.tes.key.ge.as_ngg = sctx->ngg;
+      }
+   } else if (sctx->shader.gs.cso) {
+      sctx->shader.vs.key.ge.as_ls = 0;
+      sctx->shader.vs.key.ge.as_es = 1;
+      sctx->shader.vs.key.ge.as_ngg = sctx->ngg;
+      sctx->shader.gs.key.ge.as_ngg = sctx->ngg;
+   } else {
+      sctx->shader.vs.key.ge.as_ls = 0;
+      sctx->shader.vs.key.ge.as_es = 0;
+      sctx->shader.vs.key.ge.as_ngg = sctx->ngg;
+   }
 }
 
 #define si_emit_consecutive_shader_pointers(sctx, pointer_mask, sh_base) do { \
@@ -1989,9 +2112,9 @@ void si_shader_change_notify(struct si_context *sctx)
          struct si_descriptors *descs = &sctx->descriptors[start]; \
          unsigned sh_offset = sh_reg_base + descs->shader_userdata_offset; \
          \
-         radeon_set_sh_reg_seq(&sctx->gfx_cs, sh_offset, count); \
+         radeon_set_sh_reg_seq(sh_offset, count); \
          for (int i = 0; i < count; i++) \
-            radeon_emit_32bit_pointer(sctx->screen, cs, descs[i].gpu_address); \
+            radeon_emit_32bit_pointer(sctx->screen, descs[i].gpu_address); \
       } \
    } \
 } while (0)
@@ -2082,12 +2205,12 @@ void si_emit_compute_shader_pointers(struct si_context *sctx)
    if (num_shaderbufs && sctx->compute_shaderbuf_sgprs_dirty) {
       struct si_descriptors *desc = si_const_and_shader_buffer_descriptors(sctx, PIPE_SHADER_COMPUTE);
 
-      radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
+      radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 +
                             shader->cs_shaderbufs_sgpr_index * 4,
                             num_shaderbufs * 4);
 
       for (unsigned i = 0; i < num_shaderbufs; i++)
-         radeon_emit_array(cs, &desc->list[si_get_shaderbuf_slot(i) * 4], 4);
+         radeon_emit_array(&desc->list[si_get_shaderbuf_slot(i) * 4], 4);
 
       sctx->compute_shaderbuf_sgprs_dirty = false;
    }
@@ -2097,7 +2220,7 @@ void si_emit_compute_shader_pointers(struct si_context *sctx)
    if (num_images && sctx->compute_image_sgprs_dirty) {
       struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, PIPE_SHADER_COMPUTE);
 
-      radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
+      radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 +
                             shader->cs_images_sgpr_index * 4,
                             shader->cs_images_num_sgprs);
 
@@ -2111,7 +2234,7 @@ void si_emit_compute_shader_pointers(struct si_context *sctx)
             num_sgprs = 4;
          }
 
-         radeon_emit_array(cs, &desc->list[desc_offset], num_sgprs);
+         radeon_emit_array(&desc->list[desc_offset], num_sgprs);
       }
 
       sctx->compute_image_sgprs_dirty = false;
@@ -2587,7 +2710,7 @@ void si_init_all_descriptors(struct si_context *sctx)
    sctx->b.set_shader_images = si_set_shader_images;
    sctx->b.set_constant_buffer = si_pipe_set_constant_buffer;
    sctx->b.set_inlinable_constants = si_set_inlinable_constants;
-   sctx->b.set_shader_buffers = si_set_shader_buffers;
+   sctx->b.set_shader_buffers = si_pipe_set_shader_buffers;
    sctx->b.set_sampler_views = si_pipe_set_sampler_views;
    sctx->b.create_texture_handle = si_create_texture_handle;
    sctx->b.delete_texture_handle = si_delete_texture_handle;
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_fence.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_fence.c
index 7b82aa3abd..efc4fede68 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_fence.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_fence.c	
@@ -73,7 +73,7 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigne
                  EVENT_INDEX(event == V_028A90_CS_DONE || event == V_028A90_PS_DONE ? 6 : 5) |
                  event_flags;
    unsigned sel = EOP_DST_SEL(dst_sel) | EOP_INT_SEL(int_sel) | EOP_DATA_SEL(data_sel);
-   bool compute_ib = !ctx->has_graphics || cs == &ctx->prim_discard_compute_cs;
+   bool compute_ib = !ctx->has_graphics;
 
    radeon_begin(cs);
 
@@ -92,24 +92,24 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigne
             ctx->eop_bug_scratch_tmz : ctx->eop_bug_scratch;
 
          assert(16 * ctx->screen->info.max_render_backends <= scratch->b.b.width0);
-         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
-         radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
-         radeon_emit(cs, scratch->gpu_address);
-         radeon_emit(cs, scratch->gpu_address >> 32);
+         radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
+         radeon_emit(EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
+         radeon_emit(scratch->gpu_address);
+         radeon_emit(scratch->gpu_address >> 32);
 
          radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, scratch, RADEON_USAGE_WRITE,
                                    RADEON_PRIO_QUERY);
       }
 
-      radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, ctx->chip_class >= GFX9 ? 6 : 5, 0));
-      radeon_emit(cs, op);
-      radeon_emit(cs, sel);
-      radeon_emit(cs, va);        /* address lo */
-      radeon_emit(cs, va >> 32);  /* address hi */
-      radeon_emit(cs, new_fence); /* immediate data lo */
-      radeon_emit(cs, 0);         /* immediate data hi */
+      radeon_emit(PKT3(PKT3_RELEASE_MEM, ctx->chip_class >= GFX9 ? 6 : 5, 0));
+      radeon_emit(op);
+      radeon_emit(sel);
+      radeon_emit(va);        /* address lo */
+      radeon_emit(va >> 32);  /* address hi */
+      radeon_emit(new_fence); /* immediate data lo */
+      radeon_emit(0);         /* immediate data hi */
       if (ctx->chip_class >= GFX9)
-         radeon_emit(cs, 0); /* unused */
+         radeon_emit(0); /* unused */
    } else {
       if (ctx->chip_class == GFX7 || ctx->chip_class == GFX8) {
          struct si_resource *scratch = ctx->eop_bug_scratch;
@@ -119,23 +119,23 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigne
           * (and optional cache flushes executed) before the timestamp
           * is written.
           */
-         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
-         radeon_emit(cs, op);
-         radeon_emit(cs, va);
-         radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
-         radeon_emit(cs, 0); /* immediate data */
-         radeon_emit(cs, 0); /* unused */
+         radeon_emit(PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
+         radeon_emit(op);
+         radeon_emit(va);
+         radeon_emit(((va >> 32) & 0xffff) | sel);
+         radeon_emit(0); /* immediate data */
+         radeon_emit(0); /* unused */
 
          radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, scratch, RADEON_USAGE_WRITE,
                                    RADEON_PRIO_QUERY);
       }
 
-      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
-      radeon_emit(cs, op);
-      radeon_emit(cs, va);
-      radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
-      radeon_emit(cs, new_fence); /* immediate data */
-      radeon_emit(cs, 0);         /* unused */
+      radeon_emit(PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
+      radeon_emit(op);
+      radeon_emit(va);
+      radeon_emit(((va >> 32) & 0xffff) | sel);
+      radeon_emit(new_fence); /* immediate data */
+      radeon_emit(0);         /* unused */
    }
 
    radeon_end();
@@ -159,13 +159,13 @@ void si_cp_wait_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, uint64_t v
                     uint32_t mask, unsigned flags)
 {
    radeon_begin(cs);
-   radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
-   radeon_emit(cs, WAIT_REG_MEM_MEM_SPACE(1) | flags);
-   radeon_emit(cs, va);
-   radeon_emit(cs, va >> 32);
-   radeon_emit(cs, ref);  /* reference value */
-   radeon_emit(cs, mask); /* mask */
-   radeon_emit(cs, 4);    /* poll interval */
+   radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+   radeon_emit(WAIT_REG_MEM_MEM_SPACE(1) | flags);
+   radeon_emit(va);
+   radeon_emit(va >> 32);
+   radeon_emit(ref);  /* reference value */
+   radeon_emit(mask); /* mask */
+   radeon_emit(4);    /* poll interval */
    radeon_end();
 }
 
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_get.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_get.c
index 7968f7dbaf..75f6c8a178 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_get.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_get.c	
@@ -164,6 +164,9 @@ static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_TGSI_ATOMINC_WRAP:
       return 1;
 
+   case PIPE_CAP_DRAW_VERTEX_STATE:
+      return !(sscreen->debug_flags & DBG(NO_FAST_DISPLAY_LIST));
+
    case PIPE_CAP_GLSL_ZERO_INIT:
       return 2;
 
@@ -232,6 +235,7 @@ static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
    case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
    case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_PREFER_BACK_BUFFER_REUSE:
       return 0;
 
    case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
@@ -943,7 +947,7 @@ static void si_init_renderer_string(struct si_screen *sscreen)
 
    if (sscreen->info.marketing_name) {
       snprintf(first_name, sizeof(first_name), "%s", sscreen->info.marketing_name);
-      snprintf(second_name, sizeof(second_name), "%s, ", sscreen->info.name);
+      snprintf(second_name, sizeof(second_name), "%s, ", sscreen->info.lowercase_name);
    } else {
       snprintf(first_name, sizeof(first_name), "AMD %s", sscreen->info.name);
    }
@@ -952,9 +956,8 @@ static void si_init_renderer_string(struct si_screen *sscreen)
       snprintf(kernel_version, sizeof(kernel_version), ", %s", uname_data.release);
 
    snprintf(sscreen->renderer_string, sizeof(sscreen->renderer_string),
-            "%s (%sDRM %i.%i.%i%s, LLVM " MESA_LLVM_VERSION_STRING ")", first_name, second_name,
-            sscreen->info.drm_major, sscreen->info.drm_minor, sscreen->info.drm_patchlevel,
-            kernel_version);
+            "%s (%sLLVM " MESA_LLVM_VERSION_STRING ", DRM %i.%i%s)", first_name, second_name,
+            sscreen->info.drm_major, sscreen->info.drm_minor, kernel_version);
 }
 
 void si_init_screen_get_functions(struct si_screen *sscreen)
@@ -1033,12 +1036,22 @@ void si_init_screen_get_functions(struct si_screen *sscreen)
       .lower_insert_word = true,
       .lower_rotate = true,
       .lower_to_scalar = true,
+      .has_dot_4x8 = sscreen->info.has_accelerated_dot_product,
+      .has_dot_2x16 = sscreen->info.has_accelerated_dot_product,
       .optimize_sample_mask_in = true,
       .max_unroll_iterations = 32,
+      .max_unroll_iterations_aggressive = 128,
       .use_interpolated_input_intrinsics = true,
       .lower_uniforms_to_ubo = true,
       .support_16bit_alu = sscreen->options.fp16,
       .vectorize_vec2_16bit = sscreen->options.fp16,
+      .pack_varying_options =
+         nir_pack_varying_interp_mode_none |
+         nir_pack_varying_interp_mode_smooth |
+         nir_pack_varying_interp_mode_noperspective |
+         nir_pack_varying_interp_loc_center |
+         nir_pack_varying_interp_loc_sample |
+         nir_pack_varying_interp_loc_centroid,
    };
    sscreen->nir_options = nir_options;
 }
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_gfx_cs.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_gfx_cs.c
index 1e001989c0..e38b18f71f 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_gfx_cs.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_gfx_cs.c	
@@ -31,29 +31,6 @@
 #include "util/u_upload_mgr.h"
 #include "ac_debug.h"
 
-/* initialize */
-void si_need_gfx_cs_space(struct si_context *ctx, unsigned num_draws)
-{
-   struct radeon_cmdbuf *cs = &ctx->gfx_cs;
-
-   /* There are two memory usage counters in the winsys for all buffers
-    * that have been added (cs_add_buffer) and two counters in the pipe
-    * driver for those that haven't been added yet.
-    */
-   if (unlikely(!radeon_cs_memory_below_limit(ctx->screen, &ctx->gfx_cs, ctx->vram_kb, ctx->gtt_kb))) {
-      ctx->gtt_kb = 0;
-      ctx->vram_kb = 0;
-      si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-      return;
-   }
-   ctx->gtt_kb = 0;
-   ctx->vram_kb = 0;
-
-   unsigned need_dwords = si_get_minimum_num_gfx_cs_dwords(ctx, num_draws);
-   if (!ctx->ws->cs_check_space(cs, need_dwords, false))
-      si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-}
-
 void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence)
 {
    struct radeon_cmdbuf *cs = &ctx->gfx_cs;
@@ -115,9 +92,6 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h
 
    ctx->gfx_flush_in_progress = true;
 
-   if (radeon_emitted(&ctx->prim_discard_compute_cs, 0))
-      si_compute_signal_gfx(ctx);
-
    if (ctx->has_graphics) {
       if (!list_is_empty(&ctx->active_queries))
          si_suspend_queries(ctx);
@@ -159,28 +133,8 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h
       si_log_hw_flush(ctx);
    }
 
-   if (si_compute_prim_discard_enabled(ctx)) {
-      /* The compute IB can start after the previous gfx IB starts. */
-      if (radeon_emitted(&ctx->prim_discard_compute_cs, 0) && ctx->last_gfx_fence) {
-         ctx->ws->cs_add_fence_dependency(
-            &ctx->gfx_cs, ctx->last_gfx_fence,
-            RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY | RADEON_DEPENDENCY_START_FENCE);
-      }
-
-      /* Remember the last execution barrier. It's in the IB.
-       * It will signal the start of the next compute IB.
-       */
-      if (flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW && ctx->last_pkt3_write_data) {
-         *ctx->last_pkt3_write_data = PKT3(PKT3_WRITE_DATA, 3, 0);
-         ctx->last_pkt3_write_data = NULL;
-
-         si_resource_reference(&ctx->last_ib_barrier_buf, ctx->barrier_buf);
-         ctx->last_ib_barrier_buf_offset = ctx->barrier_buf_offset;
-         si_resource_reference(&ctx->barrier_buf, NULL);
-
-         ws->fence_reference(&ctx->last_ib_barrier_fence, NULL);
-      }
-   }
+   if (sscreen->debug_flags & DBG(IB))
+      si_print_current_ib(ctx, stderr);
 
    if (ctx->is_noop)
       flags |= RADEON_FLUSH_NOOP;
@@ -194,17 +148,6 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h
 
    ctx->num_gfx_cs_flushes++;
 
-   if (si_compute_prim_discard_enabled(ctx)) {
-      /* Remember the last execution barrier, which is the last fence
-       * in this case.
-       */
-      if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
-         ctx->last_pkt3_write_data = NULL;
-         si_resource_reference(&ctx->last_ib_barrier_buf, NULL);
-         ws->fence_reference(&ctx->last_ib_barrier_fence, ctx->last_gfx_fence);
-      }
-   }
-
    /* Check VM faults if needed. */
    if (sscreen->debug_flags & DBG(CHECK_VM)) {
       /* Use conservative timeout 800ms, after which we won't wait any
@@ -239,7 +182,7 @@ static void si_begin_gfx_cs_debug(struct si_context *ctx)
    pipe_reference_init(&ctx->current_saved_cs->reference, 1);
 
    ctx->current_saved_cs->trace_buf =
-      si_resource(pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 8));
+      si_resource(pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 4));
    if (!ctx->current_saved_cs->trace_buf) {
       free(ctx->current_saved_cs);
       ctx->current_saved_cs = NULL;
@@ -304,8 +247,7 @@ void si_set_tracked_regs_to_clear_state(struct si_context *ctx)
    ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_MODE_CNTL_1] = 0x00000000;
    ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_PRIM_FILTER_CNTL] = 0;
    ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL] = 0x00000000;
-   ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__VS] = 0x00000000;
-   ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__CL] = 0x00000000;
+   ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL] = 0x00000000;
    ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_CLIP_CNTL] = 0x00090000;
    ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_BINNER_CNTL_0] = 0x00000003;
    ctx->tracked_regs.reg_value[SI_TRACKED_DB_VRS_OVERRIDE_CNTL] = 0x00000000;
@@ -351,24 +293,38 @@ void si_set_tracked_regs_to_clear_state(struct si_context *ctx)
    ctx->tracked_regs.reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL]  = 0x0000001e; /* From GFX8 */
 
    /* Set all cleared context registers to saved. */
-   ctx->tracked_regs.reg_saved = ~(1ull << SI_TRACKED_GE_PC_ALLOC); /* uconfig reg */
+   ctx->tracked_regs.reg_saved = BITFIELD64_MASK(SI_TRACKED_GE_PC_ALLOC);
    ctx->last_gs_out_prim = 0; /* cleared by CLEAR_STATE */
 }
 
-void si_install_draw_wrapper(struct si_context *sctx, pipe_draw_vbo_func wrapper)
+void si_install_draw_wrapper(struct si_context *sctx, pipe_draw_vbo_func wrapper,
+                             pipe_draw_vertex_state_func vstate_wrapper)
 {
    if (wrapper) {
       if (wrapper != sctx->b.draw_vbo) {
-         assert (!sctx->real_draw_vbo);
+         assert(!sctx->real_draw_vbo);
+         assert(!sctx->real_draw_vertex_state);
          sctx->real_draw_vbo = sctx->b.draw_vbo;
+         sctx->real_draw_vertex_state = sctx->b.draw_vertex_state;
          sctx->b.draw_vbo = wrapper;
+         sctx->b.draw_vertex_state = vstate_wrapper;
       }
    } else if (sctx->real_draw_vbo) {
       sctx->real_draw_vbo = NULL;
+      sctx->real_draw_vertex_state = NULL;
       si_select_draw_vbo(sctx);
    }
 }
 
+static void si_tmz_preamble(struct si_context *sctx)
+{
+   bool secure = si_gfx_resources_check_encrypted(sctx);
+   if (secure != sctx->ws->cs_is_secure(&sctx->gfx_cs)) {
+      si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW |
+                            RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION, NULL);
+   }
+}
+
 static void si_draw_vbo_tmz_preamble(struct pipe_context *ctx,
                                      const struct pipe_draw_info *info,
                                      unsigned drawid_offset,
@@ -377,28 +333,31 @@ static void si_draw_vbo_tmz_preamble(struct pipe_context *ctx,
                                      unsigned num_draws) {
    struct si_context *sctx = (struct si_context *)ctx;
 
-   bool secure = si_gfx_resources_check_encrypted(sctx);
-   if (secure != sctx->ws->cs_is_secure(&sctx->gfx_cs)) {
-      si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW |
-                            RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION, NULL);
-   }
-
+   si_tmz_preamble(sctx);
    sctx->real_draw_vbo(ctx, info, drawid_offset, indirect, draws, num_draws);
 }
 
+static void si_draw_vstate_tmz_preamble(struct pipe_context *ctx,
+                                        struct pipe_vertex_state *state,
+                                        uint32_t partial_velem_mask,
+                                        struct pipe_draw_vertex_state_info info,
+                                        const struct pipe_draw_start_count_bias *draws,
+                                        unsigned num_draws) {
+   struct si_context *sctx = (struct si_context *)ctx;
+
+   si_tmz_preamble(sctx);
+   sctx->real_draw_vertex_state(ctx, state, partial_velem_mask, info, draws, num_draws);
+}
+
 void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
 {
    bool is_secure = false;
 
    if (unlikely(radeon_uses_secure_bos(ctx->ws))) {
-      /* Disable features that don't work with TMZ:
-       *   - primitive discard
-       */
-      ctx->prim_discard_vertex_count_threshold = UINT_MAX;
-
       is_secure = ctx->ws->cs_is_secure(&ctx->gfx_cs);
 
-      si_install_draw_wrapper(ctx, si_draw_vbo_tmz_preamble);
+      si_install_draw_wrapper(ctx, si_draw_vbo_tmz_preamble,
+                              si_draw_vstate_tmz_preamble);
    }
 
    if (ctx->is_debug)
@@ -420,8 +379,10 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
                  SI_CONTEXT_INV_L2 | SI_CONTEXT_START_PIPELINE_STATS;
    ctx->pipeline_stats_enabled = -1;
 
-   /* We don't know if the last draw call used GS fast launch, so assume it didn't. */
-   if (ctx->chip_class == GFX10 && ctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
+   /* We don't know if the last draw used NGG because it can be a different process.
+    * When switching NGG->legacy, we need to flush VGT for certain hw generations.
+    */
+   if (ctx->screen->info.has_vgt_flush_ngg_legacy_bug && !ctx->ngg)
       ctx->flags |= SI_CONTEXT_VGT_FLUSH;
 
    if (ctx->border_color_buffer) {
@@ -572,18 +533,6 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
 
    assert(!ctx->gfx_cs.prev_dw);
    ctx->initial_gfx_cs_size = ctx->gfx_cs.current.cdw;
-   ctx->prim_discard_compute_ib_initialized = false;
-
-   /* Compute-based primitive discard:
-    *   The index ring is divided into 2 halves. Switch between the halves
-    *   in the same fashion as doublebuffering.
-    */
-   if (ctx->index_ring_base)
-      ctx->index_ring_base = 0;
-   else
-      ctx->index_ring_base = ctx->index_ring_size_per_ib;
-
-   ctx->index_ring_offset = 0;
 
    /* All buffer references are removed on a flush, so si_check_needs_implicit_sync
     * cannot determine if si_make_CB_shader_coherent() needs to be called.
@@ -601,42 +550,17 @@ void si_trace_emit(struct si_context *sctx)
    si_cp_write_data(sctx, sctx->current_saved_cs->trace_buf, 0, 4, V_370_MEM, V_370_ME, &trace_id);
 
    radeon_begin(cs);
-   radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-   radeon_emit(cs, AC_ENCODE_TRACE_POINT(trace_id));
+   radeon_emit(PKT3(PKT3_NOP, 0, 0));
+   radeon_emit(AC_ENCODE_TRACE_POINT(trace_id));
    radeon_end();
 
    if (sctx->log)
       u_log_flush(sctx->log);
 }
 
-void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx)
-{
-   if (!si_compute_prim_discard_enabled(sctx))
-      return;
-
-   if (!sctx->barrier_buf) {
-      u_suballocator_alloc(&sctx->allocator_zeroed_memory, 4, 4, &sctx->barrier_buf_offset,
-                           (struct pipe_resource **)&sctx->barrier_buf);
-   }
-
-   /* Emit a placeholder to signal the next compute IB to start.
-    * See si_compute_prim_discard.c for explanation.
-    */
-   uint32_t signal = 1;
-   si_cp_write_data(sctx, sctx->barrier_buf, sctx->barrier_buf_offset, 4, V_370_MEM, V_370_ME,
-                    &signal);
-
-   sctx->last_pkt3_write_data = &sctx->gfx_cs.current.buf[sctx->gfx_cs.current.cdw - 5];
-
-   /* Only the last occurrence of WRITE_DATA will be executed.
-    * The packet will be enabled in si_flush_gfx_cs.
-    */
-   *sctx->last_pkt3_write_data = PKT3(PKT3_NOP, 3, 0);
-}
-
 void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned cp_coher_cntl)
 {
-   bool compute_ib = !sctx->has_graphics || cs == &sctx->prim_discard_compute_cs;
+   bool compute_ib = !sctx->has_graphics;
 
    assert(sctx->chip_class <= GFX9);
 
@@ -648,20 +572,20 @@ void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, uns
 
    if (sctx->chip_class == GFX9 || compute_ib) {
       /* Flush caches and wait for the caches to assert idle. */
-      radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0));
-      radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
-      radeon_emit(cs, 0xffffffff);    /* CP_COHER_SIZE */
-      radeon_emit(cs, 0xffffff);      /* CP_COHER_SIZE_HI */
-      radeon_emit(cs, 0);             /* CP_COHER_BASE */
-      radeon_emit(cs, 0);             /* CP_COHER_BASE_HI */
-      radeon_emit(cs, 0x0000000A);    /* POLL_INTERVAL */
+      radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 5, 0));
+      radeon_emit(cp_coher_cntl); /* CP_COHER_CNTL */
+      radeon_emit(0xffffffff);    /* CP_COHER_SIZE */
+      radeon_emit(0xffffff);      /* CP_COHER_SIZE_HI */
+      radeon_emit(0);             /* CP_COHER_BASE */
+      radeon_emit(0);             /* CP_COHER_BASE_HI */
+      radeon_emit(0x0000000A);    /* POLL_INTERVAL */
    } else {
       /* ACQUIRE_MEM is only required on a compute ring. */
-      radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
-      radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
-      radeon_emit(cs, 0xffffffff);    /* CP_COHER_SIZE */
-      radeon_emit(cs, 0);             /* CP_COHER_BASE */
-      radeon_emit(cs, 0x0000000A);    /* POLL_INTERVAL */
+      radeon_emit(PKT3(PKT3_SURFACE_SYNC, 3, 0));
+      radeon_emit(cp_coher_cntl); /* CP_COHER_CNTL */
+      radeon_emit(0xffffffff);    /* CP_COHER_SIZE */
+      radeon_emit(0);             /* CP_COHER_BASE */
+      radeon_emit(0x0000000A);    /* POLL_INTERVAL */
    }
    radeon_end();
 
@@ -690,8 +614,8 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
    radeon_begin(cs);
 
    if (flags & SI_CONTEXT_VGT_FLUSH) {
-      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-      radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+      radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
    }
 
    if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
@@ -733,13 +657,13 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
    if (flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
       if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
          /* Flush CMASK/FMASK/DCC. Will wait for idle later. */
-         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-         radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
+         radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+         radeon_emit(EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
       }
       if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
          /* Flush HTILE. Will wait for idle later. */
-         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-         radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
+         radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+         radeon_emit(EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
       }
 
       /* First flush CB/DB, then L1/L2. */
@@ -758,21 +682,21 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
    } else {
       /* Wait for graphics shaders to go idle if requested. */
       if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
-         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-         radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+         radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+         radeon_emit(EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
          /* Only count explicit shader flushes, not implicit ones. */
          ctx->num_vs_flushes++;
          ctx->num_ps_flushes++;
       } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
-         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-         radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+         radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+         radeon_emit(EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
          ctx->num_vs_flushes++;
       }
    }
 
    if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && ctx->compute_is_busy) {
-      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-      radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
+      radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
       ctx->num_cs_flushes++;
       ctx->compute_is_busy = false;
    }
@@ -839,27 +763,27 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
        * The cache flush is executed in the ME, but the PFP waits
        * for completion.
        */
-      radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
-      radeon_emit(cs, dont_sync_pfp); /* CP_COHER_CNTL */
-      radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
-      radeon_emit(cs, 0xffffff);   /* CP_COHER_SIZE_HI */
-      radeon_emit(cs, 0);          /* CP_COHER_BASE */
-      radeon_emit(cs, 0);          /* CP_COHER_BASE_HI */
-      radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
-      radeon_emit(cs, gcr_cntl);   /* GCR_CNTL */
+      radeon_emit(PKT3(PKT3_ACQUIRE_MEM, 6, 0));
+      radeon_emit(dont_sync_pfp); /* CP_COHER_CNTL */
+      radeon_emit(0xffffffff); /* CP_COHER_SIZE */
+      radeon_emit(0xffffff);   /* CP_COHER_SIZE_HI */
+      radeon_emit(0);          /* CP_COHER_BASE */
+      radeon_emit(0);          /* CP_COHER_BASE_HI */
+      radeon_emit(0x0000000A); /* POLL_INTERVAL */
+      radeon_emit(gcr_cntl);   /* GCR_CNTL */
    } else if (flags & SI_CONTEXT_PFP_SYNC_ME) {
       /* Synchronize PFP with ME. (this stalls PFP) */
-      radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
-      radeon_emit(cs, 0);
+      radeon_emit(PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+      radeon_emit(0);
    }
 
    if (flags & SI_CONTEXT_START_PIPELINE_STATS && ctx->pipeline_stats_enabled != 1) {
-      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-      radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0));
+      radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0));
       ctx->pipeline_stats_enabled = 1;
    } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS && ctx->pipeline_stats_enabled != 0) {
-      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-      radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));
+      radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));
       ctx->pipeline_stats_enabled = 0;
    }
    radeon_end();
@@ -880,14 +804,6 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
 
    uint32_t cp_coher_cntl = 0;
    const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB);
-   const bool is_barrier =
-      flush_cb_db ||
-      /* INV_ICACHE == beginning of gfx IB. Checking
-       * INV_ICACHE fixes corruption for DeusExMD with
-       * compute-based culling, but I don't know why.
-       */
-      flags & (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_VS_PARTIAL_FLUSH) ||
-      (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy);
 
    assert(sctx->chip_class <= GFX9);
 
@@ -930,13 +846,13 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
 
    if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
       /* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */
-      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-      radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
+      radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
    }
    if (flags & (SI_CONTEXT_FLUSH_AND_INV_DB | SI_CONTEXT_FLUSH_AND_INV_DB_META)) {
       /* Flush HTILE. SURFACE_SYNC will wait for idle. */
-      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-      radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
+      radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
    }
 
    /* Wait for shader engines to go idle.
@@ -945,35 +861,35 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
     */
    if (!flush_cb_db) {
       if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
-         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-         radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+         radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+         radeon_emit(EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
          /* Only count explicit shader flushes, not implicit ones
           * done by SURFACE_SYNC.
           */
          sctx->num_vs_flushes++;
          sctx->num_ps_flushes++;
       } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
-         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-         radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+         radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+         radeon_emit(EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
          sctx->num_vs_flushes++;
       }
    }
 
    if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy) {
-      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-      radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+      radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
       sctx->num_cs_flushes++;
       sctx->compute_is_busy = false;
    }
 
    /* VGT state synchronization. */
    if (flags & SI_CONTEXT_VGT_FLUSH) {
-      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-      radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+      radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
    }
    if (flags & SI_CONTEXT_VGT_STREAMOUT_SYNC) {
-      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-      radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0));
+      radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0));
    }
 
    radeon_end();
@@ -1095,24 +1011,21 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
 
    if (flags & SI_CONTEXT_PFP_SYNC_ME) {
       radeon_begin(cs);
-      radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
-      radeon_emit(cs, 0);
+      radeon_emit(PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+      radeon_emit(0);
       radeon_end();
    }
 
-   if (is_barrier)
-      si_prim_discard_signal_next_compute_ib_start(sctx);
-
    if (flags & SI_CONTEXT_START_PIPELINE_STATS && sctx->pipeline_stats_enabled != 1) {
       radeon_begin(cs);
-      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-      radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0));
+      radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0));
       radeon_end();
       sctx->pipeline_stats_enabled = 1;
    } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS && sctx->pipeline_stats_enabled != 0) {
       radeon_begin(cs);
-      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-      radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));
+      radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));
       radeon_end();
       sctx->pipeline_stats_enabled = 0;
    }
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_perfcounter.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_perfcounter.c
index b553a36b42..0bee2f7d0f 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_perfcounter.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_perfcounter.c	
@@ -81,7 +81,7 @@ static void si_pc_emit_instance(struct si_context *sctx, int se, int instance)
    }
 
    radeon_begin(cs);
-   radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value);
+   radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, value);
    radeon_end();
 }
 
@@ -90,9 +90,9 @@ static void si_pc_emit_shaders(struct si_context *sctx, unsigned shaders)
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
 
    radeon_begin(cs);
-   radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2, false);
-   radeon_emit(cs, shaders & 0x7f);
-   radeon_emit(cs, 0xffffffff);
+   radeon_set_uconfig_reg_seq(R_036780_SQ_PERFCOUNTER_CTRL, 2, false);
+   radeon_emit(shaders & 0x7f);
+   radeon_emit(0xffffffff);
    radeon_end();
 }
 
@@ -112,13 +112,13 @@ static void si_pc_emit_select(struct si_context *sctx, struct ac_pc_block *block
    radeon_begin(cs);
 
    for (idx = 0; idx < count; ++idx) {
-      radeon_set_uconfig_reg_seq(cs, regs->select0[idx], 1, false);
-      radeon_emit(cs, selectors[idx] | regs->select_or);
+      radeon_set_uconfig_reg_seq(regs->select0[idx], 1, false);
+      radeon_emit(selectors[idx] | regs->select_or);
    }
 
    for (idx = 0; idx < regs->num_spm_counters; idx++) {
-      radeon_set_uconfig_reg_seq(cs, regs->select1[idx], 1, false);
-      radeon_emit(cs, 0);
+      radeon_set_uconfig_reg_seq(regs->select1[idx], 1, false);
+      radeon_emit(0);
    }
 
    radeon_end();
@@ -132,11 +132,11 @@ static void si_pc_emit_start(struct si_context *sctx, struct si_resource *buffer
                    COPY_DATA_IMM, NULL, 1);
 
    radeon_begin(cs);
-   radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
+   radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
-   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-   radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
-   radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
+   radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+   radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
+   radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
                           S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING));
    radeon_end();
 }
@@ -152,12 +152,12 @@ static void si_pc_emit_stop(struct si_context *sctx, struct si_resource *buffer,
    si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL);
 
    radeon_begin(cs);
-   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-   radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
-   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-   radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
+   radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+   radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
+   radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+   radeon_emit(EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
    radeon_set_uconfig_reg(
-      cs, R_036020_CP_PERFMON_CNTL,
+      R_036020_CP_PERFMON_CNTL,
       S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_STOP_COUNTING) | S_036020_PERFMON_SAMPLE_ENABLE(1));
    radeon_end();
 }
@@ -178,26 +178,26 @@ static void si_pc_emit_read(struct si_context *sctx, struct ac_pc_block *block,
          if (regs->counters)
             reg = regs->counters[idx];
 
-         radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-         radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
+         radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
+         radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
                             COPY_DATA_COUNT_SEL); /* 64 bits */
-         radeon_emit(cs, reg >> 2);
-         radeon_emit(cs, 0); /* unused */
-         radeon_emit(cs, va);
-         radeon_emit(cs, va >> 32);
+         radeon_emit(reg >> 2);
+         radeon_emit(0); /* unused */
+         radeon_emit(va);
+         radeon_emit(va >> 32);
          va += sizeof(uint64_t);
          reg += reg_delta;
       }
    } else {
       /* Fake counters. */
       for (idx = 0; idx < count; ++idx) {
-         radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-         radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
-                            COPY_DATA_COUNT_SEL);
-         radeon_emit(cs, 0); /* immediate */
-         radeon_emit(cs, 0);
-         radeon_emit(cs, va);
-         radeon_emit(cs, va >> 32);
+         radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
+         radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
+                     COPY_DATA_COUNT_SEL);
+         radeon_emit(0); /* immediate */
+         radeon_emit(0);
+         radeon_emit(va);
+         radeon_emit(va >> 32);
          va += sizeof(uint64_t);
       }
    }
@@ -225,10 +225,10 @@ void si_inhibit_clockgating(struct si_context *sctx, struct radeon_cmdbuf *cs, b
    radeon_begin(&sctx->gfx_cs);
 
    if (sctx->chip_class >= GFX10) {
-      radeon_set_uconfig_reg(cs, R_037390_RLC_PERFMON_CLK_CNTL,
+      radeon_set_uconfig_reg(R_037390_RLC_PERFMON_CLK_CNTL,
                              S_037390_PERFMON_CLOCK_STATE(inhibit));
    } else if (sctx->chip_class >= GFX8) {
-      radeon_set_uconfig_reg(cs, R_0372FC_RLC_PERFMON_CLK_CNTL,
+      radeon_set_uconfig_reg(R_0372FC_RLC_PERFMON_CLK_CNTL,
                              S_0372FC_PERFMON_CLOCK_STATE(inhibit));
    }
    radeon_end();
@@ -643,6 +643,7 @@ void si_destroy_perfcounters(struct si_screen *screen)
       return;
 
    ac_destroy_perfcounters(&pc->base);
+   FREE(pc);
    screen->perfcounters = NULL;
 }
 
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_pipe.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_pipe.c
index 0ca0ce8209..4abbd92881 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_pipe.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_pipe.c	
@@ -81,23 +81,20 @@ static const struct debug_named_value radeonsi_debug_options[] = {
    {"compute", DBG(COMPUTE), "Print compute info"},
    {"vm", DBG(VM), "Print virtual addresses when creating resources"},
    {"cache_stats", DBG(CACHE_STATS), "Print shader cache statistics."},
+   {"ib", DBG(IB), "Print command buffers."},
 
    /* Driver options: */
    {"nowc", DBG(NO_WC), "Disable GTT write combining"},
    {"check_vm", DBG(CHECK_VM), "Check VM faults and dump debug info."},
    {"reserve_vmid", DBG(RESERVE_VMID), "Force VMID reservation per context."},
    {"shadowregs", DBG(SHADOW_REGS), "Enable CP register shadowing."},
+   {"nofastdlist", DBG(NO_FAST_DISPLAY_LIST), "Disable fast display lists"},
 
    /* 3D engine options: */
    {"nogfx", DBG(NO_GFX), "Disable graphics. Only multimedia compute paths can be used."},
    {"nongg", DBG(NO_NGG), "Disable NGG and use the legacy pipeline."},
-   {"nofastlaunch", DBG(NO_FAST_LAUNCH), "Disable NGG GS fast launch."},
    {"nggc", DBG(ALWAYS_NGG_CULLING_ALL), "Always use NGG culling even when it can hurt."},
-   {"nggctess", DBG(ALWAYS_NGG_CULLING_TESS), "Always use NGG culling for tessellation."},
    {"nonggc", DBG(NO_NGG_CULLING), "Disable NGG culling."},
-   {"alwayspd", DBG(ALWAYS_PD), "Always enable the primitive discard compute shader."},
-   {"pd", DBG(PD), "Enable the primitive discard compute shader for large draw calls."},
-   {"nopd", DBG(NO_PD), "Disable the primitive discard compute shader."},
    {"switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet."},
    {"nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization"},
    {"nodpbb", DBG(NO_DPBB), "Disable DPBB."},
@@ -109,9 +106,11 @@ static const struct debug_named_value radeonsi_debug_options[] = {
    {"nodisplaydcc", DBG(NO_DISPLAY_DCC), "Disable display DCC"},
    {"nodcc", DBG(NO_DCC), "Disable DCC."},
    {"nodccclear", DBG(NO_DCC_CLEAR), "Disable DCC fast clear."},
-   {"nodccfb", DBG(NO_DCC_FB), "Disable separate DCC on the main framebuffer"},
+   {"nodccstore", DBG(NO_DCC_STORE), "Disable DCC stores"},
+   {"dccstore", DBG(DCC_STORE), "Enable DCC stores"},
    {"nodccmsaa", DBG(NO_DCC_MSAA), "Disable DCC for MSAA"},
    {"nofmask", DBG(NO_FMASK), "Disable MSAA compression"},
+   {"nodma", DBG(NO_DMA), "Disable SDMA-copy for DRI_PRIME"},
 
    {"tmz", DBG(TMZ), "Force allocation of scanout/depth/stencil buffer as encrypted"},
    {"sqtt", DBG(SQTT), "Enable SQTT"},
@@ -152,6 +151,20 @@ void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compil
       compiler->low_opt_passes = ac_create_llvm_passes(compiler->low_opt_tm);
 }
 
+void si_init_aux_async_compute_ctx(struct si_screen *sscreen)
+{
+   assert(!sscreen->async_compute_context);
+   sscreen->async_compute_context = si_create_context(
+      &sscreen->b,
+      SI_CONTEXT_FLAG_AUX |
+         (sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0) |
+         PIPE_CONTEXT_COMPUTE_ONLY);
+
+   /* Limit the numbers of waves allocated for this context. */
+   if (sscreen->async_compute_context)
+      ((struct si_context*)sscreen->async_compute_context)->cs_max_waves_per_sh = 2;
+}
+
 static void si_destroy_compiler(struct ac_llvm_compiler *compiler)
 {
    ac_destroy_llvm_compiler(compiler);
@@ -251,8 +264,10 @@ static void si_destroy_context(struct pipe_context *context)
       sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_12bytes_buffer);
    if (sctx->cs_dcc_decompress)
       sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_decompress);
-   if (sctx->cs_dcc_retile)
-      sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_retile);
+   for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_dcc_retile); i++) {
+      if (sctx->cs_dcc_retile[i])
+         sctx->b.delete_compute_state(&sctx->b, sctx->cs_dcc_retile[i]);
+   }
    if (sctx->no_velems_state)
       sctx->b.delete_vertex_elements_state(&sctx->b, sctx->no_velems_state);
 
@@ -288,6 +303,10 @@ static void si_destroy_context(struct pipe_context *context)
    sctx->ws->cs_destroy(&sctx->gfx_cs);
    if (sctx->ctx)
       sctx->ws->ctx_destroy(sctx->ctx);
+   if (sctx->sdma_cs) {
+      sctx->ws->cs_destroy(sctx->sdma_cs);
+      free(sctx->sdma_cs);
+   }
 
    if (sctx->dirty_implicit_resources)
       _mesa_hash_table_destroy(sctx->dirty_implicit_resources,
@@ -306,12 +325,8 @@ static void si_destroy_context(struct pipe_context *context)
    u_suballocator_destroy(&sctx->allocator_zeroed_memory);
 
    sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL);
-   sctx->ws->fence_reference(&sctx->last_ib_barrier_fence, NULL);
    si_resource_reference(&sctx->eop_bug_scratch, NULL);
    si_resource_reference(&sctx->eop_bug_scratch_tmz, NULL);
-   si_resource_reference(&sctx->index_ring, NULL);
-   si_resource_reference(&sctx->barrier_buf, NULL);
-   si_resource_reference(&sctx->last_ib_barrier_buf, NULL);
    si_resource_reference(&sctx->shadowed_regs, NULL);
    radeon_bo_reference(sctx->screen->ws, &sctx->gds, NULL);
    radeon_bo_reference(sctx->screen->ws, &sctx->gds_oa, NULL);
@@ -537,6 +552,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
    }
 
    sctx->ngg = sscreen->use_ngg;
+   si_shader_change_notify(sctx);
 
    /* Initialize context functions used by graphics and compute. */
    if (sctx->chip_class >= GFX10)
@@ -573,6 +589,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
       si_init_state_functions(sctx);
       si_init_streamout_functions(sctx);
       si_init_viewport_functions(sctx);
+      si_init_spi_map_functions(sctx);
 
       sctx->blitter = util_blitter_create(&sctx->b);
       if (sctx->blitter == NULL)
@@ -614,12 +631,6 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
       default:
          unreachable("unhandled chip class");
       }
-
-      si_initialize_prim_discard_tunables(sscreen, flags & SI_CONTEXT_FLAG_AUX,
-                                          &sctx->prim_discard_vertex_count_threshold,
-                                          &sctx->index_ring_size_per_ib);
-   } else {
-      sctx->prim_discard_vertex_count_threshold = UINT_MAX;
    }
 
    sctx->sample_mask = 0xffff;
@@ -637,7 +648,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
       sctx->b.create_video_buffer = vl_video_buffer_create;
    }
 
-   if (sctx->chip_class >= GFX9 || si_compute_prim_discard_enabled(sctx)) {
+   if (sctx->chip_class >= GFX9) {
       sctx->wait_mem_scratch =
            si_aligned_buffer_create(screen,
                                     SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL,
@@ -724,6 +735,23 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
       si_init_cp_reg_shadowing(sctx);
    }
 
+   /* Set immutable fields of shader keys. */
+   if (sctx->chip_class >= GFX9) {
+      /* The LS output / HS input layout can be communicated
+       * directly instead of via user SGPRs for merged LS-HS.
+       * This also enables jumping over the VS prolog for HS-only waves.
+       *
+       * When the LS VGPR fix is needed, monolithic shaders can:
+       *  - avoid initializing EXEC in both the LS prolog
+       *    and the LS main part when !vs_needs_prolog
+       *  - remove the fixup for unused input VGPRs
+       */
+      sctx->shader.tcs.key.ge.opt.prefer_mono = 1;
+
+      /* This enables jumping over the VS prolog for GS-only waves. */
+      sctx->shader.gs.key.ge.opt.prefer_mono = 1;
+   }
+
    si_begin_new_gfx_cs(sctx, true);
    assert(sctx->gfx_cs.current.cdw == sctx->initial_gfx_cs_size);
 
@@ -768,6 +796,13 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
          sscreen->aux_context->set_log_context(sscreen->aux_context, aux_log);
       }
       simple_mtx_unlock(&sscreen->aux_context_lock);
+
+      simple_mtx_lock(&sscreen->async_compute_context_lock);
+      if (status != PIPE_NO_RESET && sscreen->async_compute_context) {
+         sscreen->async_compute_context->destroy(sscreen->async_compute_context);
+         sscreen->async_compute_context = NULL;
+      }
+      simple_mtx_unlock(&sscreen->async_compute_context_lock);
    }
 
    sctx->initial_gfx_cs_size = sctx->gfx_cs.current.cdw;
@@ -825,9 +860,12 @@ static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen, v
    struct pipe_context *tc =
       threaded_context_create(ctx, &sscreen->pool_transfers,
                               si_replace_buffer_storage,
-                              sscreen->info.is_amdgpu ? si_create_fence : NULL,
-                              si_is_resource_busy,
-                              true,
+                              &(struct threaded_context_options){
+                                 .create_fence = sscreen->info.is_amdgpu ?
+                                       si_create_fence : NULL,
+                                 .is_resource_busy = si_is_resource_busy,
+                                 .driver_calls_flush_notify = true,
+                              },
                               &((struct si_context *)ctx)->tc);
 
    if (tc && tc != ctx)
@@ -842,7 +880,7 @@ static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen, v
 static void si_destroy_screen(struct pipe_screen *pscreen)
 {
    struct si_screen *sscreen = (struct si_screen *)pscreen;
-   struct si_shader_part *parts[] = {sscreen->vs_prologs, sscreen->tcs_epilogs, sscreen->gs_prologs,
+   struct si_shader_part *parts[] = {sscreen->vs_prologs, sscreen->tcs_epilogs,
                                      sscreen->ps_prologs, sscreen->ps_epilogs};
    unsigned i;
 
@@ -871,6 +909,11 @@ static void si_destroy_screen(struct pipe_screen *pscreen)
        sscreen->aux_context->destroy(sscreen->aux_context);
    }
 
+   simple_mtx_destroy(&sscreen->async_compute_context_lock);
+   if (sscreen->async_compute_context) {
+      sscreen->async_compute_context->destroy(sscreen->async_compute_context);
+   }
+
    util_queue_destroy(&sscreen->shader_compiler_queue);
    util_queue_destroy(&sscreen->shader_compiler_queue_low_priority);
 
@@ -906,6 +949,7 @@ static void si_destroy_screen(struct pipe_screen *pscreen)
    disk_cache_destroy(sscreen->disk_shader_cache);
    util_live_shader_cache_deinit(&sscreen->live_shader_cache);
    util_idalloc_mt_fini(&sscreen->buffer_ids);
+   util_vertex_state_cache_deinit(&sscreen->vertex_state_cache);
 
    sscreen->ws->destroy(sscreen->ws);
    FREE(sscreen);
@@ -1029,6 +1073,8 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
    {
 #define OPT_BOOL(name, dflt, description)                                                          \
    sscreen->options.name = driQueryOptionb(config->options, "radeonsi_" #name);
+#define OPT_INT(name, dflt, description)                                                           \
+   sscreen->options.name = driQueryOptioni(config->options, "radeonsi_" #name);
 #include "si_debug_options.h"
    }
 
@@ -1102,6 +1148,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
    }
 
    (void)simple_mtx_init(&sscreen->aux_context_lock, mtx_plain);
+   (void)simple_mtx_init(&sscreen->async_compute_context_lock, mtx_plain);
    (void)simple_mtx_init(&sscreen->gpu_load_mutex, mtx_plain);
 
    si_init_gs_info(sscreen);
@@ -1161,15 +1208,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
    if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false))
       si_init_perfcounters(sscreen);
 
-   unsigned prim_discard_vertex_count_threshold, tmp;
-   si_initialize_prim_discard_tunables(sscreen, false, &prim_discard_vertex_count_threshold, &tmp);
-   /* Compute-shader-based culling doesn't support VBOs in user SGPRs. */
-   if (prim_discard_vertex_count_threshold == UINT_MAX) {
-      /* This decreases CPU overhead if all descriptors are in user SGPRs because we don't
-       * have to allocate and count references for the upload buffer.
-       */
-      sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1;
-   }
+   sscreen->max_memory_usage_kb = sscreen->info.vram_size_kb + sscreen->info.gart_size_kb / 4 * 3;
 
    /* Determine tessellation ring info. */
    bool double_offchip_buffers = sscreen->info.chip_class >= GFX7 &&
@@ -1255,6 +1294,14 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws,
          sscreen->allow_dcc_msaa_clear_to_reg_for_bpp[bpp_log2] = true;
    }
 
+   /* DCC stores have 50% performance of uncompressed stores and sometimes
+    * even less than that. It's risky to enable on dGPUs.
+    */
+   sscreen->always_allow_dcc_stores = !(sscreen->debug_flags & DBG(NO_DCC_STORE)) &&
+                                      ((sscreen->info.chip_class >= GFX10_3 &&
+                                        !sscreen->info.has_dedicated_vram) ||
+                                       sscreen->debug_flags & DBG(DCC_STORE));
+
    sscreen->dpbb_allowed = !(sscreen->debug_flags & DBG(NO_DPBB)) &&
                            (sscreen->info.chip_class >= GFX10 ||
                             /* Only enable primitive binning on gfx9 APUs by default. */
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_pipe.h b/mesa 3D driver/src/gallium/drivers/radeonsi/si_pipe.h
index a8722b20fa..a128983e2c 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_pipe.h	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_pipe.h	
@@ -31,6 +31,7 @@
 #include "util/u_idalloc.h"
 #include "util/u_suballoc.h"
 #include "util/u_threaded_context.h"
+#include "util/u_vertex_state_cache.h"
 #include "ac_sqtt.h"
 
 #ifdef __cplusplus
@@ -44,7 +45,6 @@ extern "C" {
 #endif
 
 #define ATI_VENDOR_ID         0x1002
-#define SI_PRIM_DISCARD_DEBUG 0
 #define SI_NOT_QUERY          0xffffffff
 
 /* The base vertex and primitive restart can be any number, but we must pick
@@ -61,6 +61,15 @@ extern "C" {
 /* Alignment for optimal CP DMA performance. */
 #define SI_CPDMA_ALIGNMENT 32
 
+/* We don't want to evict buffers from VRAM by mapping them for CPU access,
+ * because they might never be moved back again. If a buffer is large enough,
+ * upload data by copying from a temporary GTT buffer. 8K might not seem much,
+ * but there can be 100000 buffers.
+ *
+ * This tweak improves performance for viewperf creo & snx.
+ */
+#define SI_MAX_VRAM_MAP_SIZE     8196
+
 /* Tunables for compute-based clear_buffer and copy_buffer: */
 #define SI_COMPUTE_CLEAR_DW_PER_THREAD 4
 #define SI_COMPUTE_COPY_DW_PER_THREAD  4
@@ -155,11 +164,6 @@ enum si_has_ngg {
    NGG_ON,
 };
 
-enum si_has_prim_discard_cs {
-   PRIM_DISCARD_CS_OFF,
-   PRIM_DISCARD_CS_ON,
-};
-
 enum si_clear_code
 {
    DCC_CLEAR_COLOR_0000 = 0x00000000,
@@ -170,8 +174,8 @@ enum si_clear_code
    DCC_UNCOMPRESSED = 0xFFFFFFFF,
 };
 
-#define SI_IMAGE_ACCESS_DCC_OFF   (1 << 8)
-#define SI_IMAGE_ACCESS_DCC_WRITE (1 << 9)
+#define SI_IMAGE_ACCESS_DCC_OFF           (1 << 8)
+#define SI_IMAGE_ACCESS_ALLOW_DCC_STORE   (1 << 9)
 
 /* Debug flags. */
 enum
@@ -209,23 +213,20 @@ enum
    DBG_COMPUTE,
    DBG_VM,
    DBG_CACHE_STATS,
+   DBG_IB,
 
    /* Driver options: */
    DBG_NO_WC,
    DBG_CHECK_VM,
    DBG_RESERVE_VMID,
    DBG_SHADOW_REGS,
+   DBG_NO_FAST_DISPLAY_LIST,
 
    /* 3D engine options: */
    DBG_NO_GFX,
    DBG_NO_NGG,
    DBG_ALWAYS_NGG_CULLING_ALL,
-   DBG_ALWAYS_NGG_CULLING_TESS,
    DBG_NO_NGG_CULLING,
-   DBG_NO_FAST_LAUNCH,
-   DBG_ALWAYS_PD,
-   DBG_PD,
-   DBG_NO_PD,
    DBG_SWITCH_ON_EOP,
    DBG_NO_OUT_OF_ORDER,
    DBG_NO_DPBB,
@@ -237,9 +238,11 @@ enum
    DBG_NO_DISPLAY_DCC,
    DBG_NO_DCC,
    DBG_NO_DCC_CLEAR,
-   DBG_NO_DCC_FB,
+   DBG_NO_DCC_STORE,
+   DBG_DCC_STORE,
    DBG_NO_DCC_MSAA,
    DBG_NO_FMASK,
+   DBG_NO_DMA,
 
    DBG_TMZ,
    DBG_SQTT,
@@ -278,6 +281,25 @@ enum si_coherency
    SI_COHERENCY_CP,
 };
 
+#define SI_BIND_CONSTANT_BUFFER_SHIFT     0
+#define SI_BIND_SHADER_BUFFER_SHIFT       6
+#define SI_BIND_IMAGE_BUFFER_SHIFT        12
+#define SI_BIND_SAMPLER_BUFFER_SHIFT      18
+#define SI_BIND_OTHER_BUFFER_SHIFT        24
+
+/* Bind masks for all 6 shader stages. */
+#define SI_BIND_CONSTANT_BUFFER_ALL       (0x3f << SI_BIND_CONSTANT_BUFFER_SHIFT)
+#define SI_BIND_SHADER_BUFFER_ALL         (0x3f << SI_BIND_SHADER_BUFFER_SHIFT)
+#define SI_BIND_IMAGE_BUFFER_ALL          (0x3f << SI_BIND_IMAGE_BUFFER_SHIFT)
+#define SI_BIND_SAMPLER_BUFFER_ALL        (0x3f << SI_BIND_SAMPLER_BUFFER_SHIFT)
+
+#define SI_BIND_CONSTANT_BUFFER(shader)   ((1 << (shader)) << SI_BIND_CONSTANT_BUFFER_SHIFT)
+#define SI_BIND_SHADER_BUFFER(shader)     ((1 << (shader)) << SI_BIND_SHADER_BUFFER_SHIFT)
+#define SI_BIND_IMAGE_BUFFER(shader)      ((1 << (shader)) << SI_BIND_IMAGE_BUFFER_SHIFT)
+#define SI_BIND_SAMPLER_BUFFER(shader)    ((1 << (shader)) << SI_BIND_SAMPLER_BUFFER_SHIFT)
+#define SI_BIND_VERTEX_BUFFER             (1 << (SI_BIND_OTHER_BUFFER_SHIFT + 0))
+#define SI_BIND_STREAMOUT_BUFFER          (1 << (SI_BIND_OTHER_BUFFER_SHIFT + 1))
+
 struct si_compute;
 struct si_shader_context;
 struct hash_table;
@@ -292,15 +314,14 @@ struct si_resource {
    struct pb_buffer *buf;
    uint64_t gpu_address;
    /* Memory usage if the buffer placement is optimal. */
-   uint32_t vram_usage_kb;
-   uint32_t gart_usage_kb;
+   uint32_t memory_usage_kb;
 
    /* Resource properties. */
    uint64_t bo_size;
    uint8_t bo_alignment_log2;
    enum radeon_bo_domain domains:8;
    enum radeon_bo_flag flags:16;
-   unsigned bind_history;
+   unsigned bind_history; /* bitmask of SI_BIND_xxx_BUFFER */
 
    /* The buffer range which is initialized (with a write transfer,
     * streamout, DMA, or as a random access target). The rest of
@@ -526,7 +547,7 @@ struct si_screen {
                                    unsigned width, unsigned height, unsigned depth, uint32_t *state,
                                    uint32_t *fmask_state);
 
-   unsigned num_vbos_in_user_sgprs;
+   unsigned max_memory_usage_kb;
    unsigned pa_sc_raster_config;
    unsigned pa_sc_raster_config_1;
    unsigned se_tile_repeat;
@@ -550,9 +571,11 @@ struct si_screen {
    bool use_ngg_culling;
    bool use_ngg_streamout;
    bool allow_dcc_msaa_clear_to_reg_for_bpp[5]; /* indexed by log2(Bpp) */
+   bool always_allow_dcc_stores;
 
    struct {
 #define OPT_BOOL(name, dflt, description) bool name : 1;
+#define OPT_INT(name, dflt, description) int name;
 #include "si_debug_options.h"
    } options;
 
@@ -570,6 +593,10 @@ struct si_screen {
    struct pipe_context *aux_context;
    simple_mtx_t aux_context_lock;
 
+   /* Async compute context for DRI_PRIME copies. */
+   struct pipe_context *async_compute_context;
+   simple_mtx_t async_compute_context_lock;
+
    /* This must be in the screen, because UE4 uses one context for
     * compilation and another one for rendering.
     */
@@ -623,7 +650,6 @@ struct si_screen {
    simple_mtx_t shader_parts_mutex;
    struct si_shader_part *vs_prologs;
    struct si_shader_part *tcs_epilogs;
-   struct si_shader_part *gs_prologs;
    struct si_shader_part *ps_prologs;
    struct si_shader_part *ps_epilogs;
 
@@ -666,6 +692,7 @@ struct si_screen {
    unsigned ngg_subgroup_size;
 
    struct util_idalloc_mt buffer_ids;
+   struct util_vertex_state_cache vertex_state_cache;
 };
 
 struct si_sampler_view {
@@ -804,6 +831,8 @@ struct si_streamout {
 struct si_shader_ctx_state {
    struct si_shader_selector *cso;
    struct si_shader *current;
+   /* The shader variant key representing the current state. */
+   union si_shader_key key;
 };
 
 #define SI_NUM_VGT_PARAM_KEY_BITS 12
@@ -841,35 +870,6 @@ union si_vgt_param_key {
    uint16_t index;
 };
 
-#define SI_NUM_VGT_STAGES_KEY_BITS 6
-#define SI_NUM_VGT_STAGES_STATES   (1 << SI_NUM_VGT_STAGES_KEY_BITS)
-
-/* The VGT_SHADER_STAGES key used to index the table of precomputed values.
- * Some fields are set by state-change calls, most are set by draw_vbo.
- */
-union si_vgt_stages_key {
-   struct {
-#if UTIL_ARCH_LITTLE_ENDIAN
-      uint8_t tess : 1;
-      uint8_t gs : 1;
-      uint8_t ngg_gs_fast_launch : 1;
-      uint8_t ngg_passthrough : 1;
-      uint8_t ngg : 1;       /* gfx10+ */
-      uint8_t streamout : 1; /* only used with NGG */
-      uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS;
-#else /* UTIL_ARCH_BIG_ENDIAN */
-      uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS;
-      uint8_t streamout : 1;
-      uint8_t ngg : 1;
-      uint8_t ngg_passthrough : 1;
-      uint8_t ngg_gs_fast_launch : 1;
-      uint8_t gs : 1;
-      uint8_t tess : 1;
-#endif
-   } u;
-   uint8_t index;
-};
-
 struct si_texture_handle {
    unsigned desc_slot;
    bool desc_dirty;
@@ -892,7 +892,6 @@ struct si_saved_cs {
    unsigned trace_id;
 
    unsigned gfx_last_dw;
-   unsigned compute_last_dw;
    bool flushed;
    int64_t time_flush;
 };
@@ -902,12 +901,24 @@ struct si_small_prim_cull_info {
    float small_prim_precision;
 };
 
+struct si_vertex_state {
+   struct pipe_vertex_state b;
+   struct si_vertex_elements velems;
+   uint32_t descriptors[4 * SI_MAX_ATTRIBS];
+};
+
 typedef void (*pipe_draw_vbo_func)(struct pipe_context *pipe,
                                    const struct pipe_draw_info *info,
                                    unsigned drawid_offset,
                                    const struct pipe_draw_indirect_info *indirect,
                                    const struct pipe_draw_start_count_bias *draws,
                                    unsigned num_draws);
+typedef void (*pipe_draw_vertex_state_func)(struct pipe_context *ctx,
+                                            struct pipe_vertex_state *vstate,
+                                            uint32_t partial_velem_mask,
+                                            struct pipe_draw_vertex_state_info info,
+                                            const struct pipe_draw_start_count_bias *draws,
+                                            unsigned num_draws);
 
 struct si_context {
    struct pipe_context b; /* base class */
@@ -918,6 +929,7 @@ struct si_context {
    struct radeon_winsys *ws;
    struct radeon_winsys_ctx *ctx;
    struct radeon_cmdbuf gfx_cs; /* compute IB if graphics is disabled */
+   struct radeon_cmdbuf *sdma_cs;
    struct pipe_fence_handle *last_gfx_fence;
    struct si_resource *eop_bug_scratch;
    struct si_resource *eop_bug_scratch_tmz;
@@ -958,7 +970,7 @@ struct si_context {
    void *cs_clear_render_target_1d_array;
    void *cs_clear_12bytes_buffer;
    void *cs_dcc_decompress;
-   void *cs_dcc_retile;
+   void *cs_dcc_retile[32];
    void *cs_fmask_expand[3][2]; /* [log2(samples)-1][is_array] */
    struct si_screen *screen;
    struct pipe_debug_callback debug;
@@ -986,32 +998,11 @@ struct si_context {
    unsigned last_num_draw_calls;
    unsigned flags; /* flush flags */
    /* Current unaccounted memory usage. */
-   uint32_t vram_kb;
-   uint32_t gtt_kb;
+   uint32_t memory_usage_kb;
 
    /* NGG streamout. */
    struct pb_buffer *gds;
    struct pb_buffer *gds_oa;
-   /* Compute-based primitive discard. */
-   unsigned prim_discard_vertex_count_threshold;
-   struct radeon_cmdbuf prim_discard_compute_cs;
-   struct si_shader *compute_ib_last_shader;
-   uint32_t compute_rewind_va;
-   unsigned compute_num_prims_in_batch;
-   /* index_ring is divided into 2 halves for doublebuffering. */
-   struct si_resource *index_ring;
-   unsigned index_ring_base;        /* offset of a per-IB portion */
-   unsigned index_ring_offset;      /* offset within a per-IB portion */
-   unsigned index_ring_size_per_ib; /* max available size per IB */
-   bool prim_discard_compute_ib_initialized;
-   /* For tracking the last execution barrier - it can be either
-    * a WRITE_DATA packet or a fence. */
-   uint32_t *last_pkt3_write_data;
-   struct si_resource *barrier_buf;
-   unsigned barrier_buf_offset;
-   struct pipe_fence_handle *last_ib_barrier_fence;
-   struct si_resource *last_ib_barrier_buf;
-   unsigned last_ib_barrier_buf_offset;
 
    /* Atoms (direct states). */
    union si_state_atoms atoms;
@@ -1060,28 +1051,27 @@ struct si_context {
       /* indexed access using pipe_shader_type (not by MESA_SHADER_*) */
       struct si_shader_ctx_state shaders[SI_NUM_GRAPHICS_SHADERS];
    };
-   struct si_shader_ctx_state cs_prim_discard_state;
    struct si_cs_shader_state cs_shader_state;
 
    /* shader information */
+   uint64_t ps_inputs_read_or_disabled;
    struct si_vertex_elements *vertex_elements;
    unsigned num_vertex_elements;
-   unsigned sprite_coord_enable;
    unsigned cs_max_waves_per_sh;
-   bool flatshade;
+   bool uses_nontrivial_vs_prolog;
+   bool force_trivial_vs_prolog;
    bool do_update_shaders;
    bool compute_shaderbuf_sgprs_dirty;
    bool compute_image_sgprs_dirty;
    bool vs_uses_base_instance;
    bool vs_uses_draw_id;
+   uint8_t patch_vertices;
 
    /* shader descriptors */
    struct si_descriptors descriptors[SI_NUM_DESCS];
    unsigned descriptors_dirty;
    unsigned shader_pointers_dirty;
    unsigned shader_needs_decompress_mask;
-   unsigned inlinable_uniforms_valid_mask;
-   uint32_t inlinable_uniforms[SI_NUM_SHADERS][MAX_INLINABLE_UNIFORMS];
    struct si_buffer_resources internal_bindings;
    struct si_buffer_resources const_and_shader_buffers[SI_NUM_SHADERS];
    struct si_samplers samplers[SI_NUM_SHADERS];
@@ -1136,11 +1126,7 @@ struct si_context {
    bool allow_flat_shading : 1;
 
    /* Emitted draw state. */
-   bool gs_tri_strip_adj_fix : 1;
-   bool ls_vgpr_fix : 1;
-   bool prim_discard_cs_instancing : 1;
    bool ngg : 1;
-   bool same_patch_vertices : 1;
    uint8_t ngg_culling;
    unsigned last_index_size;
    int last_base_vertex;
@@ -1251,9 +1237,6 @@ struct si_context {
    unsigned num_resident_handles;
    uint64_t num_alloc_tex_transfer_bytes;
    unsigned last_tex_ps_draw_ratio; /* for query */
-   unsigned compute_num_verts_accepted;
-   unsigned compute_num_verts_rejected;
-   unsigned compute_num_verts_ineligible; /* due to low vertex count */
    unsigned context_roll;
 
    /* Queries. */
@@ -1284,9 +1267,12 @@ struct si_context {
     */
    struct hash_table *dirty_implicit_resources;
 
-   pipe_draw_vbo_func draw_vbo[2][2][2][2];
+   pipe_draw_vbo_func draw_vbo[2][2][2];
+   pipe_draw_vertex_state_func draw_vertex_state[2][2][2];
    /* When b.draw_vbo is a wrapper, real_draw_vbo is the real draw_vbo function */
    pipe_draw_vbo_func real_draw_vbo;
+   pipe_draw_vertex_state_func real_draw_vertex_state;
+   void (*emit_spi_map[33])(struct si_context *sctx);
 
    /* SQTT */
    struct ac_thread_trace_data *thread_trace;
@@ -1381,6 +1367,7 @@ void si_init_clear_functions(struct si_context *sctx);
 #define SI_OP_CS_IMAGE                    (1 << 5)
 #define SI_OP_CS_RENDER_COND_ENABLE       (1 << 6)
 #define SI_OP_CPDMA_SKIP_CHECK_CS_SPACE   (1 << 7) /* don't call need_cs_space */
+#define SI_OP_SYNC_GE_BEFORE              (1 << 8) /* only sync VS, TCS, TES, GS */
 
 unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
                             enum si_cache_policy cache_policy);
@@ -1456,6 +1443,7 @@ void si_init_debug_functions(struct si_context *sctx);
 void si_check_vm_faults(struct si_context *sctx, struct radeon_saved_cs *saved,
                         enum ring_type ring);
 bool si_replace_shader(unsigned num, struct si_shader_binary *binary);
+void si_print_current_ib(struct si_context *sctx, FILE *f);
 
 /* si_fence.c */
 void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigned event,
@@ -1473,14 +1461,14 @@ struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx,
 /* si_get.c */
 void si_init_screen_get_functions(struct si_screen *sscreen);
 
+bool si_sdma_copy_image(struct si_context *ctx, struct si_texture *dst, struct si_texture *src);
+
 /* si_gfx_cs.c */
 void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence);
 void si_allocate_gds(struct si_context *ctx);
 void si_set_tracked_regs_to_clear_state(struct si_context *ctx);
 void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs);
-void si_need_gfx_cs_space(struct si_context *ctx, unsigned num_draws);
 void si_trace_emit(struct si_context *sctx);
-void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx);
 void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs,
                           unsigned cp_coher_cntl);
 void gfx10_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
@@ -1488,7 +1476,8 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
 /* Replace the sctx->b.draw_vbo function with a wrapper. This can be use to implement
  * optimizations without affecting the normal draw_vbo functions perf.
  */
-void si_install_draw_wrapper(struct si_context *sctx, pipe_draw_vbo_func wrapper);
+void si_install_draw_wrapper(struct si_context *sctx, pipe_draw_vbo_func wrapper,
+                             pipe_draw_vertex_state_func vstate_wrapper);
 
 /* si_gpu_load.c */
 void si_gpu_load_kill_thread(struct si_screen *sscreen);
@@ -1499,34 +1488,9 @@ unsigned si_end_counter(struct si_screen *sscreen, unsigned type, uint64_t begin
 void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs);
 void si_init_compute_functions(struct si_context *sctx);
 
-/* si_compute_prim_discard.c */
-enum si_prim_discard_outcome
-{
-   SI_PRIM_DISCARD_ENABLED,
-   SI_PRIM_DISCARD_DISABLED,
-   SI_PRIM_DISCARD_DRAW_SPLIT,
-   SI_PRIM_DISCARD_MULTI_DRAW_SPLIT,
-};
-
-void si_build_prim_discard_compute_shader(struct si_shader_context *ctx);
-enum si_prim_discard_outcome
-si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info,
-                                      unsigned drawid_offset,
-                                      const struct pipe_draw_start_count_bias *draws,
-                                      unsigned num_draws, unsigned total_count);
-void si_compute_signal_gfx(struct si_context *sctx);
-void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
-                                          const struct pipe_draw_info *info,
-                                          const struct pipe_draw_start_count_bias *draws,
-                                          unsigned num_draws, unsigned index_size,
-                                          unsigned total_count, uint64_t input_indexbuf_va,
-                                          unsigned index_max_size);
-void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context,
-                                         unsigned *prim_discard_vertex_count_threshold,
-                                         unsigned *index_ring_size_per_ib);
-
 /* si_pipe.c */
 void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compiler);
+void si_init_aux_async_compute_ctx(struct si_screen *sscreen);
 
 /* si_perfcounters.c */
 void si_init_perfcounters(struct si_screen *screen);
@@ -1694,8 +1658,7 @@ static inline void si_context_add_resource_size(struct si_context *sctx, struct
 {
    if (r) {
       /* Add memory usage for need_gfx_cs_space */
-      sctx->vram_kb += si_resource(r)->vram_usage_kb;
-      sctx->gtt_kb += si_resource(r)->gart_usage_kb;
+      sctx->memory_usage_kb += si_resource(r)->memory_usage_kb;
    }
 }
 
@@ -1855,7 +1818,19 @@ static inline bool si_htile_enabled(struct si_texture *tex, unsigned level, unsi
    if (zs_mask == PIPE_MASK_S && (tex->htile_stencil_disabled || !tex->surface.has_stencil))
       return false;
 
-   return tex->is_depth && tex->surface.meta_offset && level < tex->surface.num_meta_levels;
+   if (!tex->is_depth || !tex->surface.meta_offset)
+      return false;
+
+   struct si_screen *sscreen = (struct si_screen *)tex->buffer.b.b.screen;
+   if (sscreen->info.chip_class >= GFX8) {
+      return level < tex->surface.num_meta_levels;
+   } else {
+      /* GFX6-7 don't have TC-compatible HTILE, which means they have to run
+       * a decompression pass for every mipmap level before texturing, so compress
+       * only one level to reduce the number of decompression passes to a minimum.
+       */
+      return level == 0;
+   }
 }
 
 static inline bool vi_tc_compat_htile_enabled(struct si_texture *tex, unsigned level,
@@ -1897,6 +1872,12 @@ static inline unsigned si_get_total_colormask(struct si_context *sctx)
    ((1 << PIPE_PRIM_LINES) | (1 << PIPE_PRIM_LINE_LOOP) | (1 << PIPE_PRIM_LINE_STRIP) |            \
     (1 << PIPE_PRIM_LINES_ADJACENCY) | (1 << PIPE_PRIM_LINE_STRIP_ADJACENCY))
 
+#define UTIL_ALL_PRIM_TRIANGLE_MODES \
+   ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP) | \
+    (1 << PIPE_PRIM_TRIANGLE_FAN) | (1 << PIPE_PRIM_QUADS) | (1 << PIPE_PRIM_QUAD_STRIP) | \
+    (1 << PIPE_PRIM_POLYGON) | (1 << PIPE_PRIM_TRIANGLES_ADJACENCY) | \
+    (1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY))
+
 static inline bool util_prim_is_lines(unsigned prim)
 {
    return ((1 << prim) & UTIL_ALL_PRIM_LINE_MODES) != 0;
@@ -1909,11 +1890,12 @@ static inline bool util_prim_is_points_or_lines(unsigned prim)
 
 static inline bool util_rast_prim_is_triangles(unsigned prim)
 {
-   return ((1 << prim) &
-           ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP) |
-            (1 << PIPE_PRIM_TRIANGLE_FAN) | (1 << PIPE_PRIM_QUADS) | (1 << PIPE_PRIM_QUAD_STRIP) |
-            (1 << PIPE_PRIM_POLYGON) | (1 << PIPE_PRIM_TRIANGLES_ADJACENCY) |
-            (1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY)));
+   return ((1 << prim) & UTIL_ALL_PRIM_TRIANGLE_MODES) != 0;
+}
+
+static inline bool util_rast_prim_is_lines_or_triangles(unsigned prim)
+{
+   return ((1 << prim) & (UTIL_ALL_PRIM_LINE_MODES | UTIL_ALL_PRIM_TRIANGLE_MODES)) != 0;
 }
 
 /**
@@ -1924,17 +1906,27 @@ static inline bool util_rast_prim_is_triangles(unsigned prim)
  * \param gtt       GTT memory size not added to the buffer list yet
  */
 static inline bool radeon_cs_memory_below_limit(struct si_screen *screen, struct radeon_cmdbuf *cs,
-                                                uint32_t vram_kb, uint32_t gtt_kb)
+                                                uint32_t kb)
 {
-   vram_kb += cs->used_vram_kb;
-   gtt_kb += cs->used_gart_kb;
+   return kb + cs->used_vram_kb + cs->used_gart_kb < screen->max_memory_usage_kb;
+}
 
-   /* Anything that goes above the VRAM size should go to GTT. */
-   if (vram_kb > screen->info.vram_size_kb)
-      gtt_kb += vram_kb - screen->info.vram_size_kb;
+static inline void si_need_gfx_cs_space(struct si_context *ctx, unsigned num_draws)
+{
+   struct radeon_cmdbuf *cs = &ctx->gfx_cs;
 
-   /* Now we just need to check if we have enough GTT (the limit is 75% of max). */
-   return gtt_kb < screen->info.gart_size_kb / 4 * 3;
+   /* There are two memory usage counters in the winsys for all buffers
+    * that have been added (cs_add_buffer) and one counter in the pipe
+    * driver for those that haven't been added yet.
+    */
+   uint32_t kb = ctx->memory_usage_kb;
+   ctx->memory_usage_kb = 0;
+
+   if (radeon_cs_memory_below_limit(ctx->screen, &ctx->gfx_cs, kb) &&
+       ctx->ws->cs_check_space(cs, si_get_minimum_num_gfx_cs_dwords(ctx, num_draws)))
+      return;
+
+   si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 }
 
 /**
@@ -1978,30 +1970,20 @@ static inline void radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sc
                                                            bool check_mem)
 {
    if (check_mem &&
-       !radeon_cs_memory_below_limit(sctx->screen, &sctx->gfx_cs, sctx->vram_kb + bo->vram_usage_kb,
-                                     sctx->gtt_kb + bo->gart_usage_kb))
+       !radeon_cs_memory_below_limit(sctx->screen, &sctx->gfx_cs, sctx->memory_usage_kb + bo->memory_usage_kb))
       si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 
    radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, bo, usage, priority);
 }
 
-static inline bool si_compute_prim_discard_enabled(struct si_context *sctx)
-{
-   return sctx->prim_discard_vertex_count_threshold != UINT_MAX;
-}
-
 static inline unsigned si_get_wave_size(struct si_screen *sscreen,
-                                        gl_shader_stage stage, bool ngg, bool es,
-                                        bool gs_fast_launch, bool prim_discard_cs)
+                                        gl_shader_stage stage, bool ngg, bool es)
 {
    if (stage == MESA_SHADER_COMPUTE)
       return sscreen->compute_wave_size;
    else if (stage == MESA_SHADER_FRAGMENT)
       return sscreen->ps_wave_size;
-   else if (gs_fast_launch)
-      return 32; /* GS fast launch hangs with Wave64, so always use Wave32. */
-   else if ((stage == MESA_SHADER_VERTEX && prim_discard_cs) || /* only Wave64 implemented */
-            (stage == MESA_SHADER_VERTEX && es && !ngg) ||
+   else if ((stage == MESA_SHADER_VERTEX && es && !ngg) ||
             (stage == MESA_SHADER_TESS_EVAL && es && !ngg) ||
             (stage == MESA_SHADER_GEOMETRY && !ngg)) /* legacy GS only supports Wave64 */
       return 64;
@@ -2011,33 +1993,36 @@ static inline unsigned si_get_wave_size(struct si_screen *sscreen,
 
 static inline unsigned si_get_shader_wave_size(struct si_shader *shader)
 {
+   if (shader->selector->info.stage <= MESA_SHADER_GEOMETRY) {
+      return si_get_wave_size(shader->selector->screen, shader->selector->info.stage,
+                              shader->key.ge.as_ngg,
+                              shader->key.ge.as_es);
+   }
+
    return si_get_wave_size(shader->selector->screen, shader->selector->info.stage,
-                           shader->key.as_ngg,
-                           shader->key.as_es,
-                           shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL,
-                           shader->key.opt.vs_as_prim_discard_cs);
+                           false, false);
 }
 
 static inline void si_select_draw_vbo(struct si_context *sctx)
 {
-   bool has_prim_discard_cs = si_compute_prim_discard_enabled(sctx) &&
-                              !sctx->shader.tes.cso && !sctx->shader.gs.cso;
    pipe_draw_vbo_func draw_vbo = sctx->draw_vbo[!!sctx->shader.tes.cso]
                                                [!!sctx->shader.gs.cso]
-                                               [sctx->ngg]
-                                               [has_prim_discard_cs];
+                                               [sctx->ngg];
+   pipe_draw_vertex_state_func draw_vertex_state =
+      sctx->draw_vertex_state[!!sctx->shader.tes.cso]
+                             [!!sctx->shader.gs.cso]
+                             [sctx->ngg];
    assert(draw_vbo);
-   if (unlikely(sctx->real_draw_vbo))
-      sctx->real_draw_vbo = draw_vbo;
-   else
-      sctx->b.draw_vbo = draw_vbo;
+   assert(draw_vertex_state);
 
-   if (!has_prim_discard_cs) {
-      /* Reset this to false if prim discard CS is disabled because draw_vbo doesn't reset it. */
-      if (sctx->prim_discard_cs_instancing) {
-         sctx->do_update_shaders = true;
-         sctx->prim_discard_cs_instancing = false;
-      }
+   if (unlikely(sctx->real_draw_vbo)) {
+      assert(sctx->real_draw_vertex_state);
+      sctx->real_draw_vbo = draw_vbo;
+      sctx->real_draw_vertex_state = draw_vertex_state;
+   } else {
+      assert(!sctx->real_draw_vertex_state);
+      sctx->b.draw_vbo = draw_vbo;
+      sctx->b.draw_vertex_state = draw_vertex_state;
    }
 }
 
@@ -2055,6 +2040,20 @@ static inline unsigned si_get_num_coverage_samples(struct si_context *sctx)
    return 1;
 }
 
+static unsigned ALWAYS_INLINE
+si_num_vbos_in_user_sgprs_inline(enum chip_class chip_class)
+{
+   /* This decreases CPU overhead if all descriptors are in user SGPRs because we don't
+    * have to allocate and count references for the upload buffer.
+    */
+   return chip_class >= GFX9 ? 5 : 1;
+}
+
+static inline unsigned si_num_vbos_in_user_sgprs(struct si_screen *sscreen)
+{
+   return si_num_vbos_in_user_sgprs_inline(sscreen->info.chip_class);
+}
+
 #define PRINT_ERR(fmt, args...)                                                                    \
    fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)
 
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_pm4.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_pm4.c
index 22b6e3ad51..ae4affa1b9 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_pm4.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_pm4.c	
@@ -117,13 +117,13 @@ void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state)
 {
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
 
-   if (state->shader) {
-      radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, state->shader->bo,
+   if (state->is_shader) {
+      radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, ((struct si_shader*)state)->bo,
                                 RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
    }
 
    radeon_begin(cs);
-   radeon_emit_array(cs, state->pm4, state->ndw);
+   radeon_emit_array(state->pm4, state->ndw);
    radeon_end();
 
    if (state->atom.emit)
@@ -139,7 +139,7 @@ void si_pm4_reset_emitted(struct si_context *sctx, bool first_cs)
       for (unsigned i = 0; i < SI_NUM_STATES; i++) {
          struct si_pm4_state *state = sctx->emitted.array[i];
 
-         if (state && state->shader) {
+         if (state && state->is_shader) {
             sctx->emitted.array[i] = NULL;
             sctx->dirty_states |= 1 << i;
          }
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_pm4.h b/mesa 3D driver/src/gallium/drivers/radeonsi/si_pm4.h
index 06909ff1a9..03f79e0ba3 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_pm4.h	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_pm4.h	
@@ -54,7 +54,7 @@ struct si_pm4_state {
    uint32_t pm4[SI_PM4_MAX_DW];
 
    /* For shader states only */
-   struct si_shader *shader;
+   bool is_shader;
    struct si_atom atom;
 };
 
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_query.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_query.c
index eba6af47a9..5fd0dd8d90 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_query.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_query.c	
@@ -260,15 +260,6 @@ static bool si_query_sw_begin(struct si_context *sctx, struct si_query *squery)
    case SI_QUERY_DISK_SHADER_CACHE_MISSES:
       query->begin_result = sctx->screen->num_disk_shader_cache_misses;
       break;
-   case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
-      query->begin_result = sctx->compute_num_verts_accepted;
-      break;
-   case SI_QUERY_PD_NUM_PRIMS_REJECTED:
-      query->begin_result = sctx->compute_num_verts_rejected;
-      break;
-   case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
-      query->begin_result = sctx->compute_num_verts_ineligible;
-      break;
    case SI_QUERY_GPIN_ASIC_ID:
    case SI_QUERY_GPIN_NUM_SIMD:
    case SI_QUERY_GPIN_NUM_RB:
@@ -429,15 +420,6 @@ static bool si_query_sw_end(struct si_context *sctx, struct si_query *squery)
    case SI_QUERY_DISK_SHADER_CACHE_MISSES:
       query->end_result = sctx->screen->num_disk_shader_cache_misses;
       break;
-   case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
-      query->end_result = sctx->compute_num_verts_accepted;
-      break;
-   case SI_QUERY_PD_NUM_PRIMS_REJECTED:
-      query->end_result = sctx->compute_num_verts_rejected;
-      break;
-   case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
-      query->end_result = sctx->compute_num_verts_ineligible;
-      break;
    case SI_QUERY_GPIN_ASIC_ID:
    case SI_QUERY_GPIN_NUM_SIMD:
    case SI_QUERY_GPIN_NUM_RB:
@@ -479,11 +461,6 @@ static bool si_query_sw_get_result(struct si_context *sctx, struct si_query *squ
       result->u64 =
          (query->end_result - query->begin_result) * 100 / (query->end_time - query->begin_time);
       return true;
-   case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
-   case SI_QUERY_PD_NUM_PRIMS_REJECTED:
-   case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
-      result->u64 = ((unsigned)query->end_result - (unsigned)query->begin_result) / 3;
-      return true;
    case SI_QUERY_GPIN_ASIC_ID:
       result->u32 = 0;
       return true;
@@ -781,10 +758,10 @@ static unsigned event_type_for_stream(unsigned stream)
 static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va, unsigned stream)
 {
    radeon_begin(cs);
-   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
-   radeon_emit(cs, EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
-   radeon_emit(cs, va);
-   radeon_emit(cs, va >> 32);
+   radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
+   radeon_emit(EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
+   radeon_emit(va);
+   radeon_emit(va >> 32);
    radeon_end();
 }
 
@@ -798,10 +775,10 @@ static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_h
    case PIPE_QUERY_OCCLUSION_PREDICATE:
    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
       radeon_begin(cs);
-      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
-      radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
-      radeon_emit(cs, va);
-      radeon_emit(cs, va >> 32);
+      radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
+      radeon_emit(EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
+      radeon_emit(va);
+      radeon_emit(va >> 32);
       radeon_end();
       break;
    }
@@ -821,10 +798,10 @@ static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_h
       break;
    case PIPE_QUERY_PIPELINE_STATISTICS: {
       radeon_begin(cs);
-      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
-      radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
-      radeon_emit(cs, va);
-      radeon_emit(cs, va >> 32);
+      radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
+      radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
+      radeon_emit(va);
+      radeon_emit(va >> 32);
       radeon_end();
       break;
    }
@@ -866,10 +843,10 @@ static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw
    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
       va += 8;
       radeon_begin(cs);
-      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
-      radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
-      radeon_emit(cs, va);
-      radeon_emit(cs, va >> 32);
+      radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
+      radeon_emit(EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
+      radeon_emit(va);
+      radeon_emit(va >> 32);
       radeon_end();
 
       fence_va = va + sctx->screen->info.max_render_backends * 16 - 8;
@@ -900,10 +877,10 @@ static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw
 
       va += sample_size;
       radeon_begin(cs);
-      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
-      radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
-      radeon_emit(cs, va);
-      radeon_emit(cs, va >> 32);
+      radeon_emit(PKT3(PKT3_EVENT_WRITE, 2, 0));
+      radeon_emit(EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
+      radeon_emit(va);
+      radeon_emit(va >> 32);
       radeon_end();
 
       fence_va = va + sample_size;
@@ -959,14 +936,14 @@ static void emit_set_predicate(struct si_context *ctx, struct si_resource *buf,
    radeon_begin(cs);
 
    if (ctx->chip_class >= GFX9) {
-      radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
-      radeon_emit(cs, op);
-      radeon_emit(cs, va);
-      radeon_emit(cs, va >> 32);
+      radeon_emit(PKT3(PKT3_SET_PREDICATION, 2, 0));
+      radeon_emit(op);
+      radeon_emit(va);
+      radeon_emit(va >> 32);
    } else {
-      radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
-      radeon_emit(cs, va);
-      radeon_emit(cs, op | ((va >> 32) & 0xFF));
+      radeon_emit(PKT3(PKT3_SET_PREDICATION, 1, 0));
+      radeon_emit(va);
+      radeon_emit(op | ((va >> 32) & 0xFF));
    }
    radeon_end();
 
@@ -1758,10 +1735,6 @@ static struct pipe_driver_query_info si_driver_query_list[] = {
    X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE),
    X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE),
    X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE),
-
-   X("pd-num-prims-accepted", PD_NUM_PRIMS_ACCEPTED, UINT64, AVERAGE),
-   X("pd-num-prims-rejected", PD_NUM_PRIMS_REJECTED, UINT64, AVERAGE),
-   X("pd-num-prims-ineligible", PD_NUM_PRIMS_INELIGIBLE, UINT64, AVERAGE),
 };
 
 #undef X
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_query.h b/mesa 3D driver/src/gallium/drivers/radeonsi/si_query.h
index b1654106b1..b0e1137385 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_query.h	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_query.h	
@@ -111,9 +111,6 @@ enum
    SI_QUERY_GPIN_NUM_RB,
    SI_QUERY_GPIN_NUM_SPI,
    SI_QUERY_GPIN_NUM_SE,
-   SI_QUERY_PD_NUM_PRIMS_ACCEPTED,
-   SI_QUERY_PD_NUM_PRIMS_REJECTED,
-   SI_QUERY_PD_NUM_PRIMS_INELIGIBLE,
    SI_QUERY_LIVE_SHADER_CACHE_HITS,
    SI_QUERY_LIVE_SHADER_CACHE_MISSES,
    SI_QUERY_MEMORY_SHADER_CACHE_HITS,
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_sdma_copy_image.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_sdma_copy_image.c
new file mode 100644
index 0000000000..3120add84e
--- /dev/null
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_sdma_copy_image.c	
@@ -0,0 +1,481 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ * Copyright 2015-2021 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "si_build_pm4.h"
+#include "sid.h"
+#include "util/u_memory.h"
+
+
+static
+bool si_prepare_for_sdma_copy(struct si_context *sctx, struct si_texture *dst,struct si_texture *src)
+{
+   if (dst->surface.bpe != src->surface.bpe)
+      return false;
+
+   /* MSAA: Blits don't exist in the real world. */
+   if (src->buffer.b.b.nr_samples > 1 || dst->buffer.b.b.nr_samples > 1)
+      return false;
+
+   if (dst->buffer.b.b.last_level != 0 || src->buffer.b.b.last_level != 0)
+      return false;
+
+   return true;
+}
+
+static unsigned minify_as_blocks(unsigned width, unsigned level, unsigned blk_w)
+{
+   width = u_minify(width, level);
+   return DIV_ROUND_UP(width, blk_w);
+}
+
+static unsigned encode_legacy_tile_info(struct si_context *sctx, struct si_texture *tex)
+{
+   struct radeon_info *info = &sctx->screen->info;
+   unsigned tile_index = tex->surface.u.legacy.tiling_index[0];
+   unsigned macro_tile_index = tex->surface.u.legacy.macro_tile_index;
+   unsigned tile_mode = info->si_tile_mode_array[tile_index];
+   unsigned macro_tile_mode = info->cik_macrotile_mode_array[macro_tile_index];
+
+   return util_logbase2(tex->surface.bpe) |
+          (G_009910_ARRAY_MODE(tile_mode) << 3) |
+          (G_009910_MICRO_TILE_MODE_NEW(tile_mode) << 8) |
+          /* Non-depth modes don't have TILE_SPLIT set. */
+          ((util_logbase2(tex->surface.u.legacy.tile_split >> 6)) << 11) |
+          (G_009990_BANK_WIDTH(macro_tile_mode) << 15) |
+          (G_009990_BANK_HEIGHT(macro_tile_mode) << 18) |
+          (G_009990_NUM_BANKS(macro_tile_mode) << 21) |
+          (G_009990_MACRO_TILE_ASPECT(macro_tile_mode) << 24) |
+          (G_009910_PIPE_CONFIG(tile_mode) << 26);
+}
+
+static
+bool si_translate_format_to_hw(struct si_context *sctx, enum pipe_format format, unsigned *hw_fmt, unsigned *hw_type)
+{
+   const struct util_format_description *desc = util_format_description(format);
+   *hw_fmt = si_translate_colorformat(sctx->chip_class, format);
+
+   int firstchan;
+   for (firstchan = 0; firstchan < 4; firstchan++) {
+      if (desc->channel[firstchan].type != UTIL_FORMAT_TYPE_VOID) {
+         break;
+      }
+   }
+   if (firstchan == 4 || desc->channel[firstchan].type == UTIL_FORMAT_TYPE_FLOAT) {
+      *hw_type = V_028C70_NUMBER_FLOAT;
+   } else {
+      *hw_type = V_028C70_NUMBER_UNORM;
+      if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
+         *hw_type = V_028C70_NUMBER_SRGB;
+      else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_SIGNED) {
+         if (desc->channel[firstchan].pure_integer) {
+            *hw_type = V_028C70_NUMBER_SINT;
+         } else {
+            assert(desc->channel[firstchan].normalized);
+            *hw_type = V_028C70_NUMBER_SNORM;
+         }
+      } else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_UNSIGNED) {
+         if (desc->channel[firstchan].pure_integer) {
+            *hw_type = V_028C70_NUMBER_UINT;
+         } else {
+            assert(desc->channel[firstchan].normalized);
+            *hw_type = V_028C70_NUMBER_UNORM;
+         }
+      } else {
+         return false;
+      }
+   }
+   return true;
+}
+
+static
+bool si_sdma_v4_v5_copy_texture(struct si_context *sctx, struct si_texture *sdst, struct si_texture *ssrc, bool is_v5)
+{
+   unsigned bpp = sdst->surface.bpe;
+   uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.gfx9.surf_offset;
+   uint64_t src_address = ssrc->buffer.gpu_address + ssrc->surface.u.gfx9.surf_offset;
+   unsigned dst_pitch = sdst->surface.u.gfx9.surf_pitch;
+   unsigned src_pitch = ssrc->surface.u.gfx9.surf_pitch;
+   unsigned copy_width = DIV_ROUND_UP(ssrc->buffer.b.b.width0, ssrc->surface.blk_w);
+   unsigned copy_height = DIV_ROUND_UP(ssrc->buffer.b.b.height0, ssrc->surface.blk_h);
+
+   bool tmz = (ssrc->buffer.flags & RADEON_FLAG_ENCRYPTED);
+   assert (!tmz || (sdst->buffer.flags & RADEON_FLAG_ENCRYPTED));
+
+   /* Linear -> linear sub-window copy. */
+   if (ssrc->surface.is_linear && sdst->surface.is_linear) {
+      struct radeon_cmdbuf *cs = sctx->sdma_cs;
+
+      unsigned bytes = src_pitch * copy_height * bpp;
+
+      if (!(bytes < (1u << 22)))
+         return false;
+
+      src_address += ssrc->surface.u.gfx9.offset[0];
+      dst_address += sdst->surface.u.gfx9.offset[0];
+
+      radeon_begin(cs);
+      radeon_emit(CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
+                                  CIK_SDMA_COPY_SUB_OPCODE_LINEAR,
+                                  (tmz ? 4 : 0)));
+      radeon_emit(bytes);
+      radeon_emit(0);
+      radeon_emit(src_address);
+      radeon_emit(src_address >> 32);
+      radeon_emit(dst_address);
+      radeon_emit(dst_address >> 32);
+      radeon_end();
+      return true;
+   }
+
+   /* Linear <-> Tiled sub-window copy */
+   if (ssrc->surface.is_linear != sdst->surface.is_linear) {
+      struct si_texture *tiled = ssrc->surface.is_linear ? sdst : ssrc;
+      struct si_texture *linear = tiled == ssrc ? sdst : ssrc;
+      unsigned tiled_width = DIV_ROUND_UP(tiled->buffer.b.b.width0, tiled->surface.blk_w);
+      unsigned tiled_height = DIV_ROUND_UP(tiled->buffer.b.b.height0, tiled->surface.blk_h);
+      unsigned linear_pitch = linear == ssrc ? src_pitch : dst_pitch;
+      unsigned linear_slice_pitch = ((uint64_t)linear->surface.u.gfx9.surf_slice_size) / bpp;
+      uint64_t tiled_address = tiled == ssrc ? src_address : dst_address;
+      uint64_t linear_address = linear == ssrc ? src_address : dst_address;
+      struct radeon_cmdbuf *cs = sctx->sdma_cs;
+      /* Only SDMA 5 supports DCC with SDMA */
+      bool dcc = vi_dcc_enabled(tiled, 0);
+      assert(!dcc || is_v5);
+      assert(tiled->buffer.b.b.depth0 == 1);
+
+      linear_address += linear->surface.u.gfx9.offset[0];
+
+      /* Check if everything fits into the bitfields */
+      if (!(tiled_width < (1 << 14) && tiled_height < (1 << 14) &&
+            linear_pitch < (1 << 14) && linear_slice_pitch < (1 << 28) &&
+            copy_width < (1 << 14) && copy_height < (1 << 14)))
+         return false;
+
+      radeon_begin(cs);
+      radeon_emit(
+         CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
+                         CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW,
+                         (tmz ? 4 : 0)) |
+         dcc << 19 |
+         (is_v5 ? 0 : tiled->buffer.b.b.last_level) << 20 |
+         (linear == sdst ? 1u : 0) << 31);
+      radeon_emit((uint32_t)tiled_address | (tiled->surface.tile_swizzle << 8));
+      radeon_emit((uint32_t)(tiled_address >> 32));
+      radeon_emit(0);
+      radeon_emit(((tiled_width - 1) << 16));
+      radeon_emit((tiled_height - 1));
+      radeon_emit(util_logbase2(bpp) |
+                  tiled->surface.u.gfx9.swizzle_mode << 3 |
+                  tiled->surface.u.gfx9.resource_type << 9 |
+                  (is_v5 ? tiled->buffer.b.b.last_level : tiled->surface.u.gfx9.epitch) << 16);
+      radeon_emit((uint32_t)linear_address);
+      radeon_emit((uint32_t)(linear_address >> 32));
+      radeon_emit(0);
+      radeon_emit(((linear_pitch - 1) << 16));
+      radeon_emit(linear_slice_pitch - 1);
+      radeon_emit((copy_width - 1) | ((copy_height - 1) << 16));
+      radeon_emit(0);
+
+      if (dcc) {
+         unsigned hw_fmt, hw_type;
+         uint64_t md_address = tiled_address + tiled->surface.meta_offset;
+
+         si_translate_format_to_hw(sctx, tiled->buffer.b.b.format, &hw_fmt, &hw_type);
+
+         /* Add metadata */
+         radeon_emit((uint32_t)md_address);
+         radeon_emit((uint32_t)(md_address >> 32));
+         radeon_emit(hw_fmt |
+                     vi_alpha_is_on_msb(sctx->screen, tiled->buffer.b.b.format) << 8 |
+                     hw_type << 9 |
+                     tiled->surface.u.gfx9.color.dcc.max_compressed_block_size << 24 |
+                     V_028C78_MAX_BLOCK_SIZE_256B << 26 |
+                     tmz << 29 |
+                     tiled->surface.u.gfx9.color.dcc.pipe_aligned << 31);
+      }
+      radeon_end();
+      return true;
+   }
+
+   return false;
+}
+
+static
+bool cik_sdma_copy_texture(struct si_context *sctx, struct si_texture *sdst, struct si_texture *ssrc)
+{
+   struct radeon_info *info = &sctx->screen->info;
+   unsigned bpp = sdst->surface.bpe;
+   uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.legacy.level[0].offset_256B * 256;
+   uint64_t src_address = ssrc->buffer.gpu_address + ssrc->surface.u.legacy.level[0].offset_256B * 256;
+   unsigned dst_mode = sdst->surface.u.legacy.level[0].mode;
+   unsigned src_mode = ssrc->surface.u.legacy.level[0].mode;
+   unsigned dst_tile_index = sdst->surface.u.legacy.tiling_index[0];
+   unsigned src_tile_index = ssrc->surface.u.legacy.tiling_index[0];
+   unsigned dst_tile_mode = info->si_tile_mode_array[dst_tile_index];
+   unsigned src_tile_mode = info->si_tile_mode_array[src_tile_index];
+   unsigned dst_micro_mode = G_009910_MICRO_TILE_MODE_NEW(dst_tile_mode);
+   unsigned src_micro_mode = G_009910_MICRO_TILE_MODE_NEW(src_tile_mode);
+   unsigned dst_tile_swizzle = dst_mode == RADEON_SURF_MODE_2D ? sdst->surface.tile_swizzle : 0;
+   unsigned src_tile_swizzle = src_mode == RADEON_SURF_MODE_2D ? ssrc->surface.tile_swizzle : 0;
+   unsigned dst_pitch = sdst->surface.u.legacy.level[0].nblk_x;
+   unsigned src_pitch = ssrc->surface.u.legacy.level[0].nblk_x;
+   uint64_t dst_slice_pitch =
+      ((uint64_t)sdst->surface.u.legacy.level[0].slice_size_dw * 4) / bpp;
+   uint64_t src_slice_pitch =
+      ((uint64_t)ssrc->surface.u.legacy.level[0].slice_size_dw * 4) / bpp;
+   unsigned dst_width = minify_as_blocks(sdst->buffer.b.b.width0, 0, sdst->surface.blk_w);
+   unsigned src_width = minify_as_blocks(ssrc->buffer.b.b.width0, 0, ssrc->surface.blk_w);
+   unsigned copy_width = DIV_ROUND_UP(ssrc->buffer.b.b.width0, ssrc->surface.blk_w);
+   unsigned copy_height = DIV_ROUND_UP(ssrc->buffer.b.b.height0, ssrc->surface.blk_h);
+
+   dst_address |= dst_tile_swizzle << 8;
+   src_address |= src_tile_swizzle << 8;
+
+   /* Linear -> linear sub-window copy. */
+   if (dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED && src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED &&
+       /* check if everything fits into the bitfields */
+       src_pitch <= (1 << 14) && dst_pitch <= (1 << 14) && src_slice_pitch <= (1 << 28) &&
+       dst_slice_pitch <= (1 << 28) && copy_width <= (1 << 14) && copy_height <= (1 << 14) &&
+       /* HW limitation - GFX7: */
+       (sctx->chip_class != GFX7 ||
+        (copy_width < (1 << 14) && copy_height < (1 << 14))) &&
+       /* HW limitation - some GFX7 parts: */
+       ((sctx->family != CHIP_BONAIRE && sctx->family != CHIP_KAVERI) ||
+        (copy_width != (1 << 14) && copy_height != (1 << 14)))) {
+      struct radeon_cmdbuf *cs = sctx->sdma_cs;
+
+      radeon_begin(cs);
+      radeon_emit(CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) |
+                  (util_logbase2(bpp) << 29));
+      radeon_emit(src_address);
+      radeon_emit(src_address >> 32);
+      radeon_emit(0);
+      radeon_emit((src_pitch - 1) << 16);
+      radeon_emit(src_slice_pitch - 1);
+      radeon_emit(dst_address);
+      radeon_emit(dst_address >> 32);
+      radeon_emit(0);
+      radeon_emit((dst_pitch - 1) << 16);
+      radeon_emit(dst_slice_pitch - 1);
+      if (sctx->chip_class == GFX7) {
+         radeon_emit(copy_width | (copy_height << 16));
+         radeon_emit(0);
+      } else {
+         radeon_emit((copy_width - 1) | ((copy_height - 1) << 16));
+         radeon_emit(0);
+      }
+      radeon_end();
+      return true;
+   }
+
+   /* Tiled <-> linear sub-window copy. */
+   if ((src_mode >= RADEON_SURF_MODE_1D) != (dst_mode >= RADEON_SURF_MODE_1D)) {
+      struct si_texture *tiled = src_mode >= RADEON_SURF_MODE_1D ? ssrc : sdst;
+      struct si_texture *linear = tiled == ssrc ? sdst : ssrc;
+      unsigned tiled_width = tiled == ssrc ? src_width : dst_width;
+      unsigned linear_width = linear == ssrc ? src_width : dst_width;
+      unsigned tiled_pitch = tiled == ssrc ? src_pitch : dst_pitch;
+      unsigned linear_pitch = linear == ssrc ? src_pitch : dst_pitch;
+      unsigned tiled_slice_pitch = tiled == ssrc ? src_slice_pitch : dst_slice_pitch;
+      unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch;
+      uint64_t tiled_address = tiled == ssrc ? src_address : dst_address;
+      uint64_t linear_address = linear == ssrc ? src_address : dst_address;
+      unsigned tiled_micro_mode = tiled == ssrc ? src_micro_mode : dst_micro_mode;
+
+      assert(tiled_pitch % 8 == 0);
+      assert(tiled_slice_pitch % 64 == 0);
+      unsigned pitch_tile_max = tiled_pitch / 8 - 1;
+      unsigned slice_tile_max = tiled_slice_pitch / 64 - 1;
+      unsigned xalign = MAX2(1, 4 / bpp);
+      unsigned copy_width_aligned = copy_width;
+
+      /* If the region ends at the last pixel and is unaligned, we
+       * can copy the remainder of the line that is not visible to
+       * make it aligned.
+       */
+      if (copy_width % xalign != 0 && 0 + copy_width == linear_width &&
+          copy_width == tiled_width &&
+          align(copy_width, xalign) <= linear_pitch &&
+          align(copy_width, xalign) <= tiled_pitch)
+         copy_width_aligned = align(copy_width, xalign);
+
+      /* HW limitations. */
+      if ((sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KAVERI) &&
+          linear_pitch - 1 == 0x3fff && bpp == 16)
+         return false;
+
+      if ((sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KAVERI ||
+           sctx->family == CHIP_KABINI) &&
+          (copy_width == (1 << 14) || copy_height == (1 << 14)))
+         return false;
+
+      /* The hw can read outside of the given linear buffer bounds,
+       * or access those pages but not touch the memory in case
+       * of writes. (it still causes a VM fault)
+       *
+       * Out-of-bounds memory access or page directory access must
+       * be prevented.
+       */
+      int64_t start_linear_address, end_linear_address;
+      unsigned granularity;
+
+      /* Deduce the size of reads from the linear surface. */
+      switch (tiled_micro_mode) {
+      case V_009910_ADDR_SURF_DISPLAY_MICRO_TILING:
+         granularity = bpp == 1 ? 64 / (8 * bpp) : 128 / (8 * bpp);
+         break;
+      case V_009910_ADDR_SURF_THIN_MICRO_TILING:
+      case V_009910_ADDR_SURF_DEPTH_MICRO_TILING:
+         if (0 /* TODO: THICK microtiling */)
+            granularity =
+               bpp == 1 ? 32 / (8 * bpp)
+                        : bpp == 2 ? 64 / (8 * bpp) : bpp <= 8 ? 128 / (8 * bpp) : 256 / (8 * bpp);
+         else
+            granularity = bpp <= 2 ? 64 / (8 * bpp) : bpp <= 8 ? 128 / (8 * bpp) : 256 / (8 * bpp);
+         break;
+      default:
+         return false;
+      }
+
+      /* The linear reads start at tiled_x & ~(granularity - 1).
+       * If linear_x == 0 && tiled_x % granularity != 0, the hw
+       * starts reading from an address preceding linear_address!!!
+       */
+      start_linear_address =
+         linear->surface.u.legacy.level[0].offset_256B * 256;
+
+      end_linear_address =
+         linear->surface.u.legacy.level[0].offset_256B * 256 +
+         bpp * ((copy_height - 1) * linear_pitch + copy_width);
+
+      if ((0 + copy_width) % granularity)
+         end_linear_address += granularity - (0 + copy_width) % granularity;
+
+      if (start_linear_address < 0 || end_linear_address > linear->surface.surf_size)
+         return false;
+
+      /* Check requirements. */
+      if (tiled_address % 256 == 0 && linear_address % 4 == 0 && linear_pitch % xalign == 0 &&
+          copy_width_aligned % xalign == 0 &&
+          tiled_micro_mode != V_009910_ADDR_SURF_ROTATED_MICRO_TILING &&
+          /* check if everything fits into the bitfields */
+          tiled->surface.u.legacy.tile_split <= 4096 && pitch_tile_max < (1 << 11) &&
+          slice_tile_max < (1 << 22) && linear_pitch <= (1 << 14) &&
+          linear_slice_pitch <= (1 << 28) && copy_width_aligned <= (1 << 14) &&
+          copy_height <= (1 << 14)) {
+         struct radeon_cmdbuf *cs = sctx->sdma_cs;
+         uint32_t direction = linear == sdst ? 1u << 31 : 0;
+
+         radeon_begin(cs);
+         radeon_emit(CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
+                                     CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) |
+                     direction);
+         radeon_emit(tiled_address);
+         radeon_emit(tiled_address >> 32);
+         radeon_emit(0);
+         radeon_emit(pitch_tile_max << 16);
+         radeon_emit(slice_tile_max);
+         radeon_emit(encode_legacy_tile_info(sctx, tiled));
+         radeon_emit(linear_address);
+         radeon_emit(linear_address >> 32);
+         radeon_emit(0);
+         radeon_emit(((linear_pitch - 1) << 16));
+         radeon_emit(linear_slice_pitch - 1);
+         if (sctx->chip_class == GFX7) {
+            radeon_emit(copy_width_aligned | (copy_height << 16));
+            radeon_emit(1);
+         } else {
+            radeon_emit((copy_width_aligned - 1) | ((copy_height - 1) << 16));
+            radeon_emit(0);
+         }
+         radeon_end();
+         return true;
+      }
+   }
+
+   return false;
+}
+
+bool si_sdma_copy_image(struct si_context *sctx, struct si_texture *dst, struct si_texture *src)
+{
+   struct radeon_winsys *ws = sctx->ws;
+
+   if (!sctx->sdma_cs) {
+      if (sctx->screen->debug_flags & DBG(NO_DMA) || sctx->chip_class < GFX7)
+         return false;
+
+      sctx->sdma_cs = CALLOC_STRUCT(radeon_cmdbuf);
+      if (ws->cs_create(sctx->sdma_cs, sctx->ctx, RING_DMA,
+                        NULL, NULL, true))
+         return false;
+   }
+
+   if (!si_prepare_for_sdma_copy(sctx, dst, src))
+      return false;
+
+   /* Decompress DCC on older chips */
+   if (vi_dcc_enabled(src, 0) && sctx->chip_class < GFX10)
+      si_decompress_dcc(sctx, src);
+   /* TODO: DCC compression is possible on GFX10+. See si_set_mutable_tex_desc_fields for
+    * additional constraints.
+    * For now, the only use-case of SDMA is DRI_PRIME tiled->linear copy, so this is not
+    * implemented. */
+   if (vi_dcc_enabled(dst, 0))
+      return false;
+
+   /* Always flush the gfx queue to get the winsys to handle the dependencies for us. */
+   si_flush_gfx_cs(sctx, 0, NULL);
+
+   switch (sctx->chip_class) {
+      case GFX7:
+      case GFX8:
+         if (!cik_sdma_copy_texture(sctx, dst, src))
+            return false;
+         break;
+      case GFX9:
+      case GFX10:
+      case GFX10_3:
+         if (!si_sdma_v4_v5_copy_texture(sctx, dst, src, sctx->chip_class >= GFX10))
+            return false;
+         break;
+      default:
+         return false;
+   }
+
+   radeon_add_to_buffer_list(sctx, sctx->sdma_cs, &src->buffer, RADEON_USAGE_READ,
+                             RADEON_PRIO_SAMPLER_TEXTURE);
+   radeon_add_to_buffer_list(sctx, sctx->sdma_cs, &dst->buffer, RADEON_USAGE_WRITE,
+                             RADEON_PRIO_SAMPLER_TEXTURE);
+
+   unsigned flags = RADEON_FLUSH_START_NEXT_GFX_IB_NOW;
+   if (unlikely(radeon_uses_secure_bos(sctx->ws))) {
+      if ((bool) (src->buffer.flags & RADEON_FLAG_ENCRYPTED) !=
+          sctx->ws->cs_is_secure(sctx->sdma_cs)) {
+         flags = RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION;
+      }
+   }
+
+   return ws->cs_flush(sctx->sdma_cs, flags, NULL) == 0;
+}
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader.c
index 0cdd3a9f53..c8892a31b5 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader.c	
@@ -42,10 +42,11 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
 /** Whether the shader runs as a combination of multiple API shaders */
 bool si_is_multi_part_shader(struct si_shader *shader)
 {
-   if (shader->selector->screen->info.chip_class <= GFX8)
+   if (shader->selector->screen->info.chip_class <= GFX8 ||
+       shader->selector->info.stage > MESA_SHADER_GEOMETRY)
       return false;
 
-   return shader->key.as_ls || shader->key.as_es ||
+   return shader->key.ge.as_ls || shader->key.ge.as_es ||
           shader->selector->info.stage == MESA_SHADER_TESS_CTRL ||
           shader->selector->info.stage == MESA_SHADER_GEOMETRY;
 }
@@ -53,7 +54,10 @@ bool si_is_multi_part_shader(struct si_shader *shader)
 /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
 bool si_is_merged_shader(struct si_shader *shader)
 {
-   return shader->key.as_ngg || si_is_multi_part_shader(shader);
+   if (shader->selector->info.stage > MESA_SHADER_GEOMETRY)
+      return false;
+
+   return shader->key.ge.as_ngg || si_is_multi_part_shader(shader);
 }
 
 /**
@@ -200,7 +204,7 @@ unsigned si_get_max_workgroup_size(const struct si_shader *shader)
    switch (shader->selector->info.stage) {
    case MESA_SHADER_VERTEX:
    case MESA_SHADER_TESS_EVAL:
-      return shader->key.as_ngg ? 128 : 0;
+      return shader->key.ge.as_ngg ? 128 : 0;
 
    case MESA_SHADER_TESS_CTRL:
       /* Return this so that LLVM doesn't remove s_barrier
@@ -300,7 +304,7 @@ static void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_
    struct si_shader *shader = ctx->shader;
 
    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id);
-   if (shader->key.as_ls) {
+   if (shader->key.ge.as_ls) {
       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vs_rel_patch_id);
       if (ctx->screen->info.chip_class >= GFX10) {
          ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
@@ -384,10 +388,10 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
    memset(&ctx->args, 0, sizeof(ctx->args));
 
    /* Set MERGED shaders. */
-   if (ctx->screen->info.chip_class >= GFX9) {
-      if (shader->key.as_ls || stage == MESA_SHADER_TESS_CTRL)
+   if (ctx->screen->info.chip_class >= GFX9 && stage <= MESA_SHADER_GEOMETRY) {
+      if (shader->key.ge.as_ls || stage == MESA_SHADER_TESS_CTRL)
          stage = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
-      else if (shader->key.as_es || shader->key.as_ngg || stage == MESA_SHADER_GEOMETRY)
+      else if (shader->key.ge.as_es || shader->key.ge.as_ngg || stage == MESA_SHADER_GEOMETRY)
          stage = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
    }
 
@@ -408,9 +412,9 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
       if (!shader->is_gs_copy_shader)
          declare_vb_descriptor_input_sgprs(ctx);
 
-      if (shader->key.as_es) {
+      if (shader->key.ge.as_es) {
          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.es2gs_offset);
-      } else if (shader->key.as_ls) {
+      } else if (shader->key.ge.as_ls) {
          /* no extra parameters */
       } else {
          /* The locations of the other parameters are assigned dynamically. */
@@ -419,12 +423,6 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
 
       /* VGPRs */
       declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
-
-      /* Return values */
-      if (shader->key.opt.vs_as_prim_discard_cs) {
-         for (i = 0; i < 4; i++)
-            ac_add_return(&ctx->args, AC_ARG_VGPR);
-      }
       break;
 
    case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */
@@ -485,14 +483,14 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
             ac_add_return(&ctx->args, AC_ARG_VGPR);
 
          /* VS outputs passed via VGPRs to TCS. */
-         if (shader->key.opt.same_patch_vertices) {
+         if (shader->key.ge.opt.same_patch_vertices) {
             unsigned num_outputs = util_last_bit64(shader->selector->outputs_written);
             for (i = 0; i < num_outputs * 4; i++)
                ac_add_return(&ctx->args, AC_ARG_VGPR);
          }
       } else {
          /* TCS inputs are passed via VGPRs from VS. */
-         if (shader->key.opt.same_patch_vertices) {
+         if (shader->key.ge.opt.same_patch_vertices) {
             unsigned num_inputs = util_last_bit64(shader->previous_stage_sel->outputs_written);
             for (i = 0; i < num_inputs * 4; i++)
                ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
@@ -516,7 +514,7 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
       /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
       declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_GEOMETRY);
 
-      if (ctx->shader->key.as_ngg)
+      if (ctx->shader->key.ge.as_ngg)
          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs_tg_info);
       else
          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs2vs_offset);
@@ -553,11 +551,11 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
          declare_vb_descriptor_input_sgprs(ctx);
 
       /* VGPRs (first GS, then VS/TES) */
-      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx01_offset);
-      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx23_offset);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[0]);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[1]);
       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
-      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset);
+      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[2]);
 
       if (ctx->stage == MESA_SHADER_VERTEX) {
          declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
@@ -565,7 +563,7 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
          declare_tes_input_vgprs(ctx);
       }
 
-      if ((ctx->shader->key.as_es || ngg_cull_shader) &&
+      if ((ctx->shader->key.ge.as_es || ngg_cull_shader) &&
           (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL)) {
          unsigned num_user_sgprs, num_vgprs;
 
@@ -608,7 +606,7 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
 
-      if (shader->key.as_es) {
+      if (shader->key.ge.as_es) {
          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.es2gs_offset);
@@ -658,7 +656,7 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
                          SI_PARAM_LINEAR_CENTER);
       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_centroid,
                          SI_PARAM_LINEAR_CENTROID);
-      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);
+      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);
       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[0],
                          SI_PARAM_POS_X_FLOAT);
       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[1],
@@ -792,14 +790,15 @@ static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *sh
    unsigned num_lds_symbols = 0;
 
    if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader &&
-       (sel->info.stage == MESA_SHADER_GEOMETRY || shader->key.as_ngg)) {
+       (sel->info.stage == MESA_SHADER_GEOMETRY ||
+        (sel->info.stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg))) {
       struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
       sym->name = "esgs_ring";
       sym->size = shader->gs_info.esgs_ring_size * 4;
       sym->align = 64 * 1024;
    }
 
-   if (shader->key.as_ngg && sel->info.stage == MESA_SHADER_GEOMETRY) {
+   if (sel->info.stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
       struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
       sym->name = "ngg_emit";
       sym->size = shader->ngg.ngg_emit_size * 4;
@@ -832,7 +831,9 @@ static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_sh
 {
    struct ac_rtld_binary rtld;
    si_shader_binary_open(screen, shader, &rtld);
-   return rtld.exec_size;
+   uint64_t size = rtld.exec_size;
+   ac_rtld_close(&rtld);
+   return size;
 }
 
 static bool si_get_external_symbol(void *data, const char *name, uint64_t *value)
@@ -862,8 +863,8 @@ bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader
    si_resource_reference(&shader->bo, NULL);
    shader->bo = si_aligned_buffer_create(
       &sscreen->b,
-      (sscreen->info.cpdma_prefetch_writes_memory ?
-         0 : SI_RESOURCE_FLAG_READ_ONLY) | SI_RESOURCE_FLAG_DRIVER_INTERNAL,
+      (sscreen->info.cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY) |
+      SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_32BIT,
       PIPE_USAGE_IMMUTABLE, align(binary.rx_size, SI_CPDMA_ALIGNMENT), 256);
    if (!shader->bo)
       return false;
@@ -1064,22 +1065,20 @@ const char *si_get_shader_name(const struct si_shader *shader)
 {
    switch (shader->selector->info.stage) {
    case MESA_SHADER_VERTEX:
-      if (shader->key.as_es)
+      if (shader->key.ge.as_es)
          return "Vertex Shader as ES";
-      else if (shader->key.as_ls)
+      else if (shader->key.ge.as_ls)
          return "Vertex Shader as LS";
-      else if (shader->key.opt.vs_as_prim_discard_cs)
-         return "Vertex Shader as Primitive Discard CS";
-      else if (shader->key.as_ngg)
+      else if (shader->key.ge.as_ngg)
          return "Vertex Shader as ESGS";
       else
          return "Vertex Shader as VS";
    case MESA_SHADER_TESS_CTRL:
       return "Tessellation Control Shader";
    case MESA_SHADER_TESS_EVAL:
-      if (shader->key.as_es)
+      if (shader->key.ge.as_es)
          return "Tessellation Evaluation Shader as ES";
-      else if (shader->key.as_ngg)
+      else if (shader->key.ge.as_ngg)
          return "Tessellation Evaluation Shader as ESGS";
       else
          return "Tessellation Evaluation Shader as VS";
@@ -1143,21 +1142,19 @@ void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
    si_shader_dump_stats(sscreen, shader, file, check_debug_option);
 }
 
-static void si_dump_shader_key_vs(const struct si_shader_key *key,
+static void si_dump_shader_key_vs(const union si_shader_key *key,
                                   const struct si_vs_prolog_bits *prolog, const char *prefix,
                                   FILE *f)
 {
    fprintf(f, "  %s.instance_divisor_is_one = %u\n", prefix, prolog->instance_divisor_is_one);
    fprintf(f, "  %s.instance_divisor_is_fetched = %u\n", prefix,
            prolog->instance_divisor_is_fetched);
-   fprintf(f, "  %s.unpack_instance_id_from_vertex_id = %u\n", prefix,
-           prolog->unpack_instance_id_from_vertex_id);
    fprintf(f, "  %s.ls_vgpr_fix = %u\n", prefix, prolog->ls_vgpr_fix);
 
-   fprintf(f, "  mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode);
+   fprintf(f, "  mono.vs.fetch_opencode = %x\n", key->ge.mono.vs_fetch_opencode);
    fprintf(f, "  mono.vs.fix_fetch = {");
    for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
-      union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i];
+      union si_vs_fix_fetch fix = key->ge.mono.vs_fix_fetch[i];
       if (i)
          fprintf(f, ", ");
       if (!fix.bits)
@@ -1171,42 +1168,35 @@ static void si_dump_shader_key_vs(const struct si_shader_key *key,
 
 static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
 {
-   const struct si_shader_key *key = &shader->key;
+   const union si_shader_key *key = &shader->key;
    gl_shader_stage stage = shader->selector->info.stage;
 
    fprintf(f, "SHADER KEY\n");
 
    switch (stage) {
    case MESA_SHADER_VERTEX:
-      si_dump_shader_key_vs(key, &key->part.vs.prolog, "part.vs.prolog", f);
-      fprintf(f, "  as_es = %u\n", key->as_es);
-      fprintf(f, "  as_ls = %u\n", key->as_ls);
-      fprintf(f, "  as_ngg = %u\n", key->as_ngg);
-      fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);
-      fprintf(f, "  opt.vs_as_prim_discard_cs = %u\n", key->opt.vs_as_prim_discard_cs);
-      fprintf(f, "  opt.cs_prim_type = %s\n", tgsi_primitive_names[key->opt.cs_prim_type]);
-      fprintf(f, "  opt.cs_indexed = %u\n", key->opt.cs_indexed);
-      fprintf(f, "  opt.cs_instancing = %u\n", key->opt.cs_instancing);
-      fprintf(f, "  opt.cs_provoking_vertex_first = %u\n", key->opt.cs_provoking_vertex_first);
-      fprintf(f, "  opt.cs_cull_front = %u\n", key->opt.cs_cull_front);
-      fprintf(f, "  opt.cs_cull_back = %u\n", key->opt.cs_cull_back);
+      si_dump_shader_key_vs(key, &key->ge.part.vs.prolog, "part.vs.prolog", f);
+      fprintf(f, "  as_es = %u\n", key->ge.as_es);
+      fprintf(f, "  as_ls = %u\n", key->ge.as_ls);
+      fprintf(f, "  as_ngg = %u\n", key->ge.as_ngg);
+      fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
       break;
 
    case MESA_SHADER_TESS_CTRL:
       if (shader->selector->screen->info.chip_class >= GFX9) {
-         si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog, "part.tcs.ls_prolog", f);
+         si_dump_shader_key_vs(key, &key->ge.part.tcs.ls_prolog, "part.tcs.ls_prolog", f);
       }
-      fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
+      fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->ge.part.tcs.epilog.prim_mode);
       fprintf(f, "  mono.u.ff_tcs_inputs_to_copy = 0x%" PRIx64 "\n",
-              key->mono.u.ff_tcs_inputs_to_copy);
-      fprintf(f, "  opt.prefer_mono = %u\n", key->opt.prefer_mono);
-      fprintf(f, "  opt.same_patch_vertices = %u\n", key->opt.same_patch_vertices);
+              key->ge.mono.u.ff_tcs_inputs_to_copy);
+      fprintf(f, "  opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
+      fprintf(f, "  opt.same_patch_vertices = %u\n", key->ge.opt.same_patch_vertices);
       break;
 
    case MESA_SHADER_TESS_EVAL:
-      fprintf(f, "  as_es = %u\n", key->as_es);
-      fprintf(f, "  as_ngg = %u\n", key->as_ngg);
-      fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);
+      fprintf(f, "  as_es = %u\n", key->ge.as_es);
+      fprintf(f, "  as_ngg = %u\n", key->ge.as_ngg);
+      fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
       break;
 
    case MESA_SHADER_GEOMETRY:
@@ -1214,50 +1204,49 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
          break;
 
       if (shader->selector->screen->info.chip_class >= GFX9 &&
-          key->part.gs.es->info.stage == MESA_SHADER_VERTEX) {
-         si_dump_shader_key_vs(key, &key->part.gs.vs_prolog, "part.gs.vs_prolog", f);
+          key->ge.part.gs.es->info.stage == MESA_SHADER_VERTEX) {
+         si_dump_shader_key_vs(key, &key->ge.part.gs.vs_prolog, "part.gs.vs_prolog", f);
       }
-      fprintf(f, "  part.gs.prolog.tri_strip_adj_fix = %u\n",
-              key->part.gs.prolog.tri_strip_adj_fix);
-      fprintf(f, "  as_ngg = %u\n", key->as_ngg);
+      fprintf(f, "  mono.u.gs_tri_strip_adj_fix = %u\n", key->ge.mono.u.gs_tri_strip_adj_fix);
+      fprintf(f, "  as_ngg = %u\n", key->ge.as_ngg);
       break;
 
    case MESA_SHADER_COMPUTE:
       break;
 
    case MESA_SHADER_FRAGMENT:
-      fprintf(f, "  part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
-      fprintf(f, "  part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
-      fprintf(f, "  part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
-      fprintf(f, "  part.ps.prolog.force_persp_sample_interp = %u\n",
-              key->part.ps.prolog.force_persp_sample_interp);
-      fprintf(f, "  part.ps.prolog.force_linear_sample_interp = %u\n",
-              key->part.ps.prolog.force_linear_sample_interp);
-      fprintf(f, "  part.ps.prolog.force_persp_center_interp = %u\n",
-              key->part.ps.prolog.force_persp_center_interp);
-      fprintf(f, "  part.ps.prolog.force_linear_center_interp = %u\n",
-              key->part.ps.prolog.force_linear_center_interp);
-      fprintf(f, "  part.ps.prolog.bc_optimize_for_persp = %u\n",
-              key->part.ps.prolog.bc_optimize_for_persp);
-      fprintf(f, "  part.ps.prolog.bc_optimize_for_linear = %u\n",
-              key->part.ps.prolog.bc_optimize_for_linear);
-      fprintf(f, "  part.ps.prolog.samplemask_log_ps_iter = %u\n",
-              key->part.ps.prolog.samplemask_log_ps_iter);
-      fprintf(f, "  part.ps.epilog.spi_shader_col_format = 0x%x\n",
-              key->part.ps.epilog.spi_shader_col_format);
-      fprintf(f, "  part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
-      fprintf(f, "  part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
-      fprintf(f, "  part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
-      fprintf(f, "  part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
-      fprintf(f, "  part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
-      fprintf(f, "  part.ps.epilog.poly_line_smoothing = %u\n",
-              key->part.ps.epilog.poly_line_smoothing);
-      fprintf(f, "  part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
-      fprintf(f, "  mono.u.ps.interpolate_at_sample_force_center = %u\n",
-              key->mono.u.ps.interpolate_at_sample_force_center);
-      fprintf(f, "  mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa);
-      fprintf(f, "  mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D);
-      fprintf(f, "  mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered);
+      fprintf(f, "  prolog.color_two_side = %u\n", key->ps.part.prolog.color_two_side);
+      fprintf(f, "  prolog.flatshade_colors = %u\n", key->ps.part.prolog.flatshade_colors);
+      fprintf(f, "  prolog.poly_stipple = %u\n", key->ps.part.prolog.poly_stipple);
+      fprintf(f, "  prolog.force_persp_sample_interp = %u\n",
+              key->ps.part.prolog.force_persp_sample_interp);
+      fprintf(f, "  prolog.force_linear_sample_interp = %u\n",
+              key->ps.part.prolog.force_linear_sample_interp);
+      fprintf(f, "  prolog.force_persp_center_interp = %u\n",
+              key->ps.part.prolog.force_persp_center_interp);
+      fprintf(f, "  prolog.force_linear_center_interp = %u\n",
+              key->ps.part.prolog.force_linear_center_interp);
+      fprintf(f, "  prolog.bc_optimize_for_persp = %u\n",
+              key->ps.part.prolog.bc_optimize_for_persp);
+      fprintf(f, "  prolog.bc_optimize_for_linear = %u\n",
+              key->ps.part.prolog.bc_optimize_for_linear);
+      fprintf(f, "  prolog.samplemask_log_ps_iter = %u\n",
+              key->ps.part.prolog.samplemask_log_ps_iter);
+      fprintf(f, "  epilog.spi_shader_col_format = 0x%x\n",
+              key->ps.part.epilog.spi_shader_col_format);
+      fprintf(f, "  epilog.color_is_int8 = 0x%X\n", key->ps.part.epilog.color_is_int8);
+      fprintf(f, "  epilog.color_is_int10 = 0x%X\n", key->ps.part.epilog.color_is_int10);
+      fprintf(f, "  epilog.last_cbuf = %u\n", key->ps.part.epilog.last_cbuf);
+      fprintf(f, "  epilog.alpha_func = %u\n", key->ps.part.epilog.alpha_func);
+      fprintf(f, "  epilog.alpha_to_one = %u\n", key->ps.part.epilog.alpha_to_one);
+      fprintf(f, "  epilog.poly_line_smoothing = %u\n",
+              key->ps.part.epilog.poly_line_smoothing);
+      fprintf(f, "  epilog.clamp_color = %u\n", key->ps.part.epilog.clamp_color);
+      fprintf(f, "  mono.interpolate_at_sample_force_center = %u\n",
+              key->ps.mono.interpolate_at_sample_force_center);
+      fprintf(f, "  mono.fbfetch_msaa = %u\n", key->ps.mono.fbfetch_msaa);
+      fprintf(f, "  mono.fbfetch_is_1D = %u\n", key->ps.mono.fbfetch_is_1D);
+      fprintf(f, "  mono.fbfetch_layered = %u\n", key->ps.mono.fbfetch_layered);
       break;
 
    default:
@@ -1266,35 +1255,44 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
 
    if ((stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_TESS_EVAL ||
         stage == MESA_SHADER_VERTEX) &&
-       !key->as_es && !key->as_ls) {
-      fprintf(f, "  opt.kill_outputs = 0x%" PRIx64 "\n", key->opt.kill_outputs);
-      fprintf(f, "  opt.kill_pointsize = 0x%x\n", key->opt.kill_pointsize);
-      fprintf(f, "  opt.kill_clip_distances = 0x%x\n", key->opt.kill_clip_distances);
+       !key->ge.as_es && !key->ge.as_ls) {
+      fprintf(f, "  opt.kill_outputs = 0x%" PRIx64 "\n", key->ge.opt.kill_outputs);
+      fprintf(f, "  opt.kill_pointsize = 0x%x\n", key->ge.opt.kill_pointsize);
+      fprintf(f, "  opt.kill_clip_distances = 0x%x\n", key->ge.opt.kill_clip_distances);
       if (stage != MESA_SHADER_GEOMETRY)
-         fprintf(f, "  opt.ngg_culling = 0x%x\n", key->opt.ngg_culling);
+         fprintf(f, "  opt.ngg_culling = 0x%x\n", key->ge.opt.ngg_culling);
    }
 
-   fprintf(f, "  opt.prefer_mono = %u\n", key->opt.prefer_mono);
-   fprintf(f, "  opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
-           key->opt.inline_uniforms,
-           key->opt.inlined_uniform_values[0],
-           key->opt.inlined_uniform_values[1],
-           key->opt.inlined_uniform_values[2],
-           key->opt.inlined_uniform_values[3]);
+   if (stage <= MESA_SHADER_GEOMETRY) {
+      fprintf(f, "  opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
+      fprintf(f, "  opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
+              key->ge.opt.inline_uniforms,
+              key->ge.opt.inlined_uniform_values[0],
+              key->ge.opt.inlined_uniform_values[1],
+              key->ge.opt.inlined_uniform_values[2],
+              key->ge.opt.inlined_uniform_values[3]);
+   } else {
+      fprintf(f, "  opt.prefer_mono = %u\n", key->ps.opt.prefer_mono);
+      fprintf(f, "  opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
+              key->ps.opt.inline_uniforms,
+              key->ps.opt.inlined_uniform_values[0],
+              key->ps.opt.inlined_uniform_values[1],
+              key->ps.opt.inlined_uniform_values[2],
+              key->ps.opt.inlined_uniform_values[3]);
+   }
 }
 
 bool si_vs_needs_prolog(const struct si_shader_selector *sel,
                         const struct si_vs_prolog_bits *prolog_key,
-                        const struct si_shader_key *key, bool ngg_cull_shader)
+                        const union si_shader_key *key, bool ngg_cull_shader)
 {
+   assert(sel->info.stage == MESA_SHADER_VERTEX);
+
    /* VGPR initialization fixup for Vega10 and Raven is always done in the
     * VS prolog. */
    return sel->vs_needs_prolog || prolog_key->ls_vgpr_fix ||
-          prolog_key->unpack_instance_id_from_vertex_id ||
           /* The 2nd VS prolog loads input VGPRs from LDS */
-          (key->opt.ngg_culling && !ngg_cull_shader) ||
-          /* The 1st VS prolog generates input VGPRs for fast launch. */
-          (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
+          (key->ge.opt.ngg_culling && !ngg_cull_shader);
 }
 
 /**
@@ -1316,21 +1314,12 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_
    key->vs_prolog.states = *prolog_key;
    key->vs_prolog.num_input_sgprs = num_input_sgprs;
    key->vs_prolog.num_inputs = info->num_inputs;
-   key->vs_prolog.as_ls = shader_out->key.as_ls;
-   key->vs_prolog.as_es = shader_out->key.as_es;
-   key->vs_prolog.as_ngg = shader_out->key.as_ngg;
-   key->vs_prolog.as_prim_discard_cs = shader_out->key.opt.vs_as_prim_discard_cs;
+   key->vs_prolog.as_ls = shader_out->key.ge.as_ls;
+   key->vs_prolog.as_es = shader_out->key.ge.as_es;
+   key->vs_prolog.as_ngg = shader_out->key.ge.as_ngg;
 
-   if (ngg_cull_shader) {
-      key->vs_prolog.gs_fast_launch_tri_list =
-         !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
-      key->vs_prolog.gs_fast_launch_tri_strip =
-         !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
-      key->vs_prolog.gs_fast_launch_index_size_packed =
-         SI_GET_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(shader_out->key.opt.ngg_culling);
-   } else if (shader_out->key.opt.ngg_culling) {
+   if (!ngg_cull_shader && shader_out->key.ge.opt.ngg_culling)
       key->vs_prolog.load_vgprs_after_culling = 1;
-   }
 
    if (shader_out->selector->info.stage == MESA_SHADER_TESS_CTRL) {
       key->vs_prolog.as_ls = 1;
@@ -1338,14 +1327,13 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_
    } else if (shader_out->selector->info.stage == MESA_SHADER_GEOMETRY) {
       key->vs_prolog.as_es = 1;
       key->vs_prolog.num_merged_next_stage_vgprs = 5;
-   } else if (shader_out->key.as_ngg) {
+   } else if (shader_out->key.ge.as_ngg) {
       key->vs_prolog.num_merged_next_stage_vgprs = 5;
    }
 
    /* Only one of these combinations can be set. as_ngg can be set with as_es. */
    assert(key->vs_prolog.as_ls + key->vs_prolog.as_ngg +
-             (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) + key->vs_prolog.as_prim_discard_cs <=
-          1);
+          (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) <= 1);
 
    /* Enable loading the InstanceID VGPR. */
    uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
@@ -1357,7 +1345,7 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_
 }
 
 struct nir_shader *si_get_nir_shader(struct si_shader_selector *sel,
-                                     const struct si_shader_key *key,
+                                     const union si_shader_key *key,
                                      bool *free_nir)
 {
    nir_shader *nir;
@@ -1378,7 +1366,12 @@ struct nir_shader *si_get_nir_shader(struct si_shader_selector *sel,
       return NULL;
    }
 
-   if (key && key->opt.inline_uniforms) {
+   bool inline_uniforms = false;
+   uint32_t *inlined_uniform_values;
+   si_get_inline_uniform_state((union si_shader_key*)key, sel->pipe_shader_type,
+                               &inline_uniforms, &inlined_uniform_values);
+
+   if (inline_uniforms) {
       assert(*free_nir);
 
       /* Most places use shader information from the default variant, not
@@ -1422,7 +1415,7 @@ struct nir_shader *si_get_nir_shader(struct si_shader_selector *sel,
        */
       NIR_PASS_V(nir, nir_inline_uniforms,
                  nir->info.num_inlinable_uniforms,
-                 key->opt.inlined_uniform_values,
+                 inlined_uniform_values,
                  nir->info.inlinable_uniform_dw_offsets);
 
       si_nir_opts(sel->screen, nir, true);
@@ -1451,8 +1444,10 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi
       si_dump_streamout(&sel->so);
    }
 
-   memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
-          sizeof(shader->info.vs_output_param_offset));
+   /* Initialize vs_output_ps_input_cntl to default. */
+   for (unsigned i = 0; i < ARRAY_SIZE(shader->info.vs_output_ps_input_cntl); i++)
+      shader->info.vs_output_ps_input_cntl[i] = SI_PS_INPUT_CNTL_UNUSED;
+   shader->info.vs_output_ps_input_cntl[VARYING_SLOT_COL0] = SI_PS_INPUT_CNTL_UNUSED_COLOR0;
 
    shader->info.uses_instanceid = sel->info.uses_instanceid;
 
@@ -1463,6 +1458,43 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi
    if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir))
       return false;
 
+   /* Compute vs_output_ps_input_cntl. */
+   if ((sel->info.stage == MESA_SHADER_VERTEX ||
+        sel->info.stage == MESA_SHADER_TESS_EVAL ||
+        sel->info.stage == MESA_SHADER_GEOMETRY) &&
+       !shader->key.ge.as_ls && !shader->key.ge.as_es) {
+      ubyte *vs_output_param_offset = shader->info.vs_output_param_offset;
+
+      if (sel->info.stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg)
+         vs_output_param_offset = sel->gs_copy_shader->info.vs_output_param_offset;
+
+      /* VS and TES should also set primitive ID output if it's used. */
+      unsigned num_outputs_with_prim_id = sel->info.num_outputs +
+                                          shader->key.ge.mono.u.vs_export_prim_id;
+
+      for (unsigned i = 0; i < num_outputs_with_prim_id; i++) {
+         unsigned semantic = sel->info.output_semantic[i];
+         unsigned offset = vs_output_param_offset[i];
+         unsigned ps_input_cntl;
+
+         if (offset <= AC_EXP_PARAM_OFFSET_31) {
+            /* The input is loaded from parameter memory. */
+            ps_input_cntl = S_028644_OFFSET(offset);
+         } else {
+            /* The input is a DEFAULT_VAL constant. */
+            assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
+                   offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
+            offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
+
+            /* OFFSET=0x20 means that DEFAULT_VAL is used. */
+            ps_input_cntl = S_028644_OFFSET(0x20) |
+                            S_028644_DEFAULT_VAL(offset);
+         }
+
+         shader->info.vs_output_ps_input_cntl[semantic] = ps_input_cntl;
+      }
+   }
+
    /* Validate SGPR and VGPR usage for compute to detect compiler bugs. */
    if (sel->info.stage == MESA_SHADER_COMPUTE) {
       unsigned wave_size = sscreen->compute_wave_size;
@@ -1552,39 +1584,34 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
 
    switch (stage) {
    case MESA_SHADER_VERTEX:
-      shader.key.as_ls = key->vs_prolog.as_ls;
-      shader.key.as_es = key->vs_prolog.as_es;
-      shader.key.as_ngg = key->vs_prolog.as_ngg;
-      shader.key.opt.ngg_culling =
-         (key->vs_prolog.gs_fast_launch_tri_list ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST : 0) |
-         (key->vs_prolog.gs_fast_launch_tri_strip ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP : 0) |
-         SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(key->vs_prolog.gs_fast_launch_index_size_packed);
-      shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs;
+      shader.key.ge.as_ls = key->vs_prolog.as_ls;
+      shader.key.ge.as_es = key->vs_prolog.as_es;
+      shader.key.ge.as_ngg = key->vs_prolog.as_ngg;
       break;
    case MESA_SHADER_TESS_CTRL:
       assert(!prolog);
-      shader.key.part.tcs.epilog = key->tcs_epilog.states;
-      break;
-   case MESA_SHADER_GEOMETRY:
-      assert(prolog);
-      shader.key.as_ngg = key->gs_prolog.as_ngg;
+      shader.key.ge.part.tcs.epilog = key->tcs_epilog.states;
       break;
    case MESA_SHADER_FRAGMENT:
       if (prolog)
-         shader.key.part.ps.prolog = key->ps_prolog.states;
+         shader.key.ps.part.prolog = key->ps_prolog.states;
       else
-         shader.key.part.ps.epilog = key->ps_epilog.states;
+         shader.key.ps.part.epilog = key->ps_epilog.states;
       break;
    default:
       unreachable("bad shader part");
    }
 
+   unsigned wave_size;
+   if (stage <= MESA_SHADER_GEOMETRY) {
+      wave_size = si_get_wave_size(sscreen, stage, shader.key.ge.as_ngg, shader.key.ge.as_es);
+   } else {
+      wave_size = si_get_wave_size(sscreen, stage, false, false);
+   }
+
    struct si_shader_context ctx;
-   si_llvm_context_init(&ctx, sscreen, compiler,
-                        si_get_wave_size(sscreen, stage,
-                                         shader.key.as_ngg, shader.key.as_es,
-                                         shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL,
-                                         shader.key.opt.vs_as_prim_discard_cs));
+   si_llvm_context_init(&ctx, sscreen, compiler, wave_size);
+
    ctx.shader = &shader;
    ctx.stage = stage;
 
@@ -1635,7 +1662,7 @@ static bool si_get_vs_prolog(struct si_screen *sscreen, struct ac_llvm_compiler
 static bool si_shader_select_vs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
                                       struct si_shader *shader, struct pipe_debug_callback *debug)
 {
-   return si_get_vs_prolog(sscreen, compiler, shader, debug, shader, &shader->key.part.vs.prolog);
+   return si_get_vs_prolog(sscreen, compiler, shader, debug, shader, &shader->key.ge.part.vs.prolog);
 }
 
 /**
@@ -1645,10 +1672,10 @@ static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm
                                        struct si_shader *shader, struct pipe_debug_callback *debug)
 {
    if (sscreen->info.chip_class >= GFX9) {
-      struct si_shader *ls_main_part = shader->key.part.tcs.ls->main_shader_part_ls;
+      struct si_shader *ls_main_part = shader->key.ge.part.tcs.ls->main_shader_part_ls;
 
       if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,
-                            &shader->key.part.tcs.ls_prolog))
+                            &shader->key.ge.part.tcs.ls_prolog))
          return false;
 
       shader->previous_stage = ls_main_part;
@@ -1657,7 +1684,7 @@ static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm
    /* Get the epilog. */
    union si_shader_part_key epilog_key;
    memset(&epilog_key, 0, sizeof(epilog_key));
-   epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
+   epilog_key.tcs_epilog.states = shader->key.ge.part.tcs.epilog;
 
    shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, MESA_SHADER_TESS_CTRL, false,
                                        &epilog_key, compiler, debug, si_llvm_build_tcs_epilog,
@@ -1674,31 +1701,20 @@ static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_
    if (sscreen->info.chip_class >= GFX9) {
       struct si_shader *es_main_part;
 
-      if (shader->key.as_ngg)
-         es_main_part = shader->key.part.gs.es->main_shader_part_ngg_es;
+      if (shader->key.ge.as_ngg)
+         es_main_part = shader->key.ge.part.gs.es->main_shader_part_ngg_es;
       else
-         es_main_part = shader->key.part.gs.es->main_shader_part_es;
+         es_main_part = shader->key.ge.part.gs.es->main_shader_part_es;
 
-      if (shader->key.part.gs.es->info.stage == MESA_SHADER_VERTEX &&
+      if (shader->key.ge.part.gs.es->info.stage == MESA_SHADER_VERTEX &&
           !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,
-                            &shader->key.part.gs.vs_prolog))
+                            &shader->key.ge.part.gs.vs_prolog))
          return false;
 
       shader->previous_stage = es_main_part;
    }
 
-   if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
-      return true;
-
-   union si_shader_part_key prolog_key;
-   memset(&prolog_key, 0, sizeof(prolog_key));
-   prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
-   prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
-
-   shader->prolog2 =
-      si_get_shader_part(sscreen, &sscreen->gs_prologs, MESA_SHADER_GEOMETRY, true, &prolog_key,
-                         compiler, debug, si_llvm_build_gs_prolog, "Geometry Shader Prolog");
-   return shader->prolog2 != NULL;
+   return true;
 }
 
 /**
@@ -1711,7 +1727,7 @@ void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *ke
    struct si_shader_info *info = &shader->selector->info;
 
    memset(key, 0, sizeof(*key));
-   key->ps_prolog.states = shader->key.part.ps.prolog;
+   key->ps_prolog.states = shader->key.ps.part.prolog;
    key->ps_prolog.colors_read = info->colors_read;
    key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
    key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
@@ -1727,7 +1743,7 @@ void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *ke
    if (info->colors_read) {
       ubyte *color = shader->selector->color_attr_index;
 
-      if (shader->key.part.ps.prolog.color_two_side) {
+      if (shader->key.ps.part.prolog.color_two_side) {
          /* BCOLORs are stored after the last input. */
          key->ps_prolog.num_interp_inputs = info->num_inputs;
          key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
@@ -1744,7 +1760,7 @@ void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *ke
 
          key->ps_prolog.color_attr_index[i] = color[i];
 
-         if (shader->key.part.ps.prolog.flatshade_colors && interp == INTERP_MODE_COLOR)
+         if (shader->key.ps.part.prolog.flatshade_colors && interp == INTERP_MODE_COLOR)
             interp = INTERP_MODE_FLAT;
 
          switch (interp) {
@@ -1754,9 +1770,9 @@ void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *ke
          case INTERP_MODE_SMOOTH:
          case INTERP_MODE_COLOR:
             /* Force the interpolation location for colors here. */
-            if (shader->key.part.ps.prolog.force_persp_sample_interp)
+            if (shader->key.ps.part.prolog.force_persp_sample_interp)
                location = TGSI_INTERPOLATE_LOC_SAMPLE;
-            if (shader->key.part.ps.prolog.force_persp_center_interp)
+            if (shader->key.ps.part.prolog.force_persp_center_interp)
                location = TGSI_INTERPOLATE_LOC_CENTER;
 
             switch (location) {
@@ -1784,9 +1800,9 @@ void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *ke
             break;
          case INTERP_MODE_NOPERSPECTIVE:
             /* Force the interpolation location for colors here. */
-            if (shader->key.part.ps.prolog.force_linear_sample_interp)
+            if (shader->key.ps.part.prolog.force_linear_sample_interp)
                location = TGSI_INTERPOLATE_LOC_SAMPLE;
-            if (shader->key.part.ps.prolog.force_linear_center_interp)
+            if (shader->key.ps.part.prolog.force_linear_center_interp)
                location = TGSI_INTERPOLATE_LOC_CENTER;
 
             /* The VGPR assignment for non-monolithic shaders
@@ -1850,7 +1866,7 @@ void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *ke
    key->ps_epilog.writes_z = info->writes_z;
    key->ps_epilog.writes_stencil = info->writes_stencil;
    key->ps_epilog.writes_samplemask = info->writes_samplemask;
-   key->ps_epilog.states = shader->key.part.ps.epilog;
+   key->ps_epilog.states = shader->key.ps.part.epilog;
 }
 
 /**
@@ -1884,34 +1900,34 @@ static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_
       return false;
 
    /* Enable POS_FIXED_PT if polygon stippling is enabled. */
-   if (shader->key.part.ps.prolog.poly_stipple) {
+   if (shader->key.ps.part.prolog.poly_stipple) {
       shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
       assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
    }
 
    /* Set up the enable bits for per-sample shading if needed. */
-   if (shader->key.part.ps.prolog.force_persp_sample_interp &&
+   if (shader->key.ps.part.prolog.force_persp_sample_interp &&
        (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
         G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
       shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
    }
-   if (shader->key.part.ps.prolog.force_linear_sample_interp &&
+   if (shader->key.ps.part.prolog.force_linear_sample_interp &&
        (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
         G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
       shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
    }
-   if (shader->key.part.ps.prolog.force_persp_center_interp &&
+   if (shader->key.ps.part.prolog.force_persp_center_interp &&
        (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
         G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
       shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
    }
-   if (shader->key.part.ps.prolog.force_linear_center_interp &&
+   if (shader->key.ps.part.prolog.force_linear_center_interp &&
        (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
         G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
@@ -1933,7 +1949,7 @@ static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_
    }
 
    /* Samplemask fixup requires the sample ID. */
-   if (shader->key.part.ps.prolog.samplemask_log_ps_iter) {
+   if (shader->key.ps.part.prolog.samplemask_log_ps_iter) {
       shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
       assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
    }
@@ -1941,7 +1957,7 @@ static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_
    /* The sample mask input is always enabled, because the API shader always
     * passes it through to the epilog. Disable it here if it's unused.
     */
-   if (!shader->key.part.ps.epilog.poly_line_smoothing && !shader->selector->info.reads_samplemask)
+   if (!shader->key.ps.part.epilog.poly_line_smoothing && !shader->selector->info.reads_samplemask)
       shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
 
    return true;
@@ -2022,8 +2038,8 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler
       shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
       shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
       shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;
-      memcpy(shader->info.vs_output_param_offset, mainp->info.vs_output_param_offset,
-             sizeof(mainp->info.vs_output_param_offset));
+      memcpy(shader->info.vs_output_ps_input_cntl, mainp->info.vs_output_ps_input_cntl,
+             sizeof(mainp->info.vs_output_ps_input_cntl));
       shader->info.uses_instanceid = mainp->info.uses_instanceid;
       shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
       shader->info.nr_param_exports = mainp->info.nr_param_exports;
@@ -2094,8 +2110,8 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler
       si_calculate_max_simd_waves(shader);
    }
 
-   if (shader->key.as_ngg) {
-      assert(!shader->key.as_es && !shader->key.as_ls);
+   if (sel->info.stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
+      assert(!shader->key.ge.as_es && !shader->key.ge.as_ls);
       if (!gfx10_ngg_calculate_subgroup_info(shader)) {
          fprintf(stderr, "Failed to compute subgroup info\n");
          return false;
@@ -2111,9 +2127,7 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler
         util_rast_prim_is_triangles(sel->info.base.gs.output_primitive)) ||
        (sel->info.stage == MESA_SHADER_VERTEX &&
         /* Used to export PrimitiveID from the correct vertex. */
-        (shader->key.mono.u.vs_export_prim_id ||
-         /* Used to generate triangle strip vertex IDs for all threads. */
-         shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP)));
+        shader->key.ge.mono.u.vs_export_prim_id));
 
    shader->uses_vs_state_outprim = sscreen->use_ngg &&
                                    /* Only used by streamout in vertex shaders. */
@@ -2122,18 +2136,18 @@ bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler
 
    if (sel->info.stage == MESA_SHADER_VERTEX) {
       shader->uses_base_instance = sel->info.uses_base_instance ||
-                                   shader->key.part.vs.prolog.instance_divisor_is_one ||
-                                   shader->key.part.vs.prolog.instance_divisor_is_fetched;
+                                   shader->key.ge.part.vs.prolog.instance_divisor_is_one ||
+                                   shader->key.ge.part.vs.prolog.instance_divisor_is_fetched;
    } else if (sel->info.stage == MESA_SHADER_TESS_CTRL) {
       shader->uses_base_instance = shader->previous_stage_sel &&
                                    (shader->previous_stage_sel->info.uses_base_instance ||
-                                    shader->key.part.tcs.ls_prolog.instance_divisor_is_one ||
-                                    shader->key.part.tcs.ls_prolog.instance_divisor_is_fetched);
+                                    shader->key.ge.part.tcs.ls_prolog.instance_divisor_is_one ||
+                                    shader->key.ge.part.tcs.ls_prolog.instance_divisor_is_fetched);
    } else if (sel->info.stage == MESA_SHADER_GEOMETRY) {
       shader->uses_base_instance = shader->previous_stage_sel &&
                                    (shader->previous_stage_sel->info.uses_base_instance ||
-                                    shader->key.part.gs.vs_prolog.instance_divisor_is_one ||
-                                    shader->key.part.gs.vs_prolog.instance_divisor_is_fetched);
+                                    shader->key.ge.part.gs.vs_prolog.instance_divisor_is_one ||
+                                    shader->key.ge.part.gs.vs_prolog.instance_divisor_is_fetched);
    }
 
    si_fix_resource_usage(sscreen, shader);
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader.h b/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader.h
index 04a831665a..118c37ee5a 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader.h	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader.h	
@@ -138,6 +138,7 @@
 #include "util/u_inlines.h"
 #include "util/u_live_shader_cache.h"
 #include "util/u_queue.h"
+#include "si_pm4.h"
 
 #include <stdio.h>
 
@@ -158,6 +159,12 @@ struct si_context;
 
 #define SI_NGG_PRIM_EDGE_FLAG_BITS ((1 << 9) | (1 << 19) | (1 << 29))
 
+#define SI_PS_INPUT_CNTL_0000          (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(0))
+#define SI_PS_INPUT_CNTL_0001          (S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(3))
+#define SI_PS_INPUT_CNTL_UNUSED        SI_PS_INPUT_CNTL_0000
+/* D3D9 behaviour for COLOR0 requires 0001. GL is undefined. */
+#define SI_PS_INPUT_CNTL_UNUSED_COLOR0 SI_PS_INPUT_CNTL_0001
+
 /* SGPR user data indices */
 enum
 {
@@ -272,14 +279,10 @@ enum
    SI_VS_BLIT_SGPRS_POS_TEXCOORD = 9,
 };
 
-#define SI_NGG_CULL_VIEW_SMALLPRIMS          (1 << 0)   /* view.xy + small prims */
+#define SI_NGG_CULL_ENABLED                  (1 << 0)   /* this implies W, view.xy, and small prim culling */
 #define SI_NGG_CULL_BACK_FACE                (1 << 1)   /* back faces */
 #define SI_NGG_CULL_FRONT_FACE               (1 << 2)   /* front faces */
-#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST  (1 << 3)   /* GS fast launch: triangles */
-#define SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP (1 << 4)   /* GS fast launch: triangle strip */
-#define SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(x)     (((x) & 0x3) << 5) /* 0->0, 1->1, 2->2, 3->4 */
-#define SI_GET_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(x) (((x) >> 5) & 0x3)
-#define SI_NGG_CULL_GS_FAST_LAUNCH_ALL       (0xf << 3) /* GS fast launch (both prim types) */
+#define SI_NGG_CULL_LINES                    (1 << 3)   /* the primitive type is lines */
 
 /**
  * For VS shader keys, describe any fixups required for vertex fetch.
@@ -323,6 +326,16 @@ enum si_color_output_type {
    SI_TYPE_UINT16,
 };
 
+union si_input_info {
+   struct {
+      ubyte semantic;
+      ubyte interpolate;
+      ubyte fp16_lo_hi_valid;
+      ubyte usage_mask;
+   };
+   uint32_t _unused; /* this just forces 4-byte alignment */
+};
+
 struct si_shader_info {
    shader_info base;
 
@@ -330,12 +343,8 @@ struct si_shader_info {
 
    ubyte num_inputs;
    ubyte num_outputs;
-   ubyte input_semantic[PIPE_MAX_SHADER_INPUTS];
-   ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS];
-   ubyte input_usage_mask[PIPE_MAX_SHADER_INPUTS];
-   ubyte input_fp16_lo_hi_valid[PIPE_MAX_SHADER_INPUTS];
+   union si_input_info input[PIPE_MAX_SHADER_INPUTS];
    ubyte output_semantic[PIPE_MAX_SHADER_OUTPUTS];
-   char output_semantic_to_slot[VARYING_SLOT_VAR15_16BIT + 1];
    ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
    ubyte output_readmask[PIPE_MAX_SHADER_OUTPUTS];
    ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS];
@@ -446,7 +455,6 @@ struct si_shader_selector {
    ubyte const_and_shader_buf_descriptors_index;
    ubyte sampler_and_images_descriptors_index;
    bool vs_needs_prolog;
-   bool prim_discard_cs_allowed;
    ubyte cs_shaderbufs_sgpr_index;
    ubyte cs_num_shaderbufs_in_user_sgprs;
    ubyte cs_images_sgpr_index;
@@ -454,7 +462,6 @@ struct si_shader_selector {
    ubyte cs_num_images_in_user_sgprs;
    ubyte num_vs_inputs;
    ubyte num_vbos_in_user_sgprs;
-   unsigned pa_cl_vs_out_cntl;
    unsigned ngg_cull_vert_threshold; /* UINT32_MAX = disabled */
    ubyte clipdist_mask;
    ubyte culldist_mask;
@@ -528,7 +535,6 @@ struct si_vs_prolog_bits {
    uint16_t instance_divisor_is_one;     /* bitmask of inputs */
    uint16_t instance_divisor_is_fetched; /* bitmask of inputs */
    unsigned ls_vgpr_fix : 1;
-   unsigned unpack_instance_id_from_vertex_id : 1;
 };
 
 /* Common TCS bits between the shader key and the epilog key. */
@@ -538,10 +544,6 @@ struct si_tcs_epilog_bits {
    unsigned tes_reads_tess_factors : 1;
 };
 
-struct si_gs_prolog_bits {
-   unsigned tri_strip_adj_fix : 1;
-};
-
 /* Common PS bits between the shader key and the prolog key. */
 struct si_ps_prolog_bits {
    unsigned color_two_side : 1;
@@ -578,10 +580,6 @@ union si_shader_part_key {
       unsigned as_ls : 1;
       unsigned as_es : 1;
       unsigned as_ngg : 1;
-      unsigned as_prim_discard_cs : 1;
-      unsigned gs_fast_launch_tri_list : 1;  /* for NGG culling */
-      unsigned gs_fast_launch_tri_strip : 1; /* for NGG culling */
-      unsigned gs_fast_launch_index_size_packed : 2;
       unsigned load_vgprs_after_culling : 1;
       /* Prologs for monolithic shaders shouldn't set EXEC. */
       unsigned is_monolithic : 1;
@@ -589,10 +587,6 @@ union si_shader_part_key {
    struct {
       struct si_tcs_epilog_bits states;
    } tcs_epilog;
-   struct {
-      struct si_gs_prolog_bits states;
-      unsigned as_ngg : 1;
-   } gs_prolog;
    struct {
       struct si_ps_prolog_bits states;
       unsigned num_input_sgprs : 6;
@@ -616,7 +610,8 @@ union si_shader_part_key {
    } ps_epilog;
 };
 
-struct si_shader_key {
+/* The shader key for geometry stages (VS, TCS, TES, GS) */
+struct si_shader_key_ge {
    /* Prolog and epilog flags. */
    union {
       struct {
@@ -630,20 +625,16 @@ struct si_shader_key {
       struct {
          struct si_vs_prolog_bits vs_prolog; /* for merged ES-GS */
          struct si_shader_selector *es;      /* for merged ES-GS */
-         struct si_gs_prolog_bits prolog;
       } gs;
-      struct {
-         struct si_ps_prolog_bits prolog;
-         struct si_ps_epilog_bits epilog;
-      } ps;
    } part;
 
    /* These three are initially set according to the NEXT_SHADER property,
     * or guessed if the property doesn't seem correct.
     */
-   unsigned as_es : 1;  /* export shader, which precedes GS */
-   unsigned as_ls : 1;  /* local shader, which precedes TCS */
-   unsigned as_ngg : 1; /* VS, TES, or GS compiled as NGG primitive shader */
+   unsigned as_es : 1;  /* whether it's a shader before GS */
+   unsigned as_ls : 1;  /* whether it's VS before TCS */
+   unsigned as_ngg : 1; /* whether it's the last GE stage and NGG is enabled,
+                           also set for the stage right before GS */
 
    /* Flags for monolithic compilation only. */
    struct {
@@ -654,15 +645,10 @@ struct si_shader_key {
       union si_vs_fix_fetch vs_fix_fetch[SI_MAX_ATTRIBS];
 
       union {
-         uint64_t ff_tcs_inputs_to_copy; /* for fixed-func TCS */
+         uint64_t ff_tcs_inputs_to_copy; /* fixed-func TCS only */
          /* When PS needs PrimID and GS is disabled. */
-         unsigned vs_export_prim_id : 1;
-         struct {
-            unsigned interpolate_at_sample_force_center : 1;
-            unsigned fbfetch_msaa : 1;
-            unsigned fbfetch_is_1D : 1;
-            unsigned fbfetch_layered : 1;
-         } ps;
+         unsigned vs_export_prim_id : 1;    /* VS and TES only */
+         unsigned gs_tri_strip_adj_fix : 1; /* GS only */
       } u;
    } mono;
 
@@ -674,7 +660,7 @@ struct si_shader_key {
       unsigned kill_pointsize : 1;
 
       /* For NGG VS and TES. */
-      unsigned ngg_culling : 7; /* SI_NGG_CULL_* */
+      unsigned ngg_culling : 4; /* SI_NGG_CULL_* */
 
       /* For shaders where monolithic variants have better code.
        *
@@ -684,15 +670,6 @@ struct si_shader_key {
        */
       unsigned prefer_mono : 1;
 
-      /* Primitive discard compute shader. */
-      unsigned vs_as_prim_discard_cs : 1;
-      unsigned cs_prim_type : 4;
-      unsigned cs_indexed : 1;
-      unsigned cs_instancing : 1;
-      unsigned cs_provoking_vertex_first : 1;
-      unsigned cs_cull_front : 1;
-      unsigned cs_cull_back : 1;
-
       /* VS and TCS have the same number of patch vertices. */
       unsigned same_patch_vertices:1;
 
@@ -705,12 +682,51 @@ struct si_shader_key {
    } opt;
 };
 
+struct si_shader_key_ps {
+   struct {
+      /* Prolog and epilog flags. */
+      struct si_ps_prolog_bits prolog;
+      struct si_ps_epilog_bits epilog;
+   } part;
+
+   /* Flags for monolithic compilation only. */
+   struct {
+      unsigned interpolate_at_sample_force_center : 1;
+      unsigned fbfetch_msaa : 1;
+      unsigned fbfetch_is_1D : 1;
+      unsigned fbfetch_layered : 1;
+   } mono;
+
+   /* Optimization flags for asynchronous compilation only. */
+   struct {
+      /* For shaders where monolithic variants have better code.
+       *
+       * This is a flag that has no effect on code generation,
+       * but forces monolithic shaders to be used as soon as
+       * possible, because it's in the "opt" group.
+       */
+      unsigned prefer_mono : 1;
+      unsigned inline_uniforms:1;
+
+      /* This must be kept last to limit the number of variants
+       * depending only on the uniform values.
+       */
+      uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS];
+   } opt;
+};
+
+union si_shader_key {
+   struct si_shader_key_ge ge; /* geometry engine shaders */
+   struct si_shader_key_ps ps;
+};
+
 /* Restore the pack alignment to default. */
 #pragma pack(pop)
 
 /* GCN-specific shader info. */
 struct si_shader_binary_info {
    ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS];
+   uint32_t vs_output_ps_input_cntl[NUM_TOTAL_VARYING_SLOTS];
    ubyte num_input_sgprs;
    ubyte num_input_vgprs;
    signed char face_vgpr_index;
@@ -740,7 +756,35 @@ struct gfx9_gs_info {
    unsigned esgs_ring_size; /* in bytes */
 };
 
+#define SI_NUM_VGT_STAGES_KEY_BITS 5
+#define SI_NUM_VGT_STAGES_STATES   (1 << SI_NUM_VGT_STAGES_KEY_BITS)
+
+/* The VGT_SHADER_STAGES key used to index the table of precomputed values.
+ * Some fields are set by state-change calls, most are set by draw_vbo.
+ */
+union si_vgt_stages_key {
+   struct {
+#if UTIL_ARCH_LITTLE_ENDIAN
+      uint8_t tess : 1;
+      uint8_t gs : 1;
+      uint8_t ngg_passthrough : 1;
+      uint8_t ngg : 1;       /* gfx10+ */
+      uint8_t streamout : 1; /* only used with NGG */
+      uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS;
+#else /* UTIL_ARCH_BIG_ENDIAN */
+      uint8_t _pad : 8 - SI_NUM_VGT_STAGES_KEY_BITS;
+      uint8_t streamout : 1;
+      uint8_t ngg : 1;
+      uint8_t ngg_passthrough : 1;
+      uint8_t gs : 1;
+      uint8_t tess : 1;
+#endif
+   } u;
+   uint8_t index;
+};
+
 struct si_shader {
+   struct si_pm4_state pm4; /* base class */
    struct si_compiler_ctx_state compiler_ctx_state;
 
    struct si_shader_selector *selector;
@@ -752,10 +796,9 @@ struct si_shader {
    struct si_shader_part *prolog2;
    struct si_shader_part *epilog;
 
-   struct si_pm4_state *pm4;
    struct si_resource *bo;
    struct si_resource *scratch_bo;
-   struct si_shader_key key;
+   union si_shader_key key;
    struct util_queue_fence ready;
    bool compilation_failed;
    bool is_monolithic;
@@ -807,6 +850,8 @@ struct si_shader {
          unsigned vgt_gs_onchip_cntl;
          unsigned vgt_gs_max_prims_per_subgroup;
          unsigned vgt_esgs_ring_itemsize;
+         unsigned spi_shader_pgm_rsrc3_gs;
+         unsigned spi_shader_pgm_rsrc4_gs;
       } gs;
 
       struct {
@@ -823,6 +868,9 @@ struct si_shader {
          unsigned pa_cl_ngg_cntl;
          unsigned vgt_gs_max_vert_out; /* for API GS */
          unsigned ge_pc_alloc;         /* uconfig register */
+         unsigned spi_shader_pgm_rsrc3_gs;
+         unsigned spi_shader_pgm_rsrc4_gs;
+         union si_vgt_stages_key vgt_stages;
       } ngg;
 
       struct {
@@ -843,6 +891,7 @@ struct si_shader {
          unsigned spi_shader_z_format;
          unsigned spi_shader_col_format;
          unsigned cb_shader_mask;
+         unsigned num_interp;
       } ps;
    } ctx_reg;
 
@@ -888,38 +937,32 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
 void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *info);
 void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first);
 void si_nir_late_opts(nir_shader *nir);
-void si_finalize_nir(struct pipe_screen *screen, void *nirptr);
+char *si_finalize_nir(struct pipe_screen *screen, void *nirptr);
 
-/* si_state_shaders.c */
+/* si_state_shaders.cpp */
 void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
                       struct gfx9_gs_info *out);
+bool gfx10_is_ngg_passthrough(struct si_shader *shader);
 
 /* Inline helpers. */
 
 /* Return the pointer to the main shader part's pointer. */
 static inline struct si_shader **si_get_main_shader_part(struct si_shader_selector *sel,
-                                                         struct si_shader_key *key)
+                                                         const union si_shader_key *key)
 {
-   if (key->as_ls)
-      return &sel->main_shader_part_ls;
-   if (key->as_es && key->as_ngg)
-      return &sel->main_shader_part_ngg_es;
-   if (key->as_es)
-      return &sel->main_shader_part_es;
-   if (key->as_ngg)
-      return &sel->main_shader_part_ngg;
+   if (sel->info.stage <= MESA_SHADER_GEOMETRY) {
+      if (key->ge.as_ls)
+         return &sel->main_shader_part_ls;
+      if (key->ge.as_es && key->ge.as_ngg)
+         return &sel->main_shader_part_ngg_es;
+      if (key->ge.as_es)
+         return &sel->main_shader_part_es;
+      if (key->ge.as_ngg)
+         return &sel->main_shader_part_ngg;
+   }
    return &sel->main_shader_part;
 }
 
-static inline bool gfx10_is_ngg_passthrough(struct si_shader *shader)
-{
-   struct si_shader_selector *sel = shader->selector;
-
-   return sel->info.stage != MESA_SHADER_GEOMETRY && !sel->so.num_outputs && !sel->info.writes_edgeflag &&
-          !shader->key.opt.ngg_culling &&
-          (sel->info.stage != MESA_SHADER_VERTEX || !shader->key.mono.u.vs_export_prim_id);
-}
-
 static inline bool si_shader_uses_bindless_samplers(struct si_shader_selector *selector)
 {
    return selector ? selector->info.uses_bindless_samplers : false;
@@ -930,6 +973,22 @@ static inline bool si_shader_uses_bindless_images(struct si_shader_selector *sel
    return selector ? selector->info.uses_bindless_images : false;
 }
 
+static inline bool gfx10_edgeflags_have_effect(struct si_shader *shader)
+{
+   if (shader->selector->info.stage == MESA_SHADER_VERTEX &&
+       !shader->selector->info.base.vs.blit_sgprs_amd &&
+       !(shader->key.ge.opt.ngg_culling & SI_NGG_CULL_LINES))
+      return true;
+
+   return false;
+}
+
+static inline bool gfx10_ngg_writes_user_edgeflags(struct si_shader *shader)
+{
+   return gfx10_edgeflags_have_effect(shader) &&
+          shader->selector->info.writes_edgeflag;
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_internal.h b/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_internal.h
index 46d8e69b98..b99ded02a0 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_internal.h	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_internal.h	
@@ -30,8 +30,6 @@
 
 struct pipe_debug_callback;
 
-#define RADEON_LLVM_MAX_INPUTS 32 * 4
-
 /* Ideally pass the sample mask input to the PS epilog as v14, which
  * is its usual location, so that the shader doesn't have to add v_mov.
  */
@@ -60,8 +58,6 @@ struct si_shader_context {
    struct ac_shader_args args;
    struct ac_shader_abi abi;
 
-   LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS];
-
    LLVMBasicBlockRef merged_wrap_if_entry_block;
    int merged_wrap_if_label;
 
@@ -134,10 +130,6 @@ struct si_shader_context {
 
    /* API TES */
    struct ac_arg tes_offchip_addr;
-   /* API GS */
-   struct ac_arg gs_vtx01_offset;  /* in dwords (GFX9) */
-   struct ac_arg gs_vtx23_offset;  /* in dwords (GFX9) */
-   struct ac_arg gs_vtx45_offset;  /* in dwords (GFX9) */
    /* PS */
    struct ac_arg pos_fixed_pt;
    /* CS */
@@ -148,6 +140,9 @@ struct si_shader_context {
 
    struct ac_llvm_compiler *compiler;
 
+   /* GS vertex offsets unpacked with the gfx6-9 tristrip_adj bug workaround. */
+   LLVMValueRef gs_vtx_offset[6];
+
    /* Preloaded descriptors. */
    LLVMValueRef esgs_ring;
    LLVMValueRef gsvs_ring[4];
@@ -176,12 +171,12 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader);
 unsigned si_get_max_workgroup_size(const struct si_shader *shader);
 bool si_vs_needs_prolog(const struct si_shader_selector *sel,
                         const struct si_vs_prolog_bits *prolog_key,
-                        const struct si_shader_key *key, bool ngg_cull_shader);
+                        const union si_shader_key *key, bool ngg_cull_shader);
 void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_sgprs,
                           bool ngg_cull_shader, const struct si_vs_prolog_bits *prolog_key,
                           struct si_shader *shader_out, union si_shader_part_key *key);
 struct nir_shader *si_get_nir_shader(struct si_shader_selector *sel,
-                                     const struct si_shader_key *key,
+                                     const union si_shader_key *key,
                                      bool *free_nir);
 bool si_need_ps_prolog(const union si_shader_part_key *key);
 void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key,
@@ -194,9 +189,8 @@ bool gfx10_ngg_export_prim_early(struct si_shader *shader);
 void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx);
 void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef user_edgeflags[3],
                                  LLVMValueRef prim_passthrough);
-void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
-                                     LLVMValueRef *addrs);
-void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
+void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi);
+void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi);
 void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LLVMValueRef *addrs);
 void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx);
 void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx);
@@ -242,15 +236,14 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
 /* si_shader_llvm_gs.c */
 LLVMValueRef si_is_es_thread(struct si_shader_context *ctx);
 LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx);
-void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
+void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi);
 void si_preload_esgs_ring(struct si_shader_context *ctx);
 void si_preload_gs_rings(struct si_shader_context *ctx);
-void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
 void si_llvm_init_gs_callbacks(struct si_shader_context *ctx);
 
 /* si_shader_llvm_tess.c */
 void si_llvm_preload_tes_rings(struct si_shader_context *ctx);
-void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
+void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi);
 void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, union si_shader_part_key *key);
 void si_llvm_init_tcs_callbacks(struct si_shader_context *ctx);
 void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader);
@@ -266,7 +259,6 @@ void si_llvm_init_ps_callbacks(struct si_shader_context *ctx);
 void si_llvm_init_resource_callbacks(struct si_shader_context *ctx);
 
 /* si_shader_llvm_vs.c */
-void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir);
 void si_llvm_streamout_store_output(struct si_shader_context *ctx, LLVMValueRef const *so_buffers,
                                     LLVMValueRef const *so_write_offsets,
                                     struct pipe_stream_output *stream_out,
@@ -275,7 +267,7 @@ void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_outp
                             unsigned noutput, unsigned stream);
 void si_llvm_build_vs_exports(struct si_shader_context *ctx,
                               struct si_shader_output_values *outputs, unsigned noutput);
-void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
+void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi);
 void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
 void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader);
 
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_llvm.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_llvm.c
index 2d643c58cf..dd944e7f8b 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_llvm.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_llvm.c	
@@ -22,6 +22,7 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
+#include "ac_exp_param.h"
 #include "ac_nir_to_llvm.h"
 #include "ac_rtld.h"
 #include "si_pipe.h"
@@ -149,10 +150,10 @@ void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTy
    gl_shader_stage real_stage = ctx->stage;
 
    /* LS is merged into HS (TCS), and ES is merged into GS. */
-   if (ctx->screen->info.chip_class >= GFX9) {
-      if (ctx->shader->key.as_ls)
+   if (ctx->screen->info.chip_class >= GFX9 && ctx->stage <= MESA_SHADER_GEOMETRY) {
+      if (ctx->shader->key.ge.as_ls)
          real_stage = MESA_SHADER_TESS_CTRL;
-      else if (ctx->shader->key.as_es || ctx->shader->key.as_ngg)
+      else if (ctx->shader->key.ge.as_es || ctx->shader->key.ge.as_ngg)
          real_stage = MESA_SHADER_GEOMETRY;
    }
 
@@ -218,7 +219,8 @@ void si_llvm_create_main_func(struct si_shader_context *ctx, bool ngg_cull_shade
    }
 
 
-   if (shader->key.as_ls || ctx->stage == MESA_SHADER_TESS_CTRL) {
+   if (ctx->stage <= MESA_SHADER_GEOMETRY &&
+       (shader->key.ge.as_ls || ctx->stage == MESA_SHADER_TESS_CTRL)) {
       if (USE_LDS_SYMBOLS) {
          /* The LSHS size is not known until draw time, so we append it
           * at the end of whatever LDS use there may be in the rest of
@@ -441,8 +443,31 @@ static void si_llvm_declare_compute_memory(struct si_shader_context *ctx)
 
 static bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir)
 {
-   if (nir->info.stage == MESA_SHADER_VERTEX) {
-      si_llvm_load_vs_inputs(ctx, nir);
+   if (nir->info.stage == MESA_SHADER_GEOMETRY) {
+      /* Unpack GS vertex offsets. */
+      for (unsigned i = 0; i < 6; i++) {
+         if (ctx->screen->info.chip_class >= GFX9) {
+            ctx->gs_vtx_offset[i] = si_unpack_param(ctx, ctx->args.gs_vtx_offset[i / 2], (i & 1) * 16, 16);
+         } else {
+            ctx->gs_vtx_offset[i] = ac_get_arg(&ctx->ac, ctx->args.gs_vtx_offset[i]);
+         }
+      }
+
+      /* Apply the hw bug workaround for triangle strips with adjacency. */
+      if (ctx->screen->info.chip_class <= GFX9 &&
+          ctx->shader->key.ge.mono.u.gs_tri_strip_adj_fix) {
+         LLVMValueRef prim_id = ac_get_arg(&ctx->ac, ctx->args.gs_prim_id);
+         /* Remap GS vertex offsets for every other primitive. */
+         LLVMValueRef rotate = LLVMBuildTrunc(ctx->ac.builder, prim_id, ctx->ac.i1, "");
+         LLVMValueRef fixed[6];
+
+         for (unsigned i = 0; i < 6; i++) {
+            fixed[i] = LLVMBuildSelect(ctx->ac.builder, rotate,
+                                       ctx->gs_vtx_offset[(i + 4) % 6],
+                                       ctx->gs_vtx_offset[i], "");
+         }
+         memcpy(ctx->gs_vtx_offset, fixed, sizeof(fixed));
+      }
    } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
       unsigned colors_read = ctx->shader->selector->info.colors_read;
       LLVMValueRef main_fn = ctx->main_fn;
@@ -471,7 +496,7 @@ static bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *
       }
 
       ctx->abi.interp_at_sample_force_center =
-         ctx->shader->key.mono.u.ps.interpolate_at_sample_force_center;
+         ctx->shader->key.ps.mono.interpolate_at_sample_force_center;
 
       ctx->abi.kill_ps_if_inf_interp =
          ctx->screen->options.no_infinite_interp &&
@@ -490,7 +515,6 @@ static bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *
          si_llvm_declare_compute_memory(ctx);
    }
 
-   ctx->abi.inputs = &ctx->inputs[0];
    ctx->abi.clamp_shadow_reference = true;
    ctx->abi.robust_buffer_access = true;
    ctx->abi.convert_undef_to_zero = true;
@@ -807,9 +831,6 @@ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *part
        !same_thread_count && si_is_multi_part_shader(ctx->shader))
       ac_build_endif(&ctx->ac, 6507);
 
-   /* Return the value from the last part. It's non-void only for the prim
-    * discard compute shader.
-    */
    if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
       LLVMBuildRetVoid(builder);
    else
@@ -859,7 +880,8 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
 
    si_llvm_create_main_func(ctx, ngg_cull_shader);
 
-   if (ctx->shader->key.as_es || ctx->stage == MESA_SHADER_GEOMETRY)
+   if (ctx->stage <= MESA_SHADER_GEOMETRY &&
+       (ctx->shader->key.ge.as_es || ctx->stage == MESA_SHADER_GEOMETRY))
       si_preload_esgs_ring(ctx);
 
    if (ctx->stage == MESA_SHADER_GEOMETRY)
@@ -877,7 +899,7 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
       for (unsigned i = 0; i < 4; i++) {
          ctx->gs_next_vertex[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
       }
-      if (shader->key.as_ngg) {
+      if (shader->key.ge.as_ngg) {
          for (unsigned i = 0; i < 4; ++i) {
             ctx->gs_curprim_verts[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
             ctx->gs_generated_prims[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, "");
@@ -897,21 +919,18 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
       }
    }
 
-   if (ctx->stage != MESA_SHADER_GEOMETRY && (shader->key.as_ngg && !shader->key.as_es)) {
+   if ((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
+       shader->key.ge.as_ngg && !shader->key.ge.as_es) {
       /* Unconditionally declare scratch space base for streamout and
        * vertex compaction. Whether space is actually allocated is
        * determined during linking / PM4 creation.
-       *
-       * Add an extra dword per vertex to ensure an odd stride, which
-       * avoids bank conflicts for SoA accesses.
        */
-      if (!gfx10_is_ngg_passthrough(shader))
-         si_llvm_declare_esgs_ring(ctx);
+      si_llvm_declare_esgs_ring(ctx);
 
       /* This is really only needed when streamout and / or vertex
        * compaction is enabled.
        */
-      if (!ctx->gs_ngg_scratch && (sel->so.num_outputs || shader->key.opt.ngg_culling)) {
+      if (!ctx->gs_ngg_scratch && (sel->so.num_outputs || shader->key.ge.opt.ngg_culling)) {
          LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader));
          ctx->gs_ngg_scratch =
             LLVMAddGlobalInAddressSpace(ctx->ac.module, asi32, "ngg_scratch", AC_ADDR_SPACE_LDS);
@@ -927,8 +946,8 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
       /* TES is special because it has only 1 shader part if NGG shader culling is disabled,
        * and therefore it doesn't use the wrapper function.
        */
-      bool no_wrapper_func = ctx->stage == MESA_SHADER_TESS_EVAL && !shader->key.as_es &&
-                             !shader->key.opt.ngg_culling;
+      bool no_wrapper_func = ctx->stage == MESA_SHADER_TESS_EVAL && !shader->key.ge.as_es &&
+                             !shader->key.ge.opt.ngg_culling;
 
       /* Set EXEC = ~0 before the first shader. If the prolog is present, EXEC is set there
        * instead. For monolithic shaders, the wrapper function does this.
@@ -936,14 +955,14 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
       if ((!shader->is_monolithic || no_wrapper_func) &&
           (ctx->stage == MESA_SHADER_TESS_EVAL ||
            (ctx->stage == MESA_SHADER_VERTEX &&
-            !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, ngg_cull_shader))))
+            !si_vs_needs_prolog(sel, &shader->key.ge.part.vs.prolog, &shader->key, ngg_cull_shader))))
          ac_init_exec_full_mask(&ctx->ac);
 
       /* NGG VS and NGG TES: Send gs_alloc_req and the prim export at the beginning to decrease
        * register usage.
        */
       if ((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
-          shader->key.as_ngg && !shader->key.as_es && !shader->key.opt.ngg_culling) {
+          shader->key.ge.as_ngg && !shader->key.ge.as_es && !shader->key.ge.opt.ngg_culling) {
          /* GFX10 requires a barrier before gs_alloc_req due to a hw bug. */
          if (ctx->screen->info.chip_class == GFX10)
             ac_build_s_barrier(&ctx->ac);
@@ -958,7 +977,7 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
       }
 
       /* NGG GS: Initialize LDS and insert s_barrier, which must not be inside the if statement. */
-      if (ctx->stage == MESA_SHADER_GEOMETRY && shader->key.as_ngg)
+      if (ctx->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)
          gfx10_ngg_gs_emit_prologue(ctx);
 
       if (ctx->stage == MESA_SHADER_GEOMETRY ||
@@ -968,8 +987,8 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
           * not here.
           */
          thread_enabled = si_is_gs_thread(ctx); /* 2nd shader: thread enabled bool */
-      } else if (((shader->key.as_ls || shader->key.as_es) && !shader->is_monolithic) ||
-                 (shader->key.as_ngg && !shader->key.as_es)) {
+      } else if (((shader->key.ge.as_ls || shader->key.ge.as_es) && !shader->is_monolithic) ||
+                 (shader->key.ge.as_ngg && !shader->key.ge.as_es)) {
          /* This is NGG VS or NGG TES or VS before GS or TES before GS or VS before TCS.
           * For monolithic LS (VS before TCS) and ES (VS before GS and TES before GS),
           * the if statement is inserted by the wrapper function.
@@ -1002,11 +1021,11 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
        */
       if (ctx->stage == MESA_SHADER_TESS_CTRL) {
          /* We need the barrier only if TCS inputs are read from LDS. */
-         if (!shader->key.opt.same_patch_vertices ||
+         if (!shader->key.ge.opt.same_patch_vertices ||
              shader->selector->info.base.inputs_read &
              ~shader->selector->tcs_vgpr_only_inputs)
             ac_build_s_barrier(&ctx->ac);
-      } else if (ctx->stage == MESA_SHADER_GEOMETRY && !shader->key.as_ngg) {
+      } else if (ctx->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
          /* gfx10_ngg_gs_emit_prologue inserts the barrier for NGG. */
          ac_build_s_barrier(&ctx->ac);
       }
@@ -1045,7 +1064,7 @@ static void si_optimize_vs_outputs(struct si_shader_context *ctx)
    unsigned skip_vs_optim_mask = 0;
 
    if ((ctx->stage != MESA_SHADER_VERTEX && ctx->stage != MESA_SHADER_TESS_EVAL) ||
-       shader->key.as_ls || shader->key.as_es)
+       shader->key.ge.as_ls || shader->key.ge.as_es)
       return;
 
    /* Optimizing these outputs is not possible, since they might be overriden
@@ -1073,7 +1092,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
    si_llvm_context_init(&ctx, sscreen, compiler, si_get_shader_wave_size(shader));
 
    LLVMValueRef ngg_cull_main_fn = NULL;
-   if (shader->key.opt.ngg_culling) {
+   if (ctx.stage <= MESA_SHADER_GEOMETRY && shader->key.ge.opt.ngg_culling) {
       if (!si_llvm_translate_nir(&ctx, shader, nir, false, true)) {
          si_llvm_dispose(&ctx);
          return false;
@@ -1094,10 +1113,10 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
       LLVMValueRef main_fn = ctx.main_fn;
 
       if (ngg_cull_main_fn) {
-         if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, true)) {
+         if (si_vs_needs_prolog(sel, &shader->key.ge.part.vs.prolog, &shader->key, true)) {
             union si_shader_part_key prolog_key;
             si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, true,
-                                 &shader->key.part.vs.prolog, shader, &prolog_key);
+                                 &shader->key.ge.part.vs.prolog, shader, &prolog_key);
             prolog_key.vs_prolog.is_monolithic = true;
             si_llvm_build_vs_prolog(&ctx, &prolog_key);
             parts[num_parts++] = ctx.main_fn;
@@ -1106,10 +1125,10 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
          parts[num_parts++] = ngg_cull_main_fn;
       }
 
-      if (si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, false)) {
+      if (si_vs_needs_prolog(sel, &shader->key.ge.part.vs.prolog, &shader->key, false)) {
          union si_shader_part_key prolog_key;
          si_get_vs_prolog_key(&sel->info, shader->info.num_input_sgprs, false,
-                              &shader->key.part.vs.prolog, shader, &prolog_key);
+                              &shader->key.ge.part.vs.prolog, shader, &prolog_key);
          prolog_key.vs_prolog.is_monolithic = true;
          si_llvm_build_vs_prolog(&ctx, &prolog_key);
          parts[num_parts++] = ctx.main_fn;
@@ -1119,9 +1138,6 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
       parts[num_parts++] = main_fn;
 
       si_build_wrapper_function(&ctx, parts, num_parts, first_is_prolog ? 1 : 0, 0, false);
-
-      if (ctx.shader->key.opt.vs_as_prim_discard_cs)
-         si_build_prim_discard_compute_shader(&ctx);
    } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_EVAL && ngg_cull_main_fn) {
       LLVMValueRef parts[3], prolog, main_fn = ctx.main_fn;
 
@@ -1143,10 +1159,10 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
       si_build_wrapper_function(&ctx, parts, 3, 0, 0, false);
    } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_CTRL) {
       if (sscreen->info.chip_class >= GFX9) {
-         struct si_shader_selector *ls = shader->key.part.tcs.ls;
+         struct si_shader_selector *ls = shader->key.ge.part.tcs.ls;
          LLVMValueRef parts[4];
          bool vs_needs_prolog =
-            si_vs_needs_prolog(ls, &shader->key.part.tcs.ls_prolog, &shader->key, false);
+            si_vs_needs_prolog(ls, &shader->key.ge.part.tcs.ls_prolog, &shader->key, false);
 
          /* TCS main part */
          parts[2] = ctx.main_fn;
@@ -1154,20 +1170,24 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
          /* TCS epilog */
          union si_shader_part_key tcs_epilog_key;
          memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
-         tcs_epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
+         tcs_epilog_key.tcs_epilog.states = shader->key.ge.part.tcs.epilog;
          si_llvm_build_tcs_epilog(&ctx, &tcs_epilog_key);
          parts[3] = ctx.main_fn;
 
          /* VS as LS main part */
          ctx.next_shader_sel = ctx.shader->selector;
-         nir = si_get_nir_shader(ls, NULL, &free_nir);
+
          struct si_shader shader_ls = {};
          shader_ls.selector = ls;
-         shader_ls.key.as_ls = 1;
-         shader_ls.key.mono = shader->key.mono;
-         shader_ls.key.opt = shader->key.opt;
+         shader_ls.key.ge.part.vs.prolog = shader->key.ge.part.tcs.ls_prolog;
+         shader_ls.key.ge.as_ls = 1;
+         shader_ls.key.ge.mono = shader->key.ge.mono;
+         shader_ls.key.ge.opt = shader->key.ge.opt;
+         shader_ls.key.ge.opt.inline_uniforms = false; /* only TCS can inline uniforms */
          shader_ls.is_monolithic = true;
 
+         nir = si_get_nir_shader(ls, &shader_ls.key, &free_nir);
+
          if (!si_llvm_translate_nir(&ctx, &shader_ls, nir, free_nir, false)) {
             si_llvm_dispose(&ctx);
             return false;
@@ -1179,7 +1199,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
          if (vs_needs_prolog) {
             union si_shader_part_key vs_prolog_key;
             si_get_vs_prolog_key(&ls->info, shader_ls.info.num_input_sgprs, false,
-                                 &shader->key.part.tcs.ls_prolog, shader, &vs_prolog_key);
+                                 &shader->key.ge.part.tcs.ls_prolog, shader, &vs_prolog_key);
             vs_prolog_key.vs_prolog.is_monolithic = true;
             si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
             parts[0] = ctx.main_fn;
@@ -1191,7 +1211,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
 
          si_build_wrapper_function(&ctx, parts + !vs_needs_prolog, 4 - !vs_needs_prolog,
                                    vs_needs_prolog, vs_needs_prolog ? 2 : 1,
-                                   shader->key.opt.same_patch_vertices);
+                                   shader->key.ge.opt.same_patch_vertices);
       } else {
          LLVMValueRef parts[2];
          union si_shader_part_key epilog_key;
@@ -1199,7 +1219,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
          parts[0] = ctx.main_fn;
 
          memset(&epilog_key, 0, sizeof(epilog_key));
-         epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
+         epilog_key.tcs_epilog.states = shader->key.ge.part.tcs.epilog;
          si_llvm_build_tcs_epilog(&ctx, &epilog_key);
          parts[1] = ctx.main_fn;
 
@@ -1207,30 +1227,24 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
       }
    } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_GEOMETRY) {
       if (ctx.screen->info.chip_class >= GFX9) {
-         struct si_shader_selector *es = shader->key.part.gs.es;
+         struct si_shader_selector *es = shader->key.ge.part.gs.es;
          LLVMValueRef es_prolog = NULL;
          LLVMValueRef es_main = NULL;
-         LLVMValueRef gs_prolog = NULL;
          LLVMValueRef gs_main = ctx.main_fn;
 
-         /* GS prolog */
-         union si_shader_part_key gs_prolog_key;
-         memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
-         gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
-         gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
-         si_llvm_build_gs_prolog(&ctx, &gs_prolog_key);
-         gs_prolog = ctx.main_fn;
-
          /* ES main part */
-         nir = si_get_nir_shader(es, NULL, &free_nir);
          struct si_shader shader_es = {};
          shader_es.selector = es;
-         shader_es.key.as_es = 1;
-         shader_es.key.as_ngg = shader->key.as_ngg;
-         shader_es.key.mono = shader->key.mono;
-         shader_es.key.opt = shader->key.opt;
+         shader_es.key.ge.part.vs.prolog = shader->key.ge.part.gs.vs_prolog;
+         shader_es.key.ge.as_es = 1;
+         shader_es.key.ge.as_ngg = shader->key.ge.as_ngg;
+         shader_es.key.ge.mono = shader->key.ge.mono;
+         shader_es.key.ge.opt = shader->key.ge.opt;
+         shader_es.key.ge.opt.inline_uniforms = false; /* only GS can inline uniforms */
          shader_es.is_monolithic = true;
 
+         nir = si_get_nir_shader(es, &shader_es.key, &free_nir);
+
          if (!si_llvm_translate_nir(&ctx, &shader_es, nir, free_nir, false)) {
             si_llvm_dispose(&ctx);
             return false;
@@ -1240,10 +1254,10 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
 
          /* ES prolog */
          if (es->info.stage == MESA_SHADER_VERTEX &&
-             si_vs_needs_prolog(es, &shader->key.part.gs.vs_prolog, &shader->key, false)) {
+             si_vs_needs_prolog(es, &shader->key.ge.part.gs.vs_prolog, &shader->key, false)) {
             union si_shader_part_key vs_prolog_key;
             si_get_vs_prolog_key(&es->info, shader_es.info.num_input_sgprs, false,
-                                 &shader->key.part.gs.vs_prolog, shader, &vs_prolog_key);
+                                 &shader->key.ge.part.gs.vs_prolog, shader, &vs_prolog_key);
             vs_prolog_key.vs_prolog.is_monolithic = true;
             si_llvm_build_vs_prolog(&ctx, &vs_prolog_key);
             es_prolog = ctx.main_fn;
@@ -1255,28 +1269,17 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
 
          /* Prepare the array of shader parts. */
          LLVMValueRef parts[4];
-         unsigned num_parts = 0, main_part, next_first_part;
+         unsigned num_parts = 0, main_part;
 
          if (es_prolog)
             parts[num_parts++] = es_prolog;
 
          parts[main_part = num_parts++] = es_main;
-         parts[next_first_part = num_parts++] = gs_prolog;
          parts[num_parts++] = gs_main;
 
-         si_build_wrapper_function(&ctx, parts, num_parts, main_part, next_first_part, false);
+         si_build_wrapper_function(&ctx, parts, num_parts, main_part, main_part + 1, false);
       } else {
-         LLVMValueRef parts[2];
-         union si_shader_part_key prolog_key;
-
-         parts[1] = ctx.main_fn;
-
-         memset(&prolog_key, 0, sizeof(prolog_key));
-         prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
-         si_llvm_build_gs_prolog(&ctx, &prolog_key);
-         parts[0] = ctx.main_fn;
-
-         si_build_wrapper_function(&ctx, parts, 2, 1, 0, false);
+         /* Nothing to do for gfx6-8. The shader has only 1 part and it's ctx.main_fn. */
       }
    } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_FRAGMENT) {
       si_llvm_build_monolithic_ps(&ctx, shader);
@@ -1292,8 +1295,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
    }
 
    /* Make sure the input is a pointer and not integer followed by inttoptr. */
-   if (!shader->key.opt.vs_as_prim_discard_cs)
-      assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind);
+   assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind);
 
    /* Compile to bytecode. */
    if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, &ctx.ac, debug,
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
index 67d7150d69..0a9f503ddb 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c	
@@ -52,30 +52,14 @@ static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, unsigned in
    unsigned param;
    LLVMValueRef value;
 
-   param = si_shader_io_get_unique_index(info->input_semantic[input_index], false);
+   param = si_shader_io_get_unique_index(info->input[input_index].semantic, false);
 
    /* GFX9 has the ESGS ring in LDS. */
    if (ctx->screen->info.chip_class >= GFX9) {
-      unsigned index = vtx_offset_param;
-
-      switch (index / 2) {
-      case 0:
-         vtx_offset = si_unpack_param(ctx, ctx->gs_vtx01_offset, index % 2 ? 16 : 0, 16);
-         break;
-      case 1:
-         vtx_offset = si_unpack_param(ctx, ctx->gs_vtx23_offset, index % 2 ? 16 : 0, 16);
-         break;
-      case 2:
-         vtx_offset = si_unpack_param(ctx, ctx->gs_vtx45_offset, index % 2 ? 16 : 0, 16);
-         break;
-      default:
-         assert(0);
-         return NULL;
-      }
-
       unsigned offset = param * 4 + swizzle;
-      vtx_offset =
-         LLVMBuildAdd(ctx->ac.builder, vtx_offset, LLVMConstInt(ctx->ac.i32, offset, false), "");
+
+      vtx_offset = LLVMBuildAdd(ctx->ac.builder, ctx->gs_vtx_offset[vtx_offset_param],
+                                LLVMConstInt(ctx->ac.i32, offset, false), "");
 
       LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset);
       LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, "");
@@ -84,9 +68,8 @@ static LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, unsigned in
 
    /* GFX6: input load from the ESGS ring in memory. */
    /* Get the vertex offset parameter on GFX6. */
-   LLVMValueRef gs_vtx_offset = ac_get_arg(&ctx->ac, ctx->args.gs_vtx_offset[vtx_offset_param]);
-
-   vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset, LLVMConstInt(ctx->ac.i32, 4, 0), "");
+   vtx_offset = LLVMBuildMul(ctx->ac.builder, ctx->gs_vtx_offset[vtx_offset_param],
+                             LLVMConstInt(ctx->ac.i32, 4, 0), "");
 
    soffset = LLVMConstInt(ctx->ac.i32, (param * 4 + swizzle) * 256, 0);
 
@@ -121,7 +104,7 @@ static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
 
    ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
    ret = si_insert_input_ptr(ctx, ret, ctx->other_samplers_and_images, 1);
-   if (ctx->shader->key.as_ngg)
+   if (ctx->shader->key.ge.as_ngg)
       ret = si_insert_input_ptr(ctx, ret, ctx->args.gs_tg_info, 2);
    else
       ret = si_insert_input_ret(ctx, ret, ctx->args.gs2vs_offset, 2);
@@ -137,19 +120,20 @@ static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
 
    unsigned vgpr = 8 + SI_NUM_VS_STATE_RESOURCE_SGPRS;
 
-   ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx01_offset, vgpr++);
-   ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx23_offset, vgpr++);
+   ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_vtx_offset[0], vgpr++);
+   ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_vtx_offset[1], vgpr++);
    ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_prim_id, vgpr++);
    ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
-   ret = si_insert_input_ret_float(ctx, ret, ctx->gs_vtx45_offset, vgpr++);
+   ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_vtx_offset[2], vgpr++);
    ctx->return_value = ret;
 }
 
-void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
+void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi)
 {
    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
    struct si_shader *es = ctx->shader;
    struct si_shader_info *info = &es->selector->info;
+   LLVMValueRef *addrs = abi->outputs;
    LLVMValueRef lds_base = NULL;
    unsigned chan;
    int i;
@@ -211,7 +195,7 @@ static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx)
 
 static void emit_gs_epilogue(struct si_shader_context *ctx)
 {
-   if (ctx->shader->key.as_ngg) {
+   if (ctx->shader->key.ge.as_ngg) {
       gfx10_ngg_gs_emit_epilogue(ctx);
       return;
    }
@@ -225,13 +209,12 @@ static void emit_gs_epilogue(struct si_shader_context *ctx)
       ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
 }
 
-static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
-                                     LLVMValueRef *addrs)
+static void si_llvm_emit_gs_epilogue(struct ac_shader_abi *abi)
 {
    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
    struct si_shader_info UNUSED *info = &ctx->shader->selector->info;
 
-   assert(info->num_outputs <= max_outputs);
+   assert(info->num_outputs <= AC_LLVM_MAX_OUTPUTS);
 
    emit_gs_epilogue(ctx);
 }
@@ -241,7 +224,7 @@ static void si_llvm_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVM
 {
    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 
-   if (ctx->shader->key.as_ngg) {
+   if (ctx->shader->key.ge.as_ngg) {
       gfx10_ngg_gs_emit_vertex(ctx, stream, addrs);
       return;
    }
@@ -316,7 +299,7 @@ static void si_llvm_emit_primitive(struct ac_shader_abi *abi, unsigned stream)
 {
    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 
-   if (ctx->shader->key.as_ngg) {
+   if (ctx->shader->key.ge.as_ngg) {
       LLVMBuildStore(ctx->ac.builder, ctx->ac.i32_0, ctx->gs_curprim_verts[stream]);
       return;
    }
@@ -444,7 +427,7 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
 
    si_llvm_context_init(&ctx, sscreen, compiler,
                         si_get_wave_size(sscreen, MESA_SHADER_VERTEX,
-                                         false, false, false, false));
+                                         false, false));
    ctx.shader = shader;
    ctx.stage = MESA_SHADER_VERTEX;
 
@@ -558,113 +541,6 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
    return shader;
 }
 
-/**
- * Build the GS prolog function. Rotate the input vertices for triangle strips
- * with adjacency.
- */
-void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
-{
-   unsigned num_sgprs, num_vgprs;
-   LLVMBuilderRef builder = ctx->ac.builder;
-   LLVMTypeRef returns[AC_MAX_ARGS];
-   LLVMValueRef func, ret;
-
-   memset(&ctx->args, 0, sizeof(ctx->args));
-
-   if (ctx->screen->info.chip_class >= GFX9) {
-      /* Other user SGPRs are not needed by GS. */
-      num_sgprs = 8 + SI_NUM_VS_STATE_RESOURCE_SGPRS;
-      num_vgprs = 5; /* ES inputs are not needed by GS */
-   } else {
-      num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
-      num_vgprs = 8;
-   }
-
-   for (unsigned i = 0; i < num_sgprs; ++i) {
-      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
-      returns[i] = ctx->ac.i32;
-   }
-
-   for (unsigned i = 0; i < num_vgprs; ++i) {
-      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
-      returns[num_sgprs + i] = ctx->ac.f32;
-   }
-
-   /* Create the function. */
-   si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0);
-   func = ctx->main_fn;
-
-   /* Copy inputs to outputs. This should be no-op, as the registers match,
-    * but it will prevent the compiler from overwriting them unintentionally.
-    */
-   ret = ctx->return_value;
-   for (unsigned i = 0; i < num_sgprs; i++) {
-      LLVMValueRef p = LLVMGetParam(func, i);
-      ret = LLVMBuildInsertValue(builder, ret, p, i, "");
-   }
-   for (unsigned i = 0; i < num_vgprs; i++) {
-      LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
-      p = ac_to_float(&ctx->ac, p);
-      ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
-   }
-
-   if (key->gs_prolog.states.tri_strip_adj_fix) {
-      /* Remap the input vertices for every other primitive. */
-      const struct ac_arg gfx6_vtx_params[6] = {
-         {.used = true, .arg_index = num_sgprs},     {.used = true, .arg_index = num_sgprs + 1},
-         {.used = true, .arg_index = num_sgprs + 3}, {.used = true, .arg_index = num_sgprs + 4},
-         {.used = true, .arg_index = num_sgprs + 5}, {.used = true, .arg_index = num_sgprs + 6},
-      };
-      const struct ac_arg gfx9_vtx_params[3] = {
-         {.used = true, .arg_index = num_sgprs},
-         {.used = true, .arg_index = num_sgprs + 1},
-         {.used = true, .arg_index = num_sgprs + 4},
-      };
-      LLVMValueRef vtx_in[6], vtx_out[6];
-      LLVMValueRef prim_id, rotate;
-
-      if (ctx->screen->info.chip_class >= GFX9) {
-         for (unsigned i = 0; i < 3; i++) {
-            vtx_in[i * 2] = si_unpack_param(ctx, gfx9_vtx_params[i], 0, 16);
-            vtx_in[i * 2 + 1] = si_unpack_param(ctx, gfx9_vtx_params[i], 16, 16);
-         }
-      } else {
-         for (unsigned i = 0; i < 6; i++)
-            vtx_in[i] = ac_get_arg(&ctx->ac, gfx6_vtx_params[i]);
-      }
-
-      prim_id = LLVMGetParam(func, num_sgprs + 2);
-      rotate = LLVMBuildTrunc(builder, prim_id, ctx->ac.i1, "");
-
-      for (unsigned i = 0; i < 6; ++i) {
-         LLVMValueRef base, rotated;
-         base = vtx_in[i];
-         rotated = vtx_in[(i + 4) % 6];
-         vtx_out[i] = LLVMBuildSelect(builder, rotate, rotated, base, "");
-      }
-
-      if (ctx->screen->info.chip_class >= GFX9) {
-         for (unsigned i = 0; i < 3; i++) {
-            LLVMValueRef hi, out;
-
-            hi = LLVMBuildShl(builder, vtx_out[i * 2 + 1], LLVMConstInt(ctx->ac.i32, 16, 0), "");
-            out = LLVMBuildOr(builder, vtx_out[i * 2], hi, "");
-            out = ac_to_float(&ctx->ac, out);
-            ret = LLVMBuildInsertValue(builder, ret, out, gfx9_vtx_params[i].arg_index, "");
-         }
-      } else {
-         for (unsigned i = 0; i < 6; i++) {
-            LLVMValueRef out;
-
-            out = ac_to_float(&ctx->ac, vtx_out[i]);
-            ret = LLVMBuildInsertValue(builder, ret, out, gfx6_vtx_params[i].arg_index, "");
-         }
-      }
-   }
-
-   LLVMBuildRet(builder, ret);
-}
-
 void si_llvm_init_gs_callbacks(struct si_shader_context *ctx)
 {
    ctx->abi.load_inputs = si_nir_load_input_gs;
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c
index c5b58b9a59..758ea36b43 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c	
@@ -79,22 +79,22 @@ static LLVMValueRef si_nir_emit_fbfetch(struct ac_shader_abi *abi)
 
    args.coords[chan++] = si_unpack_param(ctx, ctx->pos_fixed_pt, 0, 16);
 
-   if (!ctx->shader->key.mono.u.ps.fbfetch_is_1D)
+   if (!ctx->shader->key.ps.mono.fbfetch_is_1D)
       args.coords[chan++] = si_unpack_param(ctx, ctx->pos_fixed_pt, 16, 16);
 
    /* Get the current render target layer index. */
-   if (ctx->shader->key.mono.u.ps.fbfetch_layered)
+   if (ctx->shader->key.ps.mono.fbfetch_layered)
       args.coords[chan++] = si_unpack_param(ctx, ctx->args.ancillary, 16, 11);
 
-   if (ctx->shader->key.mono.u.ps.fbfetch_msaa)
+   if (ctx->shader->key.ps.mono.fbfetch_msaa)
       args.coords[chan++] = si_get_sample_id(ctx);
 
-   if (ctx->shader->key.mono.u.ps.fbfetch_msaa && !(ctx->screen->debug_flags & DBG(NO_FMASK))) {
+   if (ctx->shader->key.ps.mono.fbfetch_msaa && !(ctx->screen->debug_flags & DBG(NO_FMASK))) {
       fmask = ac_build_load_to_sgpr(&ctx->ac, ptr,
                                     LLVMConstInt(ctx->ac.i32, SI_PS_IMAGE_COLORBUF0_FMASK / 2, 0));
 
       ac_apply_fmask_to_sample(&ctx->ac, fmask, args.coords,
-                               ctx->shader->key.mono.u.ps.fbfetch_layered);
+                               ctx->shader->key.ps.mono.fbfetch_layered);
    }
 
    args.opcode = ac_image_load;
@@ -102,13 +102,13 @@ static LLVMValueRef si_nir_emit_fbfetch(struct ac_shader_abi *abi)
    args.dmask = 0xf;
    args.attributes = AC_FUNC_ATTR_READNONE;
 
-   if (ctx->shader->key.mono.u.ps.fbfetch_msaa)
+   if (ctx->shader->key.ps.mono.fbfetch_msaa)
       args.dim =
-         ctx->shader->key.mono.u.ps.fbfetch_layered ? ac_image_2darraymsaa : ac_image_2dmsaa;
-   else if (ctx->shader->key.mono.u.ps.fbfetch_is_1D)
-      args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ? ac_image_1darray : ac_image_1d;
+         ctx->shader->key.ps.mono.fbfetch_layered ? ac_image_2darraymsaa : ac_image_2dmsaa;
+   else if (ctx->shader->key.ps.mono.fbfetch_is_1D)
+      args.dim = ctx->shader->key.ps.mono.fbfetch_layered ? ac_image_1darray : ac_image_1d;
    else
-      args.dim = ctx->shader->key.mono.u.ps.fbfetch_layered ? ac_image_2darray : ac_image_2d;
+      args.dim = ctx->shader->key.ps.mono.fbfetch_layered ? ac_image_2darray : ac_image_2d;
 
    return ac_build_image_opcode(&ctx->ac, &args);
 }
@@ -170,7 +170,7 @@ static void interp_fs_color(struct si_shader_context *ctx, unsigned input_index,
       j = LLVMBuildExtractElement(ctx->ac.builder, interp_param, ctx->ac.i32_1, "");
    }
 
-   if (ctx->shader->key.part.ps.prolog.color_two_side) {
+   if (ctx->shader->key.ps.part.prolog.color_two_side) {
       LLVMValueRef is_face_positive;
 
       /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
@@ -199,13 +199,13 @@ static void interp_fs_color(struct si_shader_context *ctx, unsigned input_index,
 
 static void si_alpha_test(struct si_shader_context *ctx, LLVMValueRef alpha)
 {
-   if (ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_NEVER) {
+   if (ctx->shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_NEVER) {
       static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = {
          [PIPE_FUNC_LESS] = LLVMRealOLT,     [PIPE_FUNC_EQUAL] = LLVMRealOEQ,
          [PIPE_FUNC_LEQUAL] = LLVMRealOLE,   [PIPE_FUNC_GREATER] = LLVMRealOGT,
          [PIPE_FUNC_NOTEQUAL] = LLVMRealONE, [PIPE_FUNC_GEQUAL] = LLVMRealOGE,
       };
-      LLVMRealPredicate cond = cond_map[ctx->shader->key.part.ps.epilog.alpha_func];
+      LLVMRealPredicate cond = cond_map[ctx->shader->key.ps.part.epilog.alpha_func];
       assert(cond);
 
       LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn, SI_PARAM_ALPHA_REF);
@@ -274,8 +274,8 @@ static void si_llvm_init_ps_export_args(struct si_shader_context *ctx, LLVMValue
                                         unsigned cbuf, unsigned compacted_mrt_index,
                                         unsigned color_type, struct ac_export_args *args)
 {
-   const struct si_shader_key *key = &ctx->shader->key;
-   unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
+   const union si_shader_key *key = &ctx->shader->key;
+   unsigned col_formats = key->ps.part.epilog.spi_shader_col_format;
    LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32);
    unsigned spi_shader_col_format;
    unsigned chan;
@@ -284,8 +284,8 @@ static void si_llvm_init_ps_export_args(struct si_shader_context *ctx, LLVMValue
    assert(cbuf < 8);
 
    spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
-   is_int8 = (key->part.ps.epilog.color_is_int8 >> cbuf) & 0x1;
-   is_int10 = (key->part.ps.epilog.color_is_int10 >> cbuf) & 0x1;
+   is_int8 = (key->ps.part.epilog.color_is_int8 >> cbuf) & 0x1;
+   is_int10 = (key->ps.part.epilog.color_is_int10 >> cbuf) & 0x1;
 
    /* Default is 0xf. Adjusted below depending on the format. */
    args->enabled_channels = 0xf; /* writemask */
@@ -411,31 +411,31 @@ static bool si_export_mrt_color(struct si_shader_context *ctx, LLVMValueRef *col
    int i;
 
    /* Clamp color */
-   if (ctx->shader->key.part.ps.epilog.clamp_color)
+   if (ctx->shader->key.ps.part.epilog.clamp_color)
       for (i = 0; i < 4; i++)
          color[i] = ac_build_clamp(&ctx->ac, color[i]);
 
    /* Alpha to one */
-   if (ctx->shader->key.part.ps.epilog.alpha_to_one)
+   if (ctx->shader->key.ps.part.epilog.alpha_to_one)
       color[3] = LLVMConstReal(LLVMTypeOf(color[0]), 1);
 
    /* Alpha test */
-   if (index == 0 && ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
+   if (index == 0 && ctx->shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS)
       si_alpha_test(ctx, color[3]);
 
    /* Line & polygon smoothing */
-   if (ctx->shader->key.part.ps.epilog.poly_line_smoothing)
+   if (ctx->shader->key.ps.part.epilog.poly_line_smoothing)
       color[3] = si_scale_alpha_by_sample_mask(ctx, color[3], samplemask_param);
 
    /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
-   if (ctx->shader->key.part.ps.epilog.last_cbuf > 0) {
+   if (ctx->shader->key.ps.part.epilog.last_cbuf > 0) {
       struct ac_export_args args[8];
       int c, last = -1;
 
       assert(compacted_mrt_index == 0);
 
       /* Get the export arguments, also find out what the last one is. */
-      for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
+      for (c = 0; c <= ctx->shader->key.ps.part.epilog.last_cbuf; c++) {
          si_llvm_init_ps_export_args(ctx, color, c, compacted_mrt_index,
                                      color_type, &args[c]);
          if (args[c].enabled_channels) {
@@ -447,7 +447,7 @@ static bool si_export_mrt_color(struct si_shader_context *ctx, LLVMValueRef *col
          return false;
 
       /* Emit all exports. */
-      for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
+      for (c = 0; c <= ctx->shader->key.ps.part.epilog.last_cbuf; c++) {
          if (is_last && last == c) {
             args[c].valid_mask = 1; /* whether the EXEC mask is valid */
             args[c].done = 1;       /* DONE bit */
@@ -486,14 +486,14 @@ static bool si_export_mrt_color(struct si_shader_context *ctx, LLVMValueRef *col
  *
  * The alpha-ref SGPR is returned via its original location.
  */
-static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi, unsigned max_outputs,
-                                      LLVMValueRef *addrs)
+static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi)
 {
    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
    struct si_shader *shader = ctx->shader;
    struct si_shader_info *info = &shader->selector->info;
    LLVMBuilderRef builder = ctx->ac.builder;
    unsigned i, j, first_vgpr, vgpr;
+   LLVMValueRef *addrs = abi->outputs;
 
    LLVMValueRef color[8][4] = {};
    LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c
index af4f389546..3def734a68 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c	
@@ -150,6 +150,30 @@ static LLVMValueRef force_dcc_off(struct si_shader_context *ctx, LLVMValueRef rs
    }
 }
 
+static LLVMValueRef force_write_compress_off(struct si_shader_context *ctx, LLVMValueRef rsrc)
+{
+   LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0);
+   LLVMValueRef i32_C = LLVMConstInt(ctx->ac.i32, C_00A018_WRITE_COMPRESS_ENABLE, 0);
+   LLVMValueRef tmp;
+
+   tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, "");
+   tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, "");
+   return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, "");
+}
+
+static LLVMValueRef fixup_image_desc(struct si_shader_context *ctx, LLVMValueRef rsrc,
+                                     bool uses_store)
+{
+   if (uses_store && ctx->ac.chip_class <= GFX9)
+      rsrc = force_dcc_off(ctx, rsrc);
+
+   if (!uses_store && ctx->screen->info.has_image_load_dcc_bug &&
+       ctx->screen->always_allow_dcc_stores)
+      rsrc = force_write_compress_off(ctx, rsrc);
+
+   return rsrc;
+}
+
 /* AC_DESC_FMASK is handled exactly like AC_DESC_IMAGE. The caller should
  * adjust "index" to point to FMASK. */
 static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx, LLVMValueRef list,
@@ -171,8 +195,9 @@ static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx, LLVMValueR
    else
       rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index);
 
-   if (desc_type == AC_DESC_IMAGE && uses_store && ctx->ac.chip_class <= GFX9)
-      rsrc = force_dcc_off(ctx, rsrc);
+   if (desc_type == AC_DESC_IMAGE)
+      rsrc = fixup_image_desc(ctx, rsrc, uses_store);
+
    return rsrc;
 }
 
@@ -281,8 +306,13 @@ static LLVMValueRef si_nir_load_sampler_desc(struct ac_shader_abi *abi, unsigned
       /* Fast path if the image is in user SGPRs. */
       if (!dynamic_index &&
           const_index < ctx->shader->selector->cs_num_images_in_user_sgprs &&
-          (desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_BUFFER))
-         return ac_get_arg(&ctx->ac, ctx->cs_image[const_index]);
+          (desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_BUFFER)) {
+         LLVMValueRef rsrc = ac_get_arg(&ctx->ac, ctx->cs_image[const_index]);
+
+         if (desc_type == AC_DESC_IMAGE)
+            rsrc = fixup_image_desc(ctx, rsrc, write);
+         return rsrc;
+      }
 
       /* FMASKs are separate from images. */
       if (desc_type == AC_DESC_FMASK) {
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
index 145df00efb..b0aa0a0165 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c	
@@ -71,8 +71,8 @@ static unsigned get_tcs_out_vertex_dw_stride_constant(struct si_shader_context *
 {
    assert(ctx->stage == MESA_SHADER_TESS_CTRL);
 
-   if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
-      return util_last_bit64(ctx->shader->key.mono.u.ff_tcs_inputs_to_copy) * 4;
+   if (ctx->shader->key.ge.mono.u.ff_tcs_inputs_to_copy)
+      return util_last_bit64(ctx->shader->key.ge.mono.u.ff_tcs_inputs_to_copy) * 4;
 
    return util_last_bit64(ctx->shader->selector->outputs_written) * 4;
 }
@@ -86,7 +86,7 @@ static LLVMValueRef get_tcs_out_vertex_dw_stride(struct si_shader_context *ctx)
 
 static LLVMValueRef get_tcs_out_patch_stride(struct si_shader_context *ctx)
 {
-   if (ctx->shader->key.mono.u.ff_tcs_inputs_to_copy)
+   if (ctx->shader->key.ge.mono.u.ff_tcs_inputs_to_copy)
       return si_unpack_param(ctx, ctx->tcs_out_lds_layout, 0, 13);
 
    const struct si_shader_info *info = &ctx->shader->selector->info;
@@ -160,7 +160,7 @@ static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
 
    case MESA_SHADER_TESS_CTRL:
       if (ctx->screen->info.chip_class >= GFX9 && ctx->shader->is_monolithic) {
-         stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4;
+         stride = ctx->shader->key.ge.part.tcs.ls->lshs_vertex_stride / 4;
          return LLVMConstInt(ctx->ac.i32, stride, 0);
       }
       return si_unpack_param(ctx, ctx->vs_state_bits, 24, 8);
@@ -390,13 +390,13 @@ static LLVMValueRef si_nir_load_tcs_varyings(struct ac_shader_abi *abi, LLVMType
    ubyte semantic;
 
    if (load_input) {
-      semantic = info->input_semantic[driver_location];
+      semantic = info->input[driver_location].semantic;
    } else {
       semantic = info->output_semantic[driver_location];
    }
 
    /* Load the TCS input from a VGPR if possible. */
-   if (ctx->shader->key.opt.same_patch_vertices &&
+   if (ctx->shader->key.ge.opt.same_patch_vertices &&
        load_input && vertex_index_is_invoc_id && !param_index) {
       unsigned func_param = ctx->args.tcs_rel_ids.arg_index + 1 +
                             si_shader_io_get_unique_index(semantic, false) * 4;
@@ -448,7 +448,7 @@ static LLVMValueRef si_nir_load_input_tes(struct ac_shader_abi *abi, LLVMTypeRef
    struct si_shader_info *info = &ctx->shader->selector->info;
    LLVMValueRef base, addr;
 
-   ubyte semantic = info->input_semantic[driver_location];
+   ubyte semantic = info->input[driver_location].semantic;
 
    assert((semantic >= VARYING_SLOT_PATCH0 ||
            semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
@@ -559,21 +559,6 @@ static void si_nir_store_output_tcs(struct ac_shader_abi *abi,
    }
 }
 
-static LLVMValueRef si_load_tess_coord(struct ac_shader_abi *abi)
-{
-   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-   LLVMValueRef coord[4] = {ac_get_arg(&ctx->ac, ctx->args.tes_u),
-                            ac_get_arg(&ctx->ac, ctx->args.tes_v),
-                            ctx->ac.f32_0, ctx->ac.f32_0};
-
-   /* For triangles, the vector should be (u, v, 1-u-v). */
-   if (ctx->shader->selector->info.base.tess.primitive_mode == GL_TRIANGLES) {
-      coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1,
-                               LLVMBuildFAdd(ctx->ac.builder, coord[0], coord[1], ""), "");
-   }
-   return ac_build_gather_values(&ctx->ac, coord, 4);
-}
-
 static LLVMValueRef load_tess_level(struct si_shader_context *ctx, unsigned semantic)
 {
    LLVMValueRef base, addr;
@@ -665,7 +650,7 @@ static void si_copy_tcs_inputs(struct si_shader_context *ctx)
    lds_base = get_tcs_in_current_patch_offset(ctx);
    lds_base = ac_build_imad(&ctx->ac, invocation_id, lds_vertex_stride, lds_base);
 
-   inputs = ctx->shader->key.mono.u.ff_tcs_inputs_to_copy;
+   inputs = ctx->shader->key.ge.mono.u.ff_tcs_inputs_to_copy;
    while (inputs) {
       unsigned i = u_bit_scan64(&inputs);
 
@@ -694,7 +679,7 @@ static void si_write_tess_factors(struct si_shader_context *ctx, LLVMValueRef re
    unsigned stride, outer_comps, inner_comps, i, offset;
 
    /* Add a barrier before loading tess factors from LDS. */
-   if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def)
+   if (!shader->key.ge.part.tcs.epilog.invoc0_tess_factors_are_def)
       si_llvm_emit_barrier(ctx);
 
    /* Do this only for invocation 0, because the tess levels are per-patch,
@@ -707,7 +692,7 @@ static void si_write_tess_factors(struct si_shader_context *ctx, LLVMValueRef re
                  LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, invocation_id, ctx->ac.i32_0, ""), 6503);
 
    /* Determine the layout of one tess factor element in the buffer. */
-   switch (shader->key.part.tcs.epilog.prim_mode) {
+   switch (shader->key.ge.part.tcs.epilog.prim_mode) {
    case GL_LINES:
       stride = 2; /* 2 dwords, 1 vec2 store */
       outer_comps = 2;
@@ -733,7 +718,7 @@ static void si_write_tess_factors(struct si_shader_context *ctx, LLVMValueRef re
       outer[i] = LLVMGetUndef(ctx->ac.i32);
    }
 
-   if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) {
+   if (shader->key.ge.part.tcs.epilog.invoc0_tess_factors_are_def) {
       /* Tess factors are in VGPRs. */
       for (i = 0; i < outer_comps; i++)
          outer[i] = out[i] = invoc0_tf_outer[i];
@@ -760,7 +745,7 @@ static void si_write_tess_factors(struct si_shader_context *ctx, LLVMValueRef re
       }
    }
 
-   if (shader->key.part.tcs.epilog.prim_mode == GL_LINES) {
+   if (shader->key.ge.part.tcs.epilog.prim_mode == GL_LINES) {
       /* For isolines, the hardware expects tess factors in the
        * reverse order from what NIR specifies.
        */
@@ -804,7 +789,7 @@ static void si_write_tess_factors(struct si_shader_context *ctx, LLVMValueRef re
                                   ac_glc);
 
    /* Store the tess factors into the offchip buffer if TES reads them. */
-   if (shader->key.part.tcs.epilog.tes_reads_tess_factors) {
+   if (shader->key.ge.part.tcs.epilog.tes_reads_tess_factors) {
       LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset;
       LLVMValueRef tf_inner_offset;
       unsigned param_outer, param_inner;
@@ -839,8 +824,7 @@ static void si_write_tess_factors(struct si_shader_context *ctx, LLVMValueRef re
 }
 
 /* This only writes the tessellation factor levels. */
-static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
-                                      LLVMValueRef *addrs)
+static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi)
 {
    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
    LLVMBuilderRef builder = ctx->ac.builder;
@@ -954,7 +938,7 @@ static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
    ctx->return_value = ret;
 }
 
-void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
+void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi)
 {
    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
    struct si_shader *shader = ctx->shader;
@@ -963,6 +947,7 @@ void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, L
    LLVMValueRef vertex_id = ac_get_arg(&ctx->ac, ctx->args.vs_rel_patch_id);
    LLVMValueRef vertex_dw_stride = get_tcs_in_vertex_dw_stride(ctx);
    LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->ac.builder, vertex_id, vertex_dw_stride, "");
+   LLVMValueRef *addrs = abi->outputs;
    unsigned ret_offset = 8 + GFX9_TCS_NUM_USER_SGPR + 2;
 
    /* Write outputs to LDS. The next shader (TCS aka HS) will read
@@ -998,11 +983,11 @@ void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, L
 
          LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
 
-         if (!shader->key.opt.same_patch_vertices ||
+         if (!shader->key.ge.opt.same_patch_vertices ||
              !(ctx->next_shader_sel->tcs_vgpr_only_inputs & (1ull << semantic)))
             lshs_lds_store(ctx, chan, dw_addr, value);
 
-         if (shader->key.opt.same_patch_vertices) {
+         if (shader->key.ge.opt.same_patch_vertices) {
             ctx->return_value = LLVMBuildInsertValue(ctx->ac.builder, ctx->return_value,
                                                      value, ret_offset + param * 4 + chan, "");
          }
@@ -1096,15 +1081,14 @@ void si_llvm_init_tcs_callbacks(struct si_shader_context *ctx)
 void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader)
 {
    ctx->abi.load_tess_varyings = si_nir_load_input_tes;
-   ctx->abi.load_tess_coord = si_load_tess_coord;
    ctx->abi.load_tess_level = si_load_tess_level;
    ctx->abi.load_patch_vertices_in = si_load_patch_vertices_in;
 
-   if (ctx->shader->key.as_es)
+   if (ctx->shader->key.ge.as_es)
       ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
    else if (ngg_cull_shader)
       ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue;
-   else if (ctx->shader->key.as_ngg)
+   else if (ctx->shader->key.ge.as_ngg)
       ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
    else
       ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
index 0d07aeb7a7..53cf986412 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c	
@@ -26,6 +26,7 @@
 #include "si_shader_internal.h"
 #include "sid.h"
 #include "util/u_memory.h"
+#include "ac_exp_param.h"
 
 static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, LLVMValueRef i32, unsigned index)
 {
@@ -107,7 +108,7 @@ static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, L
     * ... which is what we must prevent at all cost.
     */
    const bool can_speculate = false;
-   unsigned bit_size = info->input_fp16_lo_hi_valid[input_index] & 0x1 ? 16 : 32;
+   unsigned bit_size = info->input[input_index].fp16_lo_hi_valid & 0x1 ? 16 : 32;
    LLVMTypeRef int_type = bit_size == 16 ? ctx->ac.i16 : ctx->ac.i32;
    LLVMTypeRef float_type = bit_size == 16 ? ctx->ac.f16 : ctx->ac.f32;
    unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
@@ -130,8 +131,8 @@ static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, L
     * of dword-sized data that needs fixups. We need to insert conversion
     * code anyway, and the amd/common code does it for us.
     */
-   bool opencode = ctx->shader->key.mono.vs_fetch_opencode & (1 << input_index);
-   fix_fetch.bits = ctx->shader->key.mono.vs_fix_fetch[input_index].bits;
+   bool opencode = ctx->shader->key.ge.mono.vs_fetch_opencode & (1 << input_index);
+   fix_fetch.bits = ctx->shader->key.ge.mono.vs_fix_fetch[input_index].bits;
    if (opencode || (fix_fetch.u.log_size == 3 && fix_fetch.u.format == AC_FETCH_FORMAT_FLOAT) ||
        (fix_fetch.u.log_size == 2)) {
       tmp = ac_build_opencoded_load_format(&ctx->ac, fix_fetch.u.log_size,
@@ -157,7 +158,7 @@ static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, L
       return;
    }
 
-   unsigned required_channels = util_last_bit(info->input_usage_mask[input_index]);
+   unsigned required_channels = util_last_bit(info->input[input_index].usage_mask);
    if (required_channels == 0) {
       for (unsigned i = 0; i < 4; ++i)
          out[i] = LLVMGetUndef(ctx->ac.f32);
@@ -252,18 +253,19 @@ static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, L
       out[i] = ac_to_float(&ctx->ac, fetches[i]);
 }
 
-void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir)
+static LLVMValueRef si_load_vs_input(struct ac_shader_abi *abi, unsigned driver_location,
+                                     unsigned component, unsigned num_components,
+                                     unsigned vertex_index, LLVMTypeRef type)
 {
-   const struct si_shader_info *info = &ctx->shader->selector->info;
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   LLVMValueRef values[4];
 
-   for (unsigned i = 0; i < info->num_inputs; i++) {
-      LLVMValueRef values[4];
+   load_input_vs(ctx, driver_location, values);
 
-      load_input_vs(ctx, i, values);
+   for (unsigned i = 0; i < 4; i++)
+      values[i] = LLVMBuildBitCast(ctx->ac.builder, values[i], type, "");
 
-      for (unsigned chan = 0; chan < 4; chan++)
-         ctx->inputs[i * 4 + chan] = ac_to_integer(&ctx->ac, values[chan]);
-   }
+   return ac_build_varying_gather_values(&ctx->ac, values, num_components, component);
 }
 
 void si_llvm_streamout_store_output(struct si_shader_context *ctx, LLVMValueRef const *so_buffers,
@@ -398,7 +400,7 @@ static void si_llvm_emit_clipvertex(struct si_shader_context *ctx, struct ac_exp
    LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32, SI_VS_CONST_CLIP_PLANES, 0);
    LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
    unsigned clipdist_mask = ctx->shader->selector->clipdist_mask &
-                            ~ctx->shader->key.opt.kill_clip_distances;
+                            ~ctx->shader->key.ge.opt.kill_clip_distances;
 
    for (reg_index = 0; reg_index < 2; reg_index++) {
       struct ac_export_args *args = &pos[2 + reg_index];
@@ -451,6 +453,9 @@ static void si_prepare_param_exports(struct si_shader_context *ctx,
    struct si_shader *shader = ctx->shader;
    unsigned param_count = 0;
 
+   memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000,
+          sizeof(shader->info.vs_output_param_offset));
+
    for (unsigned i = 0; i < noutput; i++) {
       unsigned semantic = outputs[i].semantic;
 
@@ -479,7 +484,7 @@ static void si_prepare_param_exports(struct si_shader_context *ctx,
       }
 
       if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) &&
-          shader->key.opt.kill_outputs &
+          shader->key.ge.opt.kill_outputs &
              (1ull << si_shader_io_get_unique_index(semantic, true)))
          continue;
 
@@ -570,7 +575,7 @@ void si_llvm_build_vs_exports(struct si_shader_context *ctx,
                 viewport_index_value = NULL;
    unsigned pos_idx, index;
    unsigned clipdist_mask = (shader->selector->clipdist_mask &
-                             ~shader->key.opt.kill_clip_distances) |
+                             ~shader->key.ge.opt.kill_clip_distances) |
                             shader->selector->culldist_mask;
    int i;
 
@@ -624,8 +629,8 @@ void si_llvm_build_vs_exports(struct si_shader_context *ctx,
       pos_args[0].out[3] = ctx->ac.f32_1; /* W */
    }
 
-   bool writes_psize = shader->selector->info.writes_psize && !shader->key.opt.kill_pointsize;
-   bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag && !shader->key.as_ngg;
+   bool writes_psize = shader->selector->info.writes_psize && !shader->key.ge.opt.kill_pointsize;
+   bool pos_writes_edgeflag = shader->selector->info.writes_edgeflag && !shader->key.ge.as_ngg;
    bool writes_vrs = ctx->screen->options.vrs2x2;
 
    /* Write the misc vector (point size, edgeflag, layer, viewport). */
@@ -752,15 +757,16 @@ void si_llvm_build_vs_exports(struct si_shader_context *ctx,
       ac_build_export(&ctx->ac, &param_exports[i]);
 }
 
-void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs)
+void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi)
 {
    struct si_shader_context *ctx = si_shader_context_from_abi(abi);
    struct si_shader_info *info = &ctx->shader->selector->info;
    struct si_shader_output_values *outputs = NULL;
+   LLVMValueRef *addrs = abi->outputs;
    int i, j;
 
    assert(!ctx->shader->is_gs_copy_shader);
-   assert(info->num_outputs <= max_outputs);
+   assert(info->num_outputs <= AC_LLVM_MAX_OUTPUTS);
 
    outputs = MALLOC((info->num_outputs + 1) * sizeof(outputs[0]));
 
@@ -777,7 +783,7 @@ void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, L
       si_llvm_emit_streamout(ctx, outputs, i, 0);
 
    /* Export PrimitiveID. */
-   if (ctx->shader->key.mono.u.vs_export_prim_id) {
+   if (ctx->shader->key.ge.mono.u.vs_export_prim_id) {
       outputs[i].semantic = VARYING_SLOT_PRIMITIVE_ID;
       outputs[i].values[0] = ac_to_float(&ctx->ac, si_get_primitive_id(ctx, 0));
       for (j = 1; j < 4; j++)
@@ -791,32 +797,6 @@ void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, L
    FREE(outputs);
 }
 
-static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
-                                                  LLVMValueRef *addrs)
-{
-   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-   struct si_shader_info *info = &ctx->shader->selector->info;
-   LLVMValueRef pos[4] = {};
-
-   assert(info->num_outputs <= max_outputs);
-
-   for (unsigned i = 0; i < info->num_outputs; i++) {
-      if (info->output_semantic[i] != VARYING_SLOT_POS)
-         continue;
-
-      for (unsigned chan = 0; chan < 4; chan++)
-         pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
-      break;
-   }
-   assert(pos[0] != NULL);
-
-   /* Return the position output. */
-   LLVMValueRef ret = ctx->return_value;
-   for (unsigned chan = 0; chan < 4; chan++)
-      ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");
-   ctx->return_value = ret;
-}
-
 /**
  * Build the vertex shader prolog function.
  *
@@ -859,8 +839,6 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part
       returns[num_returns++] = ctx->ac.i32;
    }
 
-   struct ac_arg merged_wave_info = input_sgpr_param[3];
-
    /* Preloaded VGPRs (outputs must be floats) */
    for (i = 0; i < num_input_vgprs; i++) {
       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &input_vgpr_param[i]);
@@ -912,109 +890,6 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part
       }
    }
 
-   if (key->vs_prolog.gs_fast_launch_tri_list || key->vs_prolog.gs_fast_launch_tri_strip) {
-      LLVMValueRef wave_id, thread_id_in_tg;
-
-      wave_id = si_unpack_param(ctx, input_sgpr_param[3], 24, 4);
-      thread_id_in_tg =
-         ac_build_imad(&ctx->ac, wave_id, LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false),
-                       ac_get_thread_id(&ctx->ac));
-
-      /* The GS fast launch initializes all VGPRs to the value of
-       * the first thread, so we have to add the thread ID.
-       *
-       * Only these are initialized by the hw:
-       *   VGPR2: Base Primitive ID
-       *   VGPR5: Base Vertex ID
-       *   VGPR6: Instance ID
-       */
-
-      /* Put the vertex thread IDs into VGPRs as-is instead of packing them.
-       * The NGG cull shader will read them from there.
-       */
-      if (key->vs_prolog.gs_fast_launch_tri_list) {
-         input_vgprs[0] = ac_build_imad(&ctx->ac, thread_id_in_tg,       /* gs_vtx01_offset */
-                                        LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 0 */
-                                        LLVMConstInt(ctx->ac.i32, 0, 0));
-         input_vgprs[1] = ac_build_imad(&ctx->ac, thread_id_in_tg,       /* gs_vtx23_offset */
-                                        LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 1 */
-                                        LLVMConstInt(ctx->ac.i32, 1, 0));
-         input_vgprs[4] = ac_build_imad(&ctx->ac, thread_id_in_tg,       /* gs_vtx45_offset */
-                                        LLVMConstInt(ctx->ac.i32, 3, 0), /* Vertex 2 */
-                                        LLVMConstInt(ctx->ac.i32, 2, 0));
-      } else {
-         assert(key->vs_prolog.gs_fast_launch_tri_strip);
-         LLVMBuilderRef builder = ctx->ac.builder;
-         /* Triangle indices: */
-         LLVMValueRef index[3] = {
-            thread_id_in_tg,
-            LLVMBuildAdd(builder, thread_id_in_tg, LLVMConstInt(ctx->ac.i32, 1, 0), ""),
-            LLVMBuildAdd(builder, thread_id_in_tg, LLVMConstInt(ctx->ac.i32, 2, 0), ""),
-         };
-         LLVMValueRef is_odd = LLVMBuildTrunc(ctx->ac.builder, thread_id_in_tg, ctx->ac.i1, "");
-         LLVMValueRef flatshade_first = LLVMBuildICmp(
-            builder, LLVMIntEQ,
-            si_unpack_param(ctx, input_sgpr_param[8 + SI_SGPR_VS_STATE_BITS], 4, 2),
-            ctx->ac.i32_0, "");
-
-         ac_build_triangle_strip_indices_to_triangle(&ctx->ac, is_odd, flatshade_first, index);
-         input_vgprs[0] = index[0];
-         input_vgprs[1] = index[1];
-         input_vgprs[4] = index[2];
-      }
-
-      /* Triangles always have all edge flags set initially. */
-      input_vgprs[3] = LLVMConstInt(ctx->ac.i32, 0x7 << 8, 0);
-
-      input_vgprs[2] =
-         LLVMBuildAdd(ctx->ac.builder, input_vgprs[2], thread_id_in_tg, ""); /* PrimID */
-      input_vgprs[5] =
-         LLVMBuildAdd(ctx->ac.builder, input_vgprs[5], thread_id_in_tg, ""); /* VertexID */
-      input_vgprs[8] = input_vgprs[6];                                       /* InstanceID */
-
-      if (key->vs_prolog.gs_fast_launch_index_size_packed) {
-         LLVMTypeRef index_type = ctx->ac.voidt;
-
-         switch (key->vs_prolog.gs_fast_launch_index_size_packed) {
-         case 1:
-            index_type = ctx->ac.i8;
-            break;
-         case 2:
-            index_type = ctx->ac.i16;
-            break;
-         case 3:
-            index_type = ctx->ac.i32;
-            break;
-         default:
-            unreachable("invalid gs_fast_launch_index_size_packed");
-         }
-
-         LLVMValueRef sgprs[2] = {
-            ac_get_arg(&ctx->ac, input_sgpr_param[0]),
-            ac_get_arg(&ctx->ac, input_sgpr_param[1]),
-         };
-         LLVMValueRef indices = ac_build_gather_values(&ctx->ac, sgprs, 2);
-         indices = LLVMBuildBitCast(ctx->ac.builder, indices, ctx->ac.i64, "");
-         indices = LLVMBuildIntToPtr(ctx->ac.builder, indices,
-                                     LLVMPointerType(index_type, AC_ADDR_SPACE_CONST), "");
-
-         LLVMValueRef vertex_id = ac_build_alloca_init(&ctx->ac, input_vgprs[5], "");
-
-         /* if (is ES thread...) */
-         ac_build_ifcc(&ctx->ac,
-                       LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, ac_get_thread_id(&ctx->ac),
-                                     si_unpack_param(ctx, merged_wave_info, 0, 8), ""), 0);
-         /* VertexID = indexBufferLoad(VertexID); */
-         LLVMValueRef index = LLVMBuildGEP(ctx->ac.builder, indices, &input_vgprs[5], 1, "");
-         index = LLVMBuildLoad(ctx->ac.builder, index, "");
-         index = LLVMBuildZExt(ctx->ac.builder, index, ctx->ac.i32, "");
-         LLVMBuildStore(ctx->ac.builder, index, vertex_id);
-         ac_build_endif(&ctx->ac, 0);
-
-         input_vgprs[5] = LLVMBuildLoad(ctx->ac.builder, vertex_id, "");
-      }
-   }
-
    unsigned vertex_id_vgpr = first_vs_vgpr;
    unsigned instance_id_vgpr = ctx->screen->info.chip_class >= GFX10
                                   ? first_vs_vgpr + 3
@@ -1023,16 +898,6 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part
    ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr];
    ctx->abi.instance_id = input_vgprs[instance_id_vgpr];
 
-   /* InstanceID = VertexID >> 16;
-    * VertexID   = VertexID & 0xffff;
-    */
-   if (key->vs_prolog.states.unpack_instance_id_from_vertex_id) {
-      ctx->abi.instance_id =
-         LLVMBuildLShr(ctx->ac.builder, ctx->abi.vertex_id, LLVMConstInt(ctx->ac.i32, 16, 0), "");
-      ctx->abi.vertex_id = LLVMBuildAnd(ctx->ac.builder, ctx->abi.vertex_id,
-                                        LLVMConstInt(ctx->ac.i32, 0xffff, 0), "");
-   }
-
    /* Copy inputs to outputs. This should be no-op, as the registers match,
     * but it will prevent the compiler from overwriting them unintentionally.
     */
@@ -1125,18 +990,17 @@ void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shad
 {
    struct si_shader *shader = ctx->shader;
 
-   if (shader->key.as_ls)
+   if (shader->key.ge.as_ls)
       ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
-   else if (shader->key.as_es)
+   else if (shader->key.ge.as_es)
       ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
-   else if (shader->key.opt.vs_as_prim_discard_cs)
-      ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
    else if (ngg_cull_shader)
       ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue;
-   else if (shader->key.as_ngg)
+   else if (shader->key.ge.as_ngg)
       ctx->abi.emit_outputs = gfx10_emit_ngg_epilogue;
    else
       ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
 
    ctx->abi.load_base_vertex = get_base_vertex;
+   ctx->abi.load_inputs = si_load_vs_input;
 }
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_nir.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_nir.c
index 6a40ec7b85..ed07fa7e0a 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_nir.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_shader_nir.c	
@@ -108,21 +108,25 @@ static void scan_io_usage(struct si_shader_info *info, nir_intrinsic_instr *intr
    unsigned num_slots = indirect ? nir_intrinsic_io_semantics(intr).num_slots : 1;
 
    if (is_input) {
-      assert(driver_location + num_slots <= ARRAY_SIZE(info->input_usage_mask));
+      assert(driver_location + num_slots <= ARRAY_SIZE(info->input));
 
       for (unsigned i = 0; i < num_slots; i++) {
          unsigned loc = driver_location + i;
 
-         info->input_semantic[loc] = semantic + i;
-         info->input_interpolate[loc] = interp;
+         info->input[loc].semantic = semantic + i;
+
+         if (semantic == SYSTEM_VALUE_PRIMITIVE_ID)
+            info->input[loc].interpolate = INTERP_MODE_FLAT;
+         else
+            info->input[loc].interpolate = interp;
 
          if (mask) {
-            info->input_usage_mask[loc] |= mask;
+            info->input[loc].usage_mask |= mask;
             if (bit_size == 16) {
                if (nir_intrinsic_io_semantics(intr).high_16bits)
-                  info->input_fp16_lo_hi_valid[loc] |= 0x2;
+                  info->input[loc].fp16_lo_hi_valid |= 0x2;
                else
-                  info->input_fp16_lo_hi_valid[loc] |= 0x1;
+                  info->input[loc].fp16_lo_hi_valid |= 0x1;
             }
             info->num_inputs = MAX2(info->num_inputs, loc + 1);
          }
@@ -130,13 +134,11 @@ static void scan_io_usage(struct si_shader_info *info, nir_intrinsic_instr *intr
    } else {
       /* Outputs. */
       assert(driver_location + num_slots <= ARRAY_SIZE(info->output_usagemask));
-      assert(semantic + num_slots < ARRAY_SIZE(info->output_semantic_to_slot));
 
       for (unsigned i = 0; i < num_slots; i++) {
          unsigned loc = driver_location + i;
 
          info->output_semantic[loc] = semantic + i;
-         info->output_semantic_to_slot[semantic + i] = loc;
 
          if (is_output_load) {
             /* Output loads have only a few things that we need to track. */
@@ -475,14 +477,22 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf
       info->writes_position = nir->info.outputs_written & VARYING_BIT_POS;
    }
 
-   memset(info->output_semantic_to_slot, -1, sizeof(info->output_semantic_to_slot));
-
    func = (struct nir_function *)exec_list_get_head_const(&nir->functions);
    nir_foreach_block (block, func->impl) {
       nir_foreach_instr (instr, block)
          scan_instruction(nir, info, instr);
    }
 
+   if (info->stage == MESA_SHADER_VERTEX || info->stage == MESA_SHADER_TESS_EVAL) {
+      /* Add the PrimitiveID output, but don't increment num_outputs.
+       * The driver inserts PrimitiveID only when it's used by the pixel shader,
+       * and si_emit_spi_map uses this unconditionally when such a pixel shader is used.
+       */
+      info->output_semantic[info->num_outputs] = VARYING_SLOT_PRIMITIVE_ID;
+      info->output_type[info->num_outputs] = nir_type_uint32;
+      info->output_usagemask[info->num_outputs] = 0x1;
+   }
+
    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
       info->allow_flat_shading = !(info->uses_persp_center || info->uses_persp_centroid ||
                                    info->uses_persp_sample || info->uses_linear_center ||
@@ -496,16 +506,25 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf
                                    BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS) ||
                                    BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN) ||
                                    BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_HELPER_INVOCATION));
-   }
 
-   /* Add color inputs to the list of inputs. */
-   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
-      for (unsigned i = 0; i < 2; i++) {
-         if ((info->colors_read >> (i * 4)) & 0xf) {
-            info->input_semantic[info->num_inputs] = VARYING_SLOT_COL0 + i;
-            info->input_interpolate[info->num_inputs] = info->color_interpolate[i];
-            info->input_usage_mask[info->num_inputs] = info->colors_read >> (i * 4);
-            info->num_inputs++;
+      /* Add both front and back color inputs. */
+      unsigned num_inputs_with_colors = info->num_inputs;
+      for (unsigned back = 0; back < 2; back++) {
+         for (unsigned i = 0; i < 2; i++) {
+            if ((info->colors_read >> (i * 4)) & 0xf) {
+               unsigned index = num_inputs_with_colors;
+
+               info->input[index].semantic = (back ? VARYING_SLOT_BFC0 : VARYING_SLOT_COL0) + i;
+               info->input[index].interpolate = info->color_interpolate[i];
+               info->input[index].usage_mask = info->colors_read >> (i * 4);
+               num_inputs_with_colors++;
+
+               /* Back-face color don't increment num_inputs. si_emit_spi_map will use
+                * back-face colors conditionally only when they are needed.
+                */
+               if (!back)
+                  info->num_inputs = num_inputs_with_colors;
+            }
          }
       }
    }
@@ -832,7 +851,6 @@ static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir)
       .lower_subgroup_masks = true,
       .lower_vote_trivial = false,
       .lower_vote_eq = true,
-      .lower_elect = true,
    };
    NIR_PASS_V(nir, nir_lower_subgroups, &subgroups_options);
 
@@ -903,7 +921,7 @@ static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir)
    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
 }
 
-void si_finalize_nir(struct pipe_screen *screen, void *nirptr)
+char *si_finalize_nir(struct pipe_screen *screen, void *nirptr)
 {
    struct si_screen *sscreen = (struct si_screen *)screen;
    struct nir_shader *nir = (struct nir_shader *)nirptr;
@@ -914,4 +932,6 @@ void si_finalize_nir(struct pipe_screen *screen, void *nirptr)
 
    if (sscreen->options.inline_uniforms)
       nir_find_inlinable_uniforms(nir);
+
+   return NULL;
 }
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_sqtt.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_sqtt.c
index 76a24b58a4..f261c68cdd 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_sqtt.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_sqtt.c	
@@ -66,6 +66,14 @@ si_thread_trace_init_bo(struct si_context *sctx)
    return true;
 }
 
+static bool
+si_se_is_disabled(struct si_context* sctx, unsigned se)
+{
+   /* No active CU on the SE means it is disabled. */
+   return sctx->screen->info.cu_mask[se][0] == 0;
+}
+
+
 static void
 si_emit_thread_trace_start(struct si_context* sctx,
                            struct radeon_cmdbuf *cs,
@@ -82,8 +90,11 @@ si_emit_thread_trace_start(struct si_context* sctx,
       uint64_t data_va = ac_thread_trace_get_data_va(&sctx->screen->info, sctx->thread_trace, va, se);
       uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT;
 
+      if (si_se_is_disabled(sctx, se))
+         continue;
+
       /* Target SEx and SH0. */
-      radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
+      radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
                              S_030800_SE_INDEX(se) |
                              S_030800_SH_INDEX(0) |
                              S_030800_INSTANCE_BROADCAST_WRITES(1));
@@ -93,20 +104,20 @@ si_emit_thread_trace_start(struct si_context* sctx,
 
       if (sctx->chip_class >= GFX10) {
          /* Order seems important for the following 2 registers. */
-         radeon_set_privileged_config_reg(cs, R_008D04_SQ_THREAD_TRACE_BUF0_SIZE,
+         radeon_set_privileged_config_reg(R_008D04_SQ_THREAD_TRACE_BUF0_SIZE,
                                           S_008D04_SIZE(shifted_size) |
                                           S_008D04_BASE_HI(shifted_va >> 32));
 
-         radeon_set_privileged_config_reg(cs, R_008D00_SQ_THREAD_TRACE_BUF0_BASE, shifted_va);
+         radeon_set_privileged_config_reg(R_008D00_SQ_THREAD_TRACE_BUF0_BASE, shifted_va);
 
          int wgp = first_active_cu / 2;
-         radeon_set_privileged_config_reg(cs, R_008D14_SQ_THREAD_TRACE_MASK,
+         radeon_set_privileged_config_reg(R_008D14_SQ_THREAD_TRACE_MASK,
                                           S_008D14_WTYPE_INCLUDE(0x7f) | /* all shader stages */
                                           S_008D14_SA_SEL(0) |
                                           S_008D14_WGP_SEL(wgp) |
                                           S_008D14_SIMD_SEL(0));
 
-         radeon_set_privileged_config_reg(cs, R_008D18_SQ_THREAD_TRACE_TOKEN_MASK,
+         radeon_set_privileged_config_reg(R_008D18_SQ_THREAD_TRACE_TOKEN_MASK,
                       S_008D18_REG_INCLUDE(V_008D18_REG_INCLUDE_SQDEC |
                                            V_008D18_REG_INCLUDE_SHDEC |
                                            V_008D18_REG_INCLUDE_GFXUDEC |
@@ -116,7 +127,7 @@ si_emit_thread_trace_start(struct si_context* sctx,
                       S_008D18_TOKEN_EXCLUDE(V_008D18_TOKEN_EXCLUDE_PERF));
 
          /* Should be emitted last (it enables thread traces). */
-         radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL,
+         radeon_set_privileged_config_reg(R_008D1C_SQ_THREAD_TRACE_CTRL,
                                           S_008D1C_MODE(1) |
                                           S_008D1C_HIWATER(5) |
                                           S_008D1C_UTIL_TIMER(1) |
@@ -130,15 +141,15 @@ si_emit_thread_trace_start(struct si_context* sctx,
                                              sctx->chip_class >= GFX10_3 ? 4 : 0));
       } else {
          /* Order seems important for the following 4 registers. */
-         radeon_set_uconfig_reg(cs, R_030CDC_SQ_THREAD_TRACE_BASE2,
+         radeon_set_uconfig_reg(R_030CDC_SQ_THREAD_TRACE_BASE2,
                                 S_030CDC_ADDR_HI(shifted_va >> 32));
 
-         radeon_set_uconfig_reg(cs, R_030CC0_SQ_THREAD_TRACE_BASE, shifted_va);
+         radeon_set_uconfig_reg(R_030CC0_SQ_THREAD_TRACE_BASE, shifted_va);
 
-         radeon_set_uconfig_reg(cs, R_030CC4_SQ_THREAD_TRACE_SIZE,
+         radeon_set_uconfig_reg(R_030CC4_SQ_THREAD_TRACE_SIZE,
                                 S_030CC4_SIZE(shifted_size));
 
-         radeon_set_uconfig_reg(cs, R_030CD4_SQ_THREAD_TRACE_CTRL,
+         radeon_set_uconfig_reg(R_030CD4_SQ_THREAD_TRACE_CTRL,
                                 S_030CD4_RESET_BUFFER(1));
 
          uint32_t thread_trace_mask = S_030CC8_CU_SEL(first_active_cu) |
@@ -149,28 +160,28 @@ si_emit_thread_trace_start(struct si_context* sctx,
                                       S_030CC8_SPI_STALL_EN(1) |
                                       S_030CC8_SQ_STALL_EN(1);
 
-         radeon_set_uconfig_reg(cs, R_030CC8_SQ_THREAD_TRACE_MASK,
+         radeon_set_uconfig_reg(R_030CC8_SQ_THREAD_TRACE_MASK,
                                 thread_trace_mask);
 
          /* Trace all tokens and registers. */
-         radeon_set_uconfig_reg(cs, R_030CCC_SQ_THREAD_TRACE_TOKEN_MASK,
+         radeon_set_uconfig_reg(R_030CCC_SQ_THREAD_TRACE_TOKEN_MASK,
                                 S_030CCC_TOKEN_MASK(0xbfff) |
                                 S_030CCC_REG_MASK(0xff) |
                                 S_030CCC_REG_DROP_ON_STALL(0));
 
          /* Enable SQTT perf counters for all CUs. */
-         radeon_set_uconfig_reg(cs, R_030CD0_SQ_THREAD_TRACE_PERF_MASK,
+         radeon_set_uconfig_reg(R_030CD0_SQ_THREAD_TRACE_PERF_MASK,
                                 S_030CD0_SH0_MASK(0xffff) |
                                 S_030CD0_SH1_MASK(0xffff));
 
-         radeon_set_uconfig_reg(cs, R_030CE0_SQ_THREAD_TRACE_TOKEN_MASK2, 0xffffffff);
+         radeon_set_uconfig_reg(R_030CE0_SQ_THREAD_TRACE_TOKEN_MASK2, 0xffffffff);
 
-         radeon_set_uconfig_reg(cs, R_030CEC_SQ_THREAD_TRACE_HIWATER,
+         radeon_set_uconfig_reg(R_030CEC_SQ_THREAD_TRACE_HIWATER,
                                 S_030CEC_HIWATER(4));
 
          if (sctx->chip_class == GFX9) {
             /* Reset thread trace status errors. */
-            radeon_set_uconfig_reg(cs, R_030CE8_SQ_THREAD_TRACE_STATUS,
+            radeon_set_uconfig_reg(R_030CE8_SQ_THREAD_TRACE_STATUS,
                                    S_030CE8_UTC_ERROR(0));
          }
 
@@ -191,24 +202,24 @@ si_emit_thread_trace_start(struct si_context* sctx,
             thread_trace_mode |= S_030CD8_TC_PERF_EN(1);
          }
 
-         radeon_set_uconfig_reg(cs, R_030CD8_SQ_THREAD_TRACE_MODE,
+         radeon_set_uconfig_reg(R_030CD8_SQ_THREAD_TRACE_MODE,
                                 thread_trace_mode);
       }
    }
 
    /* Restore global broadcasting. */
-   radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
+   radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
                           S_030800_SE_BROADCAST_WRITES(1) |
                              S_030800_SH_BROADCAST_WRITES(1) |
                              S_030800_INSTANCE_BROADCAST_WRITES(1));
 
    /* Start the thread trace with a different event based on the queue. */
    if (queue_family_index == RING_COMPUTE) {
-      radeon_set_sh_reg(cs, R_00B878_COMPUTE_THREAD_TRACE_ENABLE,
+      radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE,
                         S_00B878_THREAD_TRACE_ENABLE(1));
    } else {
-      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-      radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0));
+      radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0));
    }
    radeon_end();
 }
@@ -254,14 +265,14 @@ si_copy_thread_trace_info_regs(struct si_context* sctx,
 
    /* Copy back the info struct one DWORD at a time. */
    for (unsigned i = 0; i < 3; i++) {
-      radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-      radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
-                      COPY_DATA_DST_SEL(COPY_DATA_TC_L2) |
+      radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
+      radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
+                  COPY_DATA_DST_SEL(COPY_DATA_TC_L2) |
                   COPY_DATA_WR_CONFIRM);
-      radeon_emit(cs, thread_trace_info_regs[i] >> 2);
-      radeon_emit(cs, 0); /* unused */
-      radeon_emit(cs, (info_va + i * 4));
-      radeon_emit(cs, (info_va + i * 4) >> 32);
+      radeon_emit(thread_trace_info_regs[i] >> 2);
+      radeon_emit(0); /* unused */
+      radeon_emit((info_va + i * 4));
+      radeon_emit((info_va + i * 4) >> 32);
    }
    radeon_end();
 }
@@ -279,61 +290,64 @@ si_emit_thread_trace_stop(struct si_context *sctx,
 
    /* Stop the thread trace with a different event based on the queue. */
    if (queue_family_index == RING_COMPUTE) {
-      radeon_set_sh_reg(cs, R_00B878_COMPUTE_THREAD_TRACE_ENABLE,
+      radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE,
                         S_00B878_THREAD_TRACE_ENABLE(0));
    } else {
-      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-      radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_STOP) | EVENT_INDEX(0));
+      radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_STOP) | EVENT_INDEX(0));
    }
 
-   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-   radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0));
+   radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+   radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0));
    radeon_end();
 
    for (unsigned se = 0; se < max_se; se++) {
+      if (si_se_is_disabled(sctx, se))
+         continue;
+
       radeon_begin(cs);
 
       /* Target SEi and SH0. */
-      radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
+      radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
                              S_030800_SE_INDEX(se) |
                              S_030800_SH_INDEX(0) |
                              S_030800_INSTANCE_BROADCAST_WRITES(1));
 
       if (sctx->chip_class >= GFX10) {
          /* Make sure to wait for the trace buffer. */
-         radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
-         radeon_emit(cs, WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */
-         radeon_emit(cs, R_008D20_SQ_THREAD_TRACE_STATUS >> 2);  /* register */
-         radeon_emit(cs, 0);
-         radeon_emit(cs, 0); /* reference value */
-         radeon_emit(cs, S_008D20_FINISH_DONE(1)); /* mask */
-         radeon_emit(cs, 4); /* poll interval */
+         radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+         radeon_emit(WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */
+         radeon_emit(R_008D20_SQ_THREAD_TRACE_STATUS >> 2);  /* register */
+         radeon_emit(0);
+         radeon_emit(0); /* reference value */
+         radeon_emit(S_008D20_FINISH_DONE(1)); /* mask */
+         radeon_emit(4); /* poll interval */
 
          /* Disable the thread trace mode. */
-         radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL,
+         radeon_set_privileged_config_reg(R_008D1C_SQ_THREAD_TRACE_CTRL,
                                           S_008D1C_MODE(0));
 
          /* Wait for thread trace completion. */
-         radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
-         radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
-         radeon_emit(cs, R_008D20_SQ_THREAD_TRACE_STATUS >> 2);  /* register */
-         radeon_emit(cs, 0);
-         radeon_emit(cs, 0); /* reference value */
-         radeon_emit(cs, S_008D20_BUSY(1)); /* mask */
-         radeon_emit(cs, 4); /* poll interval */
+         radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+         radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
+         radeon_emit(R_008D20_SQ_THREAD_TRACE_STATUS >> 2);  /* register */
+         radeon_emit(0);
+         radeon_emit(0); /* reference value */
+         radeon_emit(S_008D20_BUSY(1)); /* mask */
+         radeon_emit(4); /* poll interval */
       } else {
          /* Disable the thread trace mode. */
-         radeon_set_uconfig_reg(cs, R_030CD8_SQ_THREAD_TRACE_MODE,
+         radeon_set_uconfig_reg(R_030CD8_SQ_THREAD_TRACE_MODE,
                                 S_030CD8_MODE(0));
 
          /* Wait for thread trace completion. */
-         radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
-         radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
-         radeon_emit(cs, R_030CE8_SQ_THREAD_TRACE_STATUS >> 2);  /* register */
-         radeon_emit(cs, 0);
-         radeon_emit(cs, 0); /* reference value */
-         radeon_emit(cs, S_030CE8_BUSY(1)); /* mask */
-         radeon_emit(cs, 4); /* poll interval */
+         radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+         radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
+         radeon_emit(R_030CE8_SQ_THREAD_TRACE_STATUS >> 2);  /* register */
+         radeon_emit(0);
+         radeon_emit(0); /* reference value */
+         radeon_emit(S_030CE8_BUSY(1)); /* mask */
+         radeon_emit(4); /* poll interval */
       }
       radeon_end();
 
@@ -342,7 +356,7 @@ si_emit_thread_trace_stop(struct si_context *sctx,
 
    /* Restore global broadcasting. */
    radeon_begin_again(cs);
-   radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
+   radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
                           S_030800_SE_BROADCAST_WRITES(1) |
                              S_030800_SH_BROADCAST_WRITES(1) |
                              S_030800_INSTANCE_BROADCAST_WRITES(1));
@@ -358,13 +372,13 @@ si_thread_trace_start(struct si_context *sctx, int family, struct radeon_cmdbuf
 
    switch (family) {
       case RING_GFX:
-         radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
-         radeon_emit(cs, CC0_UPDATE_LOAD_ENABLES(1));
-         radeon_emit(cs, CC1_UPDATE_SHADOW_ENABLES(1));
+         radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
+         radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
+         radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
          break;
       case RING_COMPUTE:
-         radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-         radeon_emit(cs, 0);
+         radeon_emit(PKT3(PKT3_NOP, 0, 0));
+         radeon_emit(0);
          break;
    }
    radeon_end();
@@ -401,13 +415,13 @@ si_thread_trace_stop(struct si_context *sctx, int family, struct radeon_cmdbuf *
 
    switch (family) {
       case RING_GFX:
-         radeon_emit(sctx->thread_trace->stop_cs[family], PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
-         radeon_emit(sctx->thread_trace->stop_cs[family], CC0_UPDATE_LOAD_ENABLES(1));
-         radeon_emit(sctx->thread_trace->stop_cs[family], CC1_UPDATE_SHADOW_ENABLES(1));
+         radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
+         radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
+         radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
          break;
       case RING_COMPUTE:
-         radeon_emit(sctx->thread_trace->stop_cs[family], PKT3(PKT3_NOP, 0, 0));
-         radeon_emit(sctx->thread_trace->stop_cs[family], 0);
+         radeon_emit(PKT3(PKT3_NOP, 0, 0));
+         radeon_emit(0);
          break;
    }
    radeon_end();
@@ -721,9 +735,9 @@ si_emit_thread_trace_userdata(struct si_context* sctx,
 
       /* Without the perfctr bit the CP might not always pass the
        * write on correctly. */
-      radeon_set_uconfig_reg_seq(cs, R_030D08_SQ_THREAD_TRACE_USERDATA_2, count, sctx->chip_class >= GFX10);
+      radeon_set_uconfig_reg_seq(R_030D08_SQ_THREAD_TRACE_USERDATA_2, count, sctx->chip_class >= GFX10);
 
-      radeon_emit_array(cs, dwords, count);
+      radeon_emit_array(dwords, count);
 
       dwords += count;
       num_dwords -= count;
@@ -746,10 +760,10 @@ si_emit_spi_config_cntl(struct si_context* sctx,
       if (sctx->chip_class >= GFX10)
          spi_config_cntl |= S_031100_PS_PKR_PRIORITY_CNTL(3);
 
-      radeon_set_uconfig_reg(cs, R_031100_SPI_CONFIG_CNTL, spi_config_cntl);
+      radeon_set_uconfig_reg(R_031100_SPI_CONFIG_CNTL, spi_config_cntl);
    } else {
       /* SPI_CONFIG_CNTL is a protected register on GFX6-GFX8. */
-      radeon_set_privileged_config_reg(cs, R_009100_SPI_CONFIG_CNTL,
+      radeon_set_privileged_config_reg(R_009100_SPI_CONFIG_CNTL,
                                        S_009100_ENABLE_SQG_TOP_EVENTS(enable) |
                                        S_009100_ENABLE_SQG_BOP_EVENTS(enable));
    }
@@ -913,24 +927,24 @@ si_sqtt_pipeline_is_registered(struct ac_thread_trace_data *thread_trace_data,
 
 
 static enum rgp_hardware_stages
-si_sqtt_pipe_to_rgp_shader_stage(struct si_shader_key* key, enum pipe_shader_type stage)
+si_sqtt_pipe_to_rgp_shader_stage(union si_shader_key* key, enum pipe_shader_type stage)
 {
    switch (stage) {
    case PIPE_SHADER_VERTEX:
-      if (key->as_ls)
+      if (key->ge.as_ls)
          return RGP_HW_STAGE_LS;
-      else if (key->as_es)
+      else if (key->ge.as_es)
          return RGP_HW_STAGE_ES;
-      else if (key->as_ngg)
+      else if (key->ge.as_ngg)
          return RGP_HW_STAGE_GS;
       else
          return RGP_HW_STAGE_VS;
    case PIPE_SHADER_TESS_CTRL:
       return RGP_HW_STAGE_HS;
    case PIPE_SHADER_TESS_EVAL:
-      if (key->as_es)
+      if (key->ge.as_es)
          return RGP_HW_STAGE_ES;
-      else if (key->as_ngg)
+      else if (key->ge.as_ngg)
          return RGP_HW_STAGE_GS;
       else
          return RGP_HW_STAGE_VS;
@@ -1000,6 +1014,8 @@ si_sqtt_add_code_object(struct si_context* sctx,
       record->shader_data[i].elf_symbol_offset = 0;
       record->shader_data[i].hw_stage = hw_stage;
       record->shader_data[i].is_combined = false;
+      record->shader_data[i].scratch_memory_size = shader->config.scratch_bytes_per_wave;
+      record->shader_data[i].wavefront_size = si_get_shader_wave_size(shader);
 
       record->shader_stages_mask |= (1 << i);
       record->num_shaders_combined++;
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_state.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_state.c
index db7252e9fa..951e949a96 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_state.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_state.c	
@@ -30,6 +30,7 @@
 #include "util/format/u_format.h"
 #include "util/format/u_format_s3tc.h"
 #include "util/u_dual_blend.h"
+#include "util/u_helpers.h"
 #include "util/u_memory.h"
 #include "util/u_resource.h"
 #include "util/u_upload_mgr.h"
@@ -93,8 +94,8 @@ static void si_emit_cb_render_state(struct si_context *sctx)
       sctx->last_cb_target_mask = cb_target_mask;
 
       radeon_begin(cs);
-      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-      radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
+      radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
       radeon_end();
    }
 
@@ -122,7 +123,7 @@ static void si_emit_cb_render_state(struct si_context *sctx)
    /* RB+ register settings. */
    if (sctx->screen->info.rbplus_allowed) {
       unsigned spi_shader_col_format =
-         sctx->shader.ps.cso ? sctx->shader.ps.current->key.part.ps.epilog.spi_shader_col_format
+         sctx->shader.ps.cso ? sctx->shader.ps.current->key.ps.part.epilog.spi_shader_col_format
                              : 0;
       unsigned sx_ps_downconvert = 0;
       unsigned sx_blend_opt_epsilon = 0;
@@ -636,19 +637,13 @@ static void *si_create_blend_state(struct pipe_context *ctx, const struct pipe_b
    return si_create_blend_state_mode(ctx, state, V_028808_CB_NORMAL);
 }
 
-static void si_draw_blend_dst_sampler_noop(struct pipe_context *ctx,
-                                           const struct pipe_draw_info *info,
-                                           unsigned drawid_offset,
-                                           const struct pipe_draw_indirect_info *indirect,
-                                           const struct pipe_draw_start_count_bias *draws,
-                                           unsigned num_draws) {
-   struct si_context *sctx = (struct si_context *)ctx;
-
+static bool si_check_blend_dst_sampler_noop(struct si_context *sctx)
+{
    if (sctx->framebuffer.state.nr_cbufs == 1) {
       struct si_shader_selector *sel = sctx->shader.ps.cso;
       bool free_nir;
       if (unlikely(sel->info.writes_1_if_tex_is_1 == 0xff)) {
-         struct nir_shader *nir = si_get_nir_shader(sel, NULL, &free_nir);
+         struct nir_shader *nir = si_get_nir_shader(sel, &sctx->shader.ps.key, &free_nir);
 
          /* Determine if this fragment shader always writes vec4(1) if a specific texture
           * is all 1s.
@@ -677,16 +672,44 @@ static void si_draw_blend_dst_sampler_noop(struct pipe_context *ctx,
             if (tex->is_depth &&
                 tex->depth_cleared_level_mask & BITFIELD_BIT(samp->views[unit]->u.tex.first_level) &&
                 tex->depth_clear_value[0] == 1) {
-               return;
+               return false;
             }
             /* TODO: handle color textures */
          }
       }
    }
 
+   return true;
+}
+
+static void si_draw_blend_dst_sampler_noop(struct pipe_context *ctx,
+                                           const struct pipe_draw_info *info,
+                                           unsigned drawid_offset,
+                                           const struct pipe_draw_indirect_info *indirect,
+                                           const struct pipe_draw_start_count_bias *draws,
+                                           unsigned num_draws) {
+   struct si_context *sctx = (struct si_context *)ctx;
+
+   if (!si_check_blend_dst_sampler_noop(sctx))
+      return;
+
    sctx->real_draw_vbo(ctx, info, drawid_offset, indirect, draws, num_draws);
 }
 
+static void si_draw_vstate_blend_dst_sampler_noop(struct pipe_context *ctx,
+                                                  struct pipe_vertex_state *state,
+                                                  uint32_t partial_velem_mask,
+                                                  struct pipe_draw_vertex_state_info info,
+                                                  const struct pipe_draw_start_count_bias *draws,
+                                                  unsigned num_draws) {
+   struct si_context *sctx = (struct si_context *)ctx;
+
+   if (!si_check_blend_dst_sampler_noop(sctx))
+      return;
+
+   sctx->real_draw_vertex_state(ctx, state, partial_velem_mask, info, draws, num_draws);
+}
+
 static void si_bind_blend_state(struct pipe_context *ctx, void *state)
 {
    struct si_context *sctx = (struct si_context *)ctx;
@@ -709,8 +732,12 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state)
        old_blend->alpha_to_one != blend->alpha_to_one ||
        old_blend->dual_src_blend != blend->dual_src_blend ||
        old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
-       old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit)
+       old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit) {
+      si_ps_key_update_framebuffer_blend(sctx);
+      si_ps_key_update_blend_rasterizer(sctx);
+      si_update_ps_inputs_read_or_disabled(sctx);
       sctx->do_update_shaders = true;
+   }
 
    if (sctx->screen->dpbb_allowed &&
        (old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
@@ -727,9 +754,10 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state)
 
    if (likely(!radeon_uses_secure_bos(sctx->ws))) {
       if (unlikely(blend->allows_noop_optimization)) {
-         si_install_draw_wrapper(sctx, si_draw_blend_dst_sampler_noop);
+         si_install_draw_wrapper(sctx, si_draw_blend_dst_sampler_noop,
+                                 si_draw_vstate_blend_dst_sampler_noop);
       } else {
-         si_install_draw_wrapper(sctx, NULL);
+         si_install_draw_wrapper(sctx, NULL, NULL);
       }
    }
 }
@@ -759,8 +787,8 @@ static void si_emit_blend_color(struct si_context *sctx)
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
 
    radeon_begin(cs);
-   radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4);
-   radeon_emit_array(cs, (uint32_t *)sctx->blend_color.color, 4);
+   radeon_set_context_reg_seq(R_028414_CB_BLEND_RED, 4);
+   radeon_emit_array((uint32_t *)sctx->blend_color.color, 4);
    radeon_end();
 }
 
@@ -793,8 +821,8 @@ static void si_emit_clip_state(struct si_context *sctx)
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
 
    radeon_begin(cs);
-   radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6 * 4);
-   radeon_emit_array(cs, (uint32_t *)sctx->clip_state.ucp, 6 * 4);
+   radeon_set_context_reg_seq(R_0285BC_PA_CL_UCP_0_X, 6 * 4);
+   radeon_emit_array((uint32_t *)sctx->clip_state.ucp, 6 * 4);
    radeon_end();
 }
 
@@ -809,7 +837,6 @@ static void si_emit_clip_regs(struct si_context *sctx)
    unsigned clipdist_mask = vs_sel->clipdist_mask;
    unsigned ucp_mask = clipdist_mask ? 0 : rs->clip_plane_enable & SIX_BITS;
    unsigned culldist_mask = vs_sel->culldist_mask;
-   unsigned vs_out_mask = (clipdist_mask & ~vs->key.opt.kill_clip_distances) | culldist_mask;
 
    /* Clip distances on points have no effect, so need to be implemented
     * as cull distances. This applies for the clipvertex case as well.
@@ -820,23 +847,14 @@ static void si_emit_clip_regs(struct si_context *sctx)
    clipdist_mask &= rs->clip_plane_enable;
    culldist_mask |= clipdist_mask;
 
-   unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((vs_out_mask & 0x0F) != 0) |
-                         S_02881C_VS_OUT_CCDIST1_VEC_ENA((vs_out_mask & 0xF0) != 0) |
-                         S_02881C_BYPASS_VTX_RATE_COMBINER(sctx->chip_class >= GFX10_3 &&
+   unsigned pa_cl_cntl = S_02881C_BYPASS_VTX_RATE_COMBINER(sctx->chip_class >= GFX10_3 &&
                                                            !sctx->screen->options.vrs2x2) |
                          S_02881C_BYPASS_PRIM_RATE_COMBINER(sctx->chip_class >= GFX10_3) |
                          clipdist_mask | (culldist_mask << 8);
 
    radeon_begin(&sctx->gfx_cs);
-
-   if (sctx->chip_class >= GFX10) {
-      radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
-                                     SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, pa_cl_cntl,
-                                     ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
-   } else {
-      radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_VS_OUT_CNTL__CL,
-                                 vs_sel->pa_cl_vs_out_cntl | pa_cl_cntl);
-   }
+   radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_VS_OUT_CNTL,
+			      pa_cl_cntl | vs->pa_cl_vs_out_cntl);
    radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL, SI_TRACKED_PA_CL_CLIP_CNTL,
                               rs->pa_cl_clip_cntl | ucp_mask | S_028810_CLIP_DISABLE(window_space));
    radeon_end_update_context_roll(sctx);
@@ -902,15 +920,6 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
       return NULL;
    }
 
-   if (!state->front_ccw) {
-      rs->cull_front = !!(state->cull_face & PIPE_FACE_FRONT);
-      rs->cull_back = !!(state->cull_face & PIPE_FACE_BACK);
-   } else {
-      rs->cull_back = !!(state->cull_face & PIPE_FACE_FRONT);
-      rs->cull_front = !!(state->cull_face & PIPE_FACE_BACK);
-   }
-   rs->depth_clamp_any = !state->depth_clip_near || !state->depth_clip_far;
-   rs->provoking_vertex_first = state->flatshade_first;
    rs->scissor_enable = state->scissor;
    rs->clip_halfz = state->clip_halfz;
    rs->two_side = state->light_twoside;
@@ -930,9 +939,6 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
    rs->flatshade_first = state->flatshade_first;
    rs->sprite_coord_enable = state->sprite_coord_enable;
    rs->rasterizer_discard = state->rasterizer_discard;
-   rs->polygon_mode_enabled =
-      (state->fill_front != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_FRONT)) ||
-      (state->fill_back != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_BACK));
    rs->polygon_mode_is_lines =
       (state->fill_front == PIPE_POLYGON_MODE_LINE && !(state->cull_face & PIPE_FACE_FRONT)) ||
       (state->fill_back == PIPE_POLYGON_MODE_LINE && !(state->cull_face & PIPE_FACE_BACK));
@@ -950,24 +956,30 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
                          S_028810_DX_LINEAR_ATTR_CLIP_ENA(1);
 
    if (rs->rasterizer_discard) {
-      rs->ngg_cull_flags = SI_NGG_CULL_FRONT_FACE | SI_NGG_CULL_BACK_FACE;
+      rs->ngg_cull_flags = SI_NGG_CULL_ENABLED |
+                           SI_NGG_CULL_FRONT_FACE |
+                           SI_NGG_CULL_BACK_FACE;
       rs->ngg_cull_flags_y_inverted = rs->ngg_cull_flags;
    } else {
-      /* Polygon mode can't use view and small primitive culling,
-       * because it draws points or lines where the culling depends
-       * on the point or line width.
-       */
-      if (!rs->polygon_mode_enabled) {
-         rs->ngg_cull_flags |= SI_NGG_CULL_VIEW_SMALLPRIMS;
-         rs->ngg_cull_flags_y_inverted |= SI_NGG_CULL_VIEW_SMALLPRIMS;
+      rs->ngg_cull_flags = SI_NGG_CULL_ENABLED;
+      rs->ngg_cull_flags_y_inverted = rs->ngg_cull_flags;
+
+      bool cull_front, cull_back;
+
+      if (!state->front_ccw) {
+         cull_front = !!(state->cull_face & PIPE_FACE_FRONT);
+         cull_back = !!(state->cull_face & PIPE_FACE_BACK);
+      } else {
+         cull_back = !!(state->cull_face & PIPE_FACE_FRONT);
+         cull_front = !!(state->cull_face & PIPE_FACE_BACK);
       }
 
-      if (rs->cull_front) {
+      if (cull_front) {
          rs->ngg_cull_flags |= SI_NGG_CULL_FRONT_FACE;
          rs->ngg_cull_flags_y_inverted |= SI_NGG_CULL_BACK_FACE;
       }
 
-      if (rs->cull_back) {
+      if (cull_back) {
          rs->ngg_cull_flags |= SI_NGG_CULL_BACK_FACE;
          rs->ngg_cull_flags_y_inverted |= SI_NGG_CULL_FRONT_FACE;
       }
@@ -1010,7 +1022,10 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
          S_028A48_VPORT_SCISSOR_ENABLE(1) |
          S_028A48_ALTERNATE_RBS_PER_TILE(sscreen->info.chip_class >= GFX9));
 
-   si_pm4_set_reg(pm4, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp));
+   bool polygon_mode_enabled =
+      (state->fill_front != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_FRONT)) ||
+      (state->fill_back != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_BACK));
+
    si_pm4_set_reg(pm4, R_028814_PA_SU_SC_MODE_CNTL,
                   S_028814_PROVOKING_VTX_LAST(!state->flatshade_first) |
                      S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
@@ -1019,11 +1034,11 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
                      S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) |
                      S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) |
                      S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) |
-                     S_028814_POLY_MODE(rs->polygon_mode_enabled) |
+                     S_028814_POLY_MODE(polygon_mode_enabled) |
                      S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) |
                      S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back)) |
                      /* this must be set if POLY_MODE or PERPENDICULAR_ENDCAP_ENA is set */
-                     S_028814_KEEP_TOGETHER_ENABLE(sscreen->info.chip_class >= GFX10 ? rs->polygon_mode_enabled : 0));
+                     S_028814_KEEP_TOGETHER_ENABLE(sscreen->info.chip_class >= GFX10 ? polygon_mode_enabled : 0));
 
    if (!rs->uses_poly_offset)
       return rs;
@@ -1059,11 +1074,12 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast
          }
       }
 
+      si_pm4_set_reg(pm4, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, pa_su_poly_offset_db_fmt_cntl);
+      si_pm4_set_reg(pm4, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp));
       si_pm4_set_reg(pm4, R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE, fui(offset_scale));
       si_pm4_set_reg(pm4, R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET, fui(offset_units));
       si_pm4_set_reg(pm4, R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE, fui(offset_scale));
       si_pm4_set_reg(pm4, R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET, fui(offset_units));
-      si_pm4_set_reg(pm4, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, pa_su_poly_offset_db_fmt_cntl);
    }
 
    return rs;
@@ -1112,6 +1128,10 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
        old_rs->pa_cl_clip_cntl != rs->pa_cl_clip_cntl)
       si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
 
+   if (old_rs->sprite_coord_enable != rs->sprite_coord_enable ||
+       old_rs->flatshade != rs->flatshade)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map);
+
    if (old_rs->clip_plane_enable != rs->clip_plane_enable ||
        old_rs->rasterizer_discard != rs->rasterizer_discard ||
        old_rs->sprite_coord_enable != rs->sprite_coord_enable ||
@@ -1121,8 +1141,19 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
        old_rs->poly_smooth != rs->poly_smooth || old_rs->line_smooth != rs->line_smooth ||
        old_rs->clamp_fragment_color != rs->clamp_fragment_color ||
        old_rs->force_persample_interp != rs->force_persample_interp ||
-       old_rs->polygon_mode_is_points != rs->polygon_mode_is_points)
+       old_rs->polygon_mode_is_points != rs->polygon_mode_is_points) {
+      si_ps_key_update_blend_rasterizer(sctx);
+      si_ps_key_update_rasterizer(sctx);
+      si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx);
+      si_update_ps_inputs_read_or_disabled(sctx);
       sctx->do_update_shaders = true;
+   }
+
+   if (old_rs->line_smooth != rs->line_smooth ||
+       old_rs->poly_smooth != rs->poly_smooth ||
+       old_rs->poly_stipple_enable != rs->poly_stipple_enable ||
+       old_rs->flatshade != rs->flatshade)
+      si_update_vrs_flat_shading(sctx);
 }
 
 static void si_delete_rs_state(struct pipe_context *ctx, void *state)
@@ -1147,14 +1178,15 @@ static void si_emit_stencil_ref(struct si_context *sctx)
    struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part;
 
    radeon_begin(cs);
-   radeon_set_context_reg_seq(cs, R_028430_DB_STENCILREFMASK, 2);
-   radeon_emit(cs, S_028430_STENCILTESTVAL(ref->ref_value[0]) |
-                      S_028430_STENCILMASK(dsa->valuemask[0]) |
-                      S_028430_STENCILWRITEMASK(dsa->writemask[0]) | S_028430_STENCILOPVAL(1));
-   radeon_emit(cs, S_028434_STENCILTESTVAL_BF(ref->ref_value[1]) |
-                      S_028434_STENCILMASK_BF(dsa->valuemask[1]) |
-                      S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) |
-                      S_028434_STENCILOPVAL_BF(1));
+   radeon_set_context_reg_seq(R_028430_DB_STENCILREFMASK, 2);
+   radeon_emit(S_028430_STENCILTESTVAL(ref->ref_value[0]) |
+               S_028430_STENCILMASK(dsa->valuemask[0]) |
+               S_028430_STENCILWRITEMASK(dsa->writemask[0]) |
+               S_028430_STENCILOPVAL(1));
+   radeon_emit(S_028434_STENCILTESTVAL_BF(ref->ref_value[1]) |
+               S_028434_STENCILMASK_BF(dsa->valuemask[1]) |
+               S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) |
+               S_028434_STENCILOPVAL_BF(1));
    radeon_end();
 }
 
@@ -1338,8 +1370,12 @@ static void si_bind_dsa_state(struct pipe_context *ctx, void *state)
       si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref);
    }
 
-   if (old_dsa->alpha_func != dsa->alpha_func)
+   if (old_dsa->alpha_func != dsa->alpha_func) {
+      si_ps_key_update_dsa(sctx);
+      si_update_ps_inputs_read_or_disabled(sctx);
+      si_update_ps_kill_enable(sctx);
       sctx->do_update_shaders = true;
+   }
 
    if (sctx->screen->dpbb_allowed && ((old_dsa->depth_enabled != dsa->depth_enabled ||
                                        old_dsa->stencil_enabled != dsa->stencil_enabled ||
@@ -1514,8 +1550,8 @@ static void si_emit_db_render_state(struct si_context *sctx)
 /*
  * format translation
  */
-static uint32_t si_translate_colorformat(enum chip_class chip_class,
-                                         enum pipe_format format)
+uint32_t si_translate_colorformat(enum chip_class chip_class,
+                                  enum pipe_format format)
 {
    const struct util_format_description *desc = util_format_description(format);
    if (!desc)
@@ -2985,6 +3021,10 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
       si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
    }
 
+   si_ps_key_update_framebuffer(sctx);
+   si_ps_key_update_framebuffer_blend(sctx);
+   si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx);
+   si_update_ps_inputs_read_or_disabled(sctx);
    sctx->do_update_shaders = true;
 
    if (!sctx->decompression_enabled) {
@@ -3016,7 +3056,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
 
       cb = (struct si_surface *)state->cbufs[i];
       if (!cb) {
-         radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
+         radeon_set_context_reg(R_028C70_CB_COLOR0_INFO + i * 0x3C,
                                 S_028C70_FORMAT(V_028C70_COLOR_INVALID));
          continue;
       }
@@ -3095,30 +3135,30 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
                             S_028EE0_CMASK_PIPE_ALIGNED(1) |
                             S_028EE0_DCC_PIPE_ALIGNED(tex->surface.u.gfx9.color.dcc.pipe_aligned);
 
-         radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 14);
-         radeon_emit(cs, cb_color_base);             /* CB_COLOR0_BASE */
-         radeon_emit(cs, 0);                         /* hole */
-         radeon_emit(cs, 0);                         /* hole */
-         radeon_emit(cs, cb->cb_color_view);         /* CB_COLOR0_VIEW */
-         radeon_emit(cs, cb_color_info);             /* CB_COLOR0_INFO */
-         radeon_emit(cs, cb_color_attrib);           /* CB_COLOR0_ATTRIB */
-         radeon_emit(cs, cb->cb_dcc_control);        /* CB_COLOR0_DCC_CONTROL */
-         radeon_emit(cs, cb_color_cmask);            /* CB_COLOR0_CMASK */
-         radeon_emit(cs, 0);                         /* hole */
-         radeon_emit(cs, cb_color_fmask);            /* CB_COLOR0_FMASK */
-         radeon_emit(cs, 0);                         /* hole */
-         radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
-         radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
-         radeon_emit(cs, cb_dcc_base);               /* CB_COLOR0_DCC_BASE */
+         radeon_set_context_reg_seq(R_028C60_CB_COLOR0_BASE + i * 0x3C, 14);
+         radeon_emit(cb_color_base);             /* CB_COLOR0_BASE */
+         radeon_emit(0);                         /* hole */
+         radeon_emit(0);                         /* hole */
+         radeon_emit(cb->cb_color_view);         /* CB_COLOR0_VIEW */
+         radeon_emit(cb_color_info);             /* CB_COLOR0_INFO */
+         radeon_emit(cb_color_attrib);           /* CB_COLOR0_ATTRIB */
+         radeon_emit(cb->cb_dcc_control);        /* CB_COLOR0_DCC_CONTROL */
+         radeon_emit(cb_color_cmask);            /* CB_COLOR0_CMASK */
+         radeon_emit(0);                         /* hole */
+         radeon_emit(cb_color_fmask);            /* CB_COLOR0_FMASK */
+         radeon_emit(0);                         /* hole */
+         radeon_emit(tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
+         radeon_emit(tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
+         radeon_emit(cb_dcc_base);               /* CB_COLOR0_DCC_BASE */
 
-         radeon_set_context_reg(cs, R_028E40_CB_COLOR0_BASE_EXT + i * 4, cb_color_base >> 32);
-         radeon_set_context_reg(cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + i * 4,
+         radeon_set_context_reg(R_028E40_CB_COLOR0_BASE_EXT + i * 4, cb_color_base >> 32);
+         radeon_set_context_reg(R_028E60_CB_COLOR0_CMASK_BASE_EXT + i * 4,
                                 cb_color_cmask >> 32);
-         radeon_set_context_reg(cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + i * 4,
+         radeon_set_context_reg(R_028E80_CB_COLOR0_FMASK_BASE_EXT + i * 4,
                                 cb_color_fmask >> 32);
-         radeon_set_context_reg(cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + i * 4, cb_dcc_base >> 32);
-         radeon_set_context_reg(cs, R_028EC0_CB_COLOR0_ATTRIB2 + i * 4, cb->cb_color_attrib2);
-         radeon_set_context_reg(cs, R_028EE0_CB_COLOR0_ATTRIB3 + i * 4, cb_color_attrib3);
+         radeon_set_context_reg(R_028EA0_CB_COLOR0_DCC_BASE_EXT + i * 4, cb_dcc_base >> 32);
+         radeon_set_context_reg(R_028EC0_CB_COLOR0_ATTRIB2 + i * 4, cb->cb_color_attrib2);
+         radeon_set_context_reg(R_028EE0_CB_COLOR0_ATTRIB3 + i * 4, cb_color_attrib3);
       } else if (sctx->chip_class == GFX9) {
          struct gfx9_surf_meta_flags meta = {
             .rb_aligned = 1,
@@ -3140,24 +3180,24 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
                             S_028C74_RB_ALIGNED(meta.rb_aligned) |
                             S_028C74_PIPE_ALIGNED(meta.pipe_aligned);
 
-         radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 15);
-         radeon_emit(cs, cb_color_base);                            /* CB_COLOR0_BASE */
-         radeon_emit(cs, S_028C64_BASE_256B(cb_color_base >> 32));  /* CB_COLOR0_BASE_EXT */
-         radeon_emit(cs, cb->cb_color_attrib2);                     /* CB_COLOR0_ATTRIB2 */
-         radeon_emit(cs, cb->cb_color_view);                        /* CB_COLOR0_VIEW */
-         radeon_emit(cs, cb_color_info);                            /* CB_COLOR0_INFO */
-         radeon_emit(cs, cb_color_attrib);                          /* CB_COLOR0_ATTRIB */
-         radeon_emit(cs, cb->cb_dcc_control);                       /* CB_COLOR0_DCC_CONTROL */
-         radeon_emit(cs, cb_color_cmask);                           /* CB_COLOR0_CMASK */
-         radeon_emit(cs, S_028C80_BASE_256B(cb_color_cmask >> 32)); /* CB_COLOR0_CMASK_BASE_EXT */
-         radeon_emit(cs, cb_color_fmask);                           /* CB_COLOR0_FMASK */
-         radeon_emit(cs, S_028C88_BASE_256B(cb_color_fmask >> 32)); /* CB_COLOR0_FMASK_BASE_EXT */
-         radeon_emit(cs, tex->color_clear_value[0]);                /* CB_COLOR0_CLEAR_WORD0 */
-         radeon_emit(cs, tex->color_clear_value[1]);                /* CB_COLOR0_CLEAR_WORD1 */
-         radeon_emit(cs, cb_dcc_base);                              /* CB_COLOR0_DCC_BASE */
-         radeon_emit(cs, S_028C98_BASE_256B(cb_dcc_base >> 32));    /* CB_COLOR0_DCC_BASE_EXT */
+         radeon_set_context_reg_seq(R_028C60_CB_COLOR0_BASE + i * 0x3C, 15);
+         radeon_emit(cb_color_base);                            /* CB_COLOR0_BASE */
+         radeon_emit(S_028C64_BASE_256B(cb_color_base >> 32));  /* CB_COLOR0_BASE_EXT */
+         radeon_emit(cb->cb_color_attrib2);                     /* CB_COLOR0_ATTRIB2 */
+         radeon_emit(cb->cb_color_view);                        /* CB_COLOR0_VIEW */
+         radeon_emit(cb_color_info);                            /* CB_COLOR0_INFO */
+         radeon_emit(cb_color_attrib);                          /* CB_COLOR0_ATTRIB */
+         radeon_emit(cb->cb_dcc_control);                       /* CB_COLOR0_DCC_CONTROL */
+         radeon_emit(cb_color_cmask);                           /* CB_COLOR0_CMASK */
+         radeon_emit(S_028C80_BASE_256B(cb_color_cmask >> 32)); /* CB_COLOR0_CMASK_BASE_EXT */
+         radeon_emit(cb_color_fmask);                           /* CB_COLOR0_FMASK */
+         radeon_emit(S_028C88_BASE_256B(cb_color_fmask >> 32)); /* CB_COLOR0_FMASK_BASE_EXT */
+         radeon_emit(tex->color_clear_value[0]);                /* CB_COLOR0_CLEAR_WORD0 */
+         radeon_emit(tex->color_clear_value[1]);                /* CB_COLOR0_CLEAR_WORD1 */
+         radeon_emit(cb_dcc_base);                              /* CB_COLOR0_DCC_BASE */
+         radeon_emit(S_028C98_BASE_256B(cb_dcc_base >> 32));    /* CB_COLOR0_DCC_BASE_EXT */
 
-         radeon_set_context_reg(cs, R_0287A0_CB_MRT0_EPITCH + i * 4,
+         radeon_set_context_reg(R_0287A0_CB_MRT0_EPITCH + i * 4,
                                 S_0287A0_EPITCH(tex->surface.u.gfx9.epitch));
       } else {
          /* Compute mutable surface parameters (GFX6-GFX8). */
@@ -3201,29 +3241,29 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
             cb_color_fmask_slice = S_028C88_TILE_MAX(slice_tile_max);
          }
 
-         radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C,
+         radeon_set_context_reg_seq(R_028C60_CB_COLOR0_BASE + i * 0x3C,
                                     sctx->chip_class >= GFX8 ? 14 : 13);
-         radeon_emit(cs, cb_color_base);                              /* CB_COLOR0_BASE */
-         radeon_emit(cs, cb_color_pitch);                             /* CB_COLOR0_PITCH */
-         radeon_emit(cs, cb_color_slice);                             /* CB_COLOR0_SLICE */
-         radeon_emit(cs, cb->cb_color_view);                          /* CB_COLOR0_VIEW */
-         radeon_emit(cs, cb_color_info);                              /* CB_COLOR0_INFO */
-         radeon_emit(cs, cb_color_attrib);                            /* CB_COLOR0_ATTRIB */
-         radeon_emit(cs, cb->cb_dcc_control);                         /* CB_COLOR0_DCC_CONTROL */
-         radeon_emit(cs, cb_color_cmask);                             /* CB_COLOR0_CMASK */
-         radeon_emit(cs, tex->surface.u.legacy.color.cmask_slice_tile_max); /* CB_COLOR0_CMASK_SLICE */
-         radeon_emit(cs, cb_color_fmask);                             /* CB_COLOR0_FMASK */
-         radeon_emit(cs, cb_color_fmask_slice);                       /* CB_COLOR0_FMASK_SLICE */
-         radeon_emit(cs, tex->color_clear_value[0]);                  /* CB_COLOR0_CLEAR_WORD0 */
-         radeon_emit(cs, tex->color_clear_value[1]);                  /* CB_COLOR0_CLEAR_WORD1 */
+         radeon_emit(cb_color_base);                              /* CB_COLOR0_BASE */
+         radeon_emit(cb_color_pitch);                             /* CB_COLOR0_PITCH */
+         radeon_emit(cb_color_slice);                             /* CB_COLOR0_SLICE */
+         radeon_emit(cb->cb_color_view);                          /* CB_COLOR0_VIEW */
+         radeon_emit(cb_color_info);                              /* CB_COLOR0_INFO */
+         radeon_emit(cb_color_attrib);                            /* CB_COLOR0_ATTRIB */
+         radeon_emit(cb->cb_dcc_control);                         /* CB_COLOR0_DCC_CONTROL */
+         radeon_emit(cb_color_cmask);                             /* CB_COLOR0_CMASK */
+         radeon_emit(tex->surface.u.legacy.color.cmask_slice_tile_max); /* CB_COLOR0_CMASK_SLICE */
+         radeon_emit(cb_color_fmask);                             /* CB_COLOR0_FMASK */
+         radeon_emit(cb_color_fmask_slice);                       /* CB_COLOR0_FMASK_SLICE */
+         radeon_emit(tex->color_clear_value[0]);                  /* CB_COLOR0_CLEAR_WORD0 */
+         radeon_emit(tex->color_clear_value[1]);                  /* CB_COLOR0_CLEAR_WORD1 */
 
          if (sctx->chip_class >= GFX8) /* R_028C94_CB_COLOR0_DCC_BASE */
-            radeon_emit(cs, cb_dcc_base);
+            radeon_emit(cb_dcc_base);
       }
    }
    for (; i < 8; i++)
       if (sctx->framebuffer.dirty_cbufs & (1 << i))
-         radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 0);
+         radeon_set_context_reg(R_028C70_CB_COLOR0_INFO + i * 0x3C, 0);
 
    /* ZS buffer. */
    if (state->zsbuf && sctx->framebuffer.dirty_zsbuf) {
@@ -3259,49 +3299,47 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
       unsigned level = zb->base.u.tex.level;
 
       if (sctx->chip_class >= GFX10) {
-         radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
-         radeon_set_context_reg(cs, R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size);
+         radeon_set_context_reg(R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
+         radeon_set_context_reg(R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size);
 
-         radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 7);
-         radeon_emit(cs, S_02803C_RESOURCE_LEVEL(1)); /* DB_DEPTH_INFO */
-         radeon_emit(cs, db_z_info |                  /* DB_Z_INFO */
-                            S_028038_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0));
-         radeon_emit(cs, db_stencil_info);     /* DB_STENCIL_INFO */
-         radeon_emit(cs, zb->db_depth_base);   /* DB_Z_READ_BASE */
-         radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
-         radeon_emit(cs, zb->db_depth_base);   /* DB_Z_WRITE_BASE */
-         radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
+         radeon_set_context_reg_seq(R_02803C_DB_DEPTH_INFO, 7);
+         radeon_emit(S_02803C_RESOURCE_LEVEL(1)); /* DB_DEPTH_INFO */
+         radeon_emit(db_z_info |                  /* DB_Z_INFO */
+                     S_028038_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0));
+         radeon_emit(db_stencil_info);     /* DB_STENCIL_INFO */
+         radeon_emit(zb->db_depth_base);   /* DB_Z_READ_BASE */
+         radeon_emit(zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
+         radeon_emit(zb->db_depth_base);   /* DB_Z_WRITE_BASE */
+         radeon_emit(zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
 
-         radeon_set_context_reg_seq(cs, R_028068_DB_Z_READ_BASE_HI, 5);
-         radeon_emit(cs, zb->db_depth_base >> 32);      /* DB_Z_READ_BASE_HI */
-         radeon_emit(cs, zb->db_stencil_base >> 32);    /* DB_STENCIL_READ_BASE_HI */
-         radeon_emit(cs, zb->db_depth_base >> 32);      /* DB_Z_WRITE_BASE_HI */
-         radeon_emit(cs, zb->db_stencil_base >> 32);    /* DB_STENCIL_WRITE_BASE_HI */
-         radeon_emit(cs, zb->db_htile_data_base >> 32); /* DB_HTILE_DATA_BASE_HI */
+         radeon_set_context_reg_seq(R_028068_DB_Z_READ_BASE_HI, 5);
+         radeon_emit(zb->db_depth_base >> 32);      /* DB_Z_READ_BASE_HI */
+         radeon_emit(zb->db_stencil_base >> 32);    /* DB_STENCIL_READ_BASE_HI */
+         radeon_emit(zb->db_depth_base >> 32);      /* DB_Z_WRITE_BASE_HI */
+         radeon_emit(zb->db_stencil_base >> 32);    /* DB_STENCIL_WRITE_BASE_HI */
+         radeon_emit(zb->db_htile_data_base >> 32); /* DB_HTILE_DATA_BASE_HI */
       } else if (sctx->chip_class == GFX9) {
-         radeon_set_context_reg_seq(cs, R_028014_DB_HTILE_DATA_BASE, 3);
-         radeon_emit(cs, zb->db_htile_data_base); /* DB_HTILE_DATA_BASE */
-         radeon_emit(cs,
-                     S_028018_BASE_HI(zb->db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */
-         radeon_emit(cs, zb->db_depth_size);                          /* DB_DEPTH_SIZE */
+         radeon_set_context_reg_seq(R_028014_DB_HTILE_DATA_BASE, 3);
+         radeon_emit(zb->db_htile_data_base); /* DB_HTILE_DATA_BASE */
+         radeon_emit(S_028018_BASE_HI(zb->db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */
+         radeon_emit(zb->db_depth_size);                          /* DB_DEPTH_SIZE */
 
-         radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 10);
-         radeon_emit(cs, db_z_info |                                   /* DB_Z_INFO */
-                            S_028038_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0));
-         radeon_emit(cs, db_stencil_info);                             /* DB_STENCIL_INFO */
-         radeon_emit(cs, zb->db_depth_base);                           /* DB_Z_READ_BASE */
-         radeon_emit(cs, S_028044_BASE_HI(zb->db_depth_base >> 32));   /* DB_Z_READ_BASE_HI */
-         radeon_emit(cs, zb->db_stencil_base);                         /* DB_STENCIL_READ_BASE */
-         radeon_emit(cs, S_02804C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
-         radeon_emit(cs, zb->db_depth_base);                           /* DB_Z_WRITE_BASE */
-         radeon_emit(cs, S_028054_BASE_HI(zb->db_depth_base >> 32));   /* DB_Z_WRITE_BASE_HI */
-         radeon_emit(cs, zb->db_stencil_base);                         /* DB_STENCIL_WRITE_BASE */
-         radeon_emit(cs,
-                     S_02805C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
+         radeon_set_context_reg_seq(R_028038_DB_Z_INFO, 10);
+         radeon_emit(db_z_info |                                   /* DB_Z_INFO */
+                     S_028038_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0));
+         radeon_emit(db_stencil_info);                             /* DB_STENCIL_INFO */
+         radeon_emit(zb->db_depth_base);                           /* DB_Z_READ_BASE */
+         radeon_emit(S_028044_BASE_HI(zb->db_depth_base >> 32));   /* DB_Z_READ_BASE_HI */
+         radeon_emit(zb->db_stencil_base);                         /* DB_STENCIL_READ_BASE */
+         radeon_emit(S_02804C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
+         radeon_emit(zb->db_depth_base);                           /* DB_Z_WRITE_BASE */
+         radeon_emit(S_028054_BASE_HI(zb->db_depth_base >> 32));   /* DB_Z_WRITE_BASE_HI */
+         radeon_emit(zb->db_stencil_base);                         /* DB_STENCIL_WRITE_BASE */
+         radeon_emit(S_02805C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
 
-         radeon_set_context_reg_seq(cs, R_028068_DB_Z_INFO2, 2);
-         radeon_emit(cs, zb->db_z_info2);       /* DB_Z_INFO2 */
-         radeon_emit(cs, zb->db_stencil_info2); /* DB_STENCIL_INFO2 */
+         radeon_set_context_reg_seq(R_028068_DB_Z_INFO2, 2);
+         radeon_emit(zb->db_z_info2);       /* DB_Z_INFO2 */
+         radeon_emit(zb->db_stencil_info2); /* DB_STENCIL_INFO2 */
       } else {
          /* GFX6-GFX8 */
          /* Set fields dependent on tc_compatile_htile. */
@@ -3319,46 +3357,46 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
             }
          }
 
-         radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
+         radeon_set_context_reg(R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
 
-         radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 9);
-         radeon_emit(cs, zb->db_depth_info |   /* DB_DEPTH_INFO */
+         radeon_set_context_reg_seq(R_02803C_DB_DEPTH_INFO, 9);
+         radeon_emit(zb->db_depth_info |   /* DB_DEPTH_INFO */
                      S_02803C_ADDR5_SWIZZLE_MASK(!tex->tc_compatible_htile));
-         radeon_emit(cs, db_z_info |           /* DB_Z_INFO */
-                            S_028040_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0));
-         radeon_emit(cs, db_stencil_info);     /* DB_STENCIL_INFO */
-         radeon_emit(cs, zb->db_depth_base);   /* DB_Z_READ_BASE */
-         radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
-         radeon_emit(cs, zb->db_depth_base);   /* DB_Z_WRITE_BASE */
-         radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
-         radeon_emit(cs, zb->db_depth_size);   /* DB_DEPTH_SIZE */
-         radeon_emit(cs, zb->db_depth_slice);  /* DB_DEPTH_SLICE */
+         radeon_emit(db_z_info |           /* DB_Z_INFO */
+                     S_028040_ZRANGE_PRECISION(tex->depth_clear_value[level] != 0));
+         radeon_emit(db_stencil_info);     /* DB_STENCIL_INFO */
+         radeon_emit(zb->db_depth_base);   /* DB_Z_READ_BASE */
+         radeon_emit(zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
+         radeon_emit(zb->db_depth_base);   /* DB_Z_WRITE_BASE */
+         radeon_emit(zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
+         radeon_emit(zb->db_depth_size);   /* DB_DEPTH_SIZE */
+         radeon_emit(zb->db_depth_slice);  /* DB_DEPTH_SLICE */
       }
 
-      radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
-      radeon_emit(cs, tex->stencil_clear_value[level]);    /* R_028028_DB_STENCIL_CLEAR */
-      radeon_emit(cs, fui(tex->depth_clear_value[level])); /* R_02802C_DB_DEPTH_CLEAR */
+      radeon_set_context_reg_seq(R_028028_DB_STENCIL_CLEAR, 2);
+      radeon_emit(tex->stencil_clear_value[level]);    /* R_028028_DB_STENCIL_CLEAR */
+      radeon_emit(fui(tex->depth_clear_value[level])); /* R_02802C_DB_DEPTH_CLEAR */
 
-      radeon_set_context_reg(cs, R_028008_DB_DEPTH_VIEW, zb->db_depth_view);
-      radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, db_htile_surface);
+      radeon_set_context_reg(R_028008_DB_DEPTH_VIEW, zb->db_depth_view);
+      radeon_set_context_reg(R_028ABC_DB_HTILE_SURFACE, db_htile_surface);
    } else if (sctx->framebuffer.dirty_zsbuf) {
       if (sctx->chip_class == GFX9)
-         radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 2);
+         radeon_set_context_reg_seq(R_028038_DB_Z_INFO, 2);
       else
-         radeon_set_context_reg_seq(cs, R_028040_DB_Z_INFO, 2);
+         radeon_set_context_reg_seq(R_028040_DB_Z_INFO, 2);
 
-      radeon_emit(cs, S_028040_FORMAT(V_028040_Z_INVALID));       /* DB_Z_INFO */
-      radeon_emit(cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
+      radeon_emit(S_028040_FORMAT(V_028040_Z_INVALID));       /* DB_Z_INFO */
+      radeon_emit(S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
    }
 
    /* Framebuffer dimensions. */
    /* PA_SC_WINDOW_SCISSOR_TL is set in si_init_cs_preamble_state */
-   radeon_set_context_reg(cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
+   radeon_set_context_reg(R_028208_PA_SC_WINDOW_SCISSOR_BR,
                           S_028208_BR_X(state->width) | S_028208_BR_Y(state->height));
 
    if (sctx->screen->dpbb_allowed) {
-      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-      radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
+      radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
    }
    radeon_end();
 
@@ -3564,14 +3602,15 @@ static void si_emit_msaa_config(struct si_context *sctx)
       }
    }
 
-   /* Required by OpenGL line rasterization.
+   /* The DX10 diamond test is optional in GL and decreases line rasterization
+    * performance, so don't use it.
     *
     * TODO: We should also enable perpendicular endcaps for AA lines,
     *       but that requires implementing line stippling in the pixel
     *       shader. SC can only do line stippling with axis-aligned
     *       endcaps.
     */
-   unsigned sc_line_cntl = S_028BDC_DX10_DIAMOND_TEST_ENA(1);
+   unsigned sc_line_cntl = 0;
    unsigned sc_aa_config = 0;
 
    if (coverage_samples > 1) {
@@ -3637,6 +3676,9 @@ static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
       return;
 
    sctx->ps_iter_samples = min_samples;
+
+   si_ps_key_update_sample_shading(sctx);
+   si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx);
    sctx->do_update_shaders = true;
 
    si_update_ps_iter_samples(sctx);
@@ -4269,7 +4311,7 @@ struct pipe_sampler_view *si_create_sampler_view_custom(struct pipe_context *ctx
                                                         unsigned force_level)
 {
    struct si_context *sctx = (struct si_context *)ctx;
-   struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view);
+   struct si_sampler_view *view = CALLOC_STRUCT_CL(si_sampler_view);
    struct si_texture *tex = (struct si_texture *)texture;
    unsigned base_level, first_level, last_level;
    unsigned char state_swizzle[4];
@@ -4403,7 +4445,7 @@ static void si_sampler_view_destroy(struct pipe_context *ctx, struct pipe_sample
    struct si_sampler_view *view = (struct si_sampler_view *)state;
 
    pipe_resource_reference(&state->texture, NULL);
-   FREE(view);
+   FREE_CL(view);
 }
 
 static bool wrap_mode_uses_border_color(unsigned wrap, bool linear_filter)
@@ -4598,9 +4640,9 @@ static void si_emit_sample_mask(struct si_context *sctx)
           (mask & 1 && sctx->blitter_running));
 
    radeon_begin(cs);
-   radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
-   radeon_emit(cs, mask | (mask << 16));
-   radeon_emit(cs, mask | (mask << 16));
+   radeon_set_context_reg_seq(R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
+   radeon_emit(mask | (mask << 16));
+   radeon_emit(mask | (mask << 16));
    radeon_end();
 }
 
@@ -4652,8 +4694,9 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count,
 
    v->count = count;
 
+   unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs(sscreen);
    unsigned alloc_count =
-      count > sscreen->num_vbos_in_user_sgprs ? count - sscreen->num_vbos_in_user_sgprs : 0;
+      count > num_vbos_in_user_sgprs ? count - num_vbos_in_user_sgprs : 0;
    v->vb_desc_list_alloc_size = align(alloc_count * 16, SI_CPDMA_ALIGNMENT);
 
    for (i = 0; i < count; ++i) {
@@ -4669,8 +4712,6 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count,
 
       unsigned instance_divisor = elements[i].instance_divisor;
       if (instance_divisor) {
-         v->uses_instance_divisors = true;
-
          if (instance_divisor == 1) {
             v->instance_divisor_is_one |= 1u << i;
          } else {
@@ -4866,22 +4907,23 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
       sctx->vertex_buffer_user_sgprs_dirty = false;
    }
 
-   if (old->count != v->count ||
-       old->uses_instance_divisors != v->uses_instance_divisors ||
-       /* we don't check which divisors changed */
-       v->uses_instance_divisors ||
+   if (old->instance_divisor_is_one != v->instance_divisor_is_one ||
+       old->instance_divisor_is_fetched != v->instance_divisor_is_fetched ||
        (old->vb_alignment_check_mask ^ v->vb_alignment_check_mask) &
        sctx->vertex_buffer_unaligned ||
        ((v->vb_alignment_check_mask & sctx->vertex_buffer_unaligned) &&
         memcmp(old->vertex_buffer_index, v->vertex_buffer_index,
-               sizeof(v->vertex_buffer_index[0]) * v->count)) ||
+               sizeof(v->vertex_buffer_index[0]) * MAX2(old->count, v->count))) ||
        /* fix_fetch_{always,opencode,unaligned} and hw_load_is_dword are
         * functions of fix_fetch and the src_offset alignment.
         * If they change and fix_fetch doesn't, it must be due to different
         * src_offset alignment, which is reflected in fix_fetch_opencode. */
        old->fix_fetch_opencode != v->fix_fetch_opencode ||
-       memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count))
+       memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) *
+              MAX2(old->count, v->count))) {
+      si_vs_key_update_inputs(sctx);
       sctx->do_update_shaders = true;
+   }
 
    if (v->instance_divisor_is_fetched) {
       struct pipe_constant_buffer cb;
@@ -4935,7 +4977,7 @@ static void si_set_vertex_buffers(struct pipe_context *ctx, unsigned start_slot,
 
             si_context_add_resource_size(sctx, buf);
             if (buf)
-               si_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER;
+               si_resource(buf)->bind_history |= SI_BIND_VERTEX_BUFFER;
          }
          /* take_ownership allows us to copy pipe_resource pointers without refcounting. */
          memcpy(dst, buffers, count * sizeof(struct pipe_vertex_buffer));
@@ -4955,7 +4997,7 @@ static void si_set_vertex_buffers(struct pipe_context *ctx, unsigned start_slot,
 
             si_context_add_resource_size(sctx, buf);
             if (buf)
-               si_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER;
+               si_resource(buf)->bind_history |= SI_BIND_VERTEX_BUFFER;
          }
       }
    } else {
@@ -4977,8 +5019,82 @@ static void si_set_vertex_buffers(struct pipe_context *ctx, unsigned start_slot,
     * be the case in well-behaved applications anyway.
     */
    if ((sctx->vertex_elements->vb_alignment_check_mask &
-        (unaligned | orig_unaligned) & updated_mask))
+        (unaligned | orig_unaligned) & updated_mask)) {
+      si_vs_key_update_inputs(sctx);
       sctx->do_update_shaders = true;
+   }
+}
+
+static struct pipe_vertex_state *
+si_create_vertex_state(struct pipe_screen *screen,
+                       struct pipe_vertex_buffer *buffer,
+                       const struct pipe_vertex_element *elements,
+                       unsigned num_elements,
+                       struct pipe_resource *indexbuf,
+                       uint32_t full_velem_mask)
+{
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   struct si_vertex_state *state = CALLOC_STRUCT(si_vertex_state);
+
+   util_init_pipe_vertex_state(screen, buffer, elements, num_elements, indexbuf, full_velem_mask,
+                               &state->b);
+
+   /* Initialize the vertex element state in state->element.
+    * Do it by creating a vertex element state object and copying it there.
+    */
+   struct si_context ctx = {};
+   ctx.b.screen = screen;
+   struct si_vertex_elements *velems = si_create_vertex_elements(&ctx.b, num_elements, elements);
+   state->velems = *velems;
+   si_delete_vertex_element(&ctx.b, velems);
+
+   assert(!state->velems.instance_divisor_is_one);
+   assert(!state->velems.instance_divisor_is_fetched);
+   assert(!state->velems.fix_fetch_always);
+   assert(buffer->stride % 4 == 0);
+   assert(buffer->buffer_offset % 4 == 0);
+   assert(!buffer->is_user_buffer);
+   for (unsigned i = 0; i < num_elements; i++) {
+      assert(elements[i].src_offset % 4 == 0);
+      assert(!elements[i].dual_slot);
+   }
+
+   for (unsigned i = 0; i < num_elements; i++) {
+      si_set_vertex_buffer_descriptor(sscreen, &state->velems, &state->b.input.vbuffer, i,
+                                      &state->descriptors[i * 4]);
+   }
+
+   return &state->b;
+}
+
+static void si_vertex_state_destroy(struct pipe_screen *screen,
+                                    struct pipe_vertex_state *state)
+{
+   pipe_vertex_buffer_unreference(&state->input.vbuffer);
+   pipe_resource_reference(&state->input.indexbuf, NULL);
+   FREE(state);
+}
+
+static struct pipe_vertex_state *
+si_pipe_create_vertex_state(struct pipe_screen *screen,
+                            struct pipe_vertex_buffer *buffer,
+                            const struct pipe_vertex_element *elements,
+                            unsigned num_elements,
+                            struct pipe_resource *indexbuf,
+                            uint32_t full_velem_mask)
+{
+   struct si_screen *sscreen = (struct si_screen *)screen;
+
+   return util_vertex_state_cache_get(screen, buffer, elements, num_elements, indexbuf,
+                                      full_velem_mask, &sscreen->vertex_state_cache);
+}
+
+static void si_pipe_vertex_state_destroy(struct pipe_screen *screen,
+                                         struct pipe_vertex_state *state)
+{
+   struct si_screen *sscreen = (struct si_screen *)screen;
+
+   util_vertex_state_destroy(screen, &sscreen->vertex_state_cache, state);
 }
 
 /*
@@ -5003,6 +5119,13 @@ static void si_set_tess_state(struct pipe_context *ctx, const float default_oute
    si_set_internal_const_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS, &cb);
 }
 
+static void si_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+
+   sctx->patch_vertices = patch_vertices;
+}
+
 static void si_texture_barrier(struct pipe_context *ctx, unsigned flags)
 {
    struct si_context *sctx = (struct si_context *)ctx;
@@ -5132,6 +5255,7 @@ void si_init_state_functions(struct si_context *sctx)
    sctx->b.texture_barrier = si_texture_barrier;
    sctx->b.set_min_samples = si_set_min_samples;
    sctx->b.set_tess_state = si_set_tess_state;
+   sctx->b.set_patch_vertices = si_set_patch_vertices;
 
    sctx->b.set_active_query_state = si_set_active_query_state;
 }
@@ -5139,12 +5263,17 @@ void si_init_state_functions(struct si_context *sctx)
 void si_init_screen_state_functions(struct si_screen *sscreen)
 {
    sscreen->b.is_format_supported = si_is_format_supported;
+   sscreen->b.create_vertex_state = si_pipe_create_vertex_state;
+   sscreen->b.vertex_state_destroy = si_pipe_vertex_state_destroy;
 
    if (sscreen->info.chip_class >= GFX10) {
       sscreen->make_texture_descriptor = gfx10_make_texture_descriptor;
    } else {
       sscreen->make_texture_descriptor = si_make_texture_descriptor;
    }
+
+   util_vertex_state_cache_init(&sscreen->vertex_state_cache,
+                                si_create_vertex_state, si_vertex_state_destroy);
 }
 
 static void si_set_grbm_gfx_index(struct si_context *sctx, struct si_pm4_state *pm4, unsigned value)
@@ -5311,6 +5440,21 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
       si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0);
    }
 
+   if (sscreen->info.chip_class >= GFX10) {
+      si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS,
+                     S_00B524_MEM_BASE(sscreen->info.address32_hi >> 8));
+      si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES,
+                     S_00B324_MEM_BASE(sscreen->info.address32_hi >> 8));
+   } else if (sscreen->info.chip_class == GFX9) {
+      si_pm4_set_reg(pm4, R_00B414_SPI_SHADER_PGM_HI_LS,
+                     S_00B414_MEM_BASE(sscreen->info.address32_hi >> 8));
+      si_pm4_set_reg(pm4, R_00B214_SPI_SHADER_PGM_HI_ES,
+                     S_00B214_MEM_BASE(sscreen->info.address32_hi >> 8));
+   } else {
+      si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS,
+                     S_00B524_MEM_BASE(sscreen->info.address32_hi >> 8));
+   }
+
    if (sctx->chip_class >= GFX7 && sctx->chip_class <= GFX8) {
       si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
                      S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F));
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_state.h b/mesa 3D driver/src/gallium/drivers/radeonsi/si_state.h
index f9d4e6713e..af2f750942 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_state.h	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_state.h	
@@ -96,11 +96,6 @@ struct si_state_rasterizer {
    unsigned rasterizer_discard : 1;
    unsigned scissor_enable : 1;
    unsigned clip_halfz : 1;
-   unsigned cull_front : 1;
-   unsigned cull_back : 1;
-   unsigned depth_clamp_any : 1;
-   unsigned provoking_vertex_first : 1;
-   unsigned polygon_mode_enabled : 1;
    unsigned polygon_mode_is_lines : 1;
    unsigned polygon_mode_is_points : 1;
 };
@@ -174,7 +169,6 @@ struct si_vertex_elements {
    uint16_t vb_alignment_check_mask;
 
    uint8_t count;
-   bool uses_instance_divisors;
 
    uint16_t first_vb_use_mask;
    /* Vertex buffer descriptor list size aligned for optimal prefetch. */
@@ -189,13 +183,13 @@ union si_state {
       struct si_state_rasterizer *rasterizer;
       struct si_state_dsa *dsa;
       struct si_pm4_state *poly_offset;
-      struct si_pm4_state *ls;
-      struct si_pm4_state *hs;
-      struct si_pm4_state *es;
-      struct si_pm4_state *gs;
+      struct si_shader *ls;
+      struct si_shader *hs;
+      struct si_shader *es;
+      struct si_shader *gs;
       struct si_pm4_state *vgt_shader_config;
-      struct si_pm4_state *vs;
-      struct si_pm4_state *ps;
+      struct si_shader *vs;
+      struct si_shader *ps;
    } named;
    struct si_pm4_state *array[sizeof(struct si_state_named) / sizeof(struct si_pm4_state *)];
 };
@@ -255,12 +249,6 @@ struct si_shader_data {
    uint32_t sh_base[SI_NUM_SHADERS];
 };
 
-#define SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK                                                      \
-   (S_02881C_USE_VTX_POINT_SIZE(1) | S_02881C_USE_VTX_EDGE_FLAG(1) |                               \
-    S_02881C_USE_VTX_RENDER_TARGET_INDX(1) | S_02881C_USE_VTX_VIEWPORT_INDX(1) |                   \
-    S_02881C_VS_OUT_MISC_VEC_ENA(1) | S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(1) |                       \
-    S_02881C_USE_VTX_VRS_RATE(1))
-
 /* The list of registers whose emitted values are remembered by si_context. */
 enum si_tracked_reg
 {
@@ -286,8 +274,7 @@ enum si_tracked_reg
    SI_TRACKED_PA_SU_PRIM_FILTER_CNTL,
    SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL,
 
-   SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, /* set with SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK*/
-   SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, /* set with ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK */
+   SI_TRACKED_PA_CL_VS_OUT_CNTL,
    SI_TRACKED_PA_CL_CLIP_CNTL,
 
    SI_TRACKED_PA_SC_BINNER_CNTL_0,
@@ -348,7 +335,10 @@ enum si_tracked_reg
    SI_TRACKED_VGT_TF_PARAM,
    SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
 
+   /* Non-context registers: */
    SI_TRACKED_GE_PC_ALLOC,
+   SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
+   SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
 
    SI_NUM_TRACKED_REGS,
 };
@@ -488,14 +478,21 @@ struct si_buffer_resources {
    } while (0)
 
 /* si_descriptors.c */
+void si_get_inline_uniform_state(union si_shader_key *key, enum pipe_shader_type shader,
+                                 bool *inline_uniforms, uint32_t **inlined_values);
 void si_set_mutable_tex_desc_fields(struct si_screen *sscreen, struct si_texture *tex,
                                     const struct legacy_surf_level *base_level_info,
                                     unsigned base_level, unsigned first_level, unsigned block_width,
                                     /* restrict decreases overhead of si_set_sampler_view_desc ~8x. */
                                     bool is_stencil, uint16_t access, uint32_t * restrict state);
 void si_update_ps_colorbuf0_slot(struct si_context *sctx);
+void si_invalidate_inlinable_uniforms(struct si_context *sctx, enum pipe_shader_type shader);
 void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader, uint slot,
                                  struct pipe_constant_buffer *cbuf);
+void si_set_shader_buffers(struct pipe_context *ctx, enum pipe_shader_type shader,
+                           unsigned start_slot, unsigned count,
+                           const struct pipe_shader_buffer *sbuffers,
+                           unsigned writable_bitmask, bool internal_blit);
 void si_get_shader_buffers(struct si_context *sctx, enum pipe_shader_type shader, uint start_slot,
                            uint count, struct pipe_shader_buffer *sbuf);
 void si_set_ring_buffer(struct si_context *sctx, uint slot, struct pipe_resource *buffer,
@@ -529,6 +526,7 @@ struct pb_slab *si_bindless_descriptor_slab_alloc(void *priv, unsigned heap, uns
 void si_bindless_descriptor_slab_free(void *priv, struct pb_slab *pslab);
 void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf);
 /* si_state.c */
+uint32_t si_translate_colorformat(enum chip_class chip_class, enum pipe_format format);
 void si_init_state_compute_functions(struct si_context *sctx);
 void si_init_state_functions(struct si_context *sctx);
 void si_init_screen_state_functions(struct si_screen *sscreen);
@@ -561,7 +559,8 @@ struct si_fast_udiv_info32 si_compute_fast_udiv_info32(uint32_t D, unsigned num_
 /* si_state_binning.c */
 void si_emit_dpbb_state(struct si_context *sctx);
 
-/* si_state_shaders.c */
+/* si_state_shaders.cpp */
+struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, union si_vgt_stages_key key);
 void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
                          unsigned char ir_sha1_cache_key[20]);
 bool si_shader_cache_load_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20],
@@ -569,7 +568,6 @@ bool si_shader_cache_load_shader(struct si_screen *sscreen, unsigned char ir_sha
 void si_shader_cache_insert_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20],
                                    struct si_shader *shader, bool insert_into_disk_cache);
 bool si_shader_mem_ordered(struct si_shader *shader);
-bool si_update_shaders(struct si_context *sctx);
 void si_init_screen_live_shader_cache(struct si_screen *sscreen);
 void si_init_shader_functions(struct si_context *sctx);
 bool si_init_shader_cache(struct si_screen *sscreen);
@@ -580,21 +578,37 @@ void si_schedule_initial_compile(struct si_context *sctx, gl_shader_stage stage,
                                  util_queue_execute_func execute);
 void si_get_active_slot_masks(const struct si_shader_info *info, uint64_t *const_and_shader_buffers,
                               uint64_t *samplers_and_images);
-int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_ctx_state *state,
-                              struct si_compiler_ctx_state *compiler_state,
-                              struct si_shader_key *key, int thread_index, bool optimized_or_none);
-void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selector *vs,
-                               struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key);
-unsigned si_get_input_prim(const struct si_shader_selector *gs);
+int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state);
+void si_vs_key_update_inputs(struct si_context *sctx);
+void si_get_vs_key_inputs(struct si_context *sctx, union si_shader_key *key,
+                          struct si_vs_prolog_bits *prolog_key);
+void si_update_ps_inputs_read_or_disabled(struct si_context *sctx);
+void si_update_ps_kill_enable(struct si_context *sctx);
+void si_update_vrs_flat_shading(struct si_context *sctx);
+unsigned si_get_input_prim(const struct si_shader_selector *gs, const union si_shader_key *key);
 bool si_update_ngg(struct si_context *sctx);
+void si_ps_key_update_framebuffer(struct si_context *sctx);
+void si_ps_key_update_framebuffer_blend(struct si_context *sctx);
+void si_ps_key_update_blend_rasterizer(struct si_context *sctx);
+void si_ps_key_update_rasterizer(struct si_context *sctx);
+void si_ps_key_update_dsa(struct si_context *sctx);
+void si_ps_key_update_sample_shading(struct si_context *sctx);
+void si_ps_key_update_framebuffer_rasterizer_sample_shading(struct si_context *sctx);
+void si_init_tess_factor_ring(struct si_context *sctx);
+bool si_update_gs_ring_buffers(struct si_context *sctx);
+bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes);
 
-/* si_state_draw.c */
+/* si_state_draw.cpp */
+void si_set_vertex_buffer_descriptor(struct si_screen *sscreen, struct si_vertex_elements *velems,
+                                     struct pipe_vertex_buffer *vb, unsigned element_index,
+                                     uint32_t *out);
 void si_init_draw_functions_GFX6(struct si_context *sctx);
 void si_init_draw_functions_GFX7(struct si_context *sctx);
 void si_init_draw_functions_GFX8(struct si_context *sctx);
 void si_init_draw_functions_GFX9(struct si_context *sctx);
 void si_init_draw_functions_GFX10(struct si_context *sctx);
 void si_init_draw_functions_GFX10_3(struct si_context *sctx);
+void si_init_spi_map_functions(struct si_context *sctx);
 
 /* si_state_msaa.c */
 void si_init_msaa_functions(struct si_context *sctx);
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_state_draw.cpp b/mesa 3D driver/src/gallium/drivers/radeonsi/si_state_draw.cpp
index 4fc81ab181..896abcf9e8 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_state_draw.cpp	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_state_draw.cpp	
@@ -22,8 +22,10 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
+#include "ac_exp_param.h"
 #include "ac_sqtt.h"
 #include "si_build_pm4.h"
+#include "util/u_cpu_detect.h"
 #include "util/u_index_modify.h"
 #include "util/u_prim.h"
 #include "util/u_upload_mgr.h"
@@ -47,6 +49,320 @@
 /* special primitive types */
 #define SI_PRIM_RECTANGLE_LIST PIPE_PRIM_MAX
 
+template<int NUM_INTERP>
+static void si_emit_spi_map(struct si_context *sctx)
+{
+   struct si_shader *ps = sctx->shader.ps.current;
+   struct si_shader_info *psinfo = ps ? &ps->selector->info : NULL;
+   unsigned spi_ps_input_cntl[NUM_INTERP];
+
+   STATIC_ASSERT(NUM_INTERP >= 0 && NUM_INTERP <= 32);
+
+   if (!NUM_INTERP)
+      return;
+
+   struct si_shader *vs = si_get_vs(sctx)->current;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+
+   for (unsigned i = 0; i < NUM_INTERP; i++) {
+      union si_input_info input = psinfo->input[i];
+      unsigned ps_input_cntl = vs->info.vs_output_ps_input_cntl[input.semantic];
+      bool non_default_val = G_028644_OFFSET(ps_input_cntl) != 0x20;
+
+      if (non_default_val) {
+         if (input.interpolate == INTERP_MODE_FLAT ||
+             (input.interpolate == INTERP_MODE_COLOR && rs->flatshade))
+            ps_input_cntl |= S_028644_FLAT_SHADE(1);
+
+         if (input.fp16_lo_hi_valid) {
+            ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) |
+                             S_028644_ATTR0_VALID(1) | /* this must be set if FP16_INTERP_MODE is set */
+                             S_028644_ATTR1_VALID(!!(input.fp16_lo_hi_valid & 0x2));
+         }
+      }
+
+      if (input.semantic == VARYING_SLOT_PNTC ||
+          (input.semantic >= VARYING_SLOT_TEX0 && input.semantic <= VARYING_SLOT_TEX7 &&
+           rs->sprite_coord_enable & (1 << (input.semantic - VARYING_SLOT_TEX0)))) {
+         /* Overwrite the whole value (except OFFSET) for sprite coordinates. */
+         ps_input_cntl &= ~C_028644_OFFSET;
+         ps_input_cntl |= S_028644_PT_SPRITE_TEX(1);
+         if (input.fp16_lo_hi_valid & 0x1) {
+            ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) |
+                             S_028644_ATTR0_VALID(1);
+         }
+      }
+
+      spi_ps_input_cntl[i] = ps_input_cntl;
+   }
+
+   /* R_028644_SPI_PS_INPUT_CNTL_0 */
+   /* Dota 2: Only ~16% of SPI map updates set different values. */
+   /* Talos: Only ~9% of SPI map updates set different values. */
+   radeon_begin(&sctx->gfx_cs);
+   radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl,
+                               sctx->tracked_regs.spi_ps_input_cntl, NUM_INTERP);
+   radeon_end_update_context_roll(sctx);
+}
+
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
+static bool si_update_shaders(struct si_context *sctx)
+{
+   struct pipe_context *ctx = (struct pipe_context *)sctx;
+   struct si_shader *old_vs = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current;
+   unsigned old_pa_cl_vs_out_cntl = old_vs ? old_vs->pa_cl_vs_out_cntl : 0;
+   struct si_shader *old_ps = sctx->shader.ps.current;
+   unsigned old_spi_shader_col_format =
+      old_ps ? old_ps->key.ps.part.epilog.spi_shader_col_format : 0;
+   int r;
+
+   /* Update TCS and TES. */
+   if (HAS_TESS) {
+      if (!sctx->tess_rings) {
+         si_init_tess_factor_ring(sctx);
+         if (!sctx->tess_rings)
+            return false;
+      }
+
+      if (sctx->shader.tcs.cso) {
+         r = si_shader_select(ctx, &sctx->shader.tcs);
+         if (r)
+            return false;
+         si_pm4_bind_state(sctx, hs, sctx->shader.tcs.current);
+      } else {
+         if (!sctx->fixed_func_tcs_shader.cso) {
+            sctx->fixed_func_tcs_shader.cso =
+               (struct si_shader_selector*)si_create_fixed_func_tcs(sctx);
+            if (!sctx->fixed_func_tcs_shader.cso)
+               return false;
+
+            sctx->fixed_func_tcs_shader.key.ge.part.tcs.epilog.invoc0_tess_factors_are_def =
+               sctx->fixed_func_tcs_shader.cso->info.tessfactors_are_def_in_all_invocs;
+         }
+
+         r = si_shader_select(ctx, &sctx->fixed_func_tcs_shader);
+         if (r)
+            return false;
+         si_pm4_bind_state(sctx, hs, sctx->fixed_func_tcs_shader.current);
+      }
+
+      if (!HAS_GS || GFX_VERSION <= GFX8) {
+         r = si_shader_select(ctx, &sctx->shader.tes);
+         if (r)
+            return false;
+
+         if (HAS_GS) {
+            /* TES as ES */
+            assert(GFX_VERSION <= GFX8);
+            si_pm4_bind_state(sctx, es, sctx->shader.tes.current);
+         } else if (NGG) {
+            si_pm4_bind_state(sctx, gs, sctx->shader.tes.current);
+         } else {
+            si_pm4_bind_state(sctx, vs, sctx->shader.tes.current);
+         }
+      }
+   } else {
+      if (GFX_VERSION <= GFX8) {
+         si_pm4_bind_state(sctx, ls, NULL);
+         sctx->prefetch_L2_mask &= ~SI_PREFETCH_LS;
+      }
+      si_pm4_bind_state(sctx, hs, NULL);
+      sctx->prefetch_L2_mask &= ~SI_PREFETCH_HS;
+   }
+
+   /* Update GS. */
+   if (HAS_GS) {
+      r = si_shader_select(ctx, &sctx->shader.gs);
+      if (r)
+         return false;
+      si_pm4_bind_state(sctx, gs, sctx->shader.gs.current);
+      if (!NGG) {
+         si_pm4_bind_state(sctx, vs, sctx->shader.gs.cso->gs_copy_shader);
+
+         if (!si_update_gs_ring_buffers(sctx))
+            return false;
+      } else {
+         si_pm4_bind_state(sctx, vs, NULL);
+         sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS;
+      }
+   } else {
+      if (!NGG) {
+         si_pm4_bind_state(sctx, gs, NULL);
+         sctx->prefetch_L2_mask &= ~SI_PREFETCH_GS;
+         if (GFX_VERSION <= GFX8) {
+            si_pm4_bind_state(sctx, es, NULL);
+            sctx->prefetch_L2_mask &= ~SI_PREFETCH_ES;
+         }
+      }
+   }
+
+   /* Update VS. */
+   if ((!HAS_TESS && !HAS_GS) || GFX_VERSION <= GFX8) {
+      r = si_shader_select(ctx, &sctx->shader.vs);
+      if (r)
+         return false;
+
+      if (!HAS_TESS && !HAS_GS) {
+         if (NGG) {
+            si_pm4_bind_state(sctx, gs, sctx->shader.vs.current);
+            si_pm4_bind_state(sctx, vs, NULL);
+            sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS;
+         } else {
+            si_pm4_bind_state(sctx, vs, sctx->shader.vs.current);
+         }
+      } else if (HAS_TESS) {
+         si_pm4_bind_state(sctx, ls, sctx->shader.vs.current);
+      } else {
+         assert(HAS_GS);
+         si_pm4_bind_state(sctx, es, sctx->shader.vs.current);
+      }
+   }
+
+   if (GFX_VERSION >= GFX9 && HAS_TESS)
+      sctx->vs_uses_base_instance = sctx->queued.named.hs->uses_base_instance;
+   else if (GFX_VERSION >= GFX9 && HAS_GS)
+      sctx->vs_uses_base_instance = sctx->shader.gs.current->uses_base_instance;
+   else
+      sctx->vs_uses_base_instance = sctx->shader.vs.current->uses_base_instance;
+
+   union si_vgt_stages_key key;
+   key.index = 0;
+
+   /* Update VGT_SHADER_STAGES_EN. */
+   if (HAS_TESS)
+      key.u.tess = 1;
+   if (HAS_GS)
+      key.u.gs = 1;
+   if (NGG)
+      key.index |= si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->ctx_reg.ngg.vgt_stages.index;
+
+   struct si_pm4_state **pm4 = &sctx->vgt_shader_config[key.index];
+   if (unlikely(!*pm4))
+      *pm4 = si_build_vgt_shader_config(sctx->screen, key);
+   si_pm4_bind_state(sctx, vgt_shader_config, *pm4);
+
+   if (old_pa_cl_vs_out_cntl !=
+          si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->pa_cl_vs_out_cntl)
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
+
+   r = si_shader_select(ctx, &sctx->shader.ps);
+   if (r)
+      return false;
+   si_pm4_bind_state(sctx, ps, sctx->shader.ps.current);
+
+   if (si_pm4_state_changed(sctx, ps) ||
+       (!NGG && si_pm4_state_changed(sctx, vs)) ||
+       (NGG && si_pm4_state_changed(sctx, gs))) {
+      sctx->atoms.s.spi_map.emit = sctx->emit_spi_map[sctx->shader.ps.current->ctx_reg.ps.num_interp];
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map);
+   }
+
+   if ((GFX_VERSION >= GFX10_3 || (GFX_VERSION >= GFX9 && sctx->screen->info.rbplus_allowed)) &&
+       si_pm4_state_changed(sctx, ps) &&
+       (!old_ps || old_spi_shader_col_format !=
+                      sctx->shader.ps.current->key.ps.part.epilog.spi_shader_col_format))
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
+
+   if (sctx->smoothing_enabled !=
+       sctx->shader.ps.current->key.ps.part.epilog.poly_line_smoothing) {
+      sctx->smoothing_enabled = sctx->shader.ps.current->key.ps.part.epilog.poly_line_smoothing;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+
+      /* NGG cull state uses smoothing_enabled. */
+      if (GFX_VERSION >= GFX10 && sctx->screen->use_ngg_culling)
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.ngg_cull_state);
+
+      if (GFX_VERSION == GFX6)
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+
+      if (sctx->framebuffer.nr_samples <= 1)
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
+   }
+
+   if (unlikely(sctx->screen->debug_flags & DBG(SQTT) && sctx->thread_trace)) {
+      /* Pretend the bound shaders form a vk pipeline */
+      uint32_t pipeline_code_hash = 0;
+      uint64_t base_address = ~0;
+
+      for (int i = 0; i < SI_NUM_GRAPHICS_SHADERS; i++) {
+         struct si_shader *shader = sctx->shaders[i].current;
+         if (sctx->shaders[i].cso && shader) {
+            pipeline_code_hash = _mesa_hash_data_with_seed(
+               shader->binary.elf_buffer,
+               shader->binary.elf_size,
+               pipeline_code_hash);
+            base_address = MIN2(base_address,
+                                shader->bo->gpu_address);
+         }
+      }
+
+      struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
+      if (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_code_hash)) {
+         si_sqtt_register_pipeline(sctx, pipeline_code_hash, base_address, false);
+      }
+
+      si_sqtt_describe_pipeline_bind(sctx, pipeline_code_hash, 0);
+   }
+
+   if ((GFX_VERSION <= GFX8 &&
+        (si_pm4_state_enabled_and_changed(sctx, ls) || si_pm4_state_enabled_and_changed(sctx, es))) ||
+       si_pm4_state_enabled_and_changed(sctx, hs) || si_pm4_state_enabled_and_changed(sctx, gs) ||
+       si_pm4_state_enabled_and_changed(sctx, vs) || si_pm4_state_enabled_and_changed(sctx, ps)) {
+      unsigned scratch_size = 0;
+
+      if (HAS_TESS) {
+         if (GFX_VERSION <= GFX8) /* LS */
+            scratch_size = MAX2(scratch_size, sctx->shader.vs.current->config.scratch_bytes_per_wave);
+
+         scratch_size = MAX2(scratch_size, sctx->queued.named.hs->config.scratch_bytes_per_wave);
+
+         if (HAS_GS) {
+            if (GFX_VERSION <= GFX8) /* ES */
+               scratch_size = MAX2(scratch_size, sctx->shader.tes.current->config.scratch_bytes_per_wave);
+
+            scratch_size = MAX2(scratch_size, sctx->shader.gs.current->config.scratch_bytes_per_wave);
+         } else {
+            scratch_size = MAX2(scratch_size, sctx->shader.tes.current->config.scratch_bytes_per_wave);
+         }
+      } else if (HAS_GS) {
+         if (GFX_VERSION <= GFX8) /* ES */
+            scratch_size = MAX2(scratch_size, sctx->shader.vs.current->config.scratch_bytes_per_wave);
+
+         scratch_size = MAX2(scratch_size, sctx->shader.gs.current->config.scratch_bytes_per_wave);
+      } else {
+         scratch_size = MAX2(scratch_size, sctx->shader.vs.current->config.scratch_bytes_per_wave);
+      }
+
+      scratch_size = MAX2(scratch_size, sctx->shader.ps.current->config.scratch_bytes_per_wave);
+
+      if (scratch_size && !si_update_spi_tmpring_size(sctx, scratch_size))
+         return false;
+
+      if (GFX_VERSION >= GFX7) {
+         if (GFX_VERSION <= GFX8 && HAS_TESS && si_pm4_state_enabled_and_changed(sctx, ls))
+            sctx->prefetch_L2_mask |= SI_PREFETCH_LS;
+
+         if (HAS_TESS && si_pm4_state_enabled_and_changed(sctx, hs))
+            sctx->prefetch_L2_mask |= SI_PREFETCH_HS;
+
+         if (GFX_VERSION <= GFX8 && HAS_GS && si_pm4_state_enabled_and_changed(sctx, es))
+            sctx->prefetch_L2_mask |= SI_PREFETCH_ES;
+
+         if ((HAS_GS || NGG) && si_pm4_state_enabled_and_changed(sctx, gs))
+            sctx->prefetch_L2_mask |= SI_PREFETCH_GS;
+
+         if (!NGG && si_pm4_state_enabled_and_changed(sctx, vs))
+            sctx->prefetch_L2_mask |= SI_PREFETCH_VS;
+
+         if (si_pm4_state_enabled_and_changed(sctx, ps))
+            sctx->prefetch_L2_mask |= SI_PREFETCH_PS;
+      }
+   }
+
+   sctx->do_update_shaders = false;
+   return true;
+}
+
 ALWAYS_INLINE
 static unsigned si_conv_pipe_prim(unsigned mode)
 {
@@ -71,9 +387,9 @@ static unsigned si_conv_pipe_prim(unsigned mode)
    return prim_conv[mode];
 }
 
-static void si_prefetch_shader_async(struct si_context *sctx, struct si_pm4_state *state)
+static void si_prefetch_shader_async(struct si_context *sctx, struct si_shader *shader)
 {
-   struct pipe_resource *bo = &state->shader->bo->b.b;
+   struct pipe_resource *bo = &shader->bo->b.b;
 
    si_cp_dma_prefetch(sctx, bo, 0, bo->width0);
 }
@@ -191,9 +507,7 @@ static void si_prefetch_shaders(struct si_context *sctx)
  * The information about LDS and other non-compile-time parameters is then
  * written to userdata SGPRs.
  */
-static void si_emit_derived_tess_state(struct si_context *sctx,
-                                       unsigned num_tcs_input_cp,
-                                       unsigned *num_patches)
+static void si_emit_derived_tess_state(struct si_context *sctx, unsigned *num_patches)
 {
    struct si_shader *ls_current;
    struct si_shader_selector *ls;
@@ -204,6 +518,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
    unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;
    bool has_primid_instancing_bug = sctx->chip_class == GFX6 && sctx->screen->info.max_se == 1;
    unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL];
+   uint8_t num_tcs_input_cp = sctx->patch_vertices;
 
    /* Since GFX9 has merged LS-HS in the TCS state, set LS = TCS. */
    if (sctx->chip_class >= GFX9) {
@@ -212,7 +527,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
       else
          ls_current = sctx->fixed_func_tcs_shader.current;
 
-      ls = ls_current->key.part.tcs.ls;
+      ls = ls_current->key.ge.part.tcs.ls;
    } else {
       ls_current = sctx->shader.vs.current;
       ls = sctx->shader.vs.cso;
@@ -252,7 +567,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
    unsigned input_patch_size;
 
    /* Allocate LDS for TCS inputs only if it's used. */
-   if (!ls_current->key.opt.same_patch_vertices ||
+   if (!ls_current->key.ge.opt.same_patch_vertices ||
        tcs->info.base.inputs_read & ~tcs->tcs_vgpr_only_inputs)
       input_patch_size = num_tcs_input_cp * input_vertex_size;
    else
@@ -403,14 +718,14 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
       else
          hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX9(lds_size);
 
-      radeon_set_sh_reg(cs, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, hs_rsrc2);
+      radeon_set_sh_reg(R_00B42C_SPI_SHADER_PGM_RSRC2_HS, hs_rsrc2);
 
       /* Set userdata SGPRs for merged LS-HS. */
       radeon_set_sh_reg_seq(
-         cs, R_00B430_SPI_SHADER_USER_DATA_LS_0 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4, 3);
-      radeon_emit(cs, offchip_layout);
-      radeon_emit(cs, tcs_out_offsets);
-      radeon_emit(cs, tcs_out_layout);
+         R_00B430_SPI_SHADER_USER_DATA_LS_0 + GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4, 3);
+      radeon_emit(offchip_layout);
+      radeon_emit(tcs_out_offsets);
+      radeon_emit(tcs_out_layout);
    } else {
       unsigned ls_rsrc2 = ls_current->config.rsrc2;
 
@@ -420,24 +735,24 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
       /* Due to a hw bug, RSRC2_LS must be written twice with another
        * LS register written in between. */
       if (sctx->chip_class == GFX7 && sctx->family != CHIP_HAWAII)
-         radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
-      radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
-      radeon_emit(cs, ls_current->config.rsrc1);
-      radeon_emit(cs, ls_rsrc2);
+         radeon_set_sh_reg(R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
+      radeon_set_sh_reg_seq(R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
+      radeon_emit(ls_current->config.rsrc1);
+      radeon_emit(ls_rsrc2);
 
       /* Set userdata SGPRs for TCS. */
       radeon_set_sh_reg_seq(
-         cs, R_00B430_SPI_SHADER_USER_DATA_HS_0 + GFX6_SGPR_TCS_OFFCHIP_LAYOUT * 4, 4);
-      radeon_emit(cs, offchip_layout);
-      radeon_emit(cs, tcs_out_offsets);
-      radeon_emit(cs, tcs_out_layout);
-      radeon_emit(cs, tcs_in_layout);
+         R_00B430_SPI_SHADER_USER_DATA_HS_0 + GFX6_SGPR_TCS_OFFCHIP_LAYOUT * 4, 4);
+      radeon_emit(offchip_layout);
+      radeon_emit(tcs_out_offsets);
+      radeon_emit(tcs_out_layout);
+      radeon_emit(tcs_in_layout);
    }
 
    /* Set userdata SGPRs for TES. */
-   radeon_set_sh_reg_seq(cs, tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, 2);
-   radeon_emit(cs, offchip_layout);
-   radeon_emit(cs, ring_va);
+   radeon_set_sh_reg_seq(tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, 2);
+   radeon_emit(offchip_layout);
+   radeon_emit(ring_va);
    radeon_end();
 
    unsigned ls_hs_config =
@@ -448,9 +763,9 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
    if (sctx->last_ls_hs_config != ls_hs_config) {
       radeon_begin(cs);
       if (sctx->chip_class >= GFX7) {
-         radeon_set_context_reg_idx(cs, R_028B58_VGT_LS_HS_CONFIG, 2, ls_hs_config);
+         radeon_set_context_reg_idx(R_028B58_VGT_LS_HS_CONFIG, 2, ls_hs_config);
       } else {
-         radeon_set_context_reg(cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config);
+         radeon_set_context_reg(R_028B58_VGT_LS_HS_CONFIG, ls_hs_config);
       }
       radeon_end_update_context_roll(sctx);
       sctx->last_ls_hs_config = ls_hs_config;
@@ -631,6 +946,12 @@ static bool si_is_line_stipple_enabled(struct si_context *sctx)
           (rs->polygon_mode_is_lines || util_prim_is_lines(sctx->current_rast_prim));
 }
 
+enum si_is_draw_vertex_state {
+   DRAW_VERTEX_STATE_OFF,
+   DRAW_VERTEX_STATE_ON,
+};
+
+template <si_is_draw_vertex_state IS_DRAW_VERTEX_STATE> ALWAYS_INLINE
 static bool num_instanced_prims_less_than(const struct pipe_draw_indirect_info *indirect,
                                           enum pipe_prim_type prim,
                                           unsigned min_vertex_count,
@@ -638,6 +959,9 @@ static bool num_instanced_prims_less_than(const struct pipe_draw_indirect_info *
                                           unsigned num_prims,
                                           ubyte vertices_per_patch)
 {
+   if (IS_DRAW_VERTEX_STATE)
+      return 0;
+
    if (indirect) {
       return indirect->buffer ||
              (instance_count > 1 && indirect->count_from_stream_output);
@@ -647,12 +971,13 @@ static bool num_instanced_prims_less_than(const struct pipe_draw_indirect_info *
    }
 }
 
-template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS> ALWAYS_INLINE
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS,
+          si_is_draw_vertex_state IS_DRAW_VERTEX_STATE> ALWAYS_INLINE
 static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
                                           const struct pipe_draw_indirect_info *indirect,
                                           enum pipe_prim_type prim, unsigned num_patches,
                                           unsigned instance_count, bool primitive_restart,
-                                          unsigned min_vertex_count, ubyte vertices_per_patch)
+                                          unsigned min_vertex_count)
 {
    union si_vgt_param_key key = sctx->ia_multi_vgt_param_key;
    unsigned primgroup_size;
@@ -667,12 +992,15 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
    }
 
    key.u.prim = prim;
-   key.u.uses_instancing = (indirect && indirect->buffer) || instance_count > 1;
+   key.u.uses_instancing = !IS_DRAW_VERTEX_STATE &&
+                           ((indirect && indirect->buffer) || instance_count > 1);
    key.u.multi_instances_smaller_than_primgroup =
-      num_instanced_prims_less_than(indirect, prim, min_vertex_count, instance_count,
-                                    primgroup_size, vertices_per_patch);
-   key.u.primitive_restart = primitive_restart;
-   key.u.count_from_stream_output = indirect && indirect->count_from_stream_output;
+      num_instanced_prims_less_than<IS_DRAW_VERTEX_STATE>(indirect, prim, min_vertex_count,
+                                                          instance_count, primgroup_size,
+                                                          sctx->patch_vertices);
+   key.u.primitive_restart = !IS_DRAW_VERTEX_STATE && primitive_restart;
+   key.u.count_from_stream_output = !IS_DRAW_VERTEX_STATE && indirect &&
+                                    indirect->count_from_stream_output;
    key.u.line_stipple_enabled = si_is_line_stipple_enabled(sctx);
 
    ia_multi_vgt_param =
@@ -690,8 +1018,8 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
        */
       if (GFX_VERSION == GFX7 &&
           sctx->family == CHIP_HAWAII && G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) &&
-          num_instanced_prims_less_than(indirect, prim, min_vertex_count, instance_count, 2,
-                                        vertices_per_patch))
+          num_instanced_prims_less_than<IS_DRAW_VERTEX_STATE>(indirect, prim, min_vertex_count,
+                                                              instance_count, 2, sctx->patch_vertices))
          sctx->flags |= SI_CONTEXT_VGT_FLUSH;
    }
 
@@ -750,7 +1078,7 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx)
 
    unsigned gs_out_prim = si_conv_prim_to_gs_out(rast_prim);
    if (unlikely(gs_out_prim != sctx->last_gs_out_prim && (NGG || HAS_GS))) {
-      radeon_set_context_reg(cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out_prim);
+      radeon_set_context_reg(R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out_prim);
       sctx->last_gs_out_prim = gs_out_prim;
    }
 
@@ -776,11 +1104,11 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx)
    }
 }
 
-template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
-ALWAYS_INLINE
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
+          si_is_draw_vertex_state IS_DRAW_VERTEX_STATE> ALWAYS_INLINE
 static void si_emit_vs_state(struct si_context *sctx, unsigned index_size)
 {
-   if (sctx->num_vs_blit_sgprs) {
+   if (!IS_DRAW_VERTEX_STATE && sctx->num_vs_blit_sgprs) {
       /* Re-emit the state after we leave u_blitter. */
       sctx->last_vs_state = ~0;
       return;
@@ -798,7 +1126,7 @@ static void si_emit_vs_state(struct si_context *sctx, unsigned index_size)
       unsigned vs_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG,
                                                PIPE_SHADER_VERTEX);
       radeon_begin(cs);
-      radeon_set_sh_reg(cs, vs_base + SI_SGPR_VS_STATE_BITS * 4,
+      radeon_set_sh_reg(vs_base + SI_SGPR_VS_STATE_BITS * 4,
                         sctx->current_vs_state);
 
       /* Set CLAMP_VERTEX_COLOR and OUTPRIM in the last stage
@@ -807,13 +1135,13 @@ static void si_emit_vs_state(struct si_context *sctx, unsigned index_size)
        * For TES or the GS copy shader without NGG:
        */
       if (vs_base != R_00B130_SPI_SHADER_USER_DATA_VS_0) {
-         radeon_set_sh_reg(cs, R_00B130_SPI_SHADER_USER_DATA_VS_0 + SI_SGPR_VS_STATE_BITS * 4,
+         radeon_set_sh_reg(R_00B130_SPI_SHADER_USER_DATA_VS_0 + SI_SGPR_VS_STATE_BITS * 4,
                            sctx->current_vs_state);
       }
 
       /* For NGG: */
       if (GFX_VERSION >= GFX10 && vs_base != R_00B230_SPI_SHADER_USER_DATA_GS_0) {
-         radeon_set_sh_reg(cs, R_00B230_SPI_SHADER_USER_DATA_GS_0 + SI_SGPR_VS_STATE_BITS * 4,
+         radeon_set_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 + SI_SGPR_VS_STATE_BITS * 4,
                            sctx->current_vs_state);
       }
       radeon_end();
@@ -830,32 +1158,33 @@ static bool si_prim_restart_index_changed(struct si_context *sctx, bool primitiv
                                 sctx->last_restart_index == SI_RESTART_INDEX_UNKNOWN);
 }
 
-template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS> ALWAYS_INLINE
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS,
+          si_is_draw_vertex_state IS_DRAW_VERTEX_STATE> ALWAYS_INLINE
 static void si_emit_ia_multi_vgt_param(struct si_context *sctx,
                                        const struct pipe_draw_indirect_info *indirect,
                                        enum pipe_prim_type prim, unsigned num_patches,
                                        unsigned instance_count, bool primitive_restart,
-                                       unsigned min_vertex_count, ubyte vertices_per_patch)
+                                       unsigned min_vertex_count)
 {
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
    unsigned ia_multi_vgt_param;
 
    ia_multi_vgt_param =
-      si_get_ia_multi_vgt_param<GFX_VERSION, HAS_TESS, HAS_GS>
+      si_get_ia_multi_vgt_param<GFX_VERSION, HAS_TESS, HAS_GS, IS_DRAW_VERTEX_STATE>
          (sctx, indirect, prim, num_patches, instance_count, primitive_restart,
-          min_vertex_count, vertices_per_patch);
+          min_vertex_count);
 
    /* Draw state. */
    if (ia_multi_vgt_param != sctx->last_multi_vgt_param) {
       radeon_begin(cs);
 
       if (GFX_VERSION == GFX9)
-         radeon_set_uconfig_reg_idx(cs, sctx->screen, GFX_VERSION,
+         radeon_set_uconfig_reg_idx(sctx->screen, GFX_VERSION,
                                     R_030960_IA_MULTI_VGT_PARAM, 4, ia_multi_vgt_param);
       else if (GFX_VERSION >= GFX7)
-         radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
+         radeon_set_context_reg_idx(R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
       else
-         radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
+         radeon_set_context_reg(R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
 
       radeon_end();
 
@@ -906,28 +1235,31 @@ static void gfx10_emit_ge_cntl(struct si_context *sctx, unsigned num_patches)
       struct radeon_cmdbuf *cs = &sctx->gfx_cs;
 
       radeon_begin(cs);
-      radeon_set_uconfig_reg(cs, R_03096C_GE_CNTL, ge_cntl);
+      radeon_set_uconfig_reg(R_03096C_GE_CNTL, ge_cntl);
       radeon_end();
       sctx->last_multi_vgt_param = ge_cntl;
    }
 }
 
-template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG> ALWAYS_INLINE
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
+          si_is_draw_vertex_state IS_DRAW_VERTEX_STATE> ALWAYS_INLINE
 static void si_emit_draw_registers(struct si_context *sctx,
                                    const struct pipe_draw_indirect_info *indirect,
                                    enum pipe_prim_type prim, unsigned num_patches,
-                                   unsigned instance_count, ubyte vertices_per_patch,
-                                   bool primitive_restart, unsigned restart_index,
-                                   unsigned min_vertex_count)
+                                   unsigned instance_count, bool primitive_restart,
+                                   unsigned restart_index, unsigned min_vertex_count)
 {
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
 
+   if (IS_DRAW_VERTEX_STATE)
+      primitive_restart = false;
+
    if (GFX_VERSION >= GFX10)
       gfx10_emit_ge_cntl<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx, num_patches);
    else
-      si_emit_ia_multi_vgt_param<GFX_VERSION, HAS_TESS, HAS_GS>
+      si_emit_ia_multi_vgt_param<GFX_VERSION, HAS_TESS, HAS_GS, IS_DRAW_VERTEX_STATE>
          (sctx, indirect, prim, num_patches, instance_count, primitive_restart,
-          min_vertex_count, vertices_per_patch);
+          min_vertex_count);
 
    radeon_begin(cs);
 
@@ -935,11 +1267,11 @@ static void si_emit_draw_registers(struct si_context *sctx,
       unsigned vgt_prim = si_conv_pipe_prim(prim);
 
       if (GFX_VERSION >= GFX10)
-         radeon_set_uconfig_reg(cs, R_030908_VGT_PRIMITIVE_TYPE, vgt_prim);
+         radeon_set_uconfig_reg(R_030908_VGT_PRIMITIVE_TYPE, vgt_prim);
       else if (GFX_VERSION >= GFX7)
-         radeon_set_uconfig_reg_idx(cs, sctx->screen, GFX_VERSION, R_030908_VGT_PRIMITIVE_TYPE, 1, vgt_prim);
+         radeon_set_uconfig_reg_idx(sctx->screen, GFX_VERSION, R_030908_VGT_PRIMITIVE_TYPE, 1, vgt_prim);
       else
-         radeon_set_config_reg(cs, R_008958_VGT_PRIMITIVE_TYPE, vgt_prim);
+         radeon_set_config_reg(R_008958_VGT_PRIMITIVE_TYPE, vgt_prim);
 
       sctx->last_prim = prim;
    }
@@ -947,14 +1279,14 @@ static void si_emit_draw_registers(struct si_context *sctx,
    /* Primitive restart. */
    if (primitive_restart != sctx->last_primitive_restart_en) {
       if (GFX_VERSION >= GFX9)
-         radeon_set_uconfig_reg(cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN, primitive_restart);
+         radeon_set_uconfig_reg(R_03092C_VGT_MULTI_PRIM_IB_RESET_EN, primitive_restart);
       else
-         radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, primitive_restart);
+         radeon_set_context_reg(R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, primitive_restart);
 
       sctx->last_primitive_restart_en = primitive_restart;
    }
    if (si_prim_restart_index_changed(sctx, primitive_restart, restart_index)) {
-      radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, restart_index);
+      radeon_set_context_reg(R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, restart_index);
       sctx->last_restart_index = restart_index;
       if (GFX_VERSION == GFX9)
          sctx->context_roll = true;
@@ -965,15 +1297,15 @@ static void si_emit_draw_registers(struct si_context *sctx,
 #define EMIT_SQTT_END_DRAW do {                                          \
       if (GFX_VERSION >= GFX9 && unlikely(sctx->thread_trace_enabled)) { \
          radeon_begin(&sctx->gfx_cs);                                    \
-         radeon_emit(&sctx->gfx_cs, PKT3(PKT3_EVENT_WRITE, 0, 0));       \
-         radeon_emit(&sctx->gfx_cs,                                      \
-                     EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) |          \
+         radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));       \
+         radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) |          \
                      EVENT_INDEX(0));                                    \
          radeon_end();                                      \
       }                                                                  \
    } while (0)
 
-template <chip_class GFX_VERSION, si_has_ngg NGG, si_has_prim_discard_cs ALLOW_PRIM_DISCARD_CS>
+template <chip_class GFX_VERSION, si_has_ngg NGG, si_is_draw_vertex_state IS_DRAW_VERTEX_STATE>
+ALWAYS_INLINE
 static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw_info *info,
                                  unsigned drawid_base,
                                  const struct pipe_draw_indirect_info *indirect,
@@ -981,7 +1313,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
                                  unsigned num_draws, unsigned total_count,
                                  struct pipe_resource *indexbuf, unsigned index_size,
                                  unsigned index_offset, unsigned instance_count,
-                                 bool dispatch_prim_discard_cs, unsigned original_index_size)
+                                 unsigned original_index_size)
 {
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
 
@@ -992,11 +1324,11 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
 
    uint32_t use_opaque = 0;
 
-   if (indirect && indirect->count_from_stream_output) {
+   if (!IS_DRAW_VERTEX_STATE && indirect && indirect->count_from_stream_output) {
       struct si_streamout_target *t = (struct si_streamout_target *)indirect->count_from_stream_output;
 
       radeon_begin(cs);
-      radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, t->stride_in_dw);
+      radeon_set_context_reg(R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, t->stride_in_dw);
       radeon_end();
 
       si_cp_copy_data(sctx, &sctx->gfx_cs, COPY_DATA_REG, NULL,
@@ -1033,32 +1365,29 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
          }
 
          if (GFX_VERSION >= GFX9) {
-            radeon_set_uconfig_reg_idx(cs, sctx->screen, GFX_VERSION,
+            radeon_set_uconfig_reg_idx(sctx->screen, GFX_VERSION,
                                        R_03090C_VGT_INDEX_TYPE, 2, index_type);
          } else {
-            radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
-            radeon_emit(cs, index_type);
+            radeon_emit(PKT3(PKT3_INDEX_TYPE, 0, 0));
+            radeon_emit(index_type);
          }
 
          sctx->last_index_size = index_size;
       }
 
-      /* If !ALLOW_PRIM_DISCARD_CS, index_size == original_index_size. */
-      if (!ALLOW_PRIM_DISCARD_CS || original_index_size) {
-         index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(original_index_size);
-         /* Skip draw calls with 0-sized index buffers.
-          * They cause a hang on some chips, like Navi10-14.
-          */
-         if (!index_max_size) {
-            radeon_end();
-            return;
-         }
-
-         index_va = si_resource(indexbuf)->gpu_address + index_offset;
-
-         radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(indexbuf), RADEON_USAGE_READ,
-                                   RADEON_PRIO_INDEX_BUFFER);
+      index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(index_size);
+      /* Skip draw calls with 0-sized index buffers.
+       * They cause a hang on some chips, like Navi10-14.
+       */
+      if (!index_max_size) {
+         radeon_end();
+         return;
       }
+
+      index_va = si_resource(indexbuf)->gpu_address + index_offset;
+
+      radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(indexbuf), RADEON_USAGE_READ,
+                                RADEON_PRIO_INDEX_BUFFER);
    } else {
       /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
        * so the state must be re-emitted before the next indexed draw.
@@ -1070,7 +1399,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
    unsigned sh_base_reg = sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX];
    bool render_cond_bit = sctx->render_cond_enabled;
 
-   if (indirect) {
+   if (!IS_DRAW_VERTEX_STATE && indirect) {
       assert(num_draws == 1);
       uint64_t indirect_va = si_resource(indirect->buffer)->gpu_address;
 
@@ -1078,10 +1407,10 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
 
       si_invalidate_draw_constants(sctx);
 
-      radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
-      radeon_emit(cs, 1);
-      radeon_emit(cs, indirect_va);
-      radeon_emit(cs, indirect_va >> 32);
+      radeon_emit(PKT3(PKT3_SET_BASE, 2, 0));
+      radeon_emit(1);
+      radeon_emit(indirect_va);
+      radeon_emit(indirect_va >> 32);
 
       radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(indirect->buffer),
                                 RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
@@ -1091,21 +1420,21 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
       assert(indirect->offset % 4 == 0);
 
       if (index_size) {
-         radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
-         radeon_emit(cs, index_va);
-         radeon_emit(cs, index_va >> 32);
+         radeon_emit(PKT3(PKT3_INDEX_BASE, 1, 0));
+         radeon_emit(index_va);
+         radeon_emit(index_va >> 32);
 
-         radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
-         radeon_emit(cs, index_max_size);
+         radeon_emit(PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
+         radeon_emit(index_max_size);
       }
 
       if (!sctx->screen->has_draw_indirect_multi) {
-         radeon_emit(cs, PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT, 3,
-                              render_cond_bit));
-         radeon_emit(cs, indirect->offset);
-         radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
-         radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
-         radeon_emit(cs, di_src_sel);
+         radeon_emit(PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT, 3,
+                          render_cond_bit));
+         radeon_emit(indirect->offset);
+         radeon_emit((sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
+         radeon_emit((sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
+         radeon_emit(di_src_sel);
       } else {
          uint64_t count_va = 0;
 
@@ -1118,44 +1447,43 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
             count_va = params_buf->gpu_address + indirect->indirect_draw_count_offset;
          }
 
-         radeon_emit(cs,
-                     PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8,
+         radeon_emit(PKT3(index_size ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8,
                           render_cond_bit));
-         radeon_emit(cs, indirect->offset);
-         radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
-         radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
-         radeon_emit(cs, ((sh_base_reg + SI_SGPR_DRAWID * 4 - SI_SH_REG_OFFSET) >> 2) |
-                            S_2C3_DRAW_INDEX_ENABLE(sctx->shader.vs.cso->info.uses_drawid) |
-                            S_2C3_COUNT_INDIRECT_ENABLE(!!indirect->indirect_draw_count));
-         radeon_emit(cs, indirect->draw_count);
-         radeon_emit(cs, count_va);
-         radeon_emit(cs, count_va >> 32);
-         radeon_emit(cs, indirect->stride);
-         radeon_emit(cs, di_src_sel);
+         radeon_emit(indirect->offset);
+         radeon_emit((sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
+         radeon_emit((sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
+         radeon_emit(((sh_base_reg + SI_SGPR_DRAWID * 4 - SI_SH_REG_OFFSET) >> 2) |
+                     S_2C3_DRAW_INDEX_ENABLE(sctx->shader.vs.cso->info.uses_drawid) |
+                     S_2C3_COUNT_INDIRECT_ENABLE(!!indirect->indirect_draw_count));
+         radeon_emit(indirect->draw_count);
+         radeon_emit(count_va);
+         radeon_emit(count_va >> 32);
+         radeon_emit(indirect->stride);
+         radeon_emit(di_src_sel);
       }
    } else {
       /* Register shadowing requires that we always emit PKT3_NUM_INSTANCES. */
       if (sctx->shadowed_regs ||
           sctx->last_instance_count == SI_INSTANCE_COUNT_UNKNOWN ||
           sctx->last_instance_count != instance_count) {
-         radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, 0));
-         radeon_emit(cs, instance_count);
+         radeon_emit(PKT3(PKT3_NUM_INSTANCES, 0, 0));
+         radeon_emit(instance_count);
          sctx->last_instance_count = instance_count;
       }
 
       /* Base vertex and start instance. */
       int base_vertex = original_index_size ? draws[0].index_bias : draws[0].start;
 
-      bool set_draw_id = sctx->vs_uses_draw_id;
+      bool set_draw_id = !IS_DRAW_VERTEX_STATE && sctx->vs_uses_draw_id;
       bool set_base_instance = sctx->vs_uses_base_instance;
 
-      if (sctx->num_vs_blit_sgprs) {
+      if (!IS_DRAW_VERTEX_STATE && sctx->num_vs_blit_sgprs) {
          /* Re-emit draw constants after we leave u_blitter. */
          si_invalidate_draw_sh_constants(sctx);
 
          /* Blit VS doesn't use BASE_VERTEX, START_INSTANCE, and DRAWID. */
-         radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_VS_BLIT_DATA * 4, sctx->num_vs_blit_sgprs);
-         radeon_emit_array(cs, sctx->vs_blit_sh_data, sctx->num_vs_blit_sgprs);
+         radeon_set_sh_reg_seq(sh_base_reg + SI_SGPR_VS_BLIT_DATA * 4, sctx->num_vs_blit_sgprs);
+         radeon_emit_array(sctx->vs_blit_sh_data, sctx->num_vs_blit_sgprs);
       } else if (base_vertex != sctx->last_base_vertex ||
                  sctx->last_base_vertex == SI_BASE_VERTEX_UNKNOWN ||
                  (set_base_instance &&
@@ -1166,21 +1494,21 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
                    sctx->last_drawid == SI_DRAW_ID_UNKNOWN)) ||
                  sh_base_reg != sctx->last_sh_base_reg) {
          if (set_base_instance) {
-            radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 3);
-            radeon_emit(cs, base_vertex);
-            radeon_emit(cs, drawid_base);
-            radeon_emit(cs, info->start_instance);
+            radeon_set_sh_reg_seq(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 3);
+            radeon_emit(base_vertex);
+            radeon_emit(drawid_base);
+            radeon_emit(info->start_instance);
 
             sctx->last_start_instance = info->start_instance;
             sctx->last_drawid = drawid_base;
          } else if (set_draw_id) {
-            radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 2);
-            radeon_emit(cs, base_vertex);
-            radeon_emit(cs, drawid_base);
+            radeon_set_sh_reg_seq(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 2);
+            radeon_emit(base_vertex);
+            radeon_emit(drawid_base);
 
             sctx->last_drawid = drawid_base;
          } else {
-            radeon_set_sh_reg(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, base_vertex);
+            radeon_set_sh_reg(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, base_vertex);
          }
 
          sctx->last_base_vertex = base_vertex;
@@ -1188,19 +1516,10 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
       }
 
       /* Don't update draw_id in the following code if it doesn't increment. */
-      bool increment_draw_id = num_draws > 1 && set_draw_id && info->increment_draw_id;
+      bool increment_draw_id = !IS_DRAW_VERTEX_STATE && num_draws > 1 &&
+                               set_draw_id && info->increment_draw_id;
 
       if (index_size) {
-         if (ALLOW_PRIM_DISCARD_CS && dispatch_prim_discard_cs) {
-            radeon_end();
-
-            si_dispatch_prim_discard_cs_and_draw(sctx, info, draws, num_draws,
-                                                 original_index_size, total_count, index_va,
-                                                 index_max_size);
-            EMIT_SQTT_END_DRAW;
-            return;
-         }
-
          /* NOT_EOP allows merging multiple draws into 1 wave, but only user VGPRs
           * can be changed between draws, and GS fast launch must be disabled.
           * NOT_EOP doesn't work on gfx9 and older.
@@ -1216,7 +1535,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
           *  else for (all draws);
           *
           */
-         bool index_bias_varies = num_draws > 1 && info->index_bias_varies;
+         bool index_bias_varies = !IS_DRAW_VERTEX_STATE && num_draws > 1 &&
+                                  info->index_bias_varies;
 
          if (increment_draw_id) {
             if (index_bias_varies) {
@@ -1224,17 +1544,17 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
                   uint64_t va = index_va + draws[i].start * index_size;
 
                   if (i > 0) {
-                     radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 2);
-                     radeon_emit(cs, draws[i].index_bias);
-                     radeon_emit(cs, drawid_base + i);
+                     radeon_set_sh_reg_seq(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 2);
+                     radeon_emit(draws[i].index_bias);
+                     radeon_emit(drawid_base + i);
                   }
 
-                  radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
-                  radeon_emit(cs, index_max_size);
-                  radeon_emit(cs, va);
-                  radeon_emit(cs, va >> 32);
-                  radeon_emit(cs, draws[i].count);
-                  radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */
+                  radeon_emit(PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
+                  radeon_emit(index_max_size);
+                  radeon_emit(va);
+                  radeon_emit(va >> 32);
+                  radeon_emit(draws[i].count);
+                  radeon_emit(V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */
                }
                if (num_draws > 1) {
                   sctx->last_base_vertex = draws[num_draws - 1].index_bias;
@@ -1246,33 +1566,33 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
                   uint64_t va = index_va + draws[i].start * index_size;
 
                   if (i > 0)
-                     radeon_set_sh_reg(cs, sh_base_reg + SI_SGPR_DRAWID * 4, drawid_base + i);
+                     radeon_set_sh_reg(sh_base_reg + SI_SGPR_DRAWID * 4, drawid_base + i);
 
-                  radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
-                  radeon_emit(cs, index_max_size);
-                  radeon_emit(cs, va);
-                  radeon_emit(cs, va >> 32);
-                  radeon_emit(cs, draws[i].count);
-                  radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */
+                  radeon_emit(PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
+                  radeon_emit(index_max_size);
+                  radeon_emit(va);
+                  radeon_emit(va >> 32);
+                  radeon_emit(draws[i].count);
+                  radeon_emit(V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */
                }
                if (num_draws > 1)
                   sctx->last_drawid = drawid_base + num_draws - 1;
             }
          } else {
-            if (info->index_bias_varies) {
+            if (index_bias_varies) {
                /* Only BaseVertex varies. */
                for (unsigned i = 0; i < num_draws; i++) {
                   uint64_t va = index_va + draws[i].start * index_size;
 
                   if (i > 0)
-                     radeon_set_sh_reg(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, draws[i].index_bias);
+                     radeon_set_sh_reg(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, draws[i].index_bias);
 
-                  radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
-                  radeon_emit(cs, index_max_size);
-                  radeon_emit(cs, va);
-                  radeon_emit(cs, va >> 32);
-                  radeon_emit(cs, draws[i].count);
-                  radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */
+                  radeon_emit(PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
+                  radeon_emit(index_max_size);
+                  radeon_emit(va);
+                  radeon_emit(va >> 32);
+                  radeon_emit(draws[i].count);
+                  radeon_emit(V_0287F0_DI_SRC_SEL_DMA); /* NOT_EOP disabled */
                }
                if (num_draws > 1)
                   sctx->last_base_vertex = draws[num_draws - 1].index_bias;
@@ -1291,72 +1611,37 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
                for (unsigned i = 0; i < num_draws; i++) {
                   uint64_t va = index_va + draws[i].start * index_size;
 
-                  radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
-                  radeon_emit(cs, index_max_size);
-                  radeon_emit(cs, va);
-                  radeon_emit(cs, va >> 32);
-                  radeon_emit(cs, draws[i].count);
-                  radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA |
+                  radeon_emit(PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
+                  radeon_emit(index_max_size);
+                  radeon_emit(va);
+                  radeon_emit(va >> 32);
+                  radeon_emit(draws[i].count);
+                  radeon_emit(V_0287F0_DI_SRC_SEL_DMA |
                               S_0287F0_NOT_EOP(GFX_VERSION >= GFX10 && i < num_draws - 1));
                }
             }
          }
       } else {
-         /* Set the index buffer for fast launch. The VS prolog will load the indices. */
-         if (NGG && sctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0)) {
-            index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(original_index_size);
-
-            radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(indexbuf),
-                                      RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER);
-            uint64_t base_index_va = si_resource(indexbuf)->gpu_address + index_offset;
-
-            for (unsigned i = 0; i < num_draws; i++) {
-               uint64_t index_va = base_index_va + draws[i].start * original_index_size;
-
-               radeon_set_sh_reg_seq(cs, R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS, 2);
-               radeon_emit(cs, index_va);
-               radeon_emit(cs, index_va >> 32);
-
-               if (i > 0) {
-                  if (increment_draw_id) {
-                     unsigned draw_id = drawid_base + i;
-
-                     radeon_set_sh_reg(cs, sh_base_reg + SI_SGPR_DRAWID * 4, draw_id);
-                     sctx->last_drawid = draw_id;
-                  }
-               }
-
-               /* TODO: Do index buffer bounds checking? We don't do it in this case. */
-               radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));
-               radeon_emit(cs, draws[i].count);
-               radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
-            }
-            radeon_end();
-
-            EMIT_SQTT_END_DRAW;
-            return;
-         }
-
          for (unsigned i = 0; i < num_draws; i++) {
             if (i > 0) {
                if (increment_draw_id) {
                   unsigned draw_id = drawid_base + i;
 
-                  radeon_set_sh_reg_seq(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 2);
-                  radeon_emit(cs, draws[i].start);
-                  radeon_emit(cs, draw_id);
+                  radeon_set_sh_reg_seq(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 2);
+                  radeon_emit(draws[i].start);
+                  radeon_emit(draw_id);
 
                   sctx->last_drawid = draw_id;
                } else {
-                  radeon_set_sh_reg(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, draws[i].start);
+                  radeon_set_sh_reg(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, draws[i].start);
                }
             }
 
-            radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));
-            radeon_emit(cs, draws[i].count);
-            radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
+            radeon_emit(PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));
+            radeon_emit(draws[i].count);
+            radeon_emit(V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
          }
-         if (num_draws > 1 && !sctx->num_vs_blit_sgprs)
+         if (num_draws > 1 && (IS_DRAW_VERTEX_STATE || !sctx->num_vs_blit_sgprs))
             sctx->last_base_vertex = draws[num_draws - 1].start;
       }
    }
@@ -1365,19 +1650,135 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
    EMIT_SQTT_END_DRAW;
 }
 
-template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG> ALWAYS_INLINE
-static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx)
+/* Return false if not bound. */
+template<chip_class GFX_VERSION>
+static bool ALWAYS_INLINE si_set_vb_descriptor(struct si_vertex_elements *velems,
+                                               struct pipe_vertex_buffer *vb,
+                                               unsigned index, /* vertex element index */
+                                               uint32_t *desc) /* where to upload descriptors */
 {
-   unsigned count = sctx->num_vertex_elements;
+   struct si_resource *buf = si_resource(vb->buffer.resource);
+   if (!buf) {
+      memset(desc, 0, 16);
+      return false;
+   }
+
+   int64_t offset = (int64_t)((int)vb->buffer_offset) + velems->src_offset[index];
+
+   if (offset >= buf->b.b.width0) {
+      assert(offset < buf->b.b.width0);
+      memset(desc, 0, 16);
+      return false;
+   }
+
+   uint64_t va = buf->gpu_address + offset;
+
+   int64_t num_records = (int64_t)buf->b.b.width0 - offset;
+   if (GFX_VERSION != GFX8 && vb->stride) {
+      /* Round up by rounding down and adding 1 */
+      num_records = (num_records - velems->format_size[index]) / vb->stride + 1;
+   }
+   assert(num_records >= 0 && num_records <= UINT_MAX);
+
+   uint32_t rsrc_word3 = velems->rsrc_word3[index];
+
+   /* OOB_SELECT chooses the out-of-bounds check:
+    *  - 1: index >= NUM_RECORDS (Structured)
+    *  - 3: offset >= NUM_RECORDS (Raw)
+    */
+   if (GFX_VERSION >= GFX10)
+      rsrc_word3 |= S_008F0C_OOB_SELECT(vb->stride ? V_008F0C_OOB_SELECT_STRUCTURED
+                                                   : V_008F0C_OOB_SELECT_RAW);
+
+   desc[0] = va;
+   desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(vb->stride);
+   desc[2] = num_records;
+   desc[3] = rsrc_word3;
+   return true;
+}
+
+#if GFX_VER == 6 /* declare this function only once because it supports all chips. */
+
+void si_set_vertex_buffer_descriptor(struct si_screen *sscreen, struct si_vertex_elements *velems,
+                                     struct pipe_vertex_buffer *vb, unsigned element_index,
+                                     uint32_t *out)
+{
+   switch (sscreen->info.chip_class) {
+   case GFX6:
+      si_set_vb_descriptor<GFX6>(velems, vb, element_index, out);
+      break;
+   case GFX7:
+      si_set_vb_descriptor<GFX7>(velems, vb, element_index, out);
+      break;
+   case GFX8:
+      si_set_vb_descriptor<GFX8>(velems, vb, element_index, out);
+      break;
+   case GFX9:
+      si_set_vb_descriptor<GFX9>(velems, vb, element_index, out);
+      break;
+   case GFX10:
+      si_set_vb_descriptor<GFX10>(velems, vb, element_index, out);
+      break;
+   case GFX10_3:
+      si_set_vb_descriptor<GFX10_3>(velems, vb, element_index, out);
+      break;
+   default:
+      unreachable("unhandled chip class");
+   }
+}
+
+#endif
+
+/* util_bitcount has large measurable overhead (~2% difference in viewperf),  so we use
+ * the POPCNT x86 instruction via inline assembly if the CPU supports it.
+ */
+enum si_has_popcnt {
+   POPCNT_NO,
+   POPCNT_YES,
+};
+
+template<si_has_popcnt POPCNT>
+unsigned bitcount_asm(unsigned n)
+{
+   if (POPCNT == POPCNT_YES)
+      return util_popcnt_inline_asm(n);
+   else
+      return util_bitcount(n);
+}
+
+template<si_has_popcnt POPCNT>
+static ALWAYS_INLINE unsigned get_next_vertex_state_elem(struct pipe_vertex_state *state,
+                                                         uint32_t *partial_velem_mask)
+{
+   unsigned semantic_index = u_bit_scan(partial_velem_mask);
+   assert(state->input.full_velem_mask & BITFIELD_BIT(semantic_index));
+   /* A prefix mask of the full mask gives us the index in pipe_vertex_state. */
+   return bitcount_asm<POPCNT>(state->input.full_velem_mask & BITFIELD_MASK(semantic_index));
+}
+
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
+          si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, si_has_popcnt POPCNT> ALWAYS_INLINE
+static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx,
+                                                  struct pipe_vertex_state *state,
+                                                  uint32_t partial_velem_mask)
+{
+   struct si_vertex_state *vstate = (struct si_vertex_state *)state;
+   unsigned count = IS_DRAW_VERTEX_STATE ? bitcount_asm<POPCNT>(partial_velem_mask) :
+                                           sctx->num_vertex_elements;
+   unsigned sh_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG,
+                                            PIPE_SHADER_VERTEX);
+   unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs_inline(GFX_VERSION);
    bool pointer_dirty, user_sgprs_dirty;
 
    assert(count <= SI_MAX_ATTRIBS);
 
-   if (sctx->vertex_buffers_dirty) {
+   if (sctx->vertex_buffers_dirty || IS_DRAW_VERTEX_STATE) {
       assert(count);
 
       struct si_vertex_elements *velems = sctx->vertex_elements;
-      unsigned alloc_size = velems->vb_desc_list_alloc_size;
+      unsigned alloc_size = IS_DRAW_VERTEX_STATE ?
+                               vstate->velems.vb_desc_list_alloc_size :
+                               velems->vb_desc_list_alloc_size;
       uint32_t *ptr;
 
       if (alloc_size) {
@@ -1404,65 +1805,63 @@ static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx)
          si_resource_reference(&sctx->vb_descriptors_buffer, NULL);
       }
 
-      unsigned first_vb_use_mask = velems->first_vb_use_mask;
-      unsigned num_vbos_in_user_sgprs = sctx->screen->num_vbos_in_user_sgprs;
+      if (IS_DRAW_VERTEX_STATE) {
+         unsigned i = 0;
 
-      for (unsigned i = 0; i < count; i++) {
-         struct pipe_vertex_buffer *vb;
-         struct si_resource *buf;
-         unsigned vbo_index = velems->vertex_buffer_index[i];
-         uint32_t *desc = i < num_vbos_in_user_sgprs ? &sctx->vb_descriptor_user_sgprs[i * 4]
-                                                     : &ptr[(i - num_vbos_in_user_sgprs) * 4];
+         if (num_vbos_in_user_sgprs) {
+            unsigned num_vb_sgprs = MIN2(count, num_vbos_in_user_sgprs) * 4;
 
-         vb = &sctx->vertex_buffer[vbo_index];
-         buf = si_resource(vb->buffer.resource);
-         if (!buf) {
-            memset(desc, 0, 16);
-            continue;
+            radeon_begin(&sctx->gfx_cs);
+            radeon_set_sh_reg_seq(sh_base + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4, num_vb_sgprs);
+
+            for (; partial_velem_mask && i < num_vbos_in_user_sgprs; i++) {
+               unsigned velem_index = get_next_vertex_state_elem<POPCNT>(state, &partial_velem_mask);
+
+               radeon_emit_array(&vstate->descriptors[velem_index * 4], 4);
+            }
+            radeon_end();
          }
 
-         int64_t offset = (int64_t)((int)vb->buffer_offset) + velems->src_offset[i];
+         for (; partial_velem_mask; i++) {
+            unsigned velem_index = get_next_vertex_state_elem<POPCNT>(state, &partial_velem_mask);
+            uint32_t *desc = &ptr[(i - num_vbos_in_user_sgprs) * 4];
 
-         if (offset >= buf->b.b.width0) {
-            assert(offset < buf->b.b.width0);
-            memset(desc, 0, 16);
-            continue;
+            memcpy(desc, &vstate->descriptors[velem_index * 4], 16);
          }
 
-         uint64_t va = buf->gpu_address + offset;
-
-         int64_t num_records = (int64_t)buf->b.b.width0 - offset;
-         if (GFX_VERSION != GFX8 && vb->stride) {
-            /* Round up by rounding down and adding 1 */
-            num_records = (num_records - velems->format_size[i]) / vb->stride + 1;
-         }
-         assert(num_records >= 0 && num_records <= UINT_MAX);
-
-         uint32_t rsrc_word3 = velems->rsrc_word3[i];
-
-         /* OOB_SELECT chooses the out-of-bounds check:
-          *  - 1: index >= NUM_RECORDS (Structured)
-          *  - 3: offset >= NUM_RECORDS (Raw)
-          */
-         if (GFX_VERSION >= GFX10)
-            rsrc_word3 |= S_008F0C_OOB_SELECT(vb->stride ? V_008F0C_OOB_SELECT_STRUCTURED
-                                                         : V_008F0C_OOB_SELECT_RAW);
-
-         desc[0] = va;
-         desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(vb->stride);
-         desc[2] = num_records;
-         desc[3] = rsrc_word3;
-
-         if (first_vb_use_mask & (1 << i)) {
-            radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(vb->buffer.resource),
+         if (vstate->b.input.vbuffer.buffer.resource != vstate->b.input.indexbuf) {
+            radeon_add_to_buffer_list(sctx, &sctx->gfx_cs,
+                                      si_resource(vstate->b.input.vbuffer.buffer.resource),
                                       RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
          }
+
+         /* The next draw_vbo should recompute and rebind vertex buffer descriptors. */
+         sctx->vertex_buffers_dirty = sctx->num_vertex_elements > 0;
+
+         user_sgprs_dirty = false; /* We just set them above. */
+         pointer_dirty = count > num_vbos_in_user_sgprs;
+      } else {
+         unsigned first_vb_use_mask = velems->first_vb_use_mask;
+
+         for (unsigned i = 0; i < count; i++) {
+            unsigned vbo_index = velems->vertex_buffer_index[i];
+            struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbo_index];
+            uint32_t *desc = i < num_vbos_in_user_sgprs ? &sctx->vb_descriptor_user_sgprs[i * 4]
+                                                        : &ptr[(i - num_vbos_in_user_sgprs) * 4];
+
+            if (!si_set_vb_descriptor<GFX_VERSION>(velems, vb, i, desc))
+               continue;
+
+            if (first_vb_use_mask & (1 << i)) {
+               radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(vb->buffer.resource),
+                                         RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
+            }
+         }
+
+         sctx->vertex_buffers_dirty = false;
+         user_sgprs_dirty = num_vbos_in_user_sgprs > 0;
+         pointer_dirty = alloc_size != 0;
       }
-
-      sctx->vertex_buffers_dirty = false;
-
-      pointer_dirty = alloc_size != 0;
-      user_sgprs_dirty = num_vbos_in_user_sgprs > 0;
    } else {
       pointer_dirty = sctx->vertex_buffer_pointer_dirty;
       user_sgprs_dirty = sctx->vertex_buffer_user_sgprs_dirty;
@@ -1470,9 +1869,6 @@ static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx)
 
    if (pointer_dirty || user_sgprs_dirty) {
       struct radeon_cmdbuf *cs = &sctx->gfx_cs;
-      unsigned num_vbos_in_user_sgprs = sctx->screen->num_vbos_in_user_sgprs;
-      unsigned sh_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG,
-                                               PIPE_SHADER_VERTEX);
       assert(count);
 
       radeon_begin(cs);
@@ -1488,7 +1884,7 @@ static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx)
                sh_dw_offset = GFX9_VSGS_NUM_USER_SGPR;
          }
 
-         radeon_set_sh_reg(cs, sh_base + sh_dw_offset * 4,
+         radeon_set_sh_reg(sh_base + sh_dw_offset * 4,
                            sctx->vb_descriptors_buffer->gpu_address +
                            sctx->vb_descriptors_offset);
          sctx->vertex_buffer_pointer_dirty = false;
@@ -1500,8 +1896,8 @@ static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx)
 
          unsigned num_sgprs = MIN2(count, num_vbos_in_user_sgprs) * 4;
 
-         radeon_set_sh_reg_seq(cs, sh_base + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4, num_sgprs);
-         radeon_emit_array(cs, sctx->vb_descriptor_user_sgprs, num_sgprs);
+         radeon_set_sh_reg_seq(sh_base + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4, num_sgprs);
+         radeon_emit_array(sctx->vb_descriptor_user_sgprs, num_sgprs);
          sctx->vertex_buffer_user_sgprs_dirty = false;
       }
       radeon_end();
@@ -1582,7 +1978,8 @@ static void si_get_draw_start_count(struct si_context *sctx, const struct pipe_d
    }
 }
 
-template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
+          si_is_draw_vertex_state IS_DRAW_VERTEX_STATE> ALWAYS_INLINE
 static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info,
                                const struct pipe_draw_indirect_info *indirect,
                                enum pipe_prim_type prim, unsigned instance_count,
@@ -1593,7 +1990,7 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
 
    si_emit_rasterizer_prim_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx);
    if (HAS_TESS)
-      si_emit_derived_tess_state(sctx, info->vertices_per_patch, &num_patches);
+      si_emit_derived_tess_state(sctx, &num_patches);
 
    /* Emit state atoms. */
    unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
@@ -1623,97 +2020,10 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
    }
 
    /* Emit draw states. */
-   si_emit_vs_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx, info->index_size);
-   si_emit_draw_registers<GFX_VERSION, HAS_TESS, HAS_GS, NGG>
-         (sctx, indirect, prim, num_patches, instance_count, info->vertices_per_patch,
-          primitive_restart, info->restart_index, min_vertex_count);
-}
-
-static bool si_all_vs_resources_read_only(struct si_context *sctx, struct pipe_resource *indexbuf)
-{
-   struct radeon_winsys *ws = sctx->ws;
-   struct radeon_cmdbuf *cs = &sctx->gfx_cs;
-   struct si_descriptors *buffers =
-      &sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX)];
-   struct si_shader_selector *vs = sctx->shader.vs.cso;
-   struct si_vertex_elements *velems = sctx->vertex_elements;
-   unsigned num_velems = velems->count;
-   unsigned num_images = vs->info.base.num_images;
-
-   /* Index buffer. */
-   if (indexbuf && ws->cs_is_buffer_referenced(cs, si_resource(indexbuf)->buf, RADEON_USAGE_WRITE))
-      goto has_write_reference;
-
-   /* Vertex buffers. */
-   for (unsigned i = 0; i < num_velems; i++) {
-      if (!((1 << i) & velems->first_vb_use_mask))
-         continue;
-
-      unsigned vb_index = velems->vertex_buffer_index[i];
-      struct pipe_resource *res = sctx->vertex_buffer[vb_index].buffer.resource;
-      if (!res)
-         continue;
-
-      if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))
-         goto has_write_reference;
-   }
-
-   /* Constant and shader buffers. */
-   for (unsigned i = 0; i < buffers->num_active_slots; i++) {
-      unsigned index = buffers->first_active_slot + i;
-      struct pipe_resource *res = sctx->const_and_shader_buffers[PIPE_SHADER_VERTEX].buffers[index];
-      if (!res)
-         continue;
-
-      if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))
-         goto has_write_reference;
-   }
-
-   /* Samplers. */
-   if (vs->info.base.textures_used[0]) {
-      unsigned num_samplers = BITSET_LAST_BIT(vs->info.base.textures_used);
-
-      for (unsigned i = 0; i < num_samplers; i++) {
-         struct pipe_sampler_view *view = sctx->samplers[PIPE_SHADER_VERTEX].views[i];
-         if (!view)
-            continue;
-
-         if (ws->cs_is_buffer_referenced(cs, si_resource(view->texture)->buf, RADEON_USAGE_WRITE))
-            goto has_write_reference;
-      }
-   }
-
-   /* Images. */
-   if (num_images) {
-      for (unsigned i = 0; i < num_images; i++) {
-         struct pipe_resource *res = sctx->images[PIPE_SHADER_VERTEX].views[i].resource;
-         if (!res)
-            continue;
-
-         if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE))
-            goto has_write_reference;
-      }
-   }
-
-   return true;
-
-has_write_reference:
-   /* If the current gfx IB has enough packets, flush it to remove write
-    * references to buffers.
-    */
-   if (cs->prev_dw + cs->current.cdw > 2048) {
-      si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-      assert(si_all_vs_resources_read_only(sctx, indexbuf));
-      return true;
-   }
-   return false;
-}
-
-static ALWAYS_INLINE bool pd_msg(const char *s)
-{
-   if (SI_PRIM_DISCARD_DEBUG)
-      printf("PD failed: %s\n", s);
-   return false;
+   si_emit_vs_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE>(sctx, info->index_size);
+   si_emit_draw_registers<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE>
+         (sctx, indirect, prim, num_patches, instance_count, primitive_restart,
+          info->restart_index, min_vertex_count);
 }
 
 #define DRAW_CLEANUP do {                                 \
@@ -1722,13 +2032,15 @@ static ALWAYS_INLINE bool pd_msg(const char *s)
    } while (0)
 
 template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
-          si_has_prim_discard_cs ALLOW_PRIM_DISCARD_CS>
-static void si_draw_vbo(struct pipe_context *ctx,
-                        const struct pipe_draw_info *info,
-                        unsigned drawid_offset,
-                        const struct pipe_draw_indirect_info *indirect,
-                        const struct pipe_draw_start_count_bias *draws,
-                        unsigned num_draws)
+          si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, si_has_popcnt POPCNT> ALWAYS_INLINE
+static void si_draw(struct pipe_context *ctx,
+                    const struct pipe_draw_info *info,
+                    unsigned drawid_offset,
+                    const struct pipe_draw_indirect_info *indirect,
+                    const struct pipe_draw_start_count_bias *draws,
+                    unsigned num_draws,
+                    struct pipe_vertex_state *state,
+                    uint32_t partial_velem_mask)
 {
    /* Keep code that uses the least number of local variables as close to the beginning
     * of this function as possible to minimize register pressure.
@@ -1764,10 +2076,10 @@ static void si_draw_vbo(struct pipe_context *ctx,
       /* The rarely occuring tcs == NULL case is not optimized. */
       bool same_patch_vertices =
          GFX_VERSION >= GFX9 &&
-         tcs && info->vertices_per_patch == tcs->info.base.tess.tcs_vertices_out;
+         tcs && sctx->patch_vertices == tcs->info.base.tess.tcs_vertices_out;
 
-      if (sctx->same_patch_vertices != same_patch_vertices) {
-         sctx->same_patch_vertices = same_patch_vertices;
+      if (sctx->shader.tcs.key.ge.opt.same_patch_vertices != same_patch_vertices) {
+         sctx->shader.tcs.key.ge.opt.same_patch_vertices = same_patch_vertices;
          sctx->do_update_shaders = true;
       }
 
@@ -1780,16 +2092,17 @@ static void si_draw_vbo(struct pipe_context *ctx,
           * function TCS.
           */
          bool ls_vgpr_fix =
-            tcs && info->vertices_per_patch > tcs->info.base.tess.tcs_vertices_out;
+            tcs && sctx->patch_vertices > tcs->info.base.tess.tcs_vertices_out;
 
-         if (ls_vgpr_fix != sctx->ls_vgpr_fix) {
-            sctx->ls_vgpr_fix = ls_vgpr_fix;
+         if (ls_vgpr_fix != sctx->shader.tcs.key.ge.part.tcs.ls_prolog.ls_vgpr_fix) {
+            sctx->shader.tcs.key.ge.part.tcs.ls_prolog.ls_vgpr_fix = ls_vgpr_fix;
+            sctx->fixed_func_tcs_shader.key.ge.part.tcs.ls_prolog.ls_vgpr_fix = ls_vgpr_fix;
             sctx->do_update_shaders = true;
          }
       }
    }
 
-   enum pipe_prim_type prim = info->mode;
+   enum pipe_prim_type prim = (enum pipe_prim_type)info->mode;
    unsigned instance_count = info->instance_count;
 
    /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is
@@ -1798,11 +2111,14 @@ static void si_draw_vbo(struct pipe_context *ctx,
     * 'instance_count == 0' seems to be problematic on Renoir chips (#4866),
     * so simplify the condition and drop these draws for all <= GFX9 chips.
     */
-   if (GFX_VERSION <= GFX9 && unlikely(!indirect && !instance_count))
+   if (GFX_VERSION <= GFX9 && unlikely(!IS_DRAW_VERTEX_STATE && !indirect && !instance_count))
       return;
 
    struct si_shader_selector *vs = sctx->shader.vs.cso;
-   if (unlikely(!vs || sctx->num_vertex_elements < vs->num_vs_inputs ||
+   struct si_vertex_state *vstate = (struct si_vertex_state *)state;
+   if (unlikely(!vs ||
+                (!IS_DRAW_VERTEX_STATE && sctx->num_vertex_elements < vs->num_vs_inputs) ||
+                (IS_DRAW_VERTEX_STATE && vstate->velems.count < vs->num_vs_inputs) ||
                 !sctx->shader.ps.cso || (HAS_TESS != (prim == PIPE_PRIM_PATCHES)))) {
       assert(0);
       return;
@@ -1817,8 +2133,8 @@ static void si_draw_vbo(struct pipe_context *ctx,
       bool gs_tri_strip_adj_fix =
          !HAS_TESS && prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
 
-      if (gs_tri_strip_adj_fix != sctx->gs_tri_strip_adj_fix) {
-         sctx->gs_tri_strip_adj_fix = gs_tri_strip_adj_fix;
+      if (gs_tri_strip_adj_fix != sctx->shader.gs.key.ge.mono.u.gs_tri_strip_adj_fix) {
+         sctx->shader.gs.key.ge.mono.u.gs_tri_strip_adj_fix = gs_tri_strip_adj_fix;
          sctx->do_update_shaders = true;
       }
    }
@@ -1830,7 +2146,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
    if (index_size) {
       /* Translate or upload, if needed. */
       /* 8-bit indices are supported on GFX8. */
-      if (GFX_VERSION <= GFX7 && index_size == 1) {
+      if (!IS_DRAW_VERTEX_STATE && GFX_VERSION <= GFX7 && index_size == 1) {
          unsigned start, count, start_offset, size, offset;
          void *ptr;
 
@@ -1849,7 +2165,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
          /* info->start will be added by the drawing code */
          index_offset = offset - start_offset;
          index_size = 2;
-      } else if (info->has_user_indices) {
+      } else if (!IS_DRAW_VERTEX_STATE && info->has_user_indices) {
          unsigned start_offset;
 
          assert(!indirect);
@@ -1876,7 +2192,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
    unsigned min_direct_count = 0;
    unsigned total_direct_count = 0;
 
-   if (indirect) {
+   if (!IS_DRAW_VERTEX_STATE && indirect) {
       /* Add the buffer size for memory checking in need_cs_space. */
       if (indirect->buffer)
          si_context_add_resource_size(sctx, indirect->buffer);
@@ -1910,82 +2226,8 @@ static void si_draw_vbo(struct pipe_context *ctx,
       info->primitive_restart &&
       (!sctx->screen->options.prim_restart_tri_strips_only ||
        (prim != PIPE_PRIM_TRIANGLE_STRIP && prim != PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY));
-   bool dispatch_prim_discard_cs = false;
-   bool prim_discard_cs_instancing = false;
    unsigned original_index_size = index_size;
 
-   /* Determine if we can use the primitive discard compute shader. */
-   /* TODO: this requires that primitives can be drawn out of order, so check depth/stencil/blend states. */
-   if (ALLOW_PRIM_DISCARD_CS &&
-       (total_direct_count > sctx->prim_discard_vertex_count_threshold
-           ? (sctx->compute_num_verts_rejected += total_direct_count, true)
-           : /* Add, then return true. */
-           (sctx->compute_num_verts_ineligible += total_direct_count,
-            false)) && /* Add, then return false. */
-       (!primitive_restart || pd_msg("primitive restart")) &&
-       /* Supported prim types. */
-       (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP)) &&
-       /* Instancing is limited to 16-bit indices, because InstanceID is packed into VertexID. */
-       /* Instanced index_size == 0 requires that start + count < USHRT_MAX, so just reject it. */
-       (instance_count == 1 ||
-        (instance_count <= USHRT_MAX && index_size && index_size <= 2) ||
-        pd_msg("instance_count too large or index_size == 4 or DrawArraysInstanced")) &&
-       ((drawid_offset == 0 && (num_draws == 1 || !info->increment_draw_id)) ||
-        !sctx->shader.vs.cso->info.uses_drawid || pd_msg("draw_id > 0")) &&
-       (!sctx->render_cond || pd_msg("render condition")) &&
-       /* Forced enablement ignores pipeline statistics queries. */
-       (sctx->screen->debug_flags & (DBG(PD) | DBG(ALWAYS_PD)) ||
-        (!sctx->num_pipeline_stat_queries && !sctx->streamout.prims_gen_query_enabled) ||
-        pd_msg("pipestat or primgen query")) &&
-       (!sctx->vertex_elements->instance_divisor_is_fetched || pd_msg("loads instance divisors")) &&
-       (!sctx->shader.ps.cso->info.uses_primid || pd_msg("PS uses PrimID")) &&
-       !rs->polygon_mode_enabled &&
-#if SI_PRIM_DISCARD_DEBUG /* same as cso->prim_discard_cs_allowed */
-       (!sctx->shader.vs.cso->info.uses_bindless_images || pd_msg("uses bindless images")) &&
-       (!sctx->shader.vs.cso->info.uses_bindless_samplers || pd_msg("uses bindless samplers")) &&
-       (!sctx->shader.vs.cso->info.base.writes_memory || pd_msg("writes memory")) &&
-       (!sctx->shader.vs.cso->info.writes_viewport_index || pd_msg("writes viewport index")) &&
-       !sctx->shader.vs.cso->info.base.vs.window_space_position &&
-       !sctx->shader.vs.cso->so.num_outputs &&
-#else
-       (sctx->shader.vs.cso->prim_discard_cs_allowed ||
-        pd_msg("VS shader uses unsupported features")) &&
-#endif
-       /* Check that all buffers are used for read only, because compute
-        * dispatches can run ahead. */
-       (si_all_vs_resources_read_only(sctx, index_size ? indexbuf : NULL) ||
-        pd_msg("write reference"))) {
-      switch (si_prepare_prim_discard_or_split_draw(sctx, info, drawid_offset, draws, num_draws,
-                                                    total_direct_count)) {
-      case SI_PRIM_DISCARD_ENABLED:
-         original_index_size = index_size;
-         prim_discard_cs_instancing = instance_count > 1;
-         dispatch_prim_discard_cs = true;
-
-         /* The compute shader changes/lowers the following: */
-         prim = PIPE_PRIM_TRIANGLES;
-         index_size = 4;
-         instance_count = 1;
-         sctx->compute_num_verts_rejected -= total_direct_count;
-         sctx->compute_num_verts_accepted += total_direct_count;
-         break;
-      case SI_PRIM_DISCARD_DISABLED:
-         break;
-      case SI_PRIM_DISCARD_DRAW_SPLIT:
-      case SI_PRIM_DISCARD_MULTI_DRAW_SPLIT:
-         sctx->compute_num_verts_rejected -= total_direct_count;
-         /* The multi draw was split into multiple ones and executed. Return. */
-         DRAW_CLEANUP;
-         return;
-      }
-   }
-
-   if (ALLOW_PRIM_DISCARD_CS &&
-       prim_discard_cs_instancing != sctx->prim_discard_cs_instancing) {
-      sctx->prim_discard_cs_instancing = prim_discard_cs_instancing;
-      sctx->do_update_shaders = true;
-   }
-
    /* Set the rasterization primitive type.
     *
     * This must be done after si_decompress_textures, which can call
@@ -2012,41 +2254,55 @@ static void si_draw_vbo(struct pipe_context *ctx,
       }
    }
 
+   if (IS_DRAW_VERTEX_STATE) {
+      /* draw_vertex_state doesn't use the current vertex buffers and vertex elements,
+       * so disable any non-trivial VS prolog that is based on them, such as vertex
+       * format lowering.
+       */
+      if (!sctx->force_trivial_vs_prolog) {
+         sctx->force_trivial_vs_prolog = true;
+
+         /* Update shaders to disable the non-trivial VS prolog. */
+         if (sctx->uses_nontrivial_vs_prolog) {
+            si_vs_key_update_inputs(sctx);
+            sctx->do_update_shaders = true;
+         }
+      }
+   } else {
+      if (sctx->force_trivial_vs_prolog) {
+         sctx->force_trivial_vs_prolog = false;
+
+         /* Update shaders to enable the non-trivial VS prolog. */
+         if (sctx->uses_nontrivial_vs_prolog) {
+            si_vs_key_update_inputs(sctx);
+            sctx->do_update_shaders = true;
+         }
+      }
+   }
+
    /* Update NGG culling settings. */
    uint8_t old_ngg_culling = sctx->ngg_culling;
    if (GFX_VERSION >= GFX10) {
       struct si_shader_selector *hw_vs = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->cso;
 
-      if (NGG && !HAS_GS && !dispatch_prim_discard_cs &&
+      if (NGG && !HAS_GS &&
           /* Tessellation sets ngg_cull_vert_threshold to UINT_MAX if the prim type
-           * is not triangles, so this check is only needed without tessellation. */
-          (HAS_TESS || sctx->current_rast_prim == PIPE_PRIM_TRIANGLES) &&
-          total_direct_count > hw_vs->ngg_cull_vert_threshold) {
+           * is not points, so this check is only needed without tessellation. */
+          (HAS_TESS || util_rast_prim_is_lines_or_triangles(sctx->current_rast_prim)) &&
+          /* Only the first draw for a shader starts with culling disabled and it's disabled
+           * until we pass the total_direct_count check and then it stays enabled until
+           * the shader is changed. This eliminates most culling on/off state changes. */
+          (old_ngg_culling || total_direct_count > hw_vs->ngg_cull_vert_threshold)) {
+         /* Check that the current shader allows culling. */
+         assert(hw_vs->ngg_cull_vert_threshold != UINT_MAX);
+
          uint8_t ngg_culling = sctx->viewport0_y_inverted ? rs->ngg_cull_flags_y_inverted :
                                                             rs->ngg_cull_flags;
+         assert(ngg_culling); /* rasterizer state should always set this to non-zero */
 
-         /* Use NGG fast launch for certain primitive types.
-          * A draw must have at least 1 full primitive.
-          * The fast launch doesn't work with tessellation.
-          *
-          * Small instances (including small draws) don't perform well with fast launch.
-          * It's better to use normal launch with NOT_EOP for small draws, and it's
-          * always better to use normal launch for small instances.
-          */
-         if (!HAS_TESS && ngg_culling && min_direct_count >= 64 &&
-             !(sctx->screen->debug_flags & DBG(NO_FAST_LAUNCH))) {
-            if (prim == PIPE_PRIM_TRIANGLES && !index_size) {
-               ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST;
-            } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {
-               if (!index_size) {
-                  ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP;
-               } else if (!primitive_restart) {
-                  ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP |
-                                 SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(MIN2(index_size, 3));
-                  /* The index buffer will be emulated. */
-                  index_size = 0;
-               }
-            }
+         if (util_prim_is_lines(sctx->current_rast_prim)) {
+            /* Overwrite it to mask out face cull flags. */
+            ngg_culling = SI_NGG_CULL_ENABLED | SI_NGG_CULL_LINES;
          }
 
          if (ngg_culling != old_ngg_culling) {
@@ -2061,35 +2317,16 @@ static void si_draw_vbo(struct pipe_context *ctx,
    }
 
    if (unlikely(sctx->do_update_shaders)) {
-      if (unlikely(!si_update_shaders(sctx))) {
+      if (unlikely(!(si_update_shaders<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx)))) {
          DRAW_CLEANUP;
          return;
       }
 
-      /* Insert a VGT_FLUSH when enabling fast launch changes to prevent hangs.
-       * See issues #2418, #2426, #2434
-       *
-       * This is the setting that is used by the draw.
+      /* si_update_shaders can clear the ngg_culling in the shader key if the shader compilation
+       * hasn't finished. Set it to the correct value in si_context.
        */
-      if (GFX_VERSION >= GFX10) {
-         uint8_t ngg_culling = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->key.opt.ngg_culling;
-         if (GFX_VERSION == GFX10 &&
-             !(old_ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) &&
-             ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
-            sctx->flags |= SI_CONTEXT_VGT_FLUSH;
-
-         if (old_ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0) &&
-             !(ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0))) {
-            /* Need to re-set these, because we have bound an index buffer there. */
-            sctx->shader_pointers_dirty |=
-               (1u << si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_GEOMETRY)) |
-               (1u << si_sampler_and_image_descriptors_idx(PIPE_SHADER_GEOMETRY));
-            si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
-         }
-
-         /* Set this to the correct value determined by si_update_shaders. */
-         sctx->ngg_culling = ngg_culling;
-      }
+      if (GFX_VERSION >= GFX10 && NGG)
+         sctx->ngg_culling = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current->key.ge.opt.ngg_culling;
    }
 
    /* Since we've called si_context_add_resource_size for vertex buffers,
@@ -2121,7 +2358,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
       masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.scissors);
       gfx9_scissor_bug = true;
 
-      if ((indirect && indirect->count_from_stream_output) ||
+      if ((!IS_DRAW_VERTEX_STATE && indirect && indirect->count_from_stream_output) ||
           sctx->dirty_atoms & si_atoms_that_always_roll_context() ||
           sctx->dirty_states & si_states_that_always_roll_context())
          sctx->context_roll = true;
@@ -2130,7 +2367,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
    /* Use optimal packet order based on whether we need to sync the pipeline. */
    if (unlikely(sctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB |
                                SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
-                               SI_CONTEXT_VS_PARTIAL_FLUSH))) {
+                               SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_VGT_FLUSH))) {
       /* If we have to wait for idle, set all states first, so that all
        * SET packets are processed in parallel with previous draw calls.
        * Then draw and prefetch at the end. This ensures that the time
@@ -2140,7 +2377,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
          masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);
 
       /* Emit all states except possibly render condition. */
-      si_emit_all_states<GFX_VERSION, HAS_TESS, HAS_GS, NGG>
+      si_emit_all_states<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE>
             (sctx, info, indirect, prim, instance_count, min_direct_count,
              primitive_restart, masked_atoms);
       sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
@@ -2149,7 +2386,9 @@ static void si_draw_vbo(struct pipe_context *ctx,
       /* This uploads VBO descriptors, sets user SGPRs, and executes the L2 prefetch.
        * It should done after cache flushing.
        */
-      if (unlikely((!si_upload_and_prefetch_VB_descriptors<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx)))) {
+      if (unlikely((!si_upload_and_prefetch_VB_descriptors
+                        <GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE, POPCNT>
+                        (sctx, state, partial_velem_mask)))) {
          DRAW_CLEANUP;
          return;
       }
@@ -2166,10 +2405,9 @@ static void si_draw_vbo(struct pipe_context *ctx,
       }
       assert(sctx->dirty_atoms == 0);
 
-      si_emit_draw_packets<GFX_VERSION, NGG, ALLOW_PRIM_DISCARD_CS>
+      si_emit_draw_packets<GFX_VERSION, NGG, IS_DRAW_VERTEX_STATE>
             (sctx, info, drawid_offset, indirect, draws, num_draws, total_direct_count, indexbuf,
-             index_size, index_offset, instance_count, dispatch_prim_discard_cs,
-             original_index_size);
+             index_size, index_offset, instance_count, original_index_size);
       /* <-- CUs are busy here. */
 
       /* Start prefetches after the draw has been started. Both will run
@@ -2189,12 +2427,14 @@ static void si_draw_vbo(struct pipe_context *ctx,
       /* This uploads VBO descriptors, sets user SGPRs, and executes the L2 prefetch.
        * It should done after cache flushing and after the VS prefetch.
        */
-      if (unlikely((!si_upload_and_prefetch_VB_descriptors<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx)))) {
+      if (unlikely((!si_upload_and_prefetch_VB_descriptors
+                       <GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE, POPCNT>
+                       (sctx, state, partial_velem_mask)))) {
          DRAW_CLEANUP;
          return;
       }
 
-      si_emit_all_states<GFX_VERSION, HAS_TESS, HAS_GS, NGG>
+      si_emit_all_states<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE>
             (sctx, info, indirect, prim, instance_count, min_direct_count,
              primitive_restart, masked_atoms);
 
@@ -2205,10 +2445,9 @@ static void si_draw_vbo(struct pipe_context *ctx,
       }
       assert(sctx->dirty_atoms == 0);
 
-      si_emit_draw_packets<GFX_VERSION, NGG, ALLOW_PRIM_DISCARD_CS>
+      si_emit_draw_packets<GFX_VERSION, NGG, IS_DRAW_VERTEX_STATE>
             (sctx, info, drawid_offset, indirect, draws, num_draws, total_direct_count, indexbuf,
-             index_size, index_offset, instance_count, dispatch_prim_discard_cs,
-             original_index_size);
+             index_size, index_offset, instance_count, original_index_size);
 
       /* Prefetch the remaining shaders after the draw has been
        * started. */
@@ -2237,9 +2476,9 @@ static void si_draw_vbo(struct pipe_context *ctx,
    if (unlikely(sctx->decompression_enabled)) {
       sctx->num_decompress_calls++;
    } else {
-      sctx->num_draw_calls++;
+      sctx->num_draw_calls += num_draws;
       if (primitive_restart)
-         sctx->num_prim_restart_calls++;
+         sctx->num_prim_restart_calls += num_draws;
    }
 
    if (!sctx->blitter_running && sctx->framebuffer.state.zsbuf) {
@@ -2247,11 +2486,45 @@ static void si_draw_vbo(struct pipe_context *ctx,
       zstex->depth_cleared_level_mask &= ~BITFIELD_BIT(sctx->framebuffer.state.zsbuf->u.tex.level);
    }
 
-   /* TODO: Set displayable_dcc_dirty if image stores are used. */
-
    DRAW_CLEANUP;
 }
 
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
+static void si_draw_vbo(struct pipe_context *ctx,
+                        const struct pipe_draw_info *info,
+                        unsigned drawid_offset,
+                        const struct pipe_draw_indirect_info *indirect,
+                        const struct pipe_draw_start_count_bias *draws,
+                        unsigned num_draws)
+{
+   si_draw<GFX_VERSION, HAS_TESS, HAS_GS, NGG, DRAW_VERTEX_STATE_OFF, POPCNT_NO>
+      (ctx, info, drawid_offset, indirect, draws, num_draws, NULL, 0);
+}
+
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
+          si_has_popcnt POPCNT>
+static void si_draw_vertex_state(struct pipe_context *ctx,
+                                 struct pipe_vertex_state *vstate,
+                                 uint32_t partial_velem_mask,
+                                 struct pipe_draw_vertex_state_info info,
+                                 const struct pipe_draw_start_count_bias *draws,
+                                 unsigned num_draws)
+{
+   struct si_vertex_state *state = (struct si_vertex_state *)vstate;
+   struct pipe_draw_info dinfo = {};
+
+   dinfo.mode = info.mode;
+   dinfo.index_size = 4;
+   dinfo.instance_count = 1;
+   dinfo.index.resource = state->b.input.indexbuf;
+
+   si_draw<GFX_VERSION, HAS_TESS, HAS_GS, NGG, DRAW_VERTEX_STATE_ON, POPCNT>
+      (ctx, &dinfo, 0, NULL, draws, num_draws, vstate, partial_velem_mask);
+
+   if (info.take_vertex_state_ownership)
+      pipe_vertex_state_reference(&vstate, NULL);
+}
+
 static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elements_cso,
                               blitter_get_vs_func get_vs, int x1, int y1, int x2, int y2,
                               float depth, unsigned num_instances, enum blitter_attrib_type type,
@@ -2295,40 +2568,35 @@ static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elem
    pipe->draw_vbo(pipe, &info, 0, NULL, &draw, 1);
 }
 
-template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS,
-          si_has_ngg NGG, si_has_prim_discard_cs ALLOW_PRIM_DISCARD_CS>
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
 static void si_init_draw_vbo(struct si_context *sctx)
 {
-   /* Prim discard CS is only useful on gfx7+ because gfx6 doesn't have async compute. */
-   if (ALLOW_PRIM_DISCARD_CS && GFX_VERSION < GFX8)
-      return;
-
-   if (ALLOW_PRIM_DISCARD_CS && (HAS_TESS || HAS_GS))
-      return;
-
    if (NGG && GFX_VERSION < GFX10)
       return;
 
-   sctx->draw_vbo[HAS_TESS][HAS_GS][NGG][ALLOW_PRIM_DISCARD_CS] =
-      si_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG, ALLOW_PRIM_DISCARD_CS>;
-}
+   sctx->draw_vbo[HAS_TESS][HAS_GS][NGG] =
+      si_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG>;
 
-template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS>
-static void si_init_draw_vbo_all_internal_options(struct si_context *sctx)
-{
-   si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_OFF, PRIM_DISCARD_CS_OFF>(sctx);
-   si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_OFF, PRIM_DISCARD_CS_ON>(sctx);
-   si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_ON, PRIM_DISCARD_CS_OFF>(sctx);
-   si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_ON, PRIM_DISCARD_CS_ON>(sctx);
+   if (util_get_cpu_caps()->has_popcnt) {
+      sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] =
+         si_draw_vertex_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, POPCNT_YES>;
+   } else {
+      sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] =
+         si_draw_vertex_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, POPCNT_NO>;
+   }
 }
 
 template <chip_class GFX_VERSION>
 static void si_init_draw_vbo_all_pipeline_options(struct si_context *sctx)
 {
-   si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_OFF, GS_OFF>(sctx);
-   si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_OFF, GS_ON>(sctx);
-   si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_ON, GS_OFF>(sctx);
-   si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_ON, GS_ON>(sctx);
+   si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_OFF, NGG_OFF>(sctx);
+   si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_ON,  NGG_OFF>(sctx);
+   si_init_draw_vbo<GFX_VERSION, TESS_ON,  GS_OFF, NGG_OFF>(sctx);
+   si_init_draw_vbo<GFX_VERSION, TESS_ON,  GS_ON,  NGG_OFF>(sctx);
+   si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_OFF, NGG_ON>(sctx);
+   si_init_draw_vbo<GFX_VERSION, TESS_OFF, GS_ON,  NGG_ON>(sctx);
+   si_init_draw_vbo<GFX_VERSION, TESS_ON,  GS_OFF, NGG_ON>(sctx);
+   si_init_draw_vbo<GFX_VERSION, TESS_ON,  GS_ON,  NGG_ON>(sctx);
 }
 
 static void si_invalid_draw_vbo(struct pipe_context *pipe,
@@ -2341,6 +2609,16 @@ static void si_invalid_draw_vbo(struct pipe_context *pipe,
    unreachable("vertex shader not bound");
 }
 
+static void si_invalid_draw_vertex_state(struct pipe_context *ctx,
+                                         struct pipe_vertex_state *vstate,
+                                         uint32_t partial_velem_mask,
+                                         struct pipe_draw_vertex_state_info info,
+                                         const struct pipe_draw_start_count_bias *draws,
+                                         unsigned num_draws)
+{
+   unreachable("vertex shader not bound");
+}
+
 extern "C"
 void GFX(si_init_draw_functions_)(struct si_context *sctx)
 {
@@ -2352,7 +2630,53 @@ void GFX(si_init_draw_functions_)(struct si_context *sctx)
     * initialization of callbacks in upper layers (such as u_threaded_context).
     */
    sctx->b.draw_vbo = si_invalid_draw_vbo;
+   sctx->b.draw_vertex_state = si_invalid_draw_vertex_state;
    sctx->blitter->draw_rectangle = si_draw_rectangle;
 
    si_init_ia_multi_vgt_param_table(sctx);
 }
+
+#if GFX_VER == 6 /* declare this function only once because it supports all chips. */
+
+extern "C"
+void si_init_spi_map_functions(struct si_context *sctx)
+{
+   /* This unrolls the loops in si_emit_spi_map and inlines memcmp and memcpys.
+    * It improves performance for viewperf/snx.
+    */
+   sctx->emit_spi_map[0] = si_emit_spi_map<0>;
+   sctx->emit_spi_map[1] = si_emit_spi_map<1>;
+   sctx->emit_spi_map[2] = si_emit_spi_map<2>;
+   sctx->emit_spi_map[3] = si_emit_spi_map<3>;
+   sctx->emit_spi_map[4] = si_emit_spi_map<4>;
+   sctx->emit_spi_map[5] = si_emit_spi_map<5>;
+   sctx->emit_spi_map[6] = si_emit_spi_map<6>;
+   sctx->emit_spi_map[7] = si_emit_spi_map<7>;
+   sctx->emit_spi_map[8] = si_emit_spi_map<8>;
+   sctx->emit_spi_map[9] = si_emit_spi_map<9>;
+   sctx->emit_spi_map[10] = si_emit_spi_map<10>;
+   sctx->emit_spi_map[11] = si_emit_spi_map<11>;
+   sctx->emit_spi_map[12] = si_emit_spi_map<12>;
+   sctx->emit_spi_map[13] = si_emit_spi_map<13>;
+   sctx->emit_spi_map[14] = si_emit_spi_map<14>;
+   sctx->emit_spi_map[15] = si_emit_spi_map<15>;
+   sctx->emit_spi_map[16] = si_emit_spi_map<16>;
+   sctx->emit_spi_map[17] = si_emit_spi_map<17>;
+   sctx->emit_spi_map[18] = si_emit_spi_map<18>;
+   sctx->emit_spi_map[19] = si_emit_spi_map<19>;
+   sctx->emit_spi_map[20] = si_emit_spi_map<20>;
+   sctx->emit_spi_map[21] = si_emit_spi_map<21>;
+   sctx->emit_spi_map[22] = si_emit_spi_map<22>;
+   sctx->emit_spi_map[23] = si_emit_spi_map<23>;
+   sctx->emit_spi_map[24] = si_emit_spi_map<24>;
+   sctx->emit_spi_map[25] = si_emit_spi_map<25>;
+   sctx->emit_spi_map[26] = si_emit_spi_map<26>;
+   sctx->emit_spi_map[27] = si_emit_spi_map<27>;
+   sctx->emit_spi_map[28] = si_emit_spi_map<28>;
+   sctx->emit_spi_map[29] = si_emit_spi_map<29>;
+   sctx->emit_spi_map[30] = si_emit_spi_map<30>;
+   sctx->emit_spi_map[31] = si_emit_spi_map<31>;
+   sctx->emit_spi_map[32] = si_emit_spi_map<32>;
+}
+
+#endif
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_state_msaa.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_state_msaa.c
index 5412a87f0a..21abcae447 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_state_msaa.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_state_msaa.c	
@@ -81,8 +81,8 @@
  *   Right half: {1,3,5,7,9,11,13,15}
  */
 
-/* Important note: We have to use the standard DX positions, because
- * the primitive discard compute shader relies on them.
+/* Important note: We have to use the standard DX positions because shader-based culling
+ * relies on them.
  */
 
 /* 1x MSAA */
@@ -151,13 +151,13 @@ static void si_emit_max_4_sample_locs(struct radeon_cmdbuf *cs, uint64_t centroi
                                       uint32_t sample_locs)
 {
    radeon_begin(cs);
-   radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
-   radeon_emit(cs, centroid_priority);
-   radeon_emit(cs, centroid_priority >> 32);
-   radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs);
-   radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs);
-   radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs);
-   radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs);
+   radeon_set_context_reg_seq(R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
+   radeon_emit(centroid_priority);
+   radeon_emit(centroid_priority >> 32);
+   radeon_set_context_reg(R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs);
+   radeon_set_context_reg(R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs);
+   radeon_set_context_reg(R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs);
+   radeon_set_context_reg(R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs);
    radeon_end();
 }
 
@@ -165,15 +165,15 @@ static void si_emit_max_16_sample_locs(struct radeon_cmdbuf *cs, uint64_t centro
                                        const uint32_t *sample_locs, unsigned num_samples)
 {
    radeon_begin(cs);
-   radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
-   radeon_emit(cs, centroid_priority);
-   radeon_emit(cs, centroid_priority >> 32);
-   radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
+   radeon_set_context_reg_seq(R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
+   radeon_emit(centroid_priority);
+   radeon_emit(centroid_priority >> 32);
+   radeon_set_context_reg_seq(R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
                               num_samples == 8 ? 14 : 16);
-   radeon_emit_array(cs, sample_locs, 4);
-   radeon_emit_array(cs, sample_locs, 4);
-   radeon_emit_array(cs, sample_locs, 4);
-   radeon_emit_array(cs, sample_locs, num_samples == 8 ? 2 : 4);
+   radeon_emit_array(sample_locs, 4);
+   radeon_emit_array(sample_locs, 4);
+   radeon_emit_array(sample_locs, 4);
+   radeon_emit_array(sample_locs, num_samples == 8 ? 2 : 4);
    radeon_end();
 }
 
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/mesa 3D driver/src/gallium/drivers/radeonsi/si_state_shaders.cpp
new file mode 100644
index 0000000000..234f791ca2
--- /dev/null
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_state_shaders.cpp	
@@ -0,0 +1,4109 @@
+/*
+ * Copyright 2012 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "ac_exp_param.h"
+#include "ac_shader_util.h"
+#include "compiler/nir/nir_serialize.h"
+#include "nir/tgsi_to_nir.h"
+#include "si_build_pm4.h"
+#include "sid.h"
+#include "util/crc32.h"
+#include "util/disk_cache.h"
+#include "util/hash_table.h"
+#include "util/mesa-sha1.h"
+#include "util/u_async_debug.h"
+#include "util/u_memory.h"
+#include "util/u_prim.h"
+#include "tgsi/tgsi_from_mesa.h"
+
+/* SHADER_CACHE */
+
+/**
+ * Return the IR key for the shader cache.
+ */
+void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es,
+                         unsigned char ir_sha1_cache_key[20])
+{
+   struct blob blob = {};
+   unsigned ir_size;
+   void *ir_binary;
+
+   if (sel->nir_binary) {
+      ir_binary = sel->nir_binary;
+      ir_size = sel->nir_size;
+   } else {
+      assert(sel->nir);
+
+      blob_init(&blob);
+      nir_serialize(&blob, sel->nir, true);
+      ir_binary = blob.data;
+      ir_size = blob.size;
+   }
+
+   /* These settings affect the compilation, but they are not derived
+    * from the input shader IR.
+    */
+   unsigned shader_variant_flags = 0;
+
+   if (ngg)
+      shader_variant_flags |= 1 << 0;
+   if (sel->nir)
+      shader_variant_flags |= 1 << 1;
+   if (si_get_wave_size(sel->screen, sel->info.stage, ngg, es) == 32)
+      shader_variant_flags |= 1 << 2;
+   if (sel->info.stage == MESA_SHADER_FRAGMENT &&
+       /* Derivatives imply helper invocations so check for needs_quad_helper_invocations. */
+       sel->info.base.fs.needs_quad_helper_invocations &&
+       sel->info.base.fs.uses_discard &&
+       sel->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL))
+      shader_variant_flags |= 1 << 3;
+   /* use_ngg_culling disables NGG passthrough for non-culling shaders to reduce context
+    * rolls, which can be changed with AMD_DEBUG=nonggc or AMD_DEBUG=nggc.
+    */
+   if (sel->screen->use_ngg_culling)
+      shader_variant_flags |= 1 << 4;
+
+   /* bit gap */
+
+   if (sel->screen->options.no_infinite_interp)
+      shader_variant_flags |= 1 << 7;
+   if (sel->screen->options.clamp_div_by_zero)
+      shader_variant_flags |= 1 << 8;
+   if (sel->screen->debug_flags & DBG(GISEL))
+      shader_variant_flags |= 1 << 9;
+   if ((sel->info.stage == MESA_SHADER_VERTEX ||
+        sel->info.stage == MESA_SHADER_TESS_EVAL ||
+        sel->info.stage == MESA_SHADER_GEOMETRY) &&
+       !es &&
+       sel->screen->options.vrs2x2)
+      shader_variant_flags |= 1 << 10;
+   if (sel->screen->options.inline_uniforms)
+      shader_variant_flags |= 1 << 11;
+
+   struct mesa_sha1 ctx;
+   _mesa_sha1_init(&ctx);
+   _mesa_sha1_update(&ctx, &shader_variant_flags, 4);
+   _mesa_sha1_update(&ctx, ir_binary, ir_size);
+   if (sel->info.stage == MESA_SHADER_VERTEX || sel->info.stage == MESA_SHADER_TESS_EVAL ||
+       sel->info.stage == MESA_SHADER_GEOMETRY)
+      _mesa_sha1_update(&ctx, &sel->so, sizeof(sel->so));
+   _mesa_sha1_final(&ctx, ir_sha1_cache_key);
+
+   if (ir_binary == blob.data)
+      blob_finish(&blob);
+}
+
+/** Copy "data" to "ptr" and return the next dword following copied data. */
+static uint32_t *write_data(uint32_t *ptr, const void *data, unsigned size)
+{
+   /* data may be NULL if size == 0 */
+   if (size)
+      memcpy(ptr, data, size);
+   ptr += DIV_ROUND_UP(size, 4);
+   return ptr;
+}
+
+/** Read data from "ptr". Return the next dword following the data. */
+static uint32_t *read_data(uint32_t *ptr, void *data, unsigned size)
+{
+   memcpy(data, ptr, size);
+   ptr += DIV_ROUND_UP(size, 4);
+   return ptr;
+}
+
+/**
+ * Write the size as uint followed by the data. Return the next dword
+ * following the copied data.
+ */
+static uint32_t *write_chunk(uint32_t *ptr, const void *data, unsigned size)
+{
+   *ptr++ = size;
+   return write_data(ptr, data, size);
+}
+
+/**
+ * Read the size as uint followed by the data. Return both via parameters.
+ * Return the next dword following the data.
+ */
+static uint32_t *read_chunk(uint32_t *ptr, void **data, unsigned *size)
+{
+   *size = *ptr++;
+   assert(*data == NULL);
+   if (!*size)
+      return ptr;
+   *data = malloc(*size);
+   return read_data(ptr, *data, *size);
+}
+
+/**
+ * Return the shader binary in a buffer. The first 4 bytes contain its size
+ * as integer.
+ */
+static void *si_get_shader_binary(struct si_shader *shader)
+{
+   /* There is always a size of data followed by the data itself. */
+   unsigned llvm_ir_size =
+      shader->binary.llvm_ir_string ? strlen(shader->binary.llvm_ir_string) + 1 : 0;
+
+   /* Refuse to allocate overly large buffers and guard against integer
+    * overflow. */
+   if (shader->binary.elf_size > UINT_MAX / 4 || llvm_ir_size > UINT_MAX / 4)
+      return NULL;
+
+   unsigned size = 4 + /* total size */
+                   4 + /* CRC32 of the data below */
+                   align(sizeof(shader->config), 4) + align(sizeof(shader->info), 4) + 4 +
+                   align(shader->binary.elf_size, 4) + 4 + align(llvm_ir_size, 4);
+   void *buffer = CALLOC(1, size);
+   uint32_t *ptr = (uint32_t *)buffer;
+
+   if (!buffer)
+      return NULL;
+
+   *ptr++ = size;
+   ptr++; /* CRC32 is calculated at the end. */
+
+   ptr = write_data(ptr, &shader->config, sizeof(shader->config));
+   ptr = write_data(ptr, &shader->info, sizeof(shader->info));
+   ptr = write_chunk(ptr, shader->binary.elf_buffer, shader->binary.elf_size);
+   ptr = write_chunk(ptr, shader->binary.llvm_ir_string, llvm_ir_size);
+   assert((char *)ptr - (char *)buffer == (ptrdiff_t)size);
+
+   /* Compute CRC32. */
+   ptr = (uint32_t *)buffer;
+   ptr++;
+   *ptr = util_hash_crc32(ptr + 1, size - 8);
+
+   return buffer;
+}
+
+static bool si_load_shader_binary(struct si_shader *shader, void *binary)
+{
+   uint32_t *ptr = (uint32_t *)binary;
+   uint32_t size = *ptr++;
+   uint32_t crc32 = *ptr++;
+   unsigned chunk_size;
+   unsigned elf_size;
+
+   if (util_hash_crc32(ptr, size - 8) != crc32) {
+      fprintf(stderr, "radeonsi: binary shader has invalid CRC32\n");
+      return false;
+   }
+
+   ptr = read_data(ptr, &shader->config, sizeof(shader->config));
+   ptr = read_data(ptr, &shader->info, sizeof(shader->info));
+   ptr = read_chunk(ptr, (void **)&shader->binary.elf_buffer, &elf_size);
+   shader->binary.elf_size = elf_size;
+   ptr = read_chunk(ptr, (void **)&shader->binary.llvm_ir_string, &chunk_size);
+
+   return true;
+}
+
+/**
+ * Insert a shader into the cache. It's assumed the shader is not in the cache.
+ * Use si_shader_cache_load_shader before calling this.
+ */
+void si_shader_cache_insert_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20],
+                                   struct si_shader *shader, bool insert_into_disk_cache)
+{
+   void *hw_binary;
+   struct hash_entry *entry;
+   uint8_t key[CACHE_KEY_SIZE];
+   bool memory_cache_full = sscreen->shader_cache_size >= sscreen->shader_cache_max_size;
+
+   if (!insert_into_disk_cache && memory_cache_full)
+      return;
+
+   entry = _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key);
+   if (entry)
+      return; /* already added */
+
+   hw_binary = si_get_shader_binary(shader);
+   if (!hw_binary)
+      return;
+
+   if (!memory_cache_full) {
+      if (_mesa_hash_table_insert(sscreen->shader_cache,
+                                  mem_dup(ir_sha1_cache_key, 20),
+                                  hw_binary) == NULL) {
+          FREE(hw_binary);
+          return;
+      }
+      /* The size is stored at the start of the binary */
+      sscreen->shader_cache_size += *(uint32_t*)hw_binary;
+   }
+
+   if (sscreen->disk_shader_cache && insert_into_disk_cache) {
+      disk_cache_compute_key(sscreen->disk_shader_cache, ir_sha1_cache_key, 20, key);
+      disk_cache_put(sscreen->disk_shader_cache, key, hw_binary, *((uint32_t *)hw_binary), NULL);
+   }
+
+   if (memory_cache_full)
+      FREE(hw_binary);
+}
+
+bool si_shader_cache_load_shader(struct si_screen *sscreen, unsigned char ir_sha1_cache_key[20],
+                                 struct si_shader *shader)
+{
+   struct hash_entry *entry = _mesa_hash_table_search(sscreen->shader_cache, ir_sha1_cache_key);
+
+   if (entry) {
+      if (si_load_shader_binary(shader, entry->data)) {
+         p_atomic_inc(&sscreen->num_memory_shader_cache_hits);
+         return true;
+      }
+   }
+   p_atomic_inc(&sscreen->num_memory_shader_cache_misses);
+
+   if (!sscreen->disk_shader_cache)
+      return false;
+
+   unsigned char sha1[CACHE_KEY_SIZE];
+   disk_cache_compute_key(sscreen->disk_shader_cache, ir_sha1_cache_key, 20, sha1);
+
+   size_t binary_size;
+   uint8_t *buffer = (uint8_t*)disk_cache_get(sscreen->disk_shader_cache, sha1, &binary_size);
+   if (buffer) {
+      if (binary_size >= sizeof(uint32_t) && *((uint32_t *)buffer) == binary_size) {
+         if (si_load_shader_binary(shader, buffer)) {
+            free(buffer);
+            si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, shader, false);
+            p_atomic_inc(&sscreen->num_disk_shader_cache_hits);
+            return true;
+         }
+      } else {
+         /* Something has gone wrong discard the item from the cache and
+          * rebuild/link from source.
+          */
+         assert(!"Invalid radeonsi shader disk cache item!");
+         disk_cache_remove(sscreen->disk_shader_cache, sha1);
+      }
+   }
+
+   free(buffer);
+   p_atomic_inc(&sscreen->num_disk_shader_cache_misses);
+   return false;
+}
+
+static uint32_t si_shader_cache_key_hash(const void *key)
+{
+   /* Take the first dword of SHA1. */
+   return *(uint32_t *)key;
+}
+
+static bool si_shader_cache_key_equals(const void *a, const void *b)
+{
+   /* Compare SHA1s. */
+   return memcmp(a, b, 20) == 0;
+}
+
+static void si_destroy_shader_cache_entry(struct hash_entry *entry)
+{
+   FREE((void *)entry->key);
+   FREE(entry->data);
+}
+
+bool si_init_shader_cache(struct si_screen *sscreen)
+{
+   (void)simple_mtx_init(&sscreen->shader_cache_mutex, mtx_plain);
+   sscreen->shader_cache =
+      _mesa_hash_table_create(NULL, si_shader_cache_key_hash, si_shader_cache_key_equals);
+   sscreen->shader_cache_size = 0;
+   /* Maximum size: 64MB on 32 bits, 1GB else */
+   sscreen->shader_cache_max_size = ((sizeof(void *) == 4) ? 64 : 1024) * 1024 * 1024;
+
+   return sscreen->shader_cache != NULL;
+}
+
+void si_destroy_shader_cache(struct si_screen *sscreen)
+{
+   if (sscreen->shader_cache)
+      _mesa_hash_table_destroy(sscreen->shader_cache, si_destroy_shader_cache_entry);
+   simple_mtx_destroy(&sscreen->shader_cache_mutex);
+}
+
+/* SHADER STATES */
+
+bool si_shader_mem_ordered(struct si_shader *shader)
+{
+   if (shader->selector->screen->info.chip_class < GFX10)
+      return false;
+
+   const struct si_shader_info *info = &shader->selector->info;
+   const struct si_shader_info *prev_info =
+      shader->previous_stage_sel ? &shader->previous_stage_sel->info : NULL;
+
+   bool sampler_or_bvh = info->uses_vmem_return_type_sampler_or_bvh;
+   bool other = info->uses_vmem_return_type_other ||
+                info->uses_indirect_descriptor ||
+                shader->config.scratch_bytes_per_wave ||
+                (info->stage == MESA_SHADER_FRAGMENT &&
+                 (info->base.fs.uses_fbfetch_output ||
+                  shader->key.ps.part.prolog.poly_stipple));
+
+   if (prev_info) {
+      sampler_or_bvh |= prev_info->uses_vmem_return_type_sampler_or_bvh;
+      other |= prev_info->uses_vmem_return_type_other ||
+               prev_info->uses_indirect_descriptor;
+   }
+
+   /* Return true if both types of VMEM that return something are used. */
+   return sampler_or_bvh && other;
+}
+
+static void si_set_tesseval_regs(struct si_screen *sscreen, const struct si_shader_selector *tes,
+                                 struct si_shader *shader)
+{
+   const struct si_shader_info *info = &tes->info;
+   unsigned tes_prim_mode = info->base.tess.primitive_mode;
+   unsigned tes_spacing = info->base.tess.spacing;
+   bool tes_vertex_order_cw = !info->base.tess.ccw;
+   bool tes_point_mode = info->base.tess.point_mode;
+   unsigned type, partitioning, topology, distribution_mode;
+
+   switch (tes_prim_mode) {
+   case GL_LINES:
+      type = V_028B6C_TESS_ISOLINE;
+      break;
+   case GL_TRIANGLES:
+      type = V_028B6C_TESS_TRIANGLE;
+      break;
+   case GL_QUADS:
+      type = V_028B6C_TESS_QUAD;
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+   switch (tes_spacing) {
+   case TESS_SPACING_FRACTIONAL_ODD:
+      partitioning = V_028B6C_PART_FRAC_ODD;
+      break;
+   case TESS_SPACING_FRACTIONAL_EVEN:
+      partitioning = V_028B6C_PART_FRAC_EVEN;
+      break;
+   case TESS_SPACING_EQUAL:
+      partitioning = V_028B6C_PART_INTEGER;
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+   if (tes_point_mode)
+      topology = V_028B6C_OUTPUT_POINT;
+   else if (tes_prim_mode == GL_LINES)
+      topology = V_028B6C_OUTPUT_LINE;
+   else if (tes_vertex_order_cw)
+      /* for some reason, this must be the other way around */
+      topology = V_028B6C_OUTPUT_TRIANGLE_CCW;
+   else
+      topology = V_028B6C_OUTPUT_TRIANGLE_CW;
+
+   if (sscreen->info.has_distributed_tess) {
+      if (sscreen->info.family == CHIP_FIJI || sscreen->info.family >= CHIP_POLARIS10)
+         distribution_mode = V_028B6C_TRAPEZOIDS;
+      else
+         distribution_mode = V_028B6C_DONUTS;
+   } else
+      distribution_mode = V_028B6C_NO_DIST;
+
+   shader->vgt_tf_param = S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) |
+                          S_028B6C_TOPOLOGY(topology) |
+                          S_028B6C_DISTRIBUTION_MODE(distribution_mode);
+}
+
+/* Polaris needs different VTX_REUSE_DEPTH settings depending on
+ * whether the "fractional odd" tessellation spacing is used.
+ *
+ * Possible VGT configurations and which state should set the register:
+ *
+ *   Reg set in | VGT shader configuration   | Value
+ * ------------------------------------------------------
+ *     VS as VS | VS                         | 30
+ *     VS as ES | ES -> GS -> VS             | 30
+ *    TES as VS | LS -> HS -> VS             | 14 or 30
+ *    TES as ES | LS -> HS -> ES -> GS -> VS | 14 or 30
+ */
+static void polaris_set_vgt_vertex_reuse(struct si_screen *sscreen, struct si_shader_selector *sel,
+                                         struct si_shader *shader)
+{
+   if (sscreen->info.family < CHIP_POLARIS10 || sscreen->info.chip_class >= GFX10)
+      return;
+
+   /* VS as VS, or VS as ES: */
+   if ((sel->info.stage == MESA_SHADER_VERTEX &&
+        (!shader->key.ge.as_ls && !shader->is_gs_copy_shader)) ||
+       /* TES as VS, or TES as ES: */
+       sel->info.stage == MESA_SHADER_TESS_EVAL) {
+      unsigned vtx_reuse_depth = 30;
+
+      if (sel->info.stage == MESA_SHADER_TESS_EVAL &&
+          sel->info.base.tess.spacing == TESS_SPACING_FRACTIONAL_ODD)
+         vtx_reuse_depth = 14;
+
+      shader->vgt_vertex_reuse_block_cntl = vtx_reuse_depth;
+   }
+}
+
+static struct si_pm4_state *si_get_shader_pm4_state(struct si_shader *shader)
+{
+   si_pm4_clear_state(&shader->pm4);
+   shader->pm4.is_shader = true;
+   return &shader->pm4;
+}
+
+static unsigned si_get_num_vs_user_sgprs(struct si_shader *shader,
+                                         unsigned num_always_on_user_sgprs)
+{
+   struct si_shader_selector *vs =
+      shader->previous_stage_sel ? shader->previous_stage_sel : shader->selector;
+   unsigned num_vbos_in_user_sgprs = vs->num_vbos_in_user_sgprs;
+
+   /* 1 SGPR is reserved for the vertex buffer pointer. */
+   assert(num_always_on_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST - 1);
+
+   if (num_vbos_in_user_sgprs)
+      return SI_SGPR_VS_VB_DESCRIPTOR_FIRST + num_vbos_in_user_sgprs * 4;
+
+   /* Add the pointer to VBO descriptors. */
+   return num_always_on_user_sgprs + 1;
+}
+
+/* Return VGPR_COMP_CNT for the API vertex shader. This can be hw LS, LSHS, ES, ESGS, VS. */
+static unsigned si_get_vs_vgpr_comp_cnt(struct si_screen *sscreen, struct si_shader *shader,
+                                        bool legacy_vs_prim_id)
+{
+   assert(shader->selector->info.stage == MESA_SHADER_VERTEX ||
+          (shader->previous_stage_sel && shader->previous_stage_sel->info.stage == MESA_SHADER_VERTEX));
+
+   /* GFX6-9   LS    (VertexID, RelAutoIndex,           InstanceID / StepRate0, InstanceID)
+    * GFX6-9   ES,VS (VertexID, InstanceID / StepRate0, VSPrimID,               InstanceID)
+    * GFX10    LS    (VertexID, RelAutoIndex,           UserVGPR1,              UserVGPR2 or InstanceID)
+    * GFX10    ES,VS (VertexID, UserVGPR1,              UserVGPR2 or VSPrimID,  UserVGPR3 or InstanceID)
+    */
+   bool is_ls = shader->selector->info.stage == MESA_SHADER_TESS_CTRL || shader->key.ge.as_ls;
+   unsigned max = 0;
+
+   if (shader->info.uses_instanceid) {
+      if (sscreen->info.chip_class >= GFX10)
+         max = MAX2(max, 3);
+      else if (is_ls)
+         max = MAX2(max, 2); /* use (InstanceID / StepRate0) because StepRate0 == 1 */
+      else
+         max = MAX2(max, 1); /* use (InstanceID / StepRate0) because StepRate0 == 1 */
+   }
+
+   if (legacy_vs_prim_id)
+      max = MAX2(max, 2); /* VSPrimID */
+
+   if (is_ls)
+      max = MAX2(max, 1); /* RelAutoIndex */
+
+   return max;
+}
+
+static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader)
+{
+   struct si_pm4_state *pm4;
+   uint64_t va;
+
+   assert(sscreen->info.chip_class <= GFX8);
+
+   pm4 = si_get_shader_pm4_state(shader);
+   if (!pm4)
+      return;
+
+   va = shader->bo->gpu_address;
+   si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
+
+   shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - 1) / 4) |
+                          S_00B528_SGPRS((shader->config.num_sgprs - 1) / 8) |
+                          S_00B528_VGPR_COMP_CNT(si_get_vs_vgpr_comp_cnt(sscreen, shader, false)) |
+                          S_00B528_DX10_CLAMP(1) | S_00B528_FLOAT_MODE(shader->config.float_mode);
+   shader->config.rsrc2 =
+      S_00B52C_USER_SGPR(si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR)) |
+      S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
+}
+
+static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
+{
+   struct si_pm4_state *pm4;
+   uint64_t va;
+
+   pm4 = si_get_shader_pm4_state(shader);
+   if (!pm4)
+      return;
+
+   va = shader->bo->gpu_address;
+
+   if (sscreen->info.chip_class >= GFX9) {
+      if (sscreen->info.chip_class >= GFX10) {
+         si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
+      } else {
+         si_pm4_set_reg(pm4, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8);
+      }
+
+      unsigned num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_TCS_NUM_USER_SGPR);
+
+      shader->config.rsrc2 = S_00B42C_USER_SGPR(num_user_sgprs) |
+                             S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
+
+      if (sscreen->info.chip_class >= GFX10)
+         shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
+      else
+         shader->config.rsrc2 |= S_00B42C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
+   } else {
+      si_pm4_set_reg(pm4, R_00B420_SPI_SHADER_PGM_LO_HS, va >> 8);
+      si_pm4_set_reg(pm4, R_00B424_SPI_SHADER_PGM_HI_HS,
+                     S_00B424_MEM_BASE(sscreen->info.address32_hi >> 8));
+
+      shader->config.rsrc2 = S_00B42C_USER_SGPR(GFX6_TCS_NUM_USER_SGPR) | S_00B42C_OC_LDS_EN(1) |
+                             S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
+   }
+
+   si_pm4_set_reg(
+      pm4, R_00B428_SPI_SHADER_PGM_RSRC1_HS,
+      S_00B428_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) |
+         (sscreen->info.chip_class <= GFX9 ? S_00B428_SGPRS((shader->config.num_sgprs - 1) / 8)
+                                           : 0) |
+         S_00B428_DX10_CLAMP(1) | S_00B428_MEM_ORDERED(si_shader_mem_ordered(shader)) |
+         S_00B428_WGP_MODE(sscreen->info.chip_class >= GFX10) |
+         S_00B428_FLOAT_MODE(shader->config.float_mode) |
+         S_00B428_LS_VGPR_COMP_CNT(sscreen->info.chip_class >= GFX9
+                                      ? si_get_vs_vgpr_comp_cnt(sscreen, shader, false)
+                                      : 0));
+
+   if (sscreen->info.chip_class <= GFX8) {
+      si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, shader->config.rsrc2);
+   }
+}
+
+static void si_emit_shader_es(struct si_context *sctx)
+{
+   struct si_shader *shader = sctx->queued.named.es;
+   if (!shader)
+      return;
+
+   radeon_begin(&sctx->gfx_cs);
+   radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
+                              SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
+                              shader->selector->esgs_itemsize / 4);
+
+   if (shader->selector->info.stage == MESA_SHADER_TESS_EVAL)
+      radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
+                                 shader->vgt_tf_param);
+
+   if (shader->vgt_vertex_reuse_block_cntl)
+      radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
+                                 SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
+                                 shader->vgt_vertex_reuse_block_cntl);
+   radeon_end_update_context_roll(sctx);
+}
+
+static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
+{
+   struct si_pm4_state *pm4;
+   unsigned num_user_sgprs;
+   unsigned vgpr_comp_cnt;
+   uint64_t va;
+   unsigned oc_lds_en;
+
+   assert(sscreen->info.chip_class <= GFX8);
+
+   pm4 = si_get_shader_pm4_state(shader);
+   if (!pm4)
+      return;
+
+   pm4->atom.emit = si_emit_shader_es;
+   va = shader->bo->gpu_address;
+
+   if (shader->selector->info.stage == MESA_SHADER_VERTEX) {
+      vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
+      num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR);
+   } else if (shader->selector->info.stage == MESA_SHADER_TESS_EVAL) {
+      vgpr_comp_cnt = shader->selector->info.uses_primid ? 3 : 2;
+      num_user_sgprs = SI_TES_NUM_USER_SGPR;
+   } else
+      unreachable("invalid shader selector type");
+
+   oc_lds_en = shader->selector->info.stage == MESA_SHADER_TESS_EVAL ? 1 : 0;
+
+   si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
+   si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES,
+                  S_00B324_MEM_BASE(sscreen->info.address32_hi >> 8));
+   si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
+                  S_00B328_VGPRS((shader->config.num_vgprs - 1) / 4) |
+                     S_00B328_SGPRS((shader->config.num_sgprs - 1) / 8) |
+                     S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) | S_00B328_DX10_CLAMP(1) |
+                     S_00B328_FLOAT_MODE(shader->config.float_mode));
+   si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
+                  S_00B32C_USER_SGPR(num_user_sgprs) | S_00B32C_OC_LDS_EN(oc_lds_en) |
+                     S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
+
+   if (shader->selector->info.stage == MESA_SHADER_TESS_EVAL)
+      si_set_tesseval_regs(sscreen, shader->selector, shader);
+
+   polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader);
+}
+
+void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
+                      struct gfx9_gs_info *out)
+{
+   unsigned gs_num_invocations = MAX2(gs->info.base.gs.invocations, 1);
+   unsigned input_prim = gs->info.base.gs.input_primitive;
+   bool uses_adjacency =
+      input_prim >= PIPE_PRIM_LINES_ADJACENCY && input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
+
+   /* All these are in dwords: */
+   /* We can't allow using the whole LDS, because GS waves compete with
+    * other shader stages for LDS space. */
+   const unsigned max_lds_size = 8 * 1024;
+   const unsigned esgs_itemsize = es->esgs_itemsize / 4;
+   unsigned esgs_lds_size;
+
+   /* All these are per subgroup: */
+   const unsigned max_out_prims = 32 * 1024;
+   const unsigned max_es_verts = 255;
+   const unsigned ideal_gs_prims = 64;
+   unsigned max_gs_prims, gs_prims;
+   unsigned min_es_verts, es_verts, worst_case_es_verts;
+
+   if (uses_adjacency || gs_num_invocations > 1)
+      max_gs_prims = 127 / gs_num_invocations;
+   else
+      max_gs_prims = 255;
+
+   /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
+    * Make sure we don't go over the maximum value.
+    */
+   if (gs->info.base.gs.vertices_out > 0) {
+      max_gs_prims =
+         MIN2(max_gs_prims, max_out_prims / (gs->info.base.gs.vertices_out * gs_num_invocations));
+   }
+   assert(max_gs_prims > 0);
+
+   /* If the primitive has adjacency, halve the number of vertices
+    * that will be reused in multiple primitives.
+    */
+   min_es_verts = gs->gs_input_verts_per_prim / (uses_adjacency ? 2 : 1);
+
+   gs_prims = MIN2(ideal_gs_prims, max_gs_prims);
+   worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
+
+   /* Compute ESGS LDS size based on the worst case number of ES vertices
+    * needed to create the target number of GS prims per subgroup.
+    */
+   esgs_lds_size = esgs_itemsize * worst_case_es_verts;
+
+   /* If total LDS usage is too big, refactor partitions based on ratio
+    * of ESGS item sizes.
+    */
+   if (esgs_lds_size > max_lds_size) {
+      /* Our target GS Prims Per Subgroup was too large. Calculate
+       * the maximum number of GS Prims Per Subgroup that will fit
+       * into LDS, capped by the maximum that the hardware can support.
+       */
+      gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)), max_gs_prims);
+      assert(gs_prims > 0);
+      worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
+
+      esgs_lds_size = esgs_itemsize * worst_case_es_verts;
+      assert(esgs_lds_size <= max_lds_size);
+   }
+
+   /* Now calculate remaining ESGS information. */
+   if (esgs_lds_size)
+      es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts);
+   else
+      es_verts = max_es_verts;
+
+   /* Vertices for adjacency primitives are not always reused, so restore
+    * it for ES_VERTS_PER_SUBGRP.
+    */
+   min_es_verts = gs->gs_input_verts_per_prim;
+
+   /* For normal primitives, the VGT only checks if they are past the ES
+    * verts per subgroup after allocating a full GS primitive and if they
+    * are, kick off a new subgroup.  But if those additional ES verts are
+    * unique (e.g. not reused) we need to make sure there is enough LDS
+    * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP.
+    */
+   es_verts -= min_es_verts - 1;
+
+   out->es_verts_per_subgroup = es_verts;
+   out->gs_prims_per_subgroup = gs_prims;
+   out->gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
+   out->max_prims_per_subgroup = out->gs_inst_prims_in_subgroup * gs->info.base.gs.vertices_out;
+   out->esgs_ring_size = esgs_lds_size;
+
+   assert(out->max_prims_per_subgroup <= max_out_prims);
+}
+
+static void si_emit_shader_gs(struct si_context *sctx)
+{
+   struct si_shader *shader = sctx->queued.named.gs;
+   if (!shader)
+      return;
+
+   radeon_begin(&sctx->gfx_cs);
+
+   /* R_028A60_VGT_GSVS_RING_OFFSET_1, R_028A64_VGT_GSVS_RING_OFFSET_2
+    * R_028A68_VGT_GSVS_RING_OFFSET_3 */
+   radeon_opt_set_context_reg3(
+      sctx, R_028A60_VGT_GSVS_RING_OFFSET_1, SI_TRACKED_VGT_GSVS_RING_OFFSET_1,
+      shader->ctx_reg.gs.vgt_gsvs_ring_offset_1, shader->ctx_reg.gs.vgt_gsvs_ring_offset_2,
+      shader->ctx_reg.gs.vgt_gsvs_ring_offset_3);
+
+   /* R_028AB0_VGT_GSVS_RING_ITEMSIZE */
+   radeon_opt_set_context_reg(sctx, R_028AB0_VGT_GSVS_RING_ITEMSIZE,
+                              SI_TRACKED_VGT_GSVS_RING_ITEMSIZE,
+                              shader->ctx_reg.gs.vgt_gsvs_ring_itemsize);
+
+   /* R_028B38_VGT_GS_MAX_VERT_OUT */
+   radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT,
+                              shader->ctx_reg.gs.vgt_gs_max_vert_out);
+
+   /* R_028B5C_VGT_GS_VERT_ITEMSIZE, R_028B60_VGT_GS_VERT_ITEMSIZE_1
+    * R_028B64_VGT_GS_VERT_ITEMSIZE_2, R_028B68_VGT_GS_VERT_ITEMSIZE_3 */
+   radeon_opt_set_context_reg4(
+      sctx, R_028B5C_VGT_GS_VERT_ITEMSIZE, SI_TRACKED_VGT_GS_VERT_ITEMSIZE,
+      shader->ctx_reg.gs.vgt_gs_vert_itemsize, shader->ctx_reg.gs.vgt_gs_vert_itemsize_1,
+      shader->ctx_reg.gs.vgt_gs_vert_itemsize_2, shader->ctx_reg.gs.vgt_gs_vert_itemsize_3);
+
+   /* R_028B90_VGT_GS_INSTANCE_CNT */
+   radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT, SI_TRACKED_VGT_GS_INSTANCE_CNT,
+                              shader->ctx_reg.gs.vgt_gs_instance_cnt);
+
+   if (sctx->chip_class >= GFX9) {
+      /* R_028A44_VGT_GS_ONCHIP_CNTL */
+      radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, SI_TRACKED_VGT_GS_ONCHIP_CNTL,
+                                 shader->ctx_reg.gs.vgt_gs_onchip_cntl);
+      /* R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP */
+      radeon_opt_set_context_reg(sctx, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
+                                 SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
+                                 shader->ctx_reg.gs.vgt_gs_max_prims_per_subgroup);
+      /* R_028AAC_VGT_ESGS_RING_ITEMSIZE */
+      radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
+                                 SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
+                                 shader->ctx_reg.gs.vgt_esgs_ring_itemsize);
+
+      if (shader->key.ge.part.gs.es->info.stage == MESA_SHADER_TESS_EVAL)
+         radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
+                                    shader->vgt_tf_param);
+      if (shader->vgt_vertex_reuse_block_cntl)
+         radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
+                                    SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
+                                    shader->vgt_vertex_reuse_block_cntl);
+   }
+   radeon_end_update_context_roll(sctx);
+
+   /* These don't cause any context rolls. */
+   radeon_begin_again(&sctx->gfx_cs);
+   if (sctx->chip_class >= GFX7) {
+      radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
+                            SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
+                            shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs);
+   }
+   if (sctx->chip_class >= GFX10) {
+      radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
+                            SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
+                            shader->ctx_reg.gs.spi_shader_pgm_rsrc4_gs);
+   }
+   radeon_end();
+}
+
+static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
+{
+   struct si_shader_selector *sel = shader->selector;
+   const ubyte *num_components = sel->info.num_stream_output_components;
+   unsigned gs_num_invocations = sel->info.base.gs.invocations;
+   struct si_pm4_state *pm4;
+   uint64_t va;
+   unsigned max_stream = util_last_bit(sel->info.base.gs.active_stream_mask);
+   unsigned offset;
+
+   pm4 = si_get_shader_pm4_state(shader);
+   if (!pm4)
+      return;
+
+   pm4->atom.emit = si_emit_shader_gs;
+
+   offset = num_components[0] * sel->info.base.gs.vertices_out;
+   shader->ctx_reg.gs.vgt_gsvs_ring_offset_1 = offset;
+
+   if (max_stream >= 2)
+      offset += num_components[1] * sel->info.base.gs.vertices_out;
+   shader->ctx_reg.gs.vgt_gsvs_ring_offset_2 = offset;
+
+   if (max_stream >= 3)
+      offset += num_components[2] * sel->info.base.gs.vertices_out;
+   shader->ctx_reg.gs.vgt_gsvs_ring_offset_3 = offset;
+
+   if (max_stream >= 4)
+      offset += num_components[3] * sel->info.base.gs.vertices_out;
+   shader->ctx_reg.gs.vgt_gsvs_ring_itemsize = offset;
+
+   /* The GSVS_RING_ITEMSIZE register takes 15 bits */
+   assert(offset < (1 << 15));
+
+   shader->ctx_reg.gs.vgt_gs_max_vert_out = sel->info.base.gs.vertices_out;
+
+   shader->ctx_reg.gs.vgt_gs_vert_itemsize = num_components[0];
+   shader->ctx_reg.gs.vgt_gs_vert_itemsize_1 = (max_stream >= 2) ? num_components[1] : 0;
+   shader->ctx_reg.gs.vgt_gs_vert_itemsize_2 = (max_stream >= 3) ? num_components[2] : 0;
+   shader->ctx_reg.gs.vgt_gs_vert_itemsize_3 = (max_stream >= 4) ? num_components[3] : 0;
+
+   shader->ctx_reg.gs.vgt_gs_instance_cnt =
+      S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0);
+
+   /* Copy over fields from the GS copy shader to make them easily accessible from GS. */
+   shader->pa_cl_vs_out_cntl = sel->gs_copy_shader->pa_cl_vs_out_cntl;
+
+   va = shader->bo->gpu_address;
+
+   if (sscreen->info.chip_class >= GFX9) {
+      unsigned input_prim = sel->info.base.gs.input_primitive;
+      gl_shader_stage es_stage = shader->key.ge.part.gs.es->info.stage;
+      unsigned es_vgpr_comp_cnt, gs_vgpr_comp_cnt;
+
+      if (es_stage == MESA_SHADER_VERTEX) {
+         es_vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
+      } else if (es_stage == MESA_SHADER_TESS_EVAL)
+         es_vgpr_comp_cnt = shader->key.ge.part.gs.es->info.uses_primid ? 3 : 2;
+      else
+         unreachable("invalid shader selector type");
+
+      /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
+       * VGPR[0:4] are always loaded.
+       */
+      if (sel->info.uses_invocationid)
+         gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */
+      else if (sel->info.uses_primid)
+         gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
+      else if (input_prim >= PIPE_PRIM_TRIANGLES)
+         gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
+      else
+         gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
+
+      unsigned num_user_sgprs;
+      if (es_stage == MESA_SHADER_VERTEX)
+         num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR);
+      else
+         num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
+
+      if (sscreen->info.chip_class >= GFX10) {
+         si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
+      } else {
+         si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
+      }
+
+      uint32_t rsrc1 = S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B228_DX10_CLAMP(1) |
+                       S_00B228_MEM_ORDERED(si_shader_mem_ordered(shader)) |
+                       S_00B228_WGP_MODE(sscreen->info.chip_class >= GFX10) |
+                       S_00B228_FLOAT_MODE(shader->config.float_mode) |
+                       S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt);
+      uint32_t rsrc2 = S_00B22C_USER_SGPR(num_user_sgprs) |
+                       S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
+                       S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL) |
+                       S_00B22C_LDS_SIZE(shader->config.lds_size) |
+                       S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
+
+      if (sscreen->info.chip_class >= GFX10) {
+         rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
+      } else {
+         rsrc1 |= S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8);
+         rsrc2 |= S_00B22C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
+      }
+
+      si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1);
+      si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
+
+      shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(0xffff) |
+                                                   S_00B21C_WAVE_LIMIT(0x3F);
+      shader->ctx_reg.gs.spi_shader_pgm_rsrc4_gs =
+         S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0);
+
+      shader->ctx_reg.gs.vgt_gs_onchip_cntl =
+         S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) |
+         S_028A44_GS_PRIMS_PER_SUBGRP(shader->gs_info.gs_prims_per_subgroup) |
+         S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->gs_info.gs_inst_prims_in_subgroup);
+      shader->ctx_reg.gs.vgt_gs_max_prims_per_subgroup =
+         S_028A94_MAX_PRIMS_PER_SUBGROUP(shader->gs_info.max_prims_per_subgroup);
+      shader->ctx_reg.gs.vgt_esgs_ring_itemsize = shader->key.ge.part.gs.es->esgs_itemsize / 4;
+
+      if (es_stage == MESA_SHADER_TESS_EVAL)
+         si_set_tesseval_regs(sscreen, shader->key.ge.part.gs.es, shader);
+
+      polaris_set_vgt_vertex_reuse(sscreen, shader->key.ge.part.gs.es, shader);
+   } else {
+      shader->ctx_reg.gs.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(0xffff) |
+                                                   S_00B21C_WAVE_LIMIT(0x3F);
+
+      si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
+      si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS,
+                     S_00B224_MEM_BASE(sscreen->info.address32_hi >> 8));
+
+      si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
+                     S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) |
+                        S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8) |
+                        S_00B228_DX10_CLAMP(1) | S_00B228_FLOAT_MODE(shader->config.float_mode));
+      si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
+                     S_00B22C_USER_SGPR(GFX6_GS_NUM_USER_SGPR) |
+                        S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
+   }
+}
+
+bool gfx10_is_ngg_passthrough(struct si_shader *shader)
+{
+   struct si_shader_selector *sel = shader->selector;
+
+   /* Never use NGG passthrough if culling is possible even when it's not used by this shader,
+    * so that we don't get context rolls when enabling and disabling NGG passthrough.
+    */
+   if (sel->screen->use_ngg_culling)
+      return false;
+
+   /* The definition of NGG passthrough is:
+    * - user GS is turned off (no amplification, no GS instancing, and no culling)
+    * - VGT_ESGS_RING_ITEMSIZE is ignored (behaving as if it was equal to 1)
+    * - vertex indices are packed into 1 VGPR
+    * - Dimgrey and later chips can optionally skip the gs_alloc_req message
+    *
+    * NGG passthrough still allows the use of LDS.
+    */
+   return sel->info.stage != MESA_SHADER_GEOMETRY && !shader->key.ge.opt.ngg_culling;
+}
+
+/* Common tail code for NGG primitive shaders. */
+static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader *shader)
+{
+   radeon_begin(&sctx->gfx_cs);
+   radeon_opt_set_context_reg(sctx, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
+                              SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,
+                              shader->ctx_reg.ngg.ge_max_output_per_subgroup);
+   radeon_opt_set_context_reg(sctx, R_028B4C_GE_NGG_SUBGRP_CNTL, SI_TRACKED_GE_NGG_SUBGRP_CNTL,
+                              shader->ctx_reg.ngg.ge_ngg_subgrp_cntl);
+   radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN, SI_TRACKED_VGT_PRIMITIVEID_EN,
+                              shader->ctx_reg.ngg.vgt_primitiveid_en);
+   radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL, SI_TRACKED_VGT_GS_ONCHIP_CNTL,
+                              shader->ctx_reg.ngg.vgt_gs_onchip_cntl);
+   radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT, SI_TRACKED_VGT_GS_INSTANCE_CNT,
+                              shader->ctx_reg.ngg.vgt_gs_instance_cnt);
+   radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
+                              SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
+                              shader->ctx_reg.ngg.vgt_esgs_ring_itemsize);
+   radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG, SI_TRACKED_SPI_VS_OUT_CONFIG,
+                              shader->ctx_reg.ngg.spi_vs_out_config);
+   radeon_opt_set_context_reg2(
+      sctx, R_028708_SPI_SHADER_IDX_FORMAT, SI_TRACKED_SPI_SHADER_IDX_FORMAT,
+      shader->ctx_reg.ngg.spi_shader_idx_format, shader->ctx_reg.ngg.spi_shader_pos_format);
+   radeon_opt_set_context_reg(sctx, R_028818_PA_CL_VTE_CNTL, SI_TRACKED_PA_CL_VTE_CNTL,
+                              shader->ctx_reg.ngg.pa_cl_vte_cntl);
+   radeon_opt_set_context_reg(sctx, R_028838_PA_CL_NGG_CNTL, SI_TRACKED_PA_CL_NGG_CNTL,
+                              shader->ctx_reg.ngg.pa_cl_ngg_cntl);
+
+   radeon_end_update_context_roll(sctx);
+
+   /* These don't cause a context roll. */
+   radeon_begin_again(&sctx->gfx_cs);
+   radeon_opt_set_uconfig_reg(sctx, R_030980_GE_PC_ALLOC, SI_TRACKED_GE_PC_ALLOC,
+                              shader->ctx_reg.ngg.ge_pc_alloc);
+   radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
+                         SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
+                         shader->ctx_reg.ngg.spi_shader_pgm_rsrc3_gs);
+   radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
+                         SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
+                         shader->ctx_reg.ngg.spi_shader_pgm_rsrc4_gs);
+   radeon_end();
+}
+
+static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx)
+{
+   struct si_shader *shader = sctx->queued.named.gs;
+   if (!shader)
+      return;
+
+   gfx10_emit_shader_ngg_tail(sctx, shader);
+}
+
+static void gfx10_emit_shader_ngg_tess_nogs(struct si_context *sctx)
+{
+   struct si_shader *shader = sctx->queued.named.gs;
+   if (!shader)
+      return;
+
+   radeon_begin(&sctx->gfx_cs);
+   radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
+                              shader->vgt_tf_param);
+   radeon_end_update_context_roll(sctx);
+
+   gfx10_emit_shader_ngg_tail(sctx, shader);
+}
+
+static void gfx10_emit_shader_ngg_notess_gs(struct si_context *sctx)
+{
+   struct si_shader *shader = sctx->queued.named.gs;
+   if (!shader)
+      return;
+
+   radeon_begin(&sctx->gfx_cs);
+   radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT,
+                              shader->ctx_reg.ngg.vgt_gs_max_vert_out);
+   radeon_end_update_context_roll(sctx);
+
+   gfx10_emit_shader_ngg_tail(sctx, shader);
+}
+
+static void gfx10_emit_shader_ngg_tess_gs(struct si_context *sctx)
+{
+   struct si_shader *shader = sctx->queued.named.gs;
+
+   if (!shader)
+      return;
+
+   radeon_begin(&sctx->gfx_cs);
+   radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT,
+                              shader->ctx_reg.ngg.vgt_gs_max_vert_out);
+   radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
+                              shader->vgt_tf_param);
+   radeon_end_update_context_roll(sctx);
+
+   gfx10_emit_shader_ngg_tail(sctx, shader);
+}
+
+unsigned si_get_input_prim(const struct si_shader_selector *gs, const union si_shader_key *key)
+{
+   if (gs->info.stage == MESA_SHADER_GEOMETRY)
+      return gs->info.base.gs.input_primitive;
+
+   if (gs->info.stage == MESA_SHADER_TESS_EVAL) {
+      if (gs->info.base.tess.point_mode)
+         return PIPE_PRIM_POINTS;
+      if (gs->info.base.tess.primitive_mode == GL_LINES)
+         return PIPE_PRIM_LINES;
+      return PIPE_PRIM_TRIANGLES;
+   }
+
+   if (key->ge.opt.ngg_culling & SI_NGG_CULL_LINES)
+      return PIPE_PRIM_LINES;
+
+   return PIPE_PRIM_TRIANGLES; /* worst case for all callers */
+}
+
+static unsigned si_get_vs_out_cntl(const struct si_shader_selector *sel,
+                                   const struct si_shader *shader, bool ngg)
+{
+   /* Clip distances can be killed, but cull distances can't. */
+   unsigned clipcull_mask = (sel->clipdist_mask & ~shader->key.ge.opt.kill_clip_distances) |
+                            sel->culldist_mask;
+   bool writes_psize = sel->info.writes_psize && !shader->key.ge.opt.kill_pointsize;
+   bool misc_vec_ena = writes_psize || (sel->info.writes_edgeflag && !ngg) ||
+                       sel->screen->options.vrs2x2 ||
+                       sel->info.writes_layer || sel->info.writes_viewport_index;
+
+   return S_02881C_VS_OUT_CCDIST0_VEC_ENA((clipcull_mask & 0x0F) != 0) |
+          S_02881C_VS_OUT_CCDIST1_VEC_ENA((clipcull_mask & 0xF0) != 0) |
+          S_02881C_USE_VTX_POINT_SIZE(writes_psize) |
+          S_02881C_USE_VTX_EDGE_FLAG(sel->info.writes_edgeflag && !ngg) |
+          S_02881C_USE_VTX_VRS_RATE(sel->screen->options.vrs2x2) |
+          S_02881C_USE_VTX_RENDER_TARGET_INDX(sel->info.writes_layer) |
+          S_02881C_USE_VTX_VIEWPORT_INDX(sel->info.writes_viewport_index) |
+          S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
+          S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena);
+}
+
+/**
+ * Prepare the PM4 image for \p shader, which will run as a merged ESGS shader
+ * in NGG mode.
+ */
+static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader)
+{
+   const struct si_shader_selector *gs_sel = shader->selector;
+   const struct si_shader_info *gs_info = &gs_sel->info;
+   const gl_shader_stage gs_stage = shader->selector->info.stage;
+   const struct si_shader_selector *es_sel =
+      shader->previous_stage_sel ? shader->previous_stage_sel : shader->selector;
+   const struct si_shader_info *es_info = &es_sel->info;
+   const gl_shader_stage es_stage = es_sel->info.stage;
+   unsigned num_user_sgprs;
+   unsigned nparams, es_vgpr_comp_cnt, gs_vgpr_comp_cnt;
+   uint64_t va;
+   bool window_space = gs_info->stage == MESA_SHADER_VERTEX ?
+                          gs_info->base.vs.window_space_position : 0;
+   bool es_enable_prim_id = shader->key.ge.mono.u.vs_export_prim_id || es_info->uses_primid;
+   unsigned gs_num_invocations = MAX2(gs_sel->info.base.gs.invocations, 1);
+   unsigned input_prim = si_get_input_prim(gs_sel, &shader->key);
+   bool break_wave_at_eoi = false;
+   struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader);
+   if (!pm4)
+      return;
+
+   if (es_stage == MESA_SHADER_TESS_EVAL) {
+      pm4->atom.emit = gs_stage == MESA_SHADER_GEOMETRY ? gfx10_emit_shader_ngg_tess_gs
+                                                       : gfx10_emit_shader_ngg_tess_nogs;
+   } else {
+      pm4->atom.emit = gs_stage == MESA_SHADER_GEOMETRY ? gfx10_emit_shader_ngg_notess_gs
+                                                       : gfx10_emit_shader_ngg_notess_nogs;
+   }
+
+   va = shader->bo->gpu_address;
+
+   if (es_stage == MESA_SHADER_VERTEX) {
+      es_vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, false);
+
+      if (es_info->base.vs.blit_sgprs_amd) {
+         num_user_sgprs =
+            SI_SGPR_VS_BLIT_DATA + es_info->base.vs.blit_sgprs_amd;
+      } else {
+         num_user_sgprs = si_get_num_vs_user_sgprs(shader, GFX9_VSGS_NUM_USER_SGPR);
+      }
+   } else {
+      assert(es_stage == MESA_SHADER_TESS_EVAL);
+      es_vgpr_comp_cnt = es_enable_prim_id ? 3 : 2;
+      num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
+
+      if (es_enable_prim_id || gs_info->uses_primid)
+         break_wave_at_eoi = true;
+   }
+
+   /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
+    * VGPR[0:4] are always loaded.
+    *
+    * Vertex shaders always need to load VGPR3, because they need to
+    * pass edge flags for decomposed primitives (such as quads) to the PA
+    * for the GL_LINE polygon mode to skip rendering lines on inner edges.
+    */
+   if (gs_info->uses_invocationid ||
+       (gfx10_edgeflags_have_effect(shader) && !gfx10_is_ngg_passthrough(shader)))
+      gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID, edge flags. */
+   else if ((gs_stage == MESA_SHADER_GEOMETRY && gs_info->uses_primid) ||
+            (gs_stage == MESA_SHADER_VERTEX && shader->key.ge.mono.u.vs_export_prim_id))
+      gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
+   else if (input_prim >= PIPE_PRIM_TRIANGLES && !gfx10_is_ngg_passthrough(shader))
+      gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
+   else
+      gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
+
+   unsigned wave_size = si_get_shader_wave_size(shader);
+   unsigned late_alloc_wave64, cu_mask;
+
+   ac_compute_late_alloc(&sscreen->info, true, shader->key.ge.opt.ngg_culling,
+                         shader->config.scratch_bytes_per_wave > 0,
+                         &late_alloc_wave64, &cu_mask);
+
+   si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
+   si_pm4_set_reg(
+      pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
+      S_00B228_VGPRS((shader->config.num_vgprs - 1) / (wave_size == 32 ? 8 : 4)) |
+         S_00B228_FLOAT_MODE(shader->config.float_mode) | S_00B228_DX10_CLAMP(1) |
+         S_00B228_MEM_ORDERED(si_shader_mem_ordered(shader)) |
+         /* Disable the WGP mode on gfx10.3 because it can hang. (it happened on VanGogh)
+          * Let's disable it on all chips that disable exactly 1 CU per SA for GS. */
+         S_00B228_WGP_MODE(sscreen->info.chip_class == GFX10) |
+         S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt));
+   si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
+                  S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0) |
+                     S_00B22C_USER_SGPR(num_user_sgprs) |
+                     S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
+                     S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5) |
+                     S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL) |
+                     S_00B22C_LDS_SIZE(shader->config.lds_size));
+
+   shader->ctx_reg.ngg.spi_shader_pgm_rsrc3_gs = S_00B21C_CU_EN(cu_mask) |
+                                                 S_00B21C_WAVE_LIMIT(0x3F);
+   shader->ctx_reg.ngg.spi_shader_pgm_rsrc4_gs =
+      S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64);
+
+   nparams = MAX2(shader->info.nr_param_exports, 1);
+   shader->ctx_reg.ngg.spi_vs_out_config =
+      S_0286C4_VS_EXPORT_COUNT(nparams - 1) |
+      S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0);
+
+   shader->ctx_reg.ngg.spi_shader_idx_format =
+      S_028708_IDX0_EXPORT_FORMAT(V_028708_SPI_SHADER_1COMP);
+   shader->ctx_reg.ngg.spi_shader_pos_format =
+      S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
+      S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP
+                                                                  : V_02870C_SPI_SHADER_NONE) |
+      S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP
+                                                                  : V_02870C_SPI_SHADER_NONE) |
+      S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
+                                                                  : V_02870C_SPI_SHADER_NONE);
+
+   shader->ctx_reg.ngg.vgt_primitiveid_en =
+      S_028A84_PRIMITIVEID_EN(es_enable_prim_id) |
+      S_028A84_NGG_DISABLE_PROVOK_REUSE(shader->key.ge.mono.u.vs_export_prim_id ||
+                                        gs_sel->info.writes_primid);
+
+   if (gs_stage == MESA_SHADER_GEOMETRY) {
+      shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = es_sel->esgs_itemsize / 4;
+      shader->ctx_reg.ngg.vgt_gs_max_vert_out = gs_sel->info.base.gs.vertices_out;
+   } else {
+      shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = 1;
+   }
+
+   if (es_stage == MESA_SHADER_TESS_EVAL)
+      si_set_tesseval_regs(sscreen, es_sel, shader);
+
+   shader->ctx_reg.ngg.vgt_gs_onchip_cntl =
+      S_028A44_ES_VERTS_PER_SUBGRP(shader->ngg.hw_max_esverts) |
+      S_028A44_GS_PRIMS_PER_SUBGRP(shader->ngg.max_gsprims) |
+      S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->ngg.max_gsprims * gs_num_invocations);
+   shader->ctx_reg.ngg.ge_max_output_per_subgroup =
+      S_0287FC_MAX_VERTS_PER_SUBGROUP(shader->ngg.max_out_verts);
+   shader->ctx_reg.ngg.ge_ngg_subgrp_cntl = S_028B4C_PRIM_AMP_FACTOR(shader->ngg.prim_amp_factor) |
+                                            S_028B4C_THDS_PER_SUBGRP(0); /* for fast launch */
+   shader->ctx_reg.ngg.vgt_gs_instance_cnt =
+      S_028B90_CNT(gs_num_invocations) | S_028B90_ENABLE(gs_num_invocations > 1) |
+      S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(shader->ngg.max_vert_out_per_gs_instance);
+
+   /* Output hw-generated edge flags if needed and pass them via the prim
+    * export to prevent drawing lines on internal edges of decomposed
+    * primitives (such as quads) with polygon mode = lines.
+    */
+   shader->ctx_reg.ngg.pa_cl_ngg_cntl =
+      S_028838_INDEX_BUF_EDGE_FLAG_ENA(gfx10_edgeflags_have_effect(shader)) |
+      /* Reuse for NGG. */
+      S_028838_VERTEX_REUSE_DEPTH(sscreen->info.chip_class >= GFX10_3 ? 30 : 0);
+   shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, shader, true);
+
+   /* Oversubscribe PC. This improves performance when there are too many varyings. */
+   unsigned oversub_pc_factor = 1;
+
+   if (shader->key.ge.opt.ngg_culling) {
+      /* Be more aggressive with NGG culling. */
+      if (shader->info.nr_param_exports > 4)
+         oversub_pc_factor = 4;
+      else if (shader->info.nr_param_exports > 2)
+         oversub_pc_factor = 3;
+      else
+         oversub_pc_factor = 2;
+   }
+
+   unsigned oversub_pc_lines =
+      late_alloc_wave64 ? (sscreen->info.pc_lines / 4) * oversub_pc_factor : 0;
+   shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(oversub_pc_lines > 0) |
+                                     S_030980_NUM_PC_LINES(oversub_pc_lines - 1);
+
+   shader->ge_cntl = S_03096C_PRIM_GRP_SIZE(shader->ngg.max_gsprims) |
+                     S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts) |
+                     S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
+
+   /* On gfx10, the GE only checks against the maximum number of ES verts after
+    * allocating a full GS primitive. So we need to ensure that whenever
+    * this check passes, there is enough space for a full primitive without
+    * vertex reuse. VERT_GRP_SIZE=256 doesn't need this. We should always get 256
+    * if we have enough LDS.
+    *
+    * Tessellation is unaffected because it always sets GE_CNTL.VERT_GRP_SIZE = 0.
+    */
+   if ((sscreen->info.chip_class == GFX10) &&
+       (es_stage == MESA_SHADER_VERTEX || gs_stage == MESA_SHADER_VERTEX) && /* = no tess */
+       shader->ngg.hw_max_esverts != 256 &&
+       shader->ngg.hw_max_esverts > 5) {
+      /* This could be based on the input primitive type. 5 is the worst case
+       * for primitive types with adjacency.
+       */
+      shader->ge_cntl &= C_03096C_VERT_GRP_SIZE;
+      shader->ge_cntl |= S_03096C_VERT_GRP_SIZE(shader->ngg.hw_max_esverts - 5);
+   }
+
+   if (window_space) {
+      shader->ctx_reg.ngg.pa_cl_vte_cntl = S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1);
+   } else {
+      shader->ctx_reg.ngg.pa_cl_vte_cntl =
+         S_028818_VTX_W0_FMT(1) | S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
+         S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
+         S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1);
+   }
+
+   shader->ctx_reg.ngg.vgt_stages.u.ngg = 1;
+   shader->ctx_reg.ngg.vgt_stages.u.streamout = gs_sel->so.num_outputs;
+   shader->ctx_reg.ngg.vgt_stages.u.ngg_passthrough = gfx10_is_ngg_passthrough(shader);
+}
+
+static void si_emit_shader_vs(struct si_context *sctx)
+{
+   struct si_shader *shader = sctx->queued.named.vs;
+   if (!shader)
+      return;
+
+   radeon_begin(&sctx->gfx_cs);
+   radeon_opt_set_context_reg(sctx, R_028A40_VGT_GS_MODE, SI_TRACKED_VGT_GS_MODE,
+                              shader->ctx_reg.vs.vgt_gs_mode);
+   radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN, SI_TRACKED_VGT_PRIMITIVEID_EN,
+                              shader->ctx_reg.vs.vgt_primitiveid_en);
+
+   if (sctx->chip_class <= GFX8) {
+      radeon_opt_set_context_reg(sctx, R_028AB4_VGT_REUSE_OFF, SI_TRACKED_VGT_REUSE_OFF,
+                                 shader->ctx_reg.vs.vgt_reuse_off);
+   }
+
+   radeon_opt_set_context_reg(sctx, R_0286C4_SPI_VS_OUT_CONFIG, SI_TRACKED_SPI_VS_OUT_CONFIG,
+                              shader->ctx_reg.vs.spi_vs_out_config);
+
+   radeon_opt_set_context_reg(sctx, R_02870C_SPI_SHADER_POS_FORMAT,
+                              SI_TRACKED_SPI_SHADER_POS_FORMAT,
+                              shader->ctx_reg.vs.spi_shader_pos_format);
+
+   radeon_opt_set_context_reg(sctx, R_028818_PA_CL_VTE_CNTL, SI_TRACKED_PA_CL_VTE_CNTL,
+                              shader->ctx_reg.vs.pa_cl_vte_cntl);
+
+   if (shader->selector->info.stage == MESA_SHADER_TESS_EVAL)
+      radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
+                                 shader->vgt_tf_param);
+
+   if (shader->vgt_vertex_reuse_block_cntl)
+      radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
+                                 SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
+                                 shader->vgt_vertex_reuse_block_cntl);
+
+   /* Required programming for tessellation. (legacy pipeline only) */
+   if (sctx->chip_class >= GFX10 && shader->selector->info.stage == MESA_SHADER_TESS_EVAL) {
+      radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL,
+                                 SI_TRACKED_VGT_GS_ONCHIP_CNTL,
+                                 S_028A44_ES_VERTS_PER_SUBGRP(250) |
+                                 S_028A44_GS_PRIMS_PER_SUBGRP(126) |
+                                 S_028A44_GS_INST_PRIMS_IN_SUBGRP(126));
+   }
+
+   radeon_end_update_context_roll(sctx);
+
+   /* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
+   if (sctx->chip_class >= GFX10) {
+      radeon_begin_again(&sctx->gfx_cs);
+      radeon_opt_set_uconfig_reg(sctx, R_030980_GE_PC_ALLOC, SI_TRACKED_GE_PC_ALLOC,
+                                 shader->ctx_reg.vs.ge_pc_alloc);
+      radeon_end();
+   }
+}
+
+/**
+ * Compute the state for \p shader, which will run as a vertex shader on the
+ * hardware.
+ *
+ * If \p gs is non-NULL, it points to the geometry shader for which this shader
+ * is the copy shader.
+ */
+static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
+                         struct si_shader_selector *gs)
+{
+   const struct si_shader_info *info = &shader->selector->info;
+   struct si_pm4_state *pm4;
+   unsigned num_user_sgprs, vgpr_comp_cnt;
+   uint64_t va;
+   unsigned nparams, oc_lds_en;
+   bool window_space = info->stage == MESA_SHADER_VERTEX ?
+                          info->base.vs.window_space_position : 0;
+   bool enable_prim_id = shader->key.ge.mono.u.vs_export_prim_id || info->uses_primid;
+
+   pm4 = si_get_shader_pm4_state(shader);
+   if (!pm4)
+      return;
+
+   pm4->atom.emit = si_emit_shader_vs;
+
+   /* We always write VGT_GS_MODE in the VS state, because every switch
+    * between different shader pipelines involving a different GS or no
+    * GS at all involves a switch of the VS (different GS use different
+    * copy shaders). On the other hand, when the API switches from a GS to
+    * no GS and then back to the same GS used originally, the GS state is
+    * not sent again.
+    */
+   if (!gs) {
+      unsigned mode = V_028A40_GS_OFF;
+
+      /* PrimID needs GS scenario A. */
+      if (enable_prim_id)
+         mode = V_028A40_GS_SCENARIO_A;
+
+      shader->ctx_reg.vs.vgt_gs_mode = S_028A40_MODE(mode);
+      shader->ctx_reg.vs.vgt_primitiveid_en = enable_prim_id;
+   } else {
+      shader->ctx_reg.vs.vgt_gs_mode =
+         ac_vgt_gs_mode(gs->info.base.gs.vertices_out, sscreen->info.chip_class);
+      shader->ctx_reg.vs.vgt_primitiveid_en = 0;
+   }
+
+   if (sscreen->info.chip_class <= GFX8) {
+      /* Reuse needs to be set off if we write oViewport. */
+      shader->ctx_reg.vs.vgt_reuse_off = S_028AB4_REUSE_OFF(info->writes_viewport_index);
+   }
+
+   va = shader->bo->gpu_address;
+
+   if (gs) {
+      vgpr_comp_cnt = 0; /* only VertexID is needed for GS-COPY. */
+      num_user_sgprs = SI_GSCOPY_NUM_USER_SGPR;
+   } else if (shader->selector->info.stage == MESA_SHADER_VERTEX) {
+      vgpr_comp_cnt = si_get_vs_vgpr_comp_cnt(sscreen, shader, enable_prim_id);
+
+      if (info->base.vs.blit_sgprs_amd) {
+         num_user_sgprs = SI_SGPR_VS_BLIT_DATA + info->base.vs.blit_sgprs_amd;
+      } else {
+         num_user_sgprs = si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR);
+      }
+   } else if (shader->selector->info.stage == MESA_SHADER_TESS_EVAL) {
+      vgpr_comp_cnt = enable_prim_id ? 3 : 2;
+      num_user_sgprs = SI_TES_NUM_USER_SGPR;
+   } else
+      unreachable("invalid shader selector type");
+
+   /* VS is required to export at least one param. */
+   nparams = MAX2(shader->info.nr_param_exports, 1);
+   shader->ctx_reg.vs.spi_vs_out_config = S_0286C4_VS_EXPORT_COUNT(nparams - 1);
+
+   if (sscreen->info.chip_class >= GFX10) {
+      shader->ctx_reg.vs.spi_vs_out_config |=
+         S_0286C4_NO_PC_EXPORT(shader->info.nr_param_exports == 0);
+   }
+
+   shader->ctx_reg.vs.spi_shader_pos_format =
+      S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
+      S_02870C_POS1_EXPORT_FORMAT(shader->info.nr_pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP
+                                                                  : V_02870C_SPI_SHADER_NONE) |
+      S_02870C_POS2_EXPORT_FORMAT(shader->info.nr_pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP
+                                                                  : V_02870C_SPI_SHADER_NONE) |
+      S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
+                                                                  : V_02870C_SPI_SHADER_NONE);
+   unsigned late_alloc_wave64, cu_mask;
+   ac_compute_late_alloc(&sscreen->info, false, false,
+                         shader->config.scratch_bytes_per_wave > 0,
+                         &late_alloc_wave64, &cu_mask);
+
+   shader->ctx_reg.vs.ge_pc_alloc = S_030980_OVERSUB_EN(late_alloc_wave64 > 0) |
+                                    S_030980_NUM_PC_LINES(sscreen->info.pc_lines / 4 - 1);
+   shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, shader, false);
+
+   oc_lds_en = shader->selector->info.stage == MESA_SHADER_TESS_EVAL ? 1 : 0;
+
+   if (sscreen->info.chip_class >= GFX7) {
+      si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
+                     S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F));
+      si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
+   }
+
+   si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8);
+   si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS,
+                  S_00B124_MEM_BASE(sscreen->info.address32_hi >> 8));
+
+   uint32_t rsrc1 =
+      S_00B128_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) |
+      S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) | S_00B128_DX10_CLAMP(1) |
+      S_00B128_MEM_ORDERED(si_shader_mem_ordered(shader)) |
+      S_00B128_FLOAT_MODE(shader->config.float_mode);
+   uint32_t rsrc2 = S_00B12C_USER_SGPR(num_user_sgprs) | S_00B12C_OC_LDS_EN(oc_lds_en) |
+                    S_00B12C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0);
+
+   if (sscreen->info.chip_class >= GFX10)
+      rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5);
+   else if (sscreen->info.chip_class == GFX9)
+      rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5);
+
+   if (sscreen->info.chip_class <= GFX9)
+      rsrc1 |= S_00B128_SGPRS((shader->config.num_sgprs - 1) / 8);
+
+   if (!sscreen->use_ngg_streamout) {
+      rsrc2 |= S_00B12C_SO_BASE0_EN(!!shader->selector->so.stride[0]) |
+               S_00B12C_SO_BASE1_EN(!!shader->selector->so.stride[1]) |
+               S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) |
+               S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) |
+               S_00B12C_SO_EN(!!shader->selector->so.num_outputs);
+   }
+
+   si_pm4_set_reg(pm4, R_00B128_SPI_SHADER_PGM_RSRC1_VS, rsrc1);
+   si_pm4_set_reg(pm4, R_00B12C_SPI_SHADER_PGM_RSRC2_VS, rsrc2);
+
+   if (window_space)
+      shader->ctx_reg.vs.pa_cl_vte_cntl = S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1);
+   else
+      shader->ctx_reg.vs.pa_cl_vte_cntl =
+         S_028818_VTX_W0_FMT(1) | S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
+         S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
+         S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1);
+
+   if (shader->selector->info.stage == MESA_SHADER_TESS_EVAL)
+      si_set_tesseval_regs(sscreen, shader->selector, shader);
+
+   polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader);
+}
+
+static unsigned si_get_ps_num_interp(struct si_shader *ps)
+{
+   struct si_shader_info *info = &ps->selector->info;
+   unsigned num_colors = !!(info->colors_read & 0x0f) + !!(info->colors_read & 0xf0);
+   unsigned num_interp =
+      ps->selector->info.num_inputs + (ps->key.ps.part.prolog.color_two_side ? num_colors : 0);
+
+   assert(num_interp <= 32);
+   return MIN2(num_interp, 32);
+}
+
+static unsigned si_get_spi_shader_col_format(struct si_shader *shader)
+{
+   unsigned spi_shader_col_format = shader->key.ps.part.epilog.spi_shader_col_format;
+   unsigned value = 0, num_mrts = 0;
+   unsigned i, num_targets = (util_last_bit(spi_shader_col_format) + 3) / 4;
+
+   /* Remove holes in spi_shader_col_format. */
+   for (i = 0; i < num_targets; i++) {
+      unsigned spi_format = (spi_shader_col_format >> (i * 4)) & 0xf;
+
+      if (spi_format) {
+         value |= spi_format << (num_mrts * 4);
+         num_mrts++;
+      }
+   }
+
+   return value;
+}
+
+static void si_emit_shader_ps(struct si_context *sctx)
+{
+   struct si_shader *shader = sctx->queued.named.ps;
+   if (!shader)
+      return;
+
+   radeon_begin(&sctx->gfx_cs);
+   /* R_0286CC_SPI_PS_INPUT_ENA, R_0286D0_SPI_PS_INPUT_ADDR*/
+   radeon_opt_set_context_reg2(sctx, R_0286CC_SPI_PS_INPUT_ENA, SI_TRACKED_SPI_PS_INPUT_ENA,
+                               shader->ctx_reg.ps.spi_ps_input_ena,
+                               shader->ctx_reg.ps.spi_ps_input_addr);
+
+   radeon_opt_set_context_reg(sctx, R_0286E0_SPI_BARYC_CNTL, SI_TRACKED_SPI_BARYC_CNTL,
+                              shader->ctx_reg.ps.spi_baryc_cntl);
+   radeon_opt_set_context_reg(sctx, R_0286D8_SPI_PS_IN_CONTROL, SI_TRACKED_SPI_PS_IN_CONTROL,
+                              shader->ctx_reg.ps.spi_ps_in_control);
+
+   /* R_028710_SPI_SHADER_Z_FORMAT, R_028714_SPI_SHADER_COL_FORMAT */
+   radeon_opt_set_context_reg2(sctx, R_028710_SPI_SHADER_Z_FORMAT, SI_TRACKED_SPI_SHADER_Z_FORMAT,
+                               shader->ctx_reg.ps.spi_shader_z_format,
+                               shader->ctx_reg.ps.spi_shader_col_format);
+
+   radeon_opt_set_context_reg(sctx, R_02823C_CB_SHADER_MASK, SI_TRACKED_CB_SHADER_MASK,
+                              shader->ctx_reg.ps.cb_shader_mask);
+   radeon_end_update_context_roll(sctx);
+}
+
+static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
+{
+   struct si_shader_info *info = &shader->selector->info;
+   struct si_pm4_state *pm4;
+   unsigned spi_ps_in_control, spi_shader_col_format, cb_shader_mask;
+   unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
+   uint64_t va;
+   unsigned input_ena = shader->config.spi_ps_input_ena;
+
+   /* we need to enable at least one of them, otherwise we hang the GPU */
+   assert(G_0286CC_PERSP_SAMPLE_ENA(input_ena) || G_0286CC_PERSP_CENTER_ENA(input_ena) ||
+          G_0286CC_PERSP_CENTROID_ENA(input_ena) || G_0286CC_PERSP_PULL_MODEL_ENA(input_ena) ||
+          G_0286CC_LINEAR_SAMPLE_ENA(input_ena) || G_0286CC_LINEAR_CENTER_ENA(input_ena) ||
+          G_0286CC_LINEAR_CENTROID_ENA(input_ena) || G_0286CC_LINE_STIPPLE_TEX_ENA(input_ena));
+   /* POS_W_FLOAT_ENA requires one of the perspective weights. */
+   assert(!G_0286CC_POS_W_FLOAT_ENA(input_ena) || G_0286CC_PERSP_SAMPLE_ENA(input_ena) ||
+          G_0286CC_PERSP_CENTER_ENA(input_ena) || G_0286CC_PERSP_CENTROID_ENA(input_ena) ||
+          G_0286CC_PERSP_PULL_MODEL_ENA(input_ena));
+
+   /* Validate interpolation optimization flags (read as implications). */
+   assert(!shader->key.ps.part.prolog.bc_optimize_for_persp ||
+          (G_0286CC_PERSP_CENTER_ENA(input_ena) && G_0286CC_PERSP_CENTROID_ENA(input_ena)));
+   assert(!shader->key.ps.part.prolog.bc_optimize_for_linear ||
+          (G_0286CC_LINEAR_CENTER_ENA(input_ena) && G_0286CC_LINEAR_CENTROID_ENA(input_ena)));
+   assert(!shader->key.ps.part.prolog.force_persp_center_interp ||
+          (!G_0286CC_PERSP_SAMPLE_ENA(input_ena) && !G_0286CC_PERSP_CENTROID_ENA(input_ena)));
+   assert(!shader->key.ps.part.prolog.force_linear_center_interp ||
+          (!G_0286CC_LINEAR_SAMPLE_ENA(input_ena) && !G_0286CC_LINEAR_CENTROID_ENA(input_ena)));
+   assert(!shader->key.ps.part.prolog.force_persp_sample_interp ||
+          (!G_0286CC_PERSP_CENTER_ENA(input_ena) && !G_0286CC_PERSP_CENTROID_ENA(input_ena)));
+   assert(!shader->key.ps.part.prolog.force_linear_sample_interp ||
+          (!G_0286CC_LINEAR_CENTER_ENA(input_ena) && !G_0286CC_LINEAR_CENTROID_ENA(input_ena)));
+
+   /* Validate cases when the optimizations are off (read as implications). */
+   assert(shader->key.ps.part.prolog.bc_optimize_for_persp ||
+          !G_0286CC_PERSP_CENTER_ENA(input_ena) || !G_0286CC_PERSP_CENTROID_ENA(input_ena));
+   assert(shader->key.ps.part.prolog.bc_optimize_for_linear ||
+          !G_0286CC_LINEAR_CENTER_ENA(input_ena) || !G_0286CC_LINEAR_CENTROID_ENA(input_ena));
+
+   pm4 = si_get_shader_pm4_state(shader);
+   if (!pm4)
+      return;
+
+   /* If multiple state sets are allowed to be in a bin, break the batch on a new PS. */
+   if (sscreen->dpbb_allowed &&
+       (sscreen->pbb_context_states_per_bin > 1 ||
+        sscreen->pbb_persistent_states_per_bin > 1)) {
+      si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
+   }
+
+   pm4->atom.emit = si_emit_shader_ps;
+
+   /* SPI_BARYC_CNTL.POS_FLOAT_LOCATION
+    * Possible vaules:
+    * 0 -> Position = pixel center
+    * 1 -> Position = pixel centroid
+    * 2 -> Position = at sample position
+    *
+    * From GLSL 4.5 specification, section 7.1:
+    *   "The variable gl_FragCoord is available as an input variable from
+    *    within fragment shaders and it holds the window relative coordinates
+    *    (x, y, z, 1/w) values for the fragment. If multi-sampling, this
+    *    value can be for any location within the pixel, or one of the
+    *    fragment samples. The use of centroid does not further restrict
+    *    this value to be inside the current primitive."
+    *
+    * Meaning that centroid has no effect and we can return anything within
+    * the pixel. Thus, return the value at sample position, because that's
+    * the most accurate one shaders can get.
+    */
+   spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2);
+
+   if (info->base.fs.pixel_center_integer)
+      spi_baryc_cntl |= S_0286E0_POS_FLOAT_ULC(1);
+
+   spi_shader_col_format = si_get_spi_shader_col_format(shader);
+   cb_shader_mask = ac_get_cb_shader_mask(shader->key.ps.part.epilog.spi_shader_col_format);
+
+   /* Ensure that some export memory is always allocated, for two reasons:
+    *
+    * 1) Correctness: The hardware ignores the EXEC mask if no export
+    *    memory is allocated, so KILL and alpha test do not work correctly
+    *    without this.
+    * 2) Performance: Every shader needs at least a NULL export, even when
+    *    it writes no color/depth output. The NULL export instruction
+    *    stalls without this setting.
+    *
+    * Don't add this to CB_SHADER_MASK.
+    *
+    * GFX10 supports pixel shaders without exports by setting both
+    * the color and Z formats to SPI_SHADER_ZERO. The hw will skip export
+    * instructions if any are present.
+    */
+   if ((sscreen->info.chip_class <= GFX9 || info->base.fs.uses_discard ||
+        shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS) &&
+       !spi_shader_col_format && !info->writes_z && !info->writes_stencil &&
+       !info->writes_samplemask)
+      spi_shader_col_format = V_028714_SPI_SHADER_32_R;
+
+   shader->ctx_reg.ps.spi_ps_input_ena = input_ena;
+   shader->ctx_reg.ps.spi_ps_input_addr = shader->config.spi_ps_input_addr;
+
+   unsigned num_interp = si_get_ps_num_interp(shader);
+
+   /* Set interpolation controls. */
+   spi_ps_in_control = S_0286D8_NUM_INTERP(num_interp) |
+                       S_0286D8_PS_W32_EN(sscreen->ps_wave_size == 32);
+
+   shader->ctx_reg.ps.num_interp = num_interp;
+   shader->ctx_reg.ps.spi_baryc_cntl = spi_baryc_cntl;
+   shader->ctx_reg.ps.spi_ps_in_control = spi_ps_in_control;
+   shader->ctx_reg.ps.spi_shader_z_format =
+      ac_get_spi_shader_z_format(info->writes_z, info->writes_stencil, info->writes_samplemask);
+   shader->ctx_reg.ps.spi_shader_col_format = spi_shader_col_format;
+   shader->ctx_reg.ps.cb_shader_mask = cb_shader_mask;
+
+   va = shader->bo->gpu_address;
+   si_pm4_set_reg(pm4, R_00B020_SPI_SHADER_PGM_LO_PS, va >> 8);
+   si_pm4_set_reg(pm4, R_00B024_SPI_SHADER_PGM_HI_PS,
+                  S_00B024_MEM_BASE(sscreen->info.address32_hi >> 8));
+
+   uint32_t rsrc1 =
+      S_00B028_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ps_wave_size == 32 ? 8 : 4)) |
+      S_00B028_DX10_CLAMP(1) | S_00B028_MEM_ORDERED(si_shader_mem_ordered(shader)) |
+      S_00B028_FLOAT_MODE(shader->config.float_mode);
+
+   if (sscreen->info.chip_class < GFX10) {
+      rsrc1 |= S_00B028_SGPRS((shader->config.num_sgprs - 1) / 8);
+   }
+
+   si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS, rsrc1);
+   si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
+                  S_00B02C_EXTRA_LDS_SIZE(shader->config.lds_size) |
+                     S_00B02C_USER_SGPR(SI_PS_NUM_USER_SGPR) |
+                     S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
+}
+
+static void si_shader_init_pm4_state(struct si_screen *sscreen, struct si_shader *shader)
+{
+   switch (shader->selector->info.stage) {
+   case MESA_SHADER_VERTEX:
+      if (shader->key.ge.as_ls)
+         si_shader_ls(sscreen, shader);
+      else if (shader->key.ge.as_es)
+         si_shader_es(sscreen, shader);
+      else if (shader->key.ge.as_ngg)
+         gfx10_shader_ngg(sscreen, shader);
+      else
+         si_shader_vs(sscreen, shader, NULL);
+      break;
+   case MESA_SHADER_TESS_CTRL:
+      si_shader_hs(sscreen, shader);
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      if (shader->key.ge.as_es)
+         si_shader_es(sscreen, shader);
+      else if (shader->key.ge.as_ngg)
+         gfx10_shader_ngg(sscreen, shader);
+      else
+         si_shader_vs(sscreen, shader, NULL);
+      break;
+   case MESA_SHADER_GEOMETRY:
+      if (shader->key.ge.as_ngg)
+         gfx10_shader_ngg(sscreen, shader);
+      else
+         si_shader_gs(sscreen, shader);
+      break;
+   case MESA_SHADER_FRAGMENT:
+      si_shader_ps(sscreen, shader);
+      break;
+   default:
+      assert(0);
+   }
+}
+
+static void si_clear_vs_key_inputs(struct si_context *sctx, union si_shader_key *key,
+                                   struct si_vs_prolog_bits *prolog_key)
+{
+   prolog_key->instance_divisor_is_one = 0;
+   prolog_key->instance_divisor_is_fetched = 0;
+   key->ge.mono.vs_fetch_opencode = 0;
+   memset(key->ge.mono.vs_fix_fetch, 0, sizeof(key->ge.mono.vs_fix_fetch));
+}
+
+void si_vs_key_update_inputs(struct si_context *sctx)
+{
+   struct si_shader_selector *vs = sctx->shader.vs.cso;
+   struct si_vertex_elements *elts = sctx->vertex_elements;
+   union si_shader_key *key = &sctx->shader.vs.key;
+
+   if (!vs)
+      return;
+
+   if (vs->info.base.vs.blit_sgprs_amd) {
+      si_clear_vs_key_inputs(sctx, key, &key->ge.part.vs.prolog);
+      key->ge.opt.prefer_mono = 0;
+      sctx->uses_nontrivial_vs_prolog = false;
+      return;
+   }
+
+   bool uses_nontrivial_vs_prolog = false;
+
+   if (elts->instance_divisor_is_one || elts->instance_divisor_is_fetched)
+      uses_nontrivial_vs_prolog = true;
+
+   key->ge.part.vs.prolog.instance_divisor_is_one = elts->instance_divisor_is_one;
+   key->ge.part.vs.prolog.instance_divisor_is_fetched = elts->instance_divisor_is_fetched;
+   key->ge.opt.prefer_mono = elts->instance_divisor_is_fetched;
+
+   unsigned count_mask = (1 << vs->info.num_inputs) - 1;
+   unsigned fix = elts->fix_fetch_always & count_mask;
+   unsigned opencode = elts->fix_fetch_opencode & count_mask;
+
+   if (sctx->vertex_buffer_unaligned & elts->vb_alignment_check_mask) {
+      uint32_t mask = elts->fix_fetch_unaligned & count_mask;
+      while (mask) {
+         unsigned i = u_bit_scan(&mask);
+         unsigned log_hw_load_size = 1 + ((elts->hw_load_is_dword >> i) & 1);
+         unsigned vbidx = elts->vertex_buffer_index[i];
+         struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbidx];
+         unsigned align_mask = (1 << log_hw_load_size) - 1;
+         if (vb->buffer_offset & align_mask || vb->stride & align_mask) {
+            fix |= 1 << i;
+            opencode |= 1 << i;
+         }
+      }
+   }
+
+   memset(key->ge.mono.vs_fix_fetch, 0, sizeof(key->ge.mono.vs_fix_fetch));
+
+   while (fix) {
+      unsigned i = u_bit_scan(&fix);
+      uint8_t fix_fetch = elts->fix_fetch[i];
+
+      key->ge.mono.vs_fix_fetch[i].bits = fix_fetch;
+      if (fix_fetch)
+         uses_nontrivial_vs_prolog = true;
+   }
+   key->ge.mono.vs_fetch_opencode = opencode;
+   if (opencode)
+      uses_nontrivial_vs_prolog = true;
+
+   sctx->uses_nontrivial_vs_prolog = uses_nontrivial_vs_prolog;
+
+   /* draw_vertex_state (display lists) requires a trivial VS prolog that ignores
+    * the current vertex buffers and vertex elements.
+    *
+    * We just computed the prolog key because we needed to set uses_nontrivial_vs_prolog,
+    * so that we know whether the VS prolog should be updated when we switch from
+    * draw_vertex_state to draw_vbo. Now clear the VS prolog for draw_vertex_state.
+    * This should happen rarely because the VS prolog should be trivial in most
+    * cases.
+    */
+   if (uses_nontrivial_vs_prolog && sctx->force_trivial_vs_prolog)
+      si_clear_vs_key_inputs(sctx, key, &key->ge.part.vs.prolog);
+}
+
+void si_get_vs_key_inputs(struct si_context *sctx, union si_shader_key *key,
+                          struct si_vs_prolog_bits *prolog_key)
+{
+   prolog_key->instance_divisor_is_one = sctx->shader.vs.key.ge.part.vs.prolog.instance_divisor_is_one;
+   prolog_key->instance_divisor_is_fetched = sctx->shader.vs.key.ge.part.vs.prolog.instance_divisor_is_fetched;
+
+   key->ge.mono.vs_fetch_opencode = sctx->shader.vs.key.ge.mono.vs_fetch_opencode;
+   memcpy(key->ge.mono.vs_fix_fetch, sctx->shader.vs.key.ge.mono.vs_fix_fetch,
+          sizeof(key->ge.mono.vs_fix_fetch));
+}
+
+void si_update_ps_inputs_read_or_disabled(struct si_context *sctx)
+{
+   struct si_shader_selector *ps = sctx->shader.ps.cso;
+
+   /* Find out if PS is disabled. */
+   bool ps_disabled = true;
+   if (ps) {
+      bool ps_modifies_zs = ps->info.base.fs.uses_discard || ps->info.writes_z || ps->info.writes_stencil ||
+                            ps->info.writes_samplemask ||
+                            sctx->queued.named.blend->alpha_to_coverage ||
+                            sctx->queued.named.dsa->alpha_func != PIPE_FUNC_ALWAYS;
+      unsigned ps_colormask = si_get_total_colormask(sctx);
+
+      ps_disabled = sctx->queued.named.rasterizer->rasterizer_discard ||
+                    (!ps_colormask && !ps_modifies_zs && !ps->info.base.writes_memory);
+   }
+
+   sctx->ps_inputs_read_or_disabled = ps_disabled ? 0 : ps->inputs_read;
+}
+
+static void si_get_vs_key_outputs(struct si_context *sctx, struct si_shader_selector *vs,
+                                  union si_shader_key *key)
+{
+
+   key->ge.opt.kill_clip_distances = vs->clipdist_mask & ~sctx->queued.named.rasterizer->clip_plane_enable;
+
+   /* Find out which VS outputs aren't used by the PS. */
+   uint64_t outputs_written = vs->outputs_written_before_ps;
+   uint64_t linked = outputs_written & sctx->ps_inputs_read_or_disabled;
+
+   key->ge.opt.kill_outputs = ~linked & outputs_written;
+
+   if (vs->info.stage != MESA_SHADER_GEOMETRY) {
+      key->ge.opt.ngg_culling = sctx->ngg_culling;
+      key->ge.mono.u.vs_export_prim_id = sctx->shader.ps.cso && sctx->shader.ps.cso->info.uses_primid;
+   } else {
+      key->ge.opt.ngg_culling = 0;
+      key->ge.mono.u.vs_export_prim_id = 0;
+   }
+
+   key->ge.opt.kill_pointsize = vs->info.writes_psize &&
+                                sctx->current_rast_prim != PIPE_PRIM_POINTS &&
+                                !sctx->queued.named.rasterizer->polygon_mode_is_points;
+}
+
+static void si_clear_vs_key_outputs(struct si_context *sctx, struct si_shader_selector *vs,
+                                    union si_shader_key *key)
+{
+   key->ge.opt.kill_clip_distances = 0;
+   key->ge.opt.kill_outputs = 0;
+   key->ge.opt.ngg_culling = 0;
+   key->ge.mono.u.vs_export_prim_id = 0;
+   key->ge.opt.kill_pointsize = 0;
+}
+
+void si_ps_key_update_framebuffer(struct si_context *sctx)
+{
+   struct si_shader_selector *sel = sctx->shader.ps.cso;
+   union si_shader_key *key = &sctx->shader.ps.key;
+
+   if (!sel)
+      return;
+
+   if (sel->info.color0_writes_all_cbufs &&
+       sel->info.colors_written == 0x1)
+      key->ps.part.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1;
+   else
+      key->ps.part.epilog.last_cbuf = 0;
+
+   /* ps_uses_fbfetch is true only if the color buffer is bound. */
+   if (sctx->ps_uses_fbfetch) {
+      struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0];
+      struct pipe_resource *tex = cb0->texture;
+
+      /* 1D textures are allocated and used as 2D on GFX9. */
+      key->ps.mono.fbfetch_msaa = sctx->framebuffer.nr_samples > 1;
+      key->ps.mono.fbfetch_is_1D =
+         sctx->chip_class != GFX9 &&
+         (tex->target == PIPE_TEXTURE_1D || tex->target == PIPE_TEXTURE_1D_ARRAY);
+      key->ps.mono.fbfetch_layered =
+         tex->target == PIPE_TEXTURE_1D_ARRAY || tex->target == PIPE_TEXTURE_2D_ARRAY ||
+         tex->target == PIPE_TEXTURE_CUBE || tex->target == PIPE_TEXTURE_CUBE_ARRAY ||
+         tex->target == PIPE_TEXTURE_3D;
+   } else {
+      key->ps.mono.fbfetch_msaa = 0;
+      key->ps.mono.fbfetch_is_1D = 0;
+      key->ps.mono.fbfetch_layered = 0;
+   }
+}
+
+void si_ps_key_update_framebuffer_blend(struct si_context *sctx)
+{
+   struct si_shader_selector *sel = sctx->shader.ps.cso;
+   union si_shader_key *key = &sctx->shader.ps.key;
+   struct si_state_blend *blend = sctx->queued.named.blend;
+
+   if (!sel)
+      return;
+
+   /* Select the shader color format based on whether
+    * blending or alpha are needed.
+    */
+   key->ps.part.epilog.spi_shader_col_format =
+      (blend->blend_enable_4bit & blend->need_src_alpha_4bit &
+       sctx->framebuffer.spi_shader_col_format_blend_alpha) |
+      (blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
+       sctx->framebuffer.spi_shader_col_format_blend) |
+      (~blend->blend_enable_4bit & blend->need_src_alpha_4bit &
+       sctx->framebuffer.spi_shader_col_format_alpha) |
+      (~blend->blend_enable_4bit & ~blend->need_src_alpha_4bit &
+       sctx->framebuffer.spi_shader_col_format);
+   key->ps.part.epilog.spi_shader_col_format &= blend->cb_target_enabled_4bit;
+
+   /* The output for dual source blending should have
+    * the same format as the first output.
+    */
+   if (blend->dual_src_blend) {
+      key->ps.part.epilog.spi_shader_col_format |=
+         (key->ps.part.epilog.spi_shader_col_format & 0xf) << 4;
+   }
+
+   /* If alpha-to-coverage is enabled, we have to export alpha
+    * even if there is no color buffer.
+    */
+   if (!(key->ps.part.epilog.spi_shader_col_format & 0xf) && blend->alpha_to_coverage)
+      key->ps.part.epilog.spi_shader_col_format |= V_028710_SPI_SHADER_32_AR;
+
+   /* On GFX6 and GFX7 except Hawaii, the CB doesn't clamp outputs
+    * to the range supported by the type if a channel has less
+    * than 16 bits and the export format is 16_ABGR.
+    */
+   if (sctx->chip_class <= GFX7 && sctx->family != CHIP_HAWAII) {
+      key->ps.part.epilog.color_is_int8 = sctx->framebuffer.color_is_int8;
+      key->ps.part.epilog.color_is_int10 = sctx->framebuffer.color_is_int10;
+   }
+
+   /* Disable unwritten outputs (if WRITE_ALL_CBUFS isn't enabled). */
+   if (!key->ps.part.epilog.last_cbuf) {
+      key->ps.part.epilog.spi_shader_col_format &= sel->colors_written_4bit;
+      key->ps.part.epilog.color_is_int8 &= sel->info.colors_written;
+      key->ps.part.epilog.color_is_int10 &= sel->info.colors_written;
+   }
+
+   /* Eliminate shader code computing output values that are unused.
+    * This enables dead code elimination between shader parts.
+    * Check if any output is eliminated.
+    */
+   if (sel->colors_written_4bit &
+       ~(sctx->framebuffer.colorbuf_enabled_4bit & blend->cb_target_enabled_4bit))
+      key->ps.opt.prefer_mono = 1;
+   else
+      key->ps.opt.prefer_mono = 0;
+}
+
+void si_ps_key_update_blend_rasterizer(struct si_context *sctx)
+{
+   union si_shader_key *key = &sctx->shader.ps.key;
+   struct si_state_blend *blend = sctx->queued.named.blend;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+
+   key->ps.part.epilog.alpha_to_one = blend->alpha_to_one && rs->multisample_enable;
+}
+
+void si_ps_key_update_rasterizer(struct si_context *sctx)
+{
+   struct si_shader_selector *sel = sctx->shader.ps.cso;
+   union si_shader_key *key = &sctx->shader.ps.key;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+
+   if (!sel)
+      return;
+
+   key->ps.part.prolog.color_two_side = rs->two_side && sel->info.colors_read;
+   key->ps.part.prolog.flatshade_colors = rs->flatshade && sel->info.uses_interp_color;
+   key->ps.part.epilog.clamp_color = rs->clamp_fragment_color;
+}
+
+void si_ps_key_update_dsa(struct si_context *sctx)
+{
+   union si_shader_key *key = &sctx->shader.ps.key;
+
+   key->ps.part.epilog.alpha_func = sctx->queued.named.dsa->alpha_func;
+}
+
+static void si_ps_key_update_primtype_shader_rasterizer_framebuffer(struct si_context *sctx)
+{
+   union si_shader_key *key = &sctx->shader.ps.key;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+
+   bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim);
+   bool is_line = util_prim_is_lines(sctx->current_rast_prim);
+
+   key->ps.part.prolog.poly_stipple = rs->poly_stipple_enable && is_poly;
+   key->ps.part.epilog.poly_line_smoothing =
+      ((is_poly && rs->poly_smooth) || (is_line && rs->line_smooth)) &&
+      sctx->framebuffer.nr_samples <= 1;
+}
+
+void si_ps_key_update_sample_shading(struct si_context *sctx)
+{
+   struct si_shader_selector *sel = sctx->shader.ps.cso;
+   union si_shader_key *key = &sctx->shader.ps.key;
+
+   if (!sel)
+      return;
+
+   if (sctx->ps_iter_samples > 1 && sel->info.reads_samplemask)
+      key->ps.part.prolog.samplemask_log_ps_iter = util_logbase2(sctx->ps_iter_samples);
+   else
+      key->ps.part.prolog.samplemask_log_ps_iter = 0;
+}
+
+void si_ps_key_update_framebuffer_rasterizer_sample_shading(struct si_context *sctx)
+{
+   struct si_shader_selector *sel = sctx->shader.ps.cso;
+   union si_shader_key *key = &sctx->shader.ps.key;
+   struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+
+   if (!sel)
+      return;
+
+   bool uses_persp_center = sel->info.uses_persp_center ||
+                            (!rs->flatshade && sel->info.uses_persp_center_color);
+   bool uses_persp_centroid = sel->info.uses_persp_centroid ||
+                              (!rs->flatshade && sel->info.uses_persp_centroid_color);
+   bool uses_persp_sample = sel->info.uses_persp_sample ||
+                            (!rs->flatshade && sel->info.uses_persp_sample_color);
+
+   if (rs->force_persample_interp && rs->multisample_enable &&
+       sctx->framebuffer.nr_samples > 1 && sctx->ps_iter_samples > 1) {
+      key->ps.part.prolog.force_persp_sample_interp =
+         uses_persp_center || uses_persp_centroid;
+
+      key->ps.part.prolog.force_linear_sample_interp =
+         sel->info.uses_linear_center || sel->info.uses_linear_centroid;
+
+      key->ps.part.prolog.force_persp_center_interp = 0;
+      key->ps.part.prolog.force_linear_center_interp = 0;
+      key->ps.part.prolog.bc_optimize_for_persp = 0;
+      key->ps.part.prolog.bc_optimize_for_linear = 0;
+      key->ps.mono.interpolate_at_sample_force_center = 0;
+   } else if (rs->multisample_enable && sctx->framebuffer.nr_samples > 1) {
+      key->ps.part.prolog.force_persp_sample_interp = 0;
+      key->ps.part.prolog.force_linear_sample_interp = 0;
+      key->ps.part.prolog.force_persp_center_interp = 0;
+      key->ps.part.prolog.force_linear_center_interp = 0;
+      key->ps.part.prolog.bc_optimize_for_persp =
+         uses_persp_center && uses_persp_centroid;
+      key->ps.part.prolog.bc_optimize_for_linear =
+         sel->info.uses_linear_center && sel->info.uses_linear_centroid;
+      key->ps.mono.interpolate_at_sample_force_center = 0;
+   } else {
+      key->ps.part.prolog.force_persp_sample_interp = 0;
+      key->ps.part.prolog.force_linear_sample_interp = 0;
+
+      /* Make sure SPI doesn't compute more than 1 pair
+       * of (i,j), which is the optimization here. */
+      key->ps.part.prolog.force_persp_center_interp = uses_persp_center +
+                                                      uses_persp_centroid +
+                                                      uses_persp_sample > 1;
+
+      key->ps.part.prolog.force_linear_center_interp = sel->info.uses_linear_center +
+                                                       sel->info.uses_linear_centroid +
+                                                       sel->info.uses_linear_sample > 1;
+      key->ps.part.prolog.bc_optimize_for_persp = 0;
+      key->ps.part.prolog.bc_optimize_for_linear = 0;
+      key->ps.mono.interpolate_at_sample_force_center = sel->info.uses_interp_at_sample;
+   }
+}
+
+/* Compute the key for the hw shader variant */
+static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_shader_selector *sel,
+                                          union si_shader_key *key)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+
+   switch (sel->info.stage) {
+   case MESA_SHADER_VERTEX:
+      if (!sctx->shader.tes.cso && !sctx->shader.gs.cso)
+         si_get_vs_key_outputs(sctx, sel, key);
+      else
+         si_clear_vs_key_outputs(sctx, sel, key);
+      break;
+   case MESA_SHADER_TESS_CTRL:
+      if (sctx->chip_class >= GFX9) {
+         si_get_vs_key_inputs(sctx, key, &key->ge.part.tcs.ls_prolog);
+         key->ge.part.tcs.ls = sctx->shader.vs.cso;
+      }
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      if (!sctx->shader.gs.cso)
+         si_get_vs_key_outputs(sctx, sel, key);
+      else
+         si_clear_vs_key_outputs(sctx, sel, key);
+      break;
+   case MESA_SHADER_GEOMETRY:
+      if (sctx->chip_class >= GFX9) {
+         if (sctx->shader.tes.cso) {
+            si_clear_vs_key_inputs(sctx, key, &key->ge.part.gs.vs_prolog);
+            key->ge.part.gs.es = sctx->shader.tes.cso;
+         } else {
+            si_get_vs_key_inputs(sctx, key, &key->ge.part.gs.vs_prolog);
+            key->ge.part.gs.es = sctx->shader.vs.cso;
+         }
+
+         /* Only NGG can eliminate GS outputs, because the code is shared with VS. */
+         if (sctx->ngg)
+            si_get_vs_key_outputs(sctx, sel, key);
+         else
+            si_clear_vs_key_outputs(sctx, sel, key);
+      }
+      break;
+   case MESA_SHADER_FRAGMENT:
+      si_ps_key_update_primtype_shader_rasterizer_framebuffer(sctx);
+      break;
+   default:
+      assert(0);
+   }
+}
+
+static void si_build_shader_variant(struct si_shader *shader, int thread_index, bool low_priority)
+{
+   struct si_shader_selector *sel = shader->selector;
+   struct si_screen *sscreen = sel->screen;
+   struct ac_llvm_compiler *compiler;
+   struct pipe_debug_callback *debug = &shader->compiler_ctx_state.debug;
+
+   if (thread_index >= 0) {
+      if (low_priority) {
+         assert(thread_index < (int)ARRAY_SIZE(sscreen->compiler_lowp));
+         compiler = &sscreen->compiler_lowp[thread_index];
+      } else {
+         assert(thread_index < (int)ARRAY_SIZE(sscreen->compiler));
+         compiler = &sscreen->compiler[thread_index];
+      }
+      if (!debug->async)
+         debug = NULL;
+   } else {
+      assert(!low_priority);
+      compiler = shader->compiler_ctx_state.compiler;
+   }
+
+   if (!compiler->passes)
+      si_init_compiler(sscreen, compiler);
+
+   if (unlikely(!si_create_shader_variant(sscreen, compiler, shader, debug))) {
+      PRINT_ERR("Failed to build shader variant (type=%u)\n", sel->info.stage);
+      shader->compilation_failed = true;
+      return;
+   }
+
+   if (shader->compiler_ctx_state.is_debug_context) {
+      FILE *f = open_memstream(&shader->shader_log, &shader->shader_log_size);
+      if (f) {
+         si_shader_dump(sscreen, shader, NULL, f, false);
+         fclose(f);
+      }
+   }
+
+   si_shader_init_pm4_state(sscreen, shader);
+}
+
+static void si_build_shader_variant_low_priority(void *job, void *gdata, int thread_index)
+{
+   struct si_shader *shader = (struct si_shader *)job;
+
+   assert(thread_index >= 0);
+
+   si_build_shader_variant(shader, thread_index, true);
+}
+
+/* This should be const, but C++ doesn't allow implicit zero-initialization with const. */
+static union si_shader_key zeroed;
+
+static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shader_selector *sel,
+                                       struct si_compiler_ctx_state *compiler_state,
+                                       const union si_shader_key *key)
+{
+   struct si_shader **mainp = si_get_main_shader_part(sel, key);
+
+   if (!*mainp) {
+      struct si_shader *main_part = CALLOC_STRUCT(si_shader);
+
+      if (!main_part)
+         return false;
+
+      /* We can leave the fence as permanently signaled because the
+       * main part becomes visible globally only after it has been
+       * compiled. */
+      util_queue_fence_init(&main_part->ready);
+
+      main_part->selector = sel;
+      if (sel->info.stage <= MESA_SHADER_GEOMETRY) {
+         main_part->key.ge.as_es = key->ge.as_es;
+         main_part->key.ge.as_ls = key->ge.as_ls;
+         main_part->key.ge.as_ngg = key->ge.as_ngg;
+      }
+      main_part->is_monolithic = false;
+
+      if (!si_compile_shader(sscreen, compiler_state->compiler, main_part,
+                             &compiler_state->debug)) {
+         FREE(main_part);
+         return false;
+      }
+      *mainp = main_part;
+   }
+   return true;
+}
+
+/* A helper to copy *key to *local_key and return local_key. */
+template<typename SHADER_KEY_TYPE>
+static ALWAYS_INLINE const SHADER_KEY_TYPE *
+use_local_key_copy(const SHADER_KEY_TYPE *key, SHADER_KEY_TYPE *local_key, unsigned key_size)
+{
+   if (key != local_key)
+      memcpy(local_key, key, key_size);
+
+   return local_key;
+}
+
+#define NO_INLINE_UNIFORMS false
+
+/**
+ * Select a shader variant according to the shader key.
+ *
+ * \param optimized_or_none  If the key describes an optimized shader variant and
+ *                           the compilation isn't finished, don't select any
+ *                           shader and return an error.
+ *
+ * This uses a C++ template to compute the optimal memcmp size at compile time, which is important
+ * for getting inlined memcmp. The memcmp size depends on the shader key type and whether inlined
+ * uniforms are enabled.
+ */
+template<bool INLINE_UNIFORMS = true, typename SHADER_KEY_TYPE>
+static int si_shader_select_with_key(struct si_context *sctx, struct si_shader_ctx_state *state,
+                                     const SHADER_KEY_TYPE *key, int thread_index,
+                                     bool optimized_or_none)
+{
+   struct si_screen *sscreen = sctx->screen;
+   struct si_shader_selector *sel = state->cso;
+   struct si_shader_selector *previous_stage_sel = NULL;
+   struct si_shader *current = state->current;
+   struct si_shader *iter, *shader = NULL;
+   const SHADER_KEY_TYPE *zeroed_key = (SHADER_KEY_TYPE*)&zeroed;
+
+   /* "opt" must be the last field and "inlined_uniform_values" must be the last field inside opt.
+    * If there is padding, insert the padding manually before opt or inside opt.
+    */
+   STATIC_ASSERT(offsetof(SHADER_KEY_TYPE, opt) + sizeof(key->opt) == sizeof(*key));
+   STATIC_ASSERT(offsetof(SHADER_KEY_TYPE, opt.inlined_uniform_values) +
+                 sizeof(key->opt.inlined_uniform_values) == sizeof(*key));
+
+   const unsigned key_size_no_uniforms = sizeof(*key) - sizeof(key->opt.inlined_uniform_values);
+   /* Don't compare inlined_uniform_values if uniform inlining is disabled. */
+   const unsigned key_size = INLINE_UNIFORMS ? sizeof(*key) : key_size_no_uniforms;
+   const unsigned key_opt_size =
+      INLINE_UNIFORMS ? sizeof(key->opt) :
+                        sizeof(key->opt) - sizeof(key->opt.inlined_uniform_values);
+
+   /* si_shader_select_with_key must not modify 'key' because it would affect future shaders.
+    * If we need to modify it for this specific shader (eg: to disable optimizations), we
+    * use a copy.
+    */
+   SHADER_KEY_TYPE local_key;
+
+   if (unlikely(sscreen->debug_flags & DBG(NO_OPT_VARIANT))) {
+      /* Disable shader variant optimizations. */
+      key = use_local_key_copy<SHADER_KEY_TYPE>(key, &local_key, key_size);
+      memset(&local_key.opt, 0, key_opt_size);
+   }
+
+again:
+   /* Check if we don't need to change anything.
+    * This path is also used for most shaders that don't need multiple
+    * variants, it will cost just a computation of the key and this
+    * test. */
+   if (likely(current && memcmp(&current->key, key, key_size) == 0)) {
+      if (unlikely(!util_queue_fence_is_signalled(&current->ready))) {
+         if (current->is_optimized) {
+            if (optimized_or_none)
+               return -1;
+
+            key = use_local_key_copy(key, &local_key, key_size);
+            memset(&local_key.opt, 0, key_opt_size);
+            goto current_not_ready;
+         }
+
+         util_queue_fence_wait(&current->ready);
+      }
+
+      return current->compilation_failed ? -1 : 0;
+   }
+current_not_ready:
+
+   /* This must be done before the mutex is locked, because async GS
+    * compilation calls this function too, and therefore must enter
+    * the mutex first.
+    *
+    * Only wait if we are in a draw call. Don't wait if we are
+    * in a compiler thread.
+    */
+   if (thread_index < 0)
+      util_queue_fence_wait(&sel->ready);
+
+   simple_mtx_lock(&sel->mutex);
+
+   int variant_count = 0;
+   const int max_inline_uniforms_variants = 5;
+
+   /* Find the shader variant. */
+   for (iter = sel->first_variant; iter; iter = iter->next_variant) {
+      const SHADER_KEY_TYPE *iter_key = (const SHADER_KEY_TYPE *)&iter->key;
+
+      if (memcmp(iter_key, key, key_size_no_uniforms) == 0) {
+         /* Check the inlined uniform values separately, and count
+          * the number of variants based on them.
+          */
+         if (key->opt.inline_uniforms &&
+             memcmp(iter_key->opt.inlined_uniform_values,
+                    key->opt.inlined_uniform_values,
+                    MAX_INLINABLE_UNIFORMS * 4) != 0) {
+            if (variant_count++ > max_inline_uniforms_variants) {
+               key = use_local_key_copy(key, &local_key, key_size);
+               /* Too many variants. Disable inlining for this shader. */
+               local_key.opt.inline_uniforms = 0;
+               memset(local_key.opt.inlined_uniform_values, 0, MAX_INLINABLE_UNIFORMS * 4);
+               simple_mtx_unlock(&sel->mutex);
+               goto again;
+            }
+            continue;
+         }
+
+         simple_mtx_unlock(&sel->mutex);
+
+         if (unlikely(!util_queue_fence_is_signalled(&iter->ready))) {
+            /* If it's an optimized shader and its compilation has
+             * been started but isn't done, use the unoptimized
+             * shader so as not to cause a stall due to compilation.
+             */
+            if (iter->is_optimized) {
+               if (optimized_or_none)
+                  return -1;
+
+               key = use_local_key_copy(key, &local_key, key_size);
+               memset(&local_key.opt, 0, key_opt_size);
+               goto again;
+            }
+
+            util_queue_fence_wait(&iter->ready);
+         }
+
+         if (iter->compilation_failed) {
+            return -1; /* skip the draw call */
+         }
+
+         state->current = iter;
+         return 0;
+      }
+   }
+
+   /* Build a new shader. */
+   shader = CALLOC_STRUCT(si_shader);
+   if (!shader) {
+      simple_mtx_unlock(&sel->mutex);
+      return -ENOMEM;
+   }
+
+   util_queue_fence_init(&shader->ready);
+
+   if (!sctx->compiler.passes)
+      si_init_compiler(sctx->screen, &sctx->compiler);
+
+   shader->selector = sel;
+   *((SHADER_KEY_TYPE*)&shader->key) = *key;
+   shader->compiler_ctx_state.compiler = &sctx->compiler;
+   shader->compiler_ctx_state.debug = sctx->debug;
+   shader->compiler_ctx_state.is_debug_context = sctx->is_debug;
+
+   /* If this is a merged shader, get the first shader's selector. */
+   if (sscreen->info.chip_class >= GFX9) {
+      if (sel->info.stage == MESA_SHADER_TESS_CTRL)
+         previous_stage_sel = ((struct si_shader_key_ge*)key)->part.tcs.ls;
+      else if (sel->info.stage == MESA_SHADER_GEOMETRY)
+         previous_stage_sel = ((struct si_shader_key_ge*)key)->part.gs.es;
+
+      /* We need to wait for the previous shader. */
+      if (previous_stage_sel && thread_index < 0)
+         util_queue_fence_wait(&previous_stage_sel->ready);
+   }
+
+   bool is_pure_monolithic =
+      sscreen->use_monolithic_shaders || memcmp(&key->mono, &zeroed_key->mono, sizeof(key->mono)) != 0;
+
+   /* Compile the main shader part if it doesn't exist. This can happen
+    * if the initial guess was wrong.
+    */
+   if (!is_pure_monolithic) {
+      bool ok = true;
+
+      /* Make sure the main shader part is present. This is needed
+       * for shaders that can be compiled as VS, LS, or ES, and only
+       * one of them is compiled at creation.
+       *
+       * It is also needed for GS, which can be compiled as non-NGG
+       * and NGG.
+       *
+       * For merged shaders, check that the starting shader's main
+       * part is present.
+       */
+      if (previous_stage_sel) {
+         union si_shader_key shader1_key = zeroed;
+
+         if (sel->info.stage == MESA_SHADER_TESS_CTRL) {
+            shader1_key.ge.as_ls = 1;
+         } else if (sel->info.stage == MESA_SHADER_GEOMETRY) {
+            shader1_key.ge.as_es = 1;
+            shader1_key.ge.as_ngg = ((struct si_shader_key_ge*)key)->as_ngg; /* for Wave32 vs Wave64 */
+         } else {
+            assert(0);
+         }
+
+         simple_mtx_lock(&previous_stage_sel->mutex);
+         ok = si_check_missing_main_part(sscreen, previous_stage_sel, &shader->compiler_ctx_state,
+                                         &shader1_key);
+         simple_mtx_unlock(&previous_stage_sel->mutex);
+      }
+
+      if (ok) {
+         ok = si_check_missing_main_part(sscreen, sel, &shader->compiler_ctx_state,
+                                         (union si_shader_key*)key);
+      }
+
+      if (!ok) {
+         FREE(shader);
+         simple_mtx_unlock(&sel->mutex);
+         return -ENOMEM; /* skip the draw call */
+      }
+   }
+
+   /* Keep the reference to the 1st shader of merged shaders, so that
+    * Gallium can't destroy it before we destroy the 2nd shader.
+    *
+    * Set sctx = NULL, because it's unused if we're not releasing
+    * the shader, and we don't have any sctx here.
+    */
+   si_shader_selector_reference(NULL, &shader->previous_stage_sel, previous_stage_sel);
+
+   /* Monolithic-only shaders don't make a distinction between optimized
+    * and unoptimized. */
+   shader->is_monolithic =
+      is_pure_monolithic || memcmp(&key->opt, &zeroed_key->opt, key_opt_size) != 0;
+
+   shader->is_optimized = !is_pure_monolithic &&
+                          memcmp(&key->opt, &zeroed_key->opt, key_opt_size) != 0;
+
+   /* If it's an optimized shader, compile it asynchronously. */
+   if (shader->is_optimized && thread_index < 0) {
+      /* Compile it asynchronously. */
+      util_queue_add_job(&sscreen->shader_compiler_queue_low_priority, shader, &shader->ready,
+                         si_build_shader_variant_low_priority, NULL, 0);
+
+      /* Add only after the ready fence was reset, to guard against a
+       * race with si_bind_XX_shader. */
+      if (!sel->last_variant) {
+         sel->first_variant = shader;
+         sel->last_variant = shader;
+      } else {
+         sel->last_variant->next_variant = shader;
+         sel->last_variant = shader;
+      }
+
+      /* Use the default (unoptimized) shader for now. */
+      key = use_local_key_copy(key, &local_key, key_size);
+      memset(&local_key.opt, 0, key_opt_size);
+      simple_mtx_unlock(&sel->mutex);
+
+      if (sscreen->options.sync_compile)
+         util_queue_fence_wait(&shader->ready);
+
+      if (optimized_or_none)
+         return -1;
+      goto again;
+   }
+
+   /* Reset the fence before adding to the variant list. */
+   util_queue_fence_reset(&shader->ready);
+
+   if (!sel->last_variant) {
+      sel->first_variant = shader;
+      sel->last_variant = shader;
+   } else {
+      sel->last_variant->next_variant = shader;
+      sel->last_variant = shader;
+   }
+
+   simple_mtx_unlock(&sel->mutex);
+
+   assert(!shader->is_optimized);
+   si_build_shader_variant(shader, thread_index, false);
+
+   util_queue_fence_signal(&shader->ready);
+
+   if (!shader->compilation_failed)
+      state->current = shader;
+
+   return shader->compilation_failed ? -1 : 0;
+}
+
+int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+
+   si_shader_selector_key(ctx, state->cso, &state->key);
+
+   if (state->cso->info.stage == MESA_SHADER_FRAGMENT) {
+      if (state->key.ps.opt.inline_uniforms)
+         return si_shader_select_with_key(sctx, state, &state->key.ps, -1, false);
+      else
+         return si_shader_select_with_key<NO_INLINE_UNIFORMS>(sctx, state, &state->key.ps, -1, false);
+   } else {
+      if (state->key.ge.opt.inline_uniforms) {
+         return si_shader_select_with_key(sctx, state, &state->key.ge, -1, false);
+      } else {
+         return si_shader_select_with_key<NO_INLINE_UNIFORMS>(sctx, state, &state->key.ge, -1, false);
+      }
+   }
+}
+
+static void si_parse_next_shader_property(const struct si_shader_info *info, bool streamout,
+                                          union si_shader_key *key)
+{
+   gl_shader_stage next_shader = info->base.next_stage;
+
+   switch (info->stage) {
+   case MESA_SHADER_VERTEX:
+      switch (next_shader) {
+      case MESA_SHADER_GEOMETRY:
+         key->ge.as_es = 1;
+         break;
+      case MESA_SHADER_TESS_CTRL:
+      case MESA_SHADER_TESS_EVAL:
+         key->ge.as_ls = 1;
+         break;
+      default:
+         /* If POSITION isn't written, it can only be a HW VS
+          * if streamout is used. If streamout isn't used,
+          * assume that it's a HW LS. (the next shader is TCS)
+          * This heuristic is needed for separate shader objects.
+          */
+         if (!info->writes_position && !streamout)
+            key->ge.as_ls = 1;
+      }
+      break;
+
+   case MESA_SHADER_TESS_EVAL:
+      if (next_shader == MESA_SHADER_GEOMETRY || !info->writes_position)
+         key->ge.as_es = 1;
+      break;
+
+   default:;
+   }
+}
+
+/**
+ * Compile the main shader part or the monolithic shader as part of
+ * si_shader_selector initialization. Since it can be done asynchronously,
+ * there is no way to report compile failures to applications.
+ */
+static void si_init_shader_selector_async(void *job, void *gdata, int thread_index)
+{
+   struct si_shader_selector *sel = (struct si_shader_selector *)job;
+   struct si_screen *sscreen = sel->screen;
+   struct ac_llvm_compiler *compiler;
+   struct pipe_debug_callback *debug = &sel->compiler_ctx_state.debug;
+
+   assert(!debug->debug_message || debug->async);
+   assert(thread_index >= 0);
+   assert(thread_index < (int)ARRAY_SIZE(sscreen->compiler));
+   compiler = &sscreen->compiler[thread_index];
+
+   if (!compiler->passes)
+      si_init_compiler(sscreen, compiler);
+
+   /* The GS copy shader is always pre-compiled. */
+   if (sel->info.stage == MESA_SHADER_GEOMETRY &&
+       (!sscreen->use_ngg || !sscreen->use_ngg_streamout || /* also for PRIMITIVES_GENERATED */
+        sel->tess_turns_off_ngg)) {
+      sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug);
+      if (!sel->gs_copy_shader) {
+         fprintf(stderr, "radeonsi: can't create GS copy shader\n");
+         return;
+      }
+
+      si_shader_vs(sscreen, sel->gs_copy_shader, sel);
+   }
+
+   /* Serialize NIR to save memory. Monolithic shader variants
+    * have to deserialize NIR before compilation.
+    */
+   if (sel->nir) {
+      struct blob blob;
+      size_t size;
+
+      blob_init(&blob);
+      /* true = remove optional debugging data to increase
+       * the likehood of getting more shader cache hits.
+       * It also drops variable names, so we'll save more memory.
+       */
+      nir_serialize(&blob, sel->nir, true);
+      blob_finish_get_buffer(&blob, &sel->nir_binary, &size);
+      sel->nir_size = size;
+   }
+
+   /* Compile the main shader part for use with a prolog and/or epilog.
+    * If this fails, the driver will try to compile a monolithic shader
+    * on demand.
+    */
+   if (!sscreen->use_monolithic_shaders) {
+      struct si_shader *shader = CALLOC_STRUCT(si_shader);
+      unsigned char ir_sha1_cache_key[20];
+
+      if (!shader) {
+         fprintf(stderr, "radeonsi: can't allocate a main shader part\n");
+         return;
+      }
+
+      /* We can leave the fence signaled because use of the default
+       * main part is guarded by the selector's ready fence. */
+      util_queue_fence_init(&shader->ready);
+
+      shader->selector = sel;
+      shader->is_monolithic = false;
+      si_parse_next_shader_property(&sel->info, sel->so.num_outputs != 0, &shader->key);
+
+      if (sel->info.stage <= MESA_SHADER_GEOMETRY &&
+          sscreen->use_ngg && (!sel->so.num_outputs || sscreen->use_ngg_streamout) &&
+          ((sel->info.stage == MESA_SHADER_VERTEX && !shader->key.ge.as_ls) ||
+           sel->info.stage == MESA_SHADER_TESS_EVAL || sel->info.stage == MESA_SHADER_GEOMETRY))
+         shader->key.ge.as_ngg = 1;
+
+      if (sel->nir) {
+         if (sel->info.stage <= MESA_SHADER_GEOMETRY)
+            si_get_ir_cache_key(sel, shader->key.ge.as_ngg, shader->key.ge.as_es, ir_sha1_cache_key);
+         else
+            si_get_ir_cache_key(sel, false, false, ir_sha1_cache_key);
+      }
+
+      /* Try to load the shader from the shader cache. */
+      simple_mtx_lock(&sscreen->shader_cache_mutex);
+
+      if (si_shader_cache_load_shader(sscreen, ir_sha1_cache_key, shader)) {
+         simple_mtx_unlock(&sscreen->shader_cache_mutex);
+         si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
+      } else {
+         simple_mtx_unlock(&sscreen->shader_cache_mutex);
+
+         /* Compile the shader if it hasn't been loaded from the cache. */
+         if (!si_compile_shader(sscreen, compiler, shader, debug)) {
+            FREE(shader);
+            fprintf(stderr, "radeonsi: can't compile a main shader part\n");
+            return;
+         }
+
+         simple_mtx_lock(&sscreen->shader_cache_mutex);
+         si_shader_cache_insert_shader(sscreen, ir_sha1_cache_key, shader, true);
+         simple_mtx_unlock(&sscreen->shader_cache_mutex);
+      }
+
+      *si_get_main_shader_part(sel, &shader->key) = shader;
+
+      /* Unset "outputs_written" flags for outputs converted to
+       * DEFAULT_VAL, so that later inter-shader optimizations don't
+       * try to eliminate outputs that don't exist in the final
+       * shader.
+       *
+       * This is only done if non-monolithic shaders are enabled.
+       */
+      if ((sel->info.stage == MESA_SHADER_VERTEX ||
+           sel->info.stage == MESA_SHADER_TESS_EVAL ||
+           sel->info.stage == MESA_SHADER_GEOMETRY) &&
+          !shader->key.ge.as_ls && !shader->key.ge.as_es) {
+         unsigned i;
+
+         for (i = 0; i < sel->info.num_outputs; i++) {
+            unsigned semantic = sel->info.output_semantic[i];
+            unsigned ps_input_cntl = shader->info.vs_output_ps_input_cntl[semantic];
+
+            /* OFFSET=0x20 means DEFAULT_VAL, which means VS doesn't export it. */
+            if (G_028644_OFFSET(ps_input_cntl) != 0x20)
+               continue;
+
+            unsigned id;
+
+            /* Remove the output from the mask. */
+            if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) &&
+                semantic != VARYING_SLOT_POS &&
+                semantic != VARYING_SLOT_PSIZ &&
+                semantic != VARYING_SLOT_CLIP_VERTEX &&
+                semantic != VARYING_SLOT_EDGE) {
+               id = si_shader_io_get_unique_index(semantic, true);
+               sel->outputs_written_before_ps &= ~(1ull << id);
+            }
+         }
+      }
+   }
+
+   /* Free NIR. We only keep serialized NIR after this point. */
+   if (sel->nir) {
+      ralloc_free(sel->nir);
+      sel->nir = NULL;
+   }
+}
+
+void si_schedule_initial_compile(struct si_context *sctx, gl_shader_stage stage,
+                                 struct util_queue_fence *ready_fence,
+                                 struct si_compiler_ctx_state *compiler_ctx_state, void *job,
+                                 util_queue_execute_func execute)
+{
+   util_queue_fence_init(ready_fence);
+
+   struct util_async_debug_callback async_debug;
+   bool debug = (sctx->debug.debug_message && !sctx->debug.async) || sctx->is_debug ||
+                si_can_dump_shader(sctx->screen, stage);
+
+   if (debug) {
+      u_async_debug_init(&async_debug);
+      compiler_ctx_state->debug = async_debug.base;
+   }
+
+   util_queue_add_job(&sctx->screen->shader_compiler_queue, job, ready_fence, execute, NULL, 0);
+
+   if (debug) {
+      util_queue_fence_wait(ready_fence);
+      u_async_debug_drain(&async_debug, &sctx->debug);
+      u_async_debug_cleanup(&async_debug);
+   }
+
+   if (sctx->screen->options.sync_compile)
+      util_queue_fence_wait(ready_fence);
+}
+
+/* Return descriptor slot usage masks from the given shader info. */
+void si_get_active_slot_masks(const struct si_shader_info *info, uint64_t *const_and_shader_buffers,
+                              uint64_t *samplers_and_images)
+{
+   unsigned start, num_shaderbufs, num_constbufs, num_images, num_msaa_images, num_samplers;
+
+   num_shaderbufs = info->base.num_ssbos;
+   num_constbufs = info->base.num_ubos;
+   /* two 8-byte images share one 16-byte slot */
+   num_images = align(info->base.num_images, 2);
+   num_msaa_images = align(util_last_bit(info->base.msaa_images), 2);
+   num_samplers = BITSET_LAST_BIT(info->base.textures_used);
+
+   /* The layout is: sb[last] ... sb[0], cb[0] ... cb[last] */
+   start = si_get_shaderbuf_slot(num_shaderbufs - 1);
+   *const_and_shader_buffers = u_bit_consecutive64(start, num_shaderbufs + num_constbufs);
+
+   /* The layout is:
+    *   - fmask[last] ... fmask[0]     go to [15-last .. 15]
+    *   - image[last] ... image[0]     go to [31-last .. 31]
+    *   - sampler[0] ... sampler[last] go to [32 .. 32+last*2]
+    *
+    * FMASKs for images are placed separately, because MSAA images are rare,
+    * and so we can benefit from a better cache hit rate if we keep image
+    * descriptors together.
+    */
+   if (num_msaa_images)
+      num_images = SI_NUM_IMAGES + num_msaa_images; /* add FMASK descriptors */
+
+   start = si_get_image_slot(num_images - 1) / 2;
+   *samplers_and_images = u_bit_consecutive64(start, num_images / 2 + num_samplers);
+}
+
+static void *si_create_shader_selector(struct pipe_context *ctx,
+                                       const struct pipe_shader_state *state)
+{
+   struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_selector *sel = CALLOC_STRUCT(si_shader_selector);
+   int i;
+
+   if (!sel)
+      return NULL;
+
+   sel->screen = sscreen;
+   sel->compiler_ctx_state.debug = sctx->debug;
+   sel->compiler_ctx_state.is_debug_context = sctx->is_debug;
+
+   sel->so = state->stream_output;
+
+   if (state->type == PIPE_SHADER_IR_TGSI) {
+      sel->nir = tgsi_to_nir(state->tokens, ctx->screen, true);
+   } else {
+      assert(state->type == PIPE_SHADER_IR_NIR);
+      sel->nir = (nir_shader*)state->ir.nir;
+   }
+
+   si_nir_scan_shader(sel->nir, &sel->info);
+
+   const enum pipe_shader_type type = pipe_shader_type_from_mesa(sel->info.stage);
+   sel->pipe_shader_type = type;
+   sel->const_and_shader_buf_descriptors_index =
+      si_const_and_shader_buffer_descriptors_idx(type);
+   sel->sampler_and_images_descriptors_index =
+      si_sampler_and_image_descriptors_idx(type);
+
+   p_atomic_inc(&sscreen->num_shaders_created);
+   si_get_active_slot_masks(&sel->info, &sel->active_const_and_shader_buffers,
+                            &sel->active_samplers_and_images);
+
+   /* Record which streamout buffers are enabled. */
+   for (unsigned i = 0; i < sel->so.num_outputs; i++) {
+      sel->enabled_streamout_buffer_mask |= (1 << sel->so.output[i].output_buffer)
+                                            << (sel->so.output[i].stream * 4);
+   }
+
+   sel->num_vs_inputs =
+      sel->info.stage == MESA_SHADER_VERTEX && !sel->info.base.vs.blit_sgprs_amd
+         ? sel->info.num_inputs
+         : 0;
+   unsigned num_vbos_in_sgprs = si_num_vbos_in_user_sgprs_inline(sscreen->info.chip_class);
+   sel->num_vbos_in_user_sgprs = MIN2(sel->num_vs_inputs, num_vbos_in_sgprs);
+
+   /* The prolog is a no-op if there are no inputs. */
+   sel->vs_needs_prolog = sel->info.stage == MESA_SHADER_VERTEX && sel->info.num_inputs &&
+                          !sel->info.base.vs.blit_sgprs_amd;
+
+   if (sel->info.stage == MESA_SHADER_VERTEX ||
+       sel->info.stage == MESA_SHADER_TESS_CTRL ||
+       sel->info.stage == MESA_SHADER_TESS_EVAL ||
+       sel->info.stage == MESA_SHADER_GEOMETRY) {
+      if (sel->info.stage == MESA_SHADER_TESS_CTRL) {
+         /* Always reserve space for these. */
+         sel->patch_outputs_written |=
+            (1ull << si_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_INNER)) |
+            (1ull << si_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_OUTER));
+      }
+      for (i = 0; i < sel->info.num_outputs; i++) {
+         unsigned semantic = sel->info.output_semantic[i];
+
+         if (semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
+             semantic == VARYING_SLOT_TESS_LEVEL_OUTER ||
+             (semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_TESS_MAX)) {
+            sel->patch_outputs_written |= 1ull << si_shader_io_get_unique_index_patch(semantic);
+         } else if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) &&
+                    semantic != VARYING_SLOT_EDGE) {
+            sel->outputs_written |= 1ull << si_shader_io_get_unique_index(semantic, false);
+
+            /* Ignore outputs that are not passed from VS to PS. */
+            if (semantic != VARYING_SLOT_POS &&
+                semantic != VARYING_SLOT_PSIZ &&
+                semantic != VARYING_SLOT_CLIP_VERTEX) {
+               sel->outputs_written_before_ps |= 1ull
+                                                 << si_shader_io_get_unique_index(semantic, true);
+            }
+         }
+      }
+   }
+
+   switch (sel->info.stage) {
+   case MESA_SHADER_GEOMETRY:
+      /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
+      sel->rast_prim = (enum pipe_prim_type)sel->info.base.gs.output_primitive;
+      if (util_rast_prim_is_triangles(sel->rast_prim))
+         sel->rast_prim = PIPE_PRIM_TRIANGLES;
+
+      sel->gsvs_vertex_size = sel->info.num_outputs * 16;
+      sel->max_gsvs_emit_size = sel->gsvs_vertex_size * sel->info.base.gs.vertices_out;
+      sel->gs_input_verts_per_prim =
+         u_vertices_per_prim((enum pipe_prim_type)sel->info.base.gs.input_primitive);
+
+      /* EN_MAX_VERT_OUT_PER_GS_INSTANCE does not work with tesselation so
+       * we can't split workgroups. Disable ngg if any of the following conditions is true:
+       * - num_invocations * gs.vertices_out > 256
+       * - LDS usage is too high
+       */
+      sel->tess_turns_off_ngg = sscreen->info.chip_class >= GFX10 &&
+                                (sel->info.base.gs.invocations * sel->info.base.gs.vertices_out > 256 ||
+                                 sel->info.base.gs.invocations * sel->info.base.gs.vertices_out *
+                                 (sel->info.num_outputs * 4 + 1) > 6500 /* max dw per GS primitive */);
+      break;
+
+   case MESA_SHADER_VERTEX:
+   case MESA_SHADER_TESS_CTRL:
+   case MESA_SHADER_TESS_EVAL:
+      sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16;
+      sel->lshs_vertex_stride = sel->esgs_itemsize;
+
+      /* Add 1 dword to reduce LDS bank conflicts, so that each vertex
+       * will start on a different bank. (except for the maximum 32*16).
+       */
+      if (sel->lshs_vertex_stride < 32 * 16)
+         sel->lshs_vertex_stride += 4;
+
+      /* For the ESGS ring in LDS, add 1 dword to reduce LDS bank
+       * conflicts, i.e. each vertex will start at a different bank.
+       */
+      if (sctx->chip_class >= GFX9)
+         sel->esgs_itemsize += 4;
+
+      assert(((sel->esgs_itemsize / 4) & C_028AAC_ITEMSIZE) == 0);
+
+      sel->tcs_vgpr_only_inputs = ~sel->info.base.tess.tcs_cross_invocation_inputs_read &
+                                  ~sel->info.base.inputs_read_indirectly &
+                                  sel->info.base.inputs_read;
+
+      /* Only for TES: */
+      if (sel->info.stage == MESA_SHADER_TESS_EVAL) {
+         if (sel->info.base.tess.point_mode)
+            sel->rast_prim = PIPE_PRIM_POINTS;
+         else if (sel->info.base.tess.primitive_mode == GL_LINES)
+            sel->rast_prim = PIPE_PRIM_LINE_STRIP;
+         else
+            sel->rast_prim = PIPE_PRIM_TRIANGLES;
+      } else {
+         sel->rast_prim = PIPE_PRIM_TRIANGLES;
+      }
+      break;
+
+   case MESA_SHADER_FRAGMENT:
+      for (i = 0; i < sel->info.num_inputs; i++) {
+         unsigned semantic = sel->info.input[i].semantic;
+
+         if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) &&
+             semantic != VARYING_SLOT_PNTC) {
+            sel->inputs_read |= 1ull << si_shader_io_get_unique_index(semantic, true);
+         }
+      }
+
+      for (i = 0; i < 8; i++)
+         if (sel->info.colors_written & (1 << i))
+            sel->colors_written_4bit |= 0xf << (4 * i);
+
+      for (i = 0; i < sel->info.num_inputs; i++) {
+         if (sel->info.input[i].semantic == VARYING_SLOT_COL0)
+            sel->color_attr_index[0] = i;
+         else if (sel->info.input[i].semantic == VARYING_SLOT_COL1)
+            sel->color_attr_index[1] = i;
+      }
+      break;
+   default:;
+   }
+
+   bool ngg_culling_allowed =
+      sscreen->info.chip_class >= GFX10 &&
+      sscreen->use_ngg_culling &&
+      (sel->info.stage == MESA_SHADER_VERTEX ||
+       sel->info.stage == MESA_SHADER_TESS_EVAL) &&
+      sel->info.writes_position &&
+      !sel->info.writes_viewport_index && /* cull only against viewport 0 */
+      !sel->info.base.writes_memory && !sel->so.num_outputs &&
+      (sel->info.stage != MESA_SHADER_VERTEX ||
+       (!sel->info.base.vs.blit_sgprs_amd &&
+        !sel->info.base.vs.window_space_position));
+
+   sel->ngg_cull_vert_threshold = UINT_MAX; /* disabled (changed below) */
+
+   if (ngg_culling_allowed) {
+      if (sel->info.stage == MESA_SHADER_VERTEX) {
+         if (sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_ALL))
+            sel->ngg_cull_vert_threshold = 0; /* always enabled */
+         else
+            sel->ngg_cull_vert_threshold = 128;
+      } else if (sel->info.stage == MESA_SHADER_TESS_EVAL) {
+         if (sel->rast_prim != PIPE_PRIM_POINTS)
+            sel->ngg_cull_vert_threshold = 0; /* always enabled */
+      }
+   }
+
+   sel->clipdist_mask = sel->info.writes_clipvertex ? SIX_BITS :
+                           u_bit_consecutive(0, sel->info.base.clip_distance_array_size);
+   sel->culldist_mask = u_bit_consecutive(0, sel->info.base.cull_distance_array_size) <<
+                        sel->info.base.clip_distance_array_size;
+
+   /* DB_SHADER_CONTROL */
+   sel->db_shader_control = S_02880C_Z_EXPORT_ENABLE(sel->info.writes_z) |
+                            S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(sel->info.writes_stencil) |
+                            S_02880C_MASK_EXPORT_ENABLE(sel->info.writes_samplemask) |
+                            S_02880C_KILL_ENABLE(sel->info.base.fs.uses_discard);
+
+   if (sel->info.stage == MESA_SHADER_FRAGMENT) {
+      switch (sel->info.base.fs.depth_layout) {
+      case FRAG_DEPTH_LAYOUT_GREATER:
+         sel->db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z);
+         break;
+      case FRAG_DEPTH_LAYOUT_LESS:
+         sel->db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z);
+         break;
+      default:;
+      }
+
+      /* Z_ORDER, EXEC_ON_HIER_FAIL and EXEC_ON_NOOP should be set as following:
+       *
+       *   | early Z/S | writes_mem | allow_ReZ? |      Z_ORDER       | EXEC_ON_HIER_FAIL | EXEC_ON_NOOP
+       * --|-----------|------------|------------|--------------------|-------------------|-------------
+       * 1a|   false   |   false    |   true     | EarlyZ_Then_ReZ    |         0         |     0
+       * 1b|   false   |   false    |   false    | EarlyZ_Then_LateZ  |         0         |     0
+       * 2 |   false   |   true     |   n/a      |       LateZ        |         1         |     0
+       * 3 |   true    |   false    |   n/a      | EarlyZ_Then_LateZ  |         0         |     0
+       * 4 |   true    |   true     |   n/a      | EarlyZ_Then_LateZ  |         0         |     1
+       *
+       * In cases 3 and 4, HW will force Z_ORDER to EarlyZ regardless of what's set in the register.
+       * In case 2, NOOP_CULL is a don't care field. In case 2, 3 and 4, ReZ doesn't make sense.
+       *
+       * Don't use ReZ without profiling !!!
+       *
+       * ReZ decreases performance by 15% in DiRT: Showdown on Ultra settings, which has pretty complex
+       * shaders.
+       */
+      if (sel->info.base.fs.early_fragment_tests) {
+         /* Cases 3, 4. */
+         sel->db_shader_control |= S_02880C_DEPTH_BEFORE_SHADER(1) |
+                                   S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z) |
+                                   S_02880C_EXEC_ON_NOOP(sel->info.base.writes_memory);
+      } else if (sel->info.base.writes_memory) {
+         /* Case 2. */
+         sel->db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z) | S_02880C_EXEC_ON_HIER_FAIL(1);
+      } else {
+         /* Case 1. */
+         sel->db_shader_control |= S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z);
+      }
+
+      if (sel->info.base.fs.post_depth_coverage)
+         sel->db_shader_control |= S_02880C_PRE_SHADER_DEPTH_COVERAGE_ENABLE(1);
+   }
+
+   (void)simple_mtx_init(&sel->mutex, mtx_plain);
+
+   si_schedule_initial_compile(sctx, sel->info.stage, &sel->ready, &sel->compiler_ctx_state,
+                               sel, si_init_shader_selector_async);
+   return sel;
+}
+
+static void *si_create_shader(struct pipe_context *ctx, const struct pipe_shader_state *state)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+   bool cache_hit;
+   struct si_shader_selector *sel = (struct si_shader_selector *)util_live_shader_cache_get(
+      ctx, &sscreen->live_shader_cache, state, &cache_hit);
+
+   if (sel && cache_hit && sctx->debug.debug_message) {
+      if (sel->main_shader_part)
+         si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part, &sctx->debug);
+      if (sel->main_shader_part_ls)
+         si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part_ls, &sctx->debug);
+      if (sel->main_shader_part_es)
+         si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part_es, &sctx->debug);
+      if (sel->main_shader_part_ngg)
+         si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part_ngg, &sctx->debug);
+      if (sel->main_shader_part_ngg_es)
+         si_shader_dump_stats_for_shader_db(sscreen, sel->main_shader_part_ngg_es, &sctx->debug);
+   }
+   return sel;
+}
+
+static void si_update_streamout_state(struct si_context *sctx)
+{
+   struct si_shader_selector *shader_with_so = si_get_vs(sctx)->cso;
+
+   if (!shader_with_so)
+      return;
+
+   sctx->streamout.enabled_stream_buffers_mask = shader_with_so->enabled_streamout_buffer_mask;
+   sctx->streamout.stride_in_dw = shader_with_so->so.stride;
+}
+
+static void si_update_clip_regs(struct si_context *sctx, struct si_shader_selector *old_hw_vs,
+                                struct si_shader *old_hw_vs_variant,
+                                struct si_shader_selector *next_hw_vs,
+                                struct si_shader *next_hw_vs_variant)
+{
+   if (next_hw_vs &&
+       (!old_hw_vs ||
+        (old_hw_vs->info.stage == MESA_SHADER_VERTEX && old_hw_vs->info.base.vs.window_space_position) !=
+        (next_hw_vs->info.stage == MESA_SHADER_VERTEX && next_hw_vs->info.base.vs.window_space_position) ||
+        old_hw_vs->clipdist_mask != next_hw_vs->clipdist_mask ||
+        old_hw_vs->culldist_mask != next_hw_vs->culldist_mask || !old_hw_vs_variant ||
+        !next_hw_vs_variant ||
+        old_hw_vs_variant->pa_cl_vs_out_cntl != next_hw_vs_variant->pa_cl_vs_out_cntl))
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
+}
+
+static void si_update_rasterized_prim(struct si_context *sctx)
+{
+   enum pipe_prim_type rast_prim;
+
+   if (sctx->shader.gs.cso) {
+      /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
+      rast_prim = sctx->shader.gs.cso->rast_prim;
+   } else if (sctx->shader.tes.cso) {
+      /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
+      rast_prim = sctx->shader.tes.cso->rast_prim;
+   } else {
+      /* Determined by draw calls. */
+      return;
+   }
+
+   if (rast_prim != sctx->current_rast_prim) {
+      if (util_prim_is_points_or_lines(sctx->current_rast_prim) !=
+          util_prim_is_points_or_lines(rast_prim))
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband);
+
+      sctx->current_rast_prim = rast_prim;
+   }
+}
+
+static void si_update_common_shader_state(struct si_context *sctx, struct si_shader_selector *sel,
+                                          enum pipe_shader_type type)
+{
+   si_set_active_descriptors_for_shader(sctx, sel);
+
+   sctx->uses_bindless_samplers = si_shader_uses_bindless_samplers(sctx->shader.vs.cso) ||
+                                  si_shader_uses_bindless_samplers(sctx->shader.gs.cso) ||
+                                  si_shader_uses_bindless_samplers(sctx->shader.ps.cso) ||
+                                  si_shader_uses_bindless_samplers(sctx->shader.tcs.cso) ||
+                                  si_shader_uses_bindless_samplers(sctx->shader.tes.cso);
+   sctx->uses_bindless_images = si_shader_uses_bindless_images(sctx->shader.vs.cso) ||
+                                si_shader_uses_bindless_images(sctx->shader.gs.cso) ||
+                                si_shader_uses_bindless_images(sctx->shader.ps.cso) ||
+                                si_shader_uses_bindless_images(sctx->shader.tcs.cso) ||
+                                si_shader_uses_bindless_images(sctx->shader.tes.cso);
+
+   if (type == PIPE_SHADER_VERTEX || type == PIPE_SHADER_TESS_EVAL || type == PIPE_SHADER_GEOMETRY)
+      sctx->ngg_culling = 0; /* this will be enabled on the first draw if needed */
+
+   si_invalidate_inlinable_uniforms(sctx, type);
+   sctx->do_update_shaders = true;
+}
+
+static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
+   struct si_shader *old_hw_vs_variant = si_get_vs(sctx)->current;
+   struct si_shader_selector *sel = (struct si_shader_selector*)state;
+
+   if (sctx->shader.vs.cso == sel)
+      return;
+
+   sctx->shader.vs.cso = sel;
+   sctx->shader.vs.current = sel ? sel->first_variant : NULL;
+   sctx->num_vs_blit_sgprs = sel ? sel->info.base.vs.blit_sgprs_amd : 0;
+   sctx->vs_uses_draw_id = sel ? sel->info.uses_drawid : false;
+   sctx->fixed_func_tcs_shader.key.ge.mono.u.ff_tcs_inputs_to_copy = sel ? sel->outputs_written : 0;
+
+   if (si_update_ngg(sctx))
+      si_shader_change_notify(sctx);
+
+   si_update_common_shader_state(sctx, sel, PIPE_SHADER_VERTEX);
+   si_select_draw_vbo(sctx);
+   si_update_vs_viewport_state(sctx);
+   si_update_streamout_state(sctx);
+   si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso,
+                       si_get_vs(sctx)->current);
+   si_update_rasterized_prim(sctx);
+   si_vs_key_update_inputs(sctx);
+}
+
+static void si_update_tess_uses_prim_id(struct si_context *sctx)
+{
+   sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id =
+      (sctx->shader.tes.cso && sctx->shader.tes.cso->info.uses_primid) ||
+      (sctx->shader.tcs.cso && sctx->shader.tcs.cso->info.uses_primid) ||
+      (sctx->shader.gs.cso && sctx->shader.gs.cso->info.uses_primid) ||
+      (sctx->shader.ps.cso && !sctx->shader.gs.cso && sctx->shader.ps.cso->info.uses_primid);
+}
+
+bool si_update_ngg(struct si_context *sctx)
+{
+   if (!sctx->screen->use_ngg) {
+      assert(!sctx->ngg);
+      return false;
+   }
+
+   bool new_ngg = true;
+
+   if (sctx->shader.gs.cso && sctx->shader.tes.cso && sctx->shader.gs.cso->tess_turns_off_ngg) {
+      new_ngg = false;
+   } else if (!sctx->screen->use_ngg_streamout) {
+      struct si_shader_selector *last = si_get_vs(sctx)->cso;
+
+      if ((last && last->so.num_outputs) || sctx->streamout.prims_gen_query_enabled)
+         new_ngg = false;
+   }
+
+   if (new_ngg != sctx->ngg) {
+      /* Transitioning from NGG to legacy GS requires VGT_FLUSH on Navi10-14.
+       * VGT_FLUSH is also emitted at the beginning of IBs when legacy GS ring
+       * pointers are set.
+       */
+      if (sctx->screen->info.has_vgt_flush_ngg_legacy_bug && !new_ngg) {
+         sctx->flags |= SI_CONTEXT_VGT_FLUSH;
+         if (sctx->chip_class == GFX10) {
+            /* Workaround for https://gitlab.freedesktop.org/mesa/mesa/-/issues/2941 */
+            si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+         }
+      }
+
+      sctx->ngg = new_ngg;
+      sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
+      si_select_draw_vbo(sctx);
+      return true;
+   }
+   return false;
+}
+
+static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
+   struct si_shader *old_hw_vs_variant = si_get_vs(sctx)->current;
+   struct si_shader_selector *sel = (struct si_shader_selector*)state;
+   bool enable_changed = !!sctx->shader.gs.cso != !!sel;
+   bool ngg_changed;
+
+   if (sctx->shader.gs.cso == sel)
+      return;
+
+   sctx->shader.gs.cso = sel;
+   sctx->shader.gs.current = sel ? sel->first_variant : NULL;
+   sctx->ia_multi_vgt_param_key.u.uses_gs = sel != NULL;
+
+   si_update_common_shader_state(sctx, sel, PIPE_SHADER_GEOMETRY);
+   si_select_draw_vbo(sctx);
+   sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
+
+   ngg_changed = si_update_ngg(sctx);
+   if (ngg_changed || enable_changed)
+      si_shader_change_notify(sctx);
+   if (enable_changed) {
+      if (sctx->ia_multi_vgt_param_key.u.uses_tess)
+         si_update_tess_uses_prim_id(sctx);
+   }
+   si_update_vs_viewport_state(sctx);
+   si_update_streamout_state(sctx);
+   si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso,
+                       si_get_vs(sctx)->current);
+   si_update_rasterized_prim(sctx);
+}
+
+static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_selector *sel = (struct si_shader_selector*)state;
+   bool enable_changed = !!sctx->shader.tcs.cso != !!sel;
+
+   if (sctx->shader.tcs.cso == sel)
+      return;
+
+   sctx->shader.tcs.cso = sel;
+   sctx->shader.tcs.current = sel ? sel->first_variant : NULL;
+   sctx->shader.tcs.key.ge.part.tcs.epilog.invoc0_tess_factors_are_def =
+      sel ? sel->info.tessfactors_are_def_in_all_invocs : 0;
+   si_update_tess_uses_prim_id(sctx);
+
+   si_update_common_shader_state(sctx, sel, PIPE_SHADER_TESS_CTRL);
+
+   if (enable_changed)
+      sctx->last_tcs = NULL; /* invalidate derived tess state */
+}
+
+static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_selector *old_hw_vs = si_get_vs(sctx)->cso;
+   struct si_shader *old_hw_vs_variant = si_get_vs(sctx)->current;
+   struct si_shader_selector *sel = (struct si_shader_selector*)state;
+   bool enable_changed = !!sctx->shader.tes.cso != !!sel;
+
+   if (sctx->shader.tes.cso == sel)
+      return;
+
+   sctx->shader.tes.cso = sel;
+   sctx->shader.tes.current = sel ? sel->first_variant : NULL;
+   sctx->ia_multi_vgt_param_key.u.uses_tess = sel != NULL;
+   si_update_tess_uses_prim_id(sctx);
+
+   sctx->shader.tcs.key.ge.part.tcs.epilog.prim_mode =
+   sctx->fixed_func_tcs_shader.key.ge.part.tcs.epilog.prim_mode =
+      sel ? sel->info.base.tess.primitive_mode : 0;
+
+   sctx->shader.tcs.key.ge.part.tcs.epilog.tes_reads_tess_factors =
+   sctx->fixed_func_tcs_shader.key.ge.part.tcs.epilog.tes_reads_tess_factors =
+      sel ? sel->info.reads_tess_factors : 0;
+
+   si_update_common_shader_state(sctx, sel, PIPE_SHADER_TESS_EVAL);
+   si_select_draw_vbo(sctx);
+   sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
+
+   bool ngg_changed = si_update_ngg(sctx);
+   if (ngg_changed || enable_changed)
+      si_shader_change_notify(sctx);
+   if (enable_changed)
+      sctx->last_tes_sh_base = -1; /* invalidate derived tess state */
+   si_update_vs_viewport_state(sctx);
+   si_update_streamout_state(sctx);
+   si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso,
+                       si_get_vs(sctx)->current);
+   si_update_rasterized_prim(sctx);
+}
+
+void si_update_ps_kill_enable(struct si_context *sctx)
+{
+   if (!sctx->shader.ps.cso)
+      return;
+
+   unsigned db_shader_control = sctx->shader.ps.cso->db_shader_control |
+                                S_02880C_KILL_ENABLE(sctx->queued.named.dsa->alpha_func != PIPE_FUNC_ALWAYS);
+
+   if (sctx->ps_db_shader_control != db_shader_control) {
+      sctx->ps_db_shader_control = db_shader_control;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+      if (sctx->screen->dpbb_allowed)
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+   }
+}
+
+void si_update_vrs_flat_shading(struct si_context *sctx)
+{
+   if (sctx->chip_class >= GFX10_3 && sctx->shader.ps.cso) {
+      struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+      struct si_shader_info *info = &sctx->shader.ps.cso->info;
+      bool allow_flat_shading = info->allow_flat_shading;
+
+      if (allow_flat_shading &&
+          (rs->line_smooth || rs->poly_smooth || rs->poly_stipple_enable ||
+           (!rs->flatshade && info->uses_interp_color)))
+         allow_flat_shading = false;
+
+      if (sctx->allow_flat_shading != allow_flat_shading) {
+         sctx->allow_flat_shading = allow_flat_shading;
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
+      }
+   }
+}
+
+static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_selector *old_sel = sctx->shader.ps.cso;
+   struct si_shader_selector *sel = (struct si_shader_selector*)state;
+
+   /* skip if supplied shader is one already in use */
+   if (old_sel == sel)
+      return;
+
+   sctx->shader.ps.cso = sel;
+   sctx->shader.ps.current = sel ? sel->first_variant : NULL;
+
+   si_update_common_shader_state(sctx, sel, PIPE_SHADER_FRAGMENT);
+   if (sel) {
+      if (sctx->ia_multi_vgt_param_key.u.uses_tess)
+         si_update_tess_uses_prim_id(sctx);
+
+      if (!old_sel || old_sel->info.colors_written != sel->info.colors_written)
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
+
+      if (sctx->screen->has_out_of_order_rast &&
+          (!old_sel || old_sel->info.base.writes_memory != sel->info.base.writes_memory ||
+           old_sel->info.base.fs.early_fragment_tests !=
+              sel->info.base.fs.early_fragment_tests))
+         si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+   }
+   si_update_ps_colorbuf0_slot(sctx);
+
+   si_ps_key_update_framebuffer(sctx);
+   si_ps_key_update_framebuffer_blend(sctx);
+   si_ps_key_update_blend_rasterizer(sctx);
+   si_ps_key_update_rasterizer(sctx);
+   si_ps_key_update_dsa(sctx);
+   si_ps_key_update_sample_shading(sctx);
+   si_ps_key_update_framebuffer_rasterizer_sample_shading(sctx);
+   si_update_ps_inputs_read_or_disabled(sctx);
+   si_update_ps_kill_enable(sctx);
+   si_update_vrs_flat_shading(sctx);
+}
+
+static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
+{
+   if (shader->is_optimized) {
+      util_queue_drop_job(&sctx->screen->shader_compiler_queue_low_priority, &shader->ready);
+   }
+
+   util_queue_fence_destroy(&shader->ready);
+
+   /* If destroyed shaders were not unbound, the next compiled
+    * shader variant could get the same pointer address and so
+    * binding it to the same shader stage would be considered
+    * a no-op, causing random behavior.
+    */
+   int state_index = -1;
+
+   switch (shader->selector->info.stage) {
+   case MESA_SHADER_VERTEX:
+      if (shader->key.ge.as_ls) {
+         if (sctx->chip_class <= GFX8)
+            state_index = SI_STATE_IDX(ls);
+      } else if (shader->key.ge.as_es) {
+         if (sctx->chip_class <= GFX8)
+            state_index = SI_STATE_IDX(es);
+      } else if (shader->key.ge.as_ngg) {
+         state_index = SI_STATE_IDX(gs);
+      } else {
+         state_index = SI_STATE_IDX(vs);
+      }
+      break;
+   case MESA_SHADER_TESS_CTRL:
+      state_index = SI_STATE_IDX(hs);
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      if (shader->key.ge.as_es) {
+         if (sctx->chip_class <= GFX8)
+            state_index = SI_STATE_IDX(es);
+      } else if (shader->key.ge.as_ngg) {
+         state_index = SI_STATE_IDX(gs);
+      } else {
+         state_index = SI_STATE_IDX(vs);
+      }
+      break;
+   case MESA_SHADER_GEOMETRY:
+      if (shader->is_gs_copy_shader)
+         state_index = SI_STATE_IDX(vs);
+      else
+         state_index = SI_STATE_IDX(gs);
+      break;
+   case MESA_SHADER_FRAGMENT:
+      state_index = SI_STATE_IDX(ps);
+      break;
+   default:;
+   }
+
+   si_shader_selector_reference(sctx, &shader->previous_stage_sel, NULL);
+   si_shader_destroy(shader);
+   si_pm4_free_state(sctx, &shader->pm4, state_index);
+}
+
+static void si_destroy_shader_selector(struct pipe_context *ctx, void *cso)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_selector *sel = (struct si_shader_selector *)cso;
+   struct si_shader *p = sel->first_variant, *c;
+   enum pipe_shader_type type = pipe_shader_type_from_mesa(sel->info.stage);
+
+   util_queue_drop_job(&sctx->screen->shader_compiler_queue, &sel->ready);
+
+   if (sctx->shaders[type].cso == sel) {
+      sctx->shaders[type].cso = NULL;
+      sctx->shaders[type].current = NULL;
+   }
+
+   while (p) {
+      c = p->next_variant;
+      si_delete_shader(sctx, p);
+      p = c;
+   }
+
+   if (sel->main_shader_part)
+      si_delete_shader(sctx, sel->main_shader_part);
+   if (sel->main_shader_part_ls)
+      si_delete_shader(sctx, sel->main_shader_part_ls);
+   if (sel->main_shader_part_es)
+      si_delete_shader(sctx, sel->main_shader_part_es);
+   if (sel->main_shader_part_ngg)
+      si_delete_shader(sctx, sel->main_shader_part_ngg);
+   if (sel->gs_copy_shader)
+      si_delete_shader(sctx, sel->gs_copy_shader);
+
+   util_queue_fence_destroy(&sel->ready);
+   simple_mtx_destroy(&sel->mutex);
+   ralloc_free(sel->nir);
+   free(sel->nir_binary);
+   free(sel);
+}
+
+static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+   struct si_shader_selector *sel = (struct si_shader_selector *)state;
+
+   si_shader_selector_reference(sctx, &sel, NULL);
+}
+
+/**
+ * Writing CONFIG or UCONFIG VGT registers requires VGT_FLUSH before that.
+ */
+static void si_cs_preamble_add_vgt_flush(struct si_context *sctx)
+{
+   /* We shouldn't get here if registers are shadowed. */
+   assert(!sctx->shadowed_regs);
+
+   if (sctx->cs_preamble_has_vgt_flush)
+      return;
+
+   /* Done by Vulkan before VGT_FLUSH. */
+   si_pm4_cmd_add(sctx->cs_preamble_state, PKT3(PKT3_EVENT_WRITE, 0, 0));
+   si_pm4_cmd_add(sctx->cs_preamble_state, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+
+   /* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */
+   si_pm4_cmd_add(sctx->cs_preamble_state, PKT3(PKT3_EVENT_WRITE, 0, 0));
+   si_pm4_cmd_add(sctx->cs_preamble_state, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+   sctx->cs_preamble_has_vgt_flush = true;
+}
+
+/**
+ * Writing CONFIG or UCONFIG VGT registers requires VGT_FLUSH before that.
+ */
+static void si_emit_vgt_flush(struct radeon_cmdbuf *cs)
+{
+   radeon_begin(cs);
+
+   /* This is required before VGT_FLUSH. */
+   radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+   radeon_emit(EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+
+   /* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */
+   radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+   radeon_emit(EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+   radeon_end();
+}
+
+/* Initialize state related to ESGS / GSVS ring buffers */
+bool si_update_gs_ring_buffers(struct si_context *sctx)
+{
+   struct si_shader_selector *es =
+      sctx->shader.tes.cso ? sctx->shader.tes.cso : sctx->shader.vs.cso;
+   struct si_shader_selector *gs = sctx->shader.gs.cso;
+   struct si_pm4_state *pm4;
+
+   /* Chip constants. */
+   unsigned num_se = sctx->screen->info.max_se;
+   unsigned wave_size = 64;
+   unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */
+   /* On GFX6-GFX7, the value comes from VGT_GS_VERTEX_REUSE = 16.
+    * On GFX8+, the value comes from VGT_VERTEX_REUSE_BLOCK_CNTL = 30 (+2).
+    */
+   unsigned gs_vertex_reuse = (sctx->chip_class >= GFX8 ? 32 : 16) * num_se;
+   unsigned alignment = 256 * num_se;
+   /* The maximum size is 63.999 MB per SE. */
+   unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se;
+
+   /* Calculate the minimum size. */
+   unsigned min_esgs_ring_size = align(es->esgs_itemsize * gs_vertex_reuse * wave_size, alignment);
+
+   /* These are recommended sizes, not minimum sizes. */
+   unsigned esgs_ring_size =
+      max_gs_waves * 2 * wave_size * es->esgs_itemsize * gs->gs_input_verts_per_prim;
+   unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size * gs->max_gsvs_emit_size;
+
+   min_esgs_ring_size = align(min_esgs_ring_size, alignment);
+   esgs_ring_size = align(esgs_ring_size, alignment);
+   gsvs_ring_size = align(gsvs_ring_size, alignment);
+
+   esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size);
+   gsvs_ring_size = MIN2(gsvs_ring_size, max_size);
+
+   /* Some rings don't have to be allocated if shaders don't use them.
+    * (e.g. no varyings between ES and GS or GS and VS)
+    *
+    * GFX9 doesn't have the ESGS ring.
+    */
+   bool update_esgs = sctx->chip_class <= GFX8 && esgs_ring_size &&
+                      (!sctx->esgs_ring || sctx->esgs_ring->width0 < esgs_ring_size);
+   bool update_gsvs =
+      gsvs_ring_size && (!sctx->gsvs_ring || sctx->gsvs_ring->width0 < gsvs_ring_size);
+
+   if (!update_esgs && !update_gsvs)
+      return true;
+
+   if (update_esgs) {
+      pipe_resource_reference(&sctx->esgs_ring, NULL);
+      sctx->esgs_ring =
+         pipe_aligned_buffer_create(sctx->b.screen,
+                                    SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL,
+                                    PIPE_USAGE_DEFAULT,
+                                    esgs_ring_size, sctx->screen->info.pte_fragment_size);
+      if (!sctx->esgs_ring)
+         return false;
+   }
+
+   if (update_gsvs) {
+      pipe_resource_reference(&sctx->gsvs_ring, NULL);
+      sctx->gsvs_ring =
+         pipe_aligned_buffer_create(sctx->b.screen,
+                                    SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL,
+                                    PIPE_USAGE_DEFAULT,
+                                    gsvs_ring_size, sctx->screen->info.pte_fragment_size);
+      if (!sctx->gsvs_ring)
+         return false;
+   }
+
+   /* Set ring bindings. */
+   if (sctx->esgs_ring) {
+      assert(sctx->chip_class <= GFX8);
+      si_set_ring_buffer(sctx, SI_ES_RING_ESGS, sctx->esgs_ring, 0, sctx->esgs_ring->width0, true,
+                         true, 4, 64, 0);
+      si_set_ring_buffer(sctx, SI_GS_RING_ESGS, sctx->esgs_ring, 0, sctx->esgs_ring->width0, false,
+                         false, 0, 0, 0);
+   }
+   if (sctx->gsvs_ring) {
+      si_set_ring_buffer(sctx, SI_RING_GSVS, sctx->gsvs_ring, 0, sctx->gsvs_ring->width0, false,
+                         false, 0, 0, 0);
+   }
+
+   if (sctx->shadowed_regs) {
+      /* These registers will be shadowed, so set them only once. */
+      struct radeon_cmdbuf *cs = &sctx->gfx_cs;
+
+      assert(sctx->chip_class >= GFX7);
+
+      si_emit_vgt_flush(cs);
+
+      radeon_begin(cs);
+
+      /* Set the GS registers. */
+      if (sctx->esgs_ring) {
+         assert(sctx->chip_class <= GFX8);
+         radeon_set_uconfig_reg(R_030900_VGT_ESGS_RING_SIZE,
+                                sctx->esgs_ring->width0 / 256);
+      }
+      if (sctx->gsvs_ring) {
+         radeon_set_uconfig_reg(R_030904_VGT_GSVS_RING_SIZE,
+                                sctx->gsvs_ring->width0 / 256);
+      }
+      radeon_end();
+      return true;
+   }
+
+   /* The codepath without register shadowing. */
+   /* Create the "cs_preamble_gs_rings" state. */
+   pm4 = CALLOC_STRUCT(si_pm4_state);
+   if (!pm4)
+      return false;
+
+   if (sctx->chip_class >= GFX7) {
+      if (sctx->esgs_ring) {
+         assert(sctx->chip_class <= GFX8);
+         si_pm4_set_reg(pm4, R_030900_VGT_ESGS_RING_SIZE, sctx->esgs_ring->width0 / 256);
+      }
+      if (sctx->gsvs_ring)
+         si_pm4_set_reg(pm4, R_030904_VGT_GSVS_RING_SIZE, sctx->gsvs_ring->width0 / 256);
+   } else {
+      if (sctx->esgs_ring)
+         si_pm4_set_reg(pm4, R_0088C8_VGT_ESGS_RING_SIZE, sctx->esgs_ring->width0 / 256);
+      if (sctx->gsvs_ring)
+         si_pm4_set_reg(pm4, R_0088CC_VGT_GSVS_RING_SIZE, sctx->gsvs_ring->width0 / 256);
+   }
+
+   /* Set the state. */
+   if (sctx->cs_preamble_gs_rings)
+      si_pm4_free_state(sctx, sctx->cs_preamble_gs_rings, ~0);
+   sctx->cs_preamble_gs_rings = pm4;
+
+   si_cs_preamble_add_vgt_flush(sctx);
+
+   /* Flush the context to re-emit both cs_preamble states. */
+   sctx->initial_gfx_cs_size = 0; /* force flush */
+   si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+
+   return true;
+}
+
+static void si_shader_lock(struct si_shader *shader)
+{
+   simple_mtx_lock(&shader->selector->mutex);
+   if (shader->previous_stage_sel) {
+      assert(shader->previous_stage_sel != shader->selector);
+      simple_mtx_lock(&shader->previous_stage_sel->mutex);
+   }
+}
+
+static void si_shader_unlock(struct si_shader *shader)
+{
+   if (shader->previous_stage_sel)
+      simple_mtx_unlock(&shader->previous_stage_sel->mutex);
+   simple_mtx_unlock(&shader->selector->mutex);
+}
+
+/**
+ * @returns 1 if \p sel has been updated to use a new scratch buffer
+ *          0 if not
+ *          < 0 if there was a failure
+ */
+static int si_update_scratch_buffer(struct si_context *sctx, struct si_shader *shader)
+{
+   uint64_t scratch_va = sctx->scratch_buffer->gpu_address;
+
+   if (!shader)
+      return 0;
+
+   /* This shader doesn't need a scratch buffer */
+   if (shader->config.scratch_bytes_per_wave == 0)
+      return 0;
+
+   /* Prevent race conditions when updating:
+    * - si_shader::scratch_bo
+    * - si_shader::binary::code
+    * - si_shader::previous_stage::binary::code.
+    */
+   si_shader_lock(shader);
+
+   /* This shader is already configured to use the current
+    * scratch buffer. */
+   if (shader->scratch_bo == sctx->scratch_buffer) {
+      si_shader_unlock(shader);
+      return 0;
+   }
+
+   assert(sctx->scratch_buffer);
+
+   /* Replace the shader bo with a new bo that has the relocs applied. */
+   if (!si_shader_binary_upload(sctx->screen, shader, scratch_va)) {
+      si_shader_unlock(shader);
+      return -1;
+   }
+
+   /* Update the shader state to use the new shader bo. */
+   si_shader_init_pm4_state(sctx->screen, shader);
+
+   si_resource_reference(&shader->scratch_bo, sctx->scratch_buffer);
+
+   si_shader_unlock(shader);
+   return 1;
+}
+
+static struct si_shader *si_get_tcs_current(struct si_context *sctx)
+{
+   if (!sctx->shader.tes.cso)
+      return NULL; /* tessellation disabled */
+
+   return sctx->shader.tcs.cso ? sctx->shader.tcs.current : sctx->fixed_func_tcs_shader.current;
+}
+
+static bool si_update_scratch_relocs(struct si_context *sctx)
+{
+   struct si_shader *tcs = si_get_tcs_current(sctx);
+   int r;
+
+   /* Update the shaders, so that they are using the latest scratch.
+    * The scratch buffer may have been changed since these shaders were
+    * last used, so we still need to try to update them, even if they
+    * require scratch buffers smaller than the current size.
+    */
+   r = si_update_scratch_buffer(sctx, sctx->shader.ps.current);
+   if (r < 0)
+      return false;
+   if (r == 1)
+      si_pm4_bind_state(sctx, ps, sctx->shader.ps.current);
+
+   r = si_update_scratch_buffer(sctx, sctx->shader.gs.current);
+   if (r < 0)
+      return false;
+   if (r == 1)
+      si_pm4_bind_state(sctx, gs, sctx->shader.gs.current);
+
+   r = si_update_scratch_buffer(sctx, tcs);
+   if (r < 0)
+      return false;
+   if (r == 1)
+      si_pm4_bind_state(sctx, hs, tcs);
+
+   /* VS can be bound as LS, ES, or VS. */
+   r = si_update_scratch_buffer(sctx, sctx->shader.vs.current);
+   if (r < 0)
+      return false;
+   if (r == 1) {
+      if (sctx->shader.vs.current->key.ge.as_ls)
+         si_pm4_bind_state(sctx, ls, sctx->shader.vs.current);
+      else if (sctx->shader.vs.current->key.ge.as_es)
+         si_pm4_bind_state(sctx, es, sctx->shader.vs.current);
+      else if (sctx->shader.vs.current->key.ge.as_ngg)
+         si_pm4_bind_state(sctx, gs, sctx->shader.vs.current);
+      else
+         si_pm4_bind_state(sctx, vs, sctx->shader.vs.current);
+   }
+
+   /* TES can be bound as ES or VS. */
+   r = si_update_scratch_buffer(sctx, sctx->shader.tes.current);
+   if (r < 0)
+      return false;
+   if (r == 1) {
+      if (sctx->shader.tes.current->key.ge.as_es)
+         si_pm4_bind_state(sctx, es, sctx->shader.tes.current);
+      else if (sctx->shader.tes.current->key.ge.as_ngg)
+         si_pm4_bind_state(sctx, gs, sctx->shader.tes.current);
+      else
+         si_pm4_bind_state(sctx, vs, sctx->shader.tes.current);
+   }
+
+   return true;
+}
+
+bool si_update_spi_tmpring_size(struct si_context *sctx, unsigned bytes)
+{
+   /* SPI_TMPRING_SIZE.WAVESIZE must be constant for each scratch buffer.
+    * There are 2 cases to handle:
+    *
+    * - If the current needed size is less than the maximum seen size,
+    *   use the maximum seen size, so that WAVESIZE remains the same.
+    *
+    * - If the current needed size is greater than the maximum seen size,
+    *   the scratch buffer is reallocated, so we can increase WAVESIZE.
+    *
+    * Shaders that set SCRATCH_EN=0 don't allocate scratch space.
+    * Otherwise, the number of waves that can use scratch is
+    * SPI_TMPRING_SIZE.WAVES.
+    */
+   sctx->max_seen_scratch_bytes_per_wave = MAX2(sctx->max_seen_scratch_bytes_per_wave, bytes);
+
+   unsigned scratch_needed_size = sctx->max_seen_scratch_bytes_per_wave * sctx->scratch_waves;
+   unsigned spi_tmpring_size;
+
+   if (scratch_needed_size > 0) {
+      if (!sctx->scratch_buffer || scratch_needed_size > sctx->scratch_buffer->b.b.width0) {
+         /* Create a bigger scratch buffer */
+         si_resource_reference(&sctx->scratch_buffer, NULL);
+
+         sctx->scratch_buffer = si_aligned_buffer_create(
+            &sctx->screen->b,
+            SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL,
+            PIPE_USAGE_DEFAULT, scratch_needed_size,
+            sctx->screen->info.pte_fragment_size);
+         if (!sctx->scratch_buffer)
+            return false;
+
+         si_context_add_resource_size(sctx, &sctx->scratch_buffer->b.b);
+      }
+
+      if (!si_update_scratch_relocs(sctx))
+         return false;
+   }
+
+   /* The LLVM shader backend should be reporting aligned scratch_sizes. */
+   assert((scratch_needed_size & ~0x3FF) == scratch_needed_size &&
+          "scratch size should already be aligned correctly.");
+
+   spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) |
+                      S_0286E8_WAVESIZE(sctx->max_seen_scratch_bytes_per_wave >> 10);
+   if (spi_tmpring_size != sctx->spi_tmpring_size) {
+      sctx->spi_tmpring_size = spi_tmpring_size;
+      si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
+   }
+   return true;
+}
+
+void si_init_tess_factor_ring(struct si_context *sctx)
+{
+   assert(!sctx->tess_rings);
+   assert(((sctx->screen->tess_factor_ring_size / 4) & C_030938_SIZE) == 0);
+
+   /* The address must be aligned to 2^19, because the shader only
+    * receives the high 13 bits.
+    */
+   sctx->tess_rings = pipe_aligned_buffer_create(
+      sctx->b.screen, SI_RESOURCE_FLAG_32BIT | SI_RESOURCE_FLAG_DRIVER_INTERNAL, PIPE_USAGE_DEFAULT,
+      sctx->screen->tess_offchip_ring_size + sctx->screen->tess_factor_ring_size, 1 << 19);
+   if (!sctx->tess_rings)
+      return;
+
+   if (sctx->screen->info.has_tmz_support) {
+      sctx->tess_rings_tmz = pipe_aligned_buffer_create(
+         sctx->b.screen,
+         PIPE_RESOURCE_FLAG_ENCRYPTED | SI_RESOURCE_FLAG_32BIT | SI_RESOURCE_FLAG_DRIVER_INTERNAL,
+         PIPE_USAGE_DEFAULT,
+         sctx->screen->tess_offchip_ring_size + sctx->screen->tess_factor_ring_size, 1 << 19);
+   }
+
+   uint64_t factor_va =
+      si_resource(sctx->tess_rings)->gpu_address + sctx->screen->tess_offchip_ring_size;
+
+   if (sctx->shadowed_regs) {
+      /* These registers will be shadowed, so set them only once. */
+      /* TODO: tmz + shadowed_regs support */
+      struct radeon_cmdbuf *cs = &sctx->gfx_cs;
+
+      assert(sctx->chip_class >= GFX7);
+
+      radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(sctx->tess_rings),
+                                RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RINGS);
+      si_emit_vgt_flush(cs);
+
+      /* Set tessellation registers. */
+      radeon_begin(cs);
+      radeon_set_uconfig_reg(R_030938_VGT_TF_RING_SIZE,
+                             S_030938_SIZE(sctx->screen->tess_factor_ring_size / 4));
+      radeon_set_uconfig_reg(R_030940_VGT_TF_MEMORY_BASE, factor_va >> 8);
+      if (sctx->chip_class >= GFX10) {
+         radeon_set_uconfig_reg(R_030984_VGT_TF_MEMORY_BASE_HI_UMD,
+                                S_030984_BASE_HI(factor_va >> 40));
+      } else if (sctx->chip_class == GFX9) {
+         radeon_set_uconfig_reg(R_030944_VGT_TF_MEMORY_BASE_HI,
+                                S_030944_BASE_HI(factor_va >> 40));
+      }
+      radeon_set_uconfig_reg(R_03093C_VGT_HS_OFFCHIP_PARAM,
+                             sctx->screen->vgt_hs_offchip_param);
+      radeon_end();
+      return;
+   }
+
+   /* The codepath without register shadowing. */
+   si_cs_preamble_add_vgt_flush(sctx);
+
+   /* Append these registers to the init config state. */
+   if (sctx->chip_class >= GFX7) {
+      si_pm4_set_reg(sctx->cs_preamble_state, R_030938_VGT_TF_RING_SIZE,
+                     S_030938_SIZE(sctx->screen->tess_factor_ring_size / 4));
+      si_pm4_set_reg(sctx->cs_preamble_state, R_030940_VGT_TF_MEMORY_BASE, factor_va >> 8);
+      if (sctx->chip_class >= GFX10)
+         si_pm4_set_reg(sctx->cs_preamble_state, R_030984_VGT_TF_MEMORY_BASE_HI_UMD,
+                        S_030984_BASE_HI(factor_va >> 40));
+      else if (sctx->chip_class == GFX9)
+         si_pm4_set_reg(sctx->cs_preamble_state, R_030944_VGT_TF_MEMORY_BASE_HI,
+                        S_030944_BASE_HI(factor_va >> 40));
+      si_pm4_set_reg(sctx->cs_preamble_state, R_03093C_VGT_HS_OFFCHIP_PARAM,
+                     sctx->screen->vgt_hs_offchip_param);
+   } else {
+      struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
+
+      si_pm4_set_reg(pm4, R_008988_VGT_TF_RING_SIZE,
+                     S_008988_SIZE(sctx->screen->tess_factor_ring_size / 4));
+      si_pm4_set_reg(pm4, R_0089B8_VGT_TF_MEMORY_BASE, factor_va >> 8);
+      si_pm4_set_reg(pm4, R_0089B0_VGT_HS_OFFCHIP_PARAM,
+                     sctx->screen->vgt_hs_offchip_param);
+      sctx->cs_preamble_tess_rings = pm4;
+
+      if (sctx->screen->info.has_tmz_support) {
+         pm4 = CALLOC_STRUCT(si_pm4_state);
+         uint64_t factor_va_tmz =
+            si_resource(sctx->tess_rings_tmz)->gpu_address + sctx->screen->tess_offchip_ring_size;
+         si_pm4_set_reg(pm4, R_008988_VGT_TF_RING_SIZE,
+                     S_008988_SIZE(sctx->screen->tess_factor_ring_size / 4));
+         si_pm4_set_reg(pm4, R_0089B8_VGT_TF_MEMORY_BASE, factor_va_tmz >> 8);
+         si_pm4_set_reg(pm4, R_0089B0_VGT_HS_OFFCHIP_PARAM,
+                        sctx->screen->vgt_hs_offchip_param);
+         sctx->cs_preamble_tess_rings_tmz = pm4;
+      }
+   }
+
+   /* Flush the context to re-emit the cs_preamble state.
+    * This is done only once in a lifetime of a context.
+    */
+   sctx->initial_gfx_cs_size = 0; /* force flush */
+   si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+}
+
+struct si_pm4_state *si_build_vgt_shader_config(struct si_screen *screen, union si_vgt_stages_key key)
+{
+   struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
+   uint32_t stages = 0;
+
+   if (key.u.tess) {
+      stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) | S_028B54_HS_EN(1) | S_028B54_DYNAMIC_HS(1);
+
+      if (key.u.gs)
+         stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) | S_028B54_GS_EN(1);
+      else if (key.u.ngg)
+         stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS);
+      else
+         stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
+   } else if (key.u.gs) {
+      stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) | S_028B54_GS_EN(1);
+   } else if (key.u.ngg) {
+      stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL);
+   }
+
+   if (key.u.ngg) {
+      stages |= S_028B54_PRIMGEN_EN(1) |
+                S_028B54_NGG_WAVE_ID_EN(key.u.streamout) |
+                S_028B54_PRIMGEN_PASSTHRU_EN(key.u.ngg_passthrough) |
+                S_028B54_PRIMGEN_PASSTHRU_NO_MSG(key.u.ngg_passthrough &&
+                                                 screen->info.family >= CHIP_DIMGREY_CAVEFISH);
+   } else if (key.u.gs)
+      stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
+
+   if (screen->info.chip_class >= GFX9)
+      stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
+
+   if (screen->info.chip_class >= GFX10 && screen->ge_wave_size == 32) {
+      stages |= S_028B54_HS_W32_EN(1) |
+                S_028B54_GS_W32_EN(key.u.ngg) | /* legacy GS only supports Wave64 */
+                S_028B54_VS_W32_EN(1);
+   }
+
+   si_pm4_set_reg(pm4, R_028B54_VGT_SHADER_STAGES_EN, stages);
+   return pm4;
+}
+
+static void si_emit_scratch_state(struct si_context *sctx)
+{
+   struct radeon_cmdbuf *cs = &sctx->gfx_cs;
+
+   radeon_begin(cs);
+   radeon_set_context_reg(R_0286E8_SPI_TMPRING_SIZE, sctx->spi_tmpring_size);
+   radeon_end();
+
+   if (sctx->scratch_buffer) {
+      radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->scratch_buffer, RADEON_USAGE_READWRITE,
+                                RADEON_PRIO_SCRATCH_BUFFER);
+   }
+}
+
+void si_init_screen_live_shader_cache(struct si_screen *sscreen)
+{
+   util_live_shader_cache_init(&sscreen->live_shader_cache, si_create_shader_selector,
+                               si_destroy_shader_selector);
+}
+
+void si_init_shader_functions(struct si_context *sctx)
+{
+   sctx->atoms.s.scratch_state.emit = si_emit_scratch_state;
+
+   sctx->b.create_vs_state = si_create_shader;
+   sctx->b.create_tcs_state = si_create_shader;
+   sctx->b.create_tes_state = si_create_shader;
+   sctx->b.create_gs_state = si_create_shader;
+   sctx->b.create_fs_state = si_create_shader;
+
+   sctx->b.bind_vs_state = si_bind_vs_shader;
+   sctx->b.bind_tcs_state = si_bind_tcs_shader;
+   sctx->b.bind_tes_state = si_bind_tes_shader;
+   sctx->b.bind_gs_state = si_bind_gs_shader;
+   sctx->b.bind_fs_state = si_bind_ps_shader;
+
+   sctx->b.delete_vs_state = si_delete_shader_selector;
+   sctx->b.delete_tcs_state = si_delete_shader_selector;
+   sctx->b.delete_tes_state = si_delete_shader_selector;
+   sctx->b.delete_gs_state = si_delete_shader_selector;
+   sctx->b.delete_fs_state = si_delete_shader_selector;
+}
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_state_streamout.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_state_streamout.c
index d9f968bddf..ba5bb233c3 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_state_streamout.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_state_streamout.c	
@@ -199,7 +199,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ
          }
 
          si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf);
-         si_resource(targets[i]->buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT;
+         si_resource(targets[i]->buffer)->bind_history |= SI_BIND_STREAMOUT_BUFFER;
       } else {
          si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
       }
@@ -240,14 +240,14 @@ static void gfx10_emit_streamout_begin(struct si_context *sctx)
          va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
       }
 
-      radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
-      radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
-                         S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(i == last_target));
-      radeon_emit(cs, va);
-      radeon_emit(cs, va >> 32);
-      radeon_emit(cs, 4 * i); /* destination in GDS */
-      radeon_emit(cs, 0);
-      radeon_emit(cs, S_415_BYTE_COUNT_GFX9(4) | S_415_DISABLE_WR_CONFIRM_GFX9(i != last_target));
+      radeon_emit(PKT3(PKT3_DMA_DATA, 5, 0));
+      radeon_emit(S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
+                  S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(i == last_target));
+      radeon_emit(va);
+      radeon_emit(va >> 32);
+      radeon_emit(4 * i); /* destination in GDS */
+      radeon_emit(0);
+      radeon_emit(S_415_BYTE_COUNT_GFX9(4) | S_415_DISABLE_WR_CONFIRM_GFX9(i != last_target));
    }
    radeon_end();
 
@@ -284,23 +284,22 @@ static void si_flush_vgt_streamout(struct si_context *sctx)
    /* The register is at different places on different ASICs. */
    if (sctx->chip_class >= GFX7) {
       reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
-      radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
+      radeon_set_uconfig_reg(reg_strmout_cntl, 0);
    } else {
       reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
-      radeon_set_config_reg(cs, reg_strmout_cntl, 0);
+      radeon_set_config_reg(reg_strmout_cntl, 0);
    }
 
-   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-   radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
+   radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+   radeon_emit(EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
 
-   radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
-   radeon_emit(cs,
-               WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
-   radeon_emit(cs, reg_strmout_cntl >> 2); /* register */
-   radeon_emit(cs, 0);
-   radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
-   radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
-   radeon_emit(cs, 4);                              /* poll interval */
+   radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+   radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
+   radeon_emit(reg_strmout_cntl >> 2); /* register */
+   radeon_emit(0);
+   radeon_emit(S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
+   radeon_emit(S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
+   radeon_emit(4);                              /* poll interval */
    radeon_end();
 }
 
@@ -324,33 +323,33 @@ static void si_emit_streamout_begin(struct si_context *sctx)
       /* AMD GCN binds streamout buffers as shader resources.
        * VGT only counts primitives and tells the shader
        * through SGPRs what to do. */
-      radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2);
-      radeon_emit(cs, (t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */
-      radeon_emit(cs, stride_in_dw[i]);                                    /* VTX_STRIDE (in DW) */
+      radeon_set_context_reg_seq(R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2);
+      radeon_emit((t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */
+      radeon_emit(stride_in_dw[i]);                                    /* VTX_STRIDE (in DW) */
 
       if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) {
          uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
 
          /* Append. */
-         radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
-         radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
-                            STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
-         radeon_emit(cs, 0);                                                 /* unused */
-         radeon_emit(cs, 0);                                                 /* unused */
-         radeon_emit(cs, va);                                                /* src address lo */
-         radeon_emit(cs, va >> 32);                                          /* src address hi */
+         radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
+         radeon_emit(STRMOUT_SELECT_BUFFER(i) |
+                     STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
+         radeon_emit(0);                                              /* unused */
+         radeon_emit(0);                                              /* unused */
+         radeon_emit(va);                                             /* src address lo */
+         radeon_emit(va >> 32);                                       /* src address hi */
 
          radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_READ,
                                    RADEON_PRIO_SO_FILLED_SIZE);
       } else {
          /* Start from the beginning. */
-         radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
-         radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
-                            STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
-         radeon_emit(cs, 0);                                                    /* unused */
-         radeon_emit(cs, 0);                                                    /* unused */
-         radeon_emit(cs, t[i]->b.buffer_offset >> 2); /* buffer offset in DW */
-         radeon_emit(cs, 0);                          /* unused */
+         radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
+         radeon_emit(STRMOUT_SELECT_BUFFER(i) |
+                     STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
+         radeon_emit(0);                                                 /* unused */
+         radeon_emit(0);                                                 /* unused */
+         radeon_emit(t[i]->b.buffer_offset >> 2); /* buffer offset in DW */
+         radeon_emit(0);                          /* unused */
       }
    }
    radeon_end();
@@ -379,13 +378,13 @@ void si_emit_streamout_end(struct si_context *sctx)
          continue;
 
       va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
-      radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
-      radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
-                         STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
-      radeon_emit(cs, va);                                  /* dst address lo */
-      radeon_emit(cs, va >> 32);                            /* dst address hi */
-      radeon_emit(cs, 0);                                   /* unused */
-      radeon_emit(cs, 0);                                   /* unused */
+      radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
+      radeon_emit(STRMOUT_SELECT_BUFFER(i) | STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
+                  STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
+      radeon_emit(va);                                  /* dst address lo */
+      radeon_emit(va >> 32);                            /* dst address hi */
+      radeon_emit(0);                                   /* unused */
+      radeon_emit(0);                                   /* unused */
 
       radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_WRITE,
                                 RADEON_PRIO_SO_FILLED_SIZE);
@@ -394,7 +393,7 @@ void si_emit_streamout_end(struct si_context *sctx)
        * primitives emitted) may be enabled even if there is not
        * buffer bound. This ensures that the primitives-emitted query
        * won't increment. */
-      radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
+      radeon_set_context_reg(R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
 
       t[i]->buf_filled_size_valid = true;
    }
@@ -415,14 +414,13 @@ static void si_emit_streamout_enable(struct si_context *sctx)
    assert(!sctx->screen->use_ngg_streamout);
 
    radeon_begin(&sctx->gfx_cs);
-   radeon_set_context_reg_seq(&sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
-   radeon_emit(&sctx->gfx_cs, S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) |
-                                S_028B94_RAST_STREAM(0) |
-                                S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) |
-                                S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) |
-                                S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx)));
-   radeon_emit(&sctx->gfx_cs,
-               sctx->streamout.hw_enabled_mask & sctx->streamout.enabled_stream_buffers_mask);
+   radeon_set_context_reg_seq(R_028B94_VGT_STRMOUT_CONFIG, 2);
+   radeon_emit(S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) |
+               S_028B94_RAST_STREAM(0) |
+               S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) |
+               S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) |
+               S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx)));
+   radeon_emit(sctx->streamout.hw_enabled_mask & sctx->streamout.enabled_stream_buffers_mask);
    radeon_end();
 }
 
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_state_viewport.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_state_viewport.c
index 7dda99a181..3c753c618c 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_state_viewport.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_state_viewport.c	
@@ -104,7 +104,7 @@ static void si_emit_cull_state(struct si_context *sctx)
    radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->small_prim_cull_info_buf,
                              RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
    radeon_begin(&sctx->gfx_cs);
-   radeon_set_sh_reg(&sctx->gfx_cs, R_00B220_SPI_SHADER_PGM_LO_GS,
+   radeon_set_sh_reg(R_00B220_SPI_SHADER_PGM_LO_GS,
                      sctx->small_prim_cull_info_address >> 8);
    radeon_end();
 
@@ -221,15 +221,15 @@ static void si_emit_one_scissor(struct si_context *ctx, struct radeon_cmdbuf *cs
     * SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0.
     */
    if (ctx->chip_class == GFX6 && (final.maxx == 0 || final.maxy == 0)) {
-      radeon_emit(cs, S_028250_TL_X(1) | S_028250_TL_Y(1) | S_028250_WINDOW_OFFSET_DISABLE(1));
-      radeon_emit(cs, S_028254_BR_X(1) | S_028254_BR_Y(1));
+      radeon_emit(S_028250_TL_X(1) | S_028250_TL_Y(1) | S_028250_WINDOW_OFFSET_DISABLE(1));
+      radeon_emit(S_028254_BR_X(1) | S_028254_BR_Y(1));
       radeon_end();
       return;
    }
 
-   radeon_emit(cs, S_028250_TL_X(final.minx) | S_028250_TL_Y(final.miny) |
-                      S_028250_WINDOW_OFFSET_DISABLE(1));
-   radeon_emit(cs, S_028254_BR_X(final.maxx) | S_028254_BR_Y(final.maxy));
+   radeon_emit(S_028250_TL_X(final.minx) | S_028250_TL_Y(final.miny) |
+                  S_028250_WINDOW_OFFSET_DISABLE(1));
+   radeon_emit(S_028254_BR_X(final.maxx) | S_028254_BR_Y(final.maxy));
    radeon_end();
 }
 
@@ -382,7 +382,7 @@ static void si_emit_scissors(struct si_context *ctx)
       struct si_signed_scissor *vp = &ctx->viewports.as_scissor[0];
 
       radeon_begin(cs);
-      radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, 2);
+      radeon_set_context_reg_seq(R_028250_PA_SC_VPORT_SCISSOR_0_TL, 2);
       radeon_end();
 
       si_emit_one_scissor(ctx, cs, vp, scissor_enabled ? &states[0] : NULL);
@@ -393,7 +393,7 @@ static void si_emit_scissors(struct si_context *ctx)
     * This is a hardware requirement.
     */
    radeon_begin(cs);
-   radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, SI_MAX_VIEWPORTS * 2);
+   radeon_set_context_reg_seq(R_028250_PA_SC_VPORT_SCISSOR_0_TL, SI_MAX_VIEWPORTS * 2);
    radeon_end();
 
    for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) {
@@ -416,27 +416,10 @@ static void si_set_viewport_states(struct pipe_context *pctx, unsigned start_slo
 
       si_get_scissor_from_viewport(ctx, &state[i], scissor);
 
-      unsigned w = scissor->maxx - scissor->minx;
-      unsigned h = scissor->maxy - scissor->miny;
-      unsigned max_extent = MAX2(w, h);
-
       int max_corner = MAX2(
          MAX2(abs(scissor->maxx), abs(scissor->maxy)),
          MAX2(abs(scissor->minx), abs(scissor->miny)));
 
-      unsigned center_x = (scissor->maxx + scissor->minx) / 2;
-      unsigned center_y = (scissor->maxy + scissor->miny) / 2;
-      unsigned max_center = MAX2(center_x, center_y);
-
-      /* PA_SU_HARDWARE_SCREEN_OFFSET can't center viewports whose
-       * center start farther than MAX_PA_SU_HARDWARE_SCREEN_OFFSET.
-       * (for example, a 1x1 viewport in the lower right corner of
-       * 16Kx16K) Such viewports need a greater guardband, so they
-       * have to use a worse quantization mode.
-       */
-      unsigned distance_off_center = MAX2(0, (int)max_center - MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
-      max_extent += distance_off_center;
-
       /* Determine the best quantization mode (subpixel precision),
        * but also leave enough space for the guardband.
        *
@@ -445,7 +428,7 @@ static void si_set_viewport_states(struct pipe_context *pctx, unsigned start_slo
        * Always use 16_8 if primitive binning is possible to occur.
        */
       if ((ctx->family == CHIP_VEGA10 || ctx->family == CHIP_RAVEN) && ctx->screen->dpbb_allowed)
-         max_extent = 16384; /* Use QUANT_MODE == 16_8. */
+         max_corner = 16384; /* Use QUANT_MODE == 16_8. */
 
       /* Another constraint is that all coordinates in the viewport
        * are representable in fixed point with respect to the
@@ -462,9 +445,9 @@ static void si_set_viewport_states(struct pipe_context *pctx, unsigned start_slo
        * 4k x 4k of the render target.
        */
 
-      if (max_extent <= 1024 && max_corner < (1 << 12)) /* 4K scanline area for guardband */
+      if (max_corner <= 1024) /* 4K scanline area for guardband */
          scissor->quant_mode = SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH;
-      else if (max_extent <= 4096 && max_corner < (1 << 14)) /* 16K scanline area for guardband */
+      else if (max_corner <= 4096) /* 16K scanline area for guardband */
          scissor->quant_mode = SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH;
       else /* 64K scanline area for guardband */
          scissor->quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
@@ -489,12 +472,12 @@ static void si_emit_one_viewport(struct si_context *ctx, struct pipe_viewport_st
    struct radeon_cmdbuf *cs = &ctx->gfx_cs;
 
    radeon_begin(cs);
-   radeon_emit(cs, fui(state->scale[0]));
-   radeon_emit(cs, fui(state->translate[0]));
-   radeon_emit(cs, fui(state->scale[1]));
-   radeon_emit(cs, fui(state->translate[1]));
-   radeon_emit(cs, fui(state->scale[2]));
-   radeon_emit(cs, fui(state->translate[2]));
+   radeon_emit(fui(state->scale[0]));
+   radeon_emit(fui(state->translate[0]));
+   radeon_emit(fui(state->scale[1]));
+   radeon_emit(fui(state->translate[1]));
+   radeon_emit(fui(state->scale[2]));
+   radeon_emit(fui(state->translate[2]));
    radeon_end();
 }
 
@@ -506,7 +489,7 @@ static void si_emit_viewports(struct si_context *ctx)
    /* The simple case: Only 1 viewport is active. */
    if (!ctx->vs_writes_viewport_index) {
       radeon_begin(cs);
-      radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6);
+      radeon_set_context_reg_seq(R_02843C_PA_CL_VPORT_XSCALE, 6);
       radeon_end();
 
       si_emit_one_viewport(ctx, &states[0]);
@@ -517,7 +500,7 @@ static void si_emit_viewports(struct si_context *ctx)
     * This is a hardware requirement.
     */
    radeon_begin(cs);
-   radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE + 0, SI_MAX_VIEWPORTS * 6);
+   radeon_set_context_reg_seq(R_02843C_PA_CL_VPORT_XSCALE + 0, SI_MAX_VIEWPORTS * 6);
    radeon_end();
 
    for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++)
@@ -548,9 +531,9 @@ static void si_emit_depth_ranges(struct si_context *ctx)
       si_viewport_zmin_zmax(&states[0], clip_halfz, window_space, &zmin, &zmax);
 
       radeon_begin(cs);
-      radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0, 2);
-      radeon_emit(cs, fui(zmin));
-      radeon_emit(cs, fui(zmax));
+      radeon_set_context_reg_seq(R_0282D0_PA_SC_VPORT_ZMIN_0, 2);
+      radeon_emit(fui(zmin));
+      radeon_emit(fui(zmax));
       radeon_end();
       return;
    }
@@ -559,11 +542,11 @@ static void si_emit_depth_ranges(struct si_context *ctx)
     * This is a hardware requirement.
     */
    radeon_begin(cs);
-   radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0, SI_MAX_VIEWPORTS * 2);
+   radeon_set_context_reg_seq(R_0282D0_PA_SC_VPORT_ZMIN_0, SI_MAX_VIEWPORTS * 2);
    for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) {
       si_viewport_zmin_zmax(&states[i], clip_halfz, window_space, &zmin, &zmax);
-      radeon_emit(cs, fui(zmin));
-      radeon_emit(cs, fui(zmax));
+      radeon_emit(fui(zmin));
+      radeon_emit(fui(zmax));
    }
    radeon_end();
 }
@@ -662,10 +645,10 @@ static void si_emit_window_rectangles(struct si_context *sctx)
       return;
    }
 
-   radeon_set_context_reg_seq(cs, R_028210_PA_SC_CLIPRECT_0_TL, num_rectangles * 2);
+   radeon_set_context_reg_seq(R_028210_PA_SC_CLIPRECT_0_TL, num_rectangles * 2);
    for (unsigned i = 0; i < num_rectangles; i++) {
-      radeon_emit(cs, S_028210_TL_X(rects[i].minx) | S_028210_TL_Y(rects[i].miny));
-      radeon_emit(cs, S_028214_BR_X(rects[i].maxx) | S_028214_BR_Y(rects[i].maxy));
+      radeon_emit(S_028210_TL_X(rects[i].minx) | S_028210_TL_Y(rects[i].miny));
+      radeon_emit(S_028214_BR_X(rects[i].maxx) | S_028214_BR_Y(rects[i].maxy));
    }
    radeon_end();
 }
diff --git a/mesa 3D driver/src/gallium/drivers/radeonsi/si_texture.c b/mesa 3D driver/src/gallium/drivers/radeonsi/si_texture.c
index cf41f4842e..388743ccf6 100644
--- a/mesa 3D driver/src/gallium/drivers/radeonsi/si_texture.c	
+++ b/mesa 3D driver/src/gallium/drivers/radeonsi/si_texture.c	
@@ -232,6 +232,13 @@ static int si_init_surface(struct si_screen *sscreen, struct radeon_surf *surfac
          break;
 
       case GFX9:
+         /* DCC MSAA fails this on Raven:
+          *    https://www.khronos.org/registry/webgl/sdk/tests/deqp/functional/gles3/fbomultisample.2_samples.html
+          * and this on Picasso:
+          *    https://www.khronos.org/registry/webgl/sdk/tests/deqp/functional/gles3/fbomultisample.4_samples.html
+          */
+         if (sscreen->info.family == CHIP_RAVEN && ptex->nr_storage_samples >= 2 && bpe < 4)
+            flags |= RADEON_SURF_DISABLE_DCC;
          break;
 
       case GFX10:
@@ -447,8 +454,7 @@ static void si_reallocate_texture_inplace(struct si_context *sctx, struct si_tex
    tex->buffer.b.b.bind = templ.bind;
    radeon_bo_reference(sctx->screen->ws, &tex->buffer.buf, new_tex->buffer.buf);
    tex->buffer.gpu_address = new_tex->buffer.gpu_address;
-   tex->buffer.vram_usage_kb = new_tex->buffer.vram_usage_kb;
-   tex->buffer.gart_usage_kb = new_tex->buffer.gart_usage_kb;
+   tex->buffer.memory_usage_kb = new_tex->buffer.memory_usage_kb;
    tex->buffer.bo_size = new_tex->buffer.bo_size;
    tex->buffer.bo_alignment_log2 = new_tex->buffer.bo_alignment_log2;
    tex->buffer.domains = new_tex->buffer.domains;
@@ -727,6 +733,8 @@ static bool si_texture_get_handle(struct pipe_screen *screen, struct pipe_contex
 
       modifier = tex->surface.modifier;
    } else {
+      tc_buffer_disable_cpu_storage(&res->b.b);
+
       /* Buffer exports are for the OpenCL interop. */
       /* Move a suballocated buffer into a non-suballocated allocation. */
       if (sscreen->ws->buffer_is_suballocated(res->buf) ||
@@ -890,7 +898,7 @@ static struct si_texture *si_texture_create_object(struct pipe_screen *screen,
       return NULL;
    }
 
-   tex = CALLOC_STRUCT(si_texture);
+   tex = CALLOC_STRUCT_CL(si_texture);
    if (!tex)
       goto error;
 
@@ -984,8 +992,7 @@ static struct si_texture *si_texture_create_object(struct pipe_screen *screen,
       resource->bo_alignment_log2 = plane0->buffer.bo_alignment_log2;
       resource->flags = plane0->buffer.flags;
       resource->domains = plane0->buffer.domains;
-      resource->vram_usage_kb = plane0->buffer.vram_usage_kb;
-      resource->gart_usage_kb = plane0->buffer.gart_usage_kb;
+      resource->memory_usage_kb = plane0->buffer.memory_usage_kb;
 
       radeon_bo_reference(sscreen->ws, &resource->buf, plane0->buffer.buf);
       resource->gpu_address = plane0->buffer.gpu_address;
@@ -1001,10 +1008,7 @@ static struct si_texture *si_texture_create_object(struct pipe_screen *screen,
       resource->bo_size = imported_buf->size;
       resource->bo_alignment_log2 = imported_buf->alignment_log2;
       resource->domains = sscreen->ws->buffer_get_initial_domain(resource->buf);
-      if (resource->domains & RADEON_DOMAIN_VRAM)
-         resource->vram_usage_kb = MAX2(1, resource->bo_size / 1024);
-      else if (resource->domains & RADEON_DOMAIN_GTT)
-         resource->gart_usage_kb = MAX2(1, resource->bo_size / 1024);
+      resource->memory_usage_kb = MAX2(1, resource->bo_size / 1024);
       if (sscreen->ws->buffer_get_flags)
          resource->flags = sscreen->ws->buffer_get_flags(resource->buf);
    }
@@ -1127,7 +1131,7 @@ static struct si_texture *si_texture_create_object(struct pipe_screen *screen,
    return tex;
 
 error:
-   FREE(tex);
+   FREE_CL(tex);
    return NULL;
 }
 
@@ -1382,6 +1386,18 @@ si_get_dmabuf_modifier_planes(struct pipe_screen *pscreen, uint64_t modifier,
    return planes;
 }
 
+static bool
+si_modifier_supports_resource(struct pipe_screen *screen,
+                              uint64_t modifier,
+                              const struct pipe_resource *templ)
+{
+   struct si_screen *sscreen = (struct si_screen *)screen;
+   uint32_t max_width, max_height;
+
+   ac_modifier_max_extent(&sscreen->info, modifier, &max_width, &max_height);
+   return templ->width0 <= max_width && templ->height0 <= max_height;
+}
+
 static struct pipe_resource *
 si_texture_create_with_modifiers(struct pipe_screen *screen,
                                  const struct pipe_resource *templ,
@@ -1411,7 +1427,7 @@ si_texture_create_with_modifiers(struct pipe_screen *screen,
    for (int i = 0; i < allowed_mod_count; ++i) {
       bool found = false;
       for (int j = 0; j < modifier_count && !found; ++j)
-         if (modifiers[j] == allowed_modifiers[i])
+         if (modifiers[j] == allowed_modifiers[i] && si_modifier_supports_resource(screen, modifiers[j], templ))
             found = true;
 
       if (found) {
@@ -1565,12 +1581,14 @@ static struct pipe_resource *si_texture_from_handle(struct pipe_screen *screen,
        templ->last_level != 0)
       return NULL;
 
-   buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle, sscreen->info.max_alignment);
+   buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle,
+                                         sscreen->info.max_alignment,
+                                         templ->bind & PIPE_BIND_DRI_PRIME);
    if (!buf)
       return NULL;
 
    if (whandle->plane >= util_format_get_num_planes(whandle->format)) {
-      struct si_auxiliary_texture *tex = CALLOC_STRUCT(si_auxiliary_texture);
+      struct si_auxiliary_texture *tex = CALLOC_STRUCT_CL(si_auxiliary_texture);
       if (!tex)
          return NULL;
       tex->b.b = *templ;
@@ -1757,7 +1775,7 @@ static void *si_texture_transfer_map(struct pipe_context *ctx, struct pipe_resou
       /* Tiled textures need to be converted into a linear texture for CPU
        * access. The staging texture is always linear and is placed in GART.
        *
-       * Always use a staging texture for VRAM, so that we don't map it and
+       * dGPU use a staging texture for VRAM, so that we don't map it and
        * don't relocate it to GTT.
        *
        * Reading from VRAM or GTT WC is slow, always use the staging
@@ -1767,7 +1785,7 @@ static void *si_texture_transfer_map(struct pipe_context *ctx, struct pipe_resou
        * is busy.
        */
       if (!tex->surface.is_linear || (tex->buffer.flags & RADEON_FLAG_ENCRYPTED) ||
-          (tex->buffer.domains & RADEON_DOMAIN_VRAM &&
+          (tex->buffer.domains & RADEON_DOMAIN_VRAM && sctx->screen->info.has_dedicated_vram &&
            !sctx->screen->info.smart_access_memory))
          use_staging_texture = true;
       else if (usage & PIPE_MAP_READ)
@@ -2113,7 +2131,7 @@ si_memobj_from_handle(struct pipe_screen *screen, struct winsys_handle *whandle,
    if (!memobj)
       return NULL;
 
-   buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle, sscreen->info.max_alignment);
+   buf = sscreen->ws->buffer_from_handle(sscreen->ws, whandle, sscreen->info.max_alignment, false);
    if (!buf) {
       free(memobj);
       return NULL;
diff --git a/mesa 3D driver/src/gallium/drivers/softpipe/ci/deqp-softpipe-fails.txt b/mesa 3D driver/src/gallium/drivers/softpipe/ci/deqp-softpipe-fails.txt
index 87778af5bf..81848c7124 100644
--- a/mesa 3D driver/src/gallium/drivers/softpipe/ci/deqp-softpipe-fails.txt	
+++ b/mesa 3D driver/src/gallium/drivers/softpipe/ci/deqp-softpipe-fails.txt	
@@ -3,8 +3,6 @@ dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_corner,Fail
 dEQP-GLES2.functional.clipping.point.wide_point_clip,Fail
 dEQP-GLES2.functional.clipping.point.wide_point_clip_viewport_center,Fail
 dEQP-GLES2.functional.clipping.point.wide_point_clip_viewport_corner,Fail
-dEQP-GLES2.functional.clipping.triangle_vertex.clip_two.clip_neg_y_neg_z_and_neg_x_neg_y_pos_z,Fail
-dEQP-GLES2.functional.clipping.triangle_vertex.clip_two.clip_pos_y_pos_z_and_neg_x_neg_y_neg_z,Fail
 dEQP-GLES2.functional.polygon_offset.default_displacement_with_units,Fail
 dEQP-GLES2.functional.polygon_offset.fixed16_displacement_with_units,Fail
 dEQP-GLES2.functional.rasterization.interpolation.basic.line_loop_wide,Fail
@@ -19,8 +17,6 @@ dEQP-GLES3.functional.clipping.line.wide_line_clip_viewport_corner,Fail
 dEQP-GLES3.functional.clipping.point.wide_point_clip,Fail
 dEQP-GLES3.functional.clipping.point.wide_point_clip_viewport_center,Fail
 dEQP-GLES3.functional.clipping.point.wide_point_clip_viewport_corner,Fail
-dEQP-GLES3.functional.clipping.triangle_vertex.clip_two.clip_neg_y_neg_z_and_neg_x_neg_y_pos_z,Fail
-dEQP-GLES3.functional.clipping.triangle_vertex.clip_two.clip_pos_y_pos_z_and_neg_x_neg_y_neg_z,Fail
 dEQP-GLES3.functional.draw.random.124,Fail
 dEQP-GLES3.functional.fbo.depth.depth_test_clamp.depth24_stencil8,Fail
 dEQP-GLES3.functional.fbo.depth.depth_test_clamp.depth32f_stencil8,Fail
diff --git a/mesa 3D driver/src/gallium/drivers/softpipe/ci/deqp-softpipe.toml b/mesa 3D driver/src/gallium/drivers/softpipe/ci/deqp-softpipe.toml
index a5995e6fe6..77c38b0ef6 100644
--- a/mesa 3D driver/src/gallium/drivers/softpipe/ci/deqp-softpipe.toml	
+++ b/mesa 3D driver/src/gallium/drivers/softpipe/ci/deqp-softpipe.toml	
@@ -8,6 +8,8 @@ deqp_args = [
     "--deqp-gl-config-name=rgba8888d24s8ms0",
     "--deqp-visibility=hidden"
 ]
+version_check = "GL ES 3.1.*git"
+renderer_check = "softpipe"
 
 [[deqp]]
 deqp = "/deqp/modules/gles3/deqp-gles3"
diff --git a/mesa 3D driver/src/gallium/drivers/softpipe/ci/gitlab-ci.yml b/mesa 3D driver/src/gallium/drivers/softpipe/ci/gitlab-ci.yml
index 51c5fa03ac..11531ffdb6 100644
--- a/mesa 3D driver/src/gallium/drivers/softpipe/ci/gitlab-ci.yml	
+++ b/mesa 3D driver/src/gallium/drivers/softpipe/ci/gitlab-ci.yml	
@@ -13,7 +13,6 @@
 
 softpipe-deqp:
   variables:
-    DEQP_VER: gles2 # for the renderer check
     DEQP_SUITE: softpipe
   parallel: 2
   extends: .softpipe-deqp-test
@@ -23,7 +22,7 @@ softpipe-asan-gles31:
     GPU_VERSION: softpipe-asan
     DEQP_FRACTION: 10
     DEQP_VER: gles31
-    TEST_LD_PRELOAD: libasan.so.6
+    DEQP_RUNNER_OPTIONS: "--env LD_PRELOAD=libasan.so.6"
   extends: .softpipe-deqp-test
   needs:
     - debian/x86_test-gl
diff --git a/mesa 3D driver/src/gallium/drivers/softpipe/ci/softpipe-quick.txt b/mesa 3D driver/src/gallium/drivers/softpipe/ci/softpipe-quick.txt
index 3ca307444c..e860e667c3 100644
--- a/mesa 3D driver/src/gallium/drivers/softpipe/ci/softpipe-quick.txt	
+++ b/mesa 3D driver/src/gallium/drivers/softpipe/ci/softpipe-quick.txt	
@@ -543,6 +543,7 @@ spec/arb_buffer_storage/bufferstorage-persistent_gles3 read coherent client-stor
 spec/arb_clear_texture/arb_clear_texture-multisample: skip
 spec/arb_color_buffer_float/gl_rgba8_snorm-render: fail
 spec/arb_color_buffer_float/gl_rgba8_snorm-render-fog: fail
+spec/arb_compute_shader/compute-and-render-bug-109630: skip
 spec/arb_compute_shader/execution/min-dvec4-double-large-group-size: skip
 spec/arb_compute_variable_group_size/errors: skip
 spec/arb_compute_variable_group_size/execution/basic-local-size: skip
@@ -843,6 +844,7 @@ spec/arb_sample_shading/samplemask 6: skip
 spec/arb_sample_shading/samplemask 6 all: skip
 spec/arb_sample_shading/samplemask 8: skip
 spec/arb_sample_shading/samplemask 8 all: skip
+spec/arb_separate_shader_objects/execution/layout-location-named-block-with-array: skip
 spec/arb_separate_shader_objects/linker/pervertex-clipdistance-tcs-out-tes: skip
 spec/arb_separate_shader_objects/linker/pervertex-clipdistance-tes-out-gs: skip
 spec/arb_separate_shader_objects/linker/pervertex-clipdistance-vs-out-tcs: skip
diff --git a/mesa 3D driver/src/gallium/drivers/softpipe/sp_draw_arrays.c b/mesa 3D driver/src/gallium/drivers/softpipe/sp_draw_arrays.c
index 6a5f34703a..f73131f95b 100644
--- a/mesa 3D driver/src/gallium/drivers/softpipe/sp_draw_arrays.c	
+++ b/mesa 3D driver/src/gallium/drivers/softpipe/sp_draw_arrays.c	
@@ -141,7 +141,7 @@ softpipe_draw_vbo(struct pipe_context *pipe,
                                     sp->active_statistics_queries > 0);
 
    /* draw! */
-   draw_vbo(draw, info, drawid_offset, indirect, draws, num_draws);
+   draw_vbo(draw, info, drawid_offset, indirect, draws, num_draws, 0);
 
    /* unmap vertex/index buffers - will cause draw module to flush */
    for (i = 0; i < sp->num_vertex_buffers; i++) {
diff --git a/mesa 3D driver/src/gallium/drivers/softpipe/sp_setup.c b/mesa 3D driver/src/gallium/drivers/softpipe/sp_setup.c
index 4f5721cba4..891e7e61e3 100644
--- a/mesa 3D driver/src/gallium/drivers/softpipe/sp_setup.c	
+++ b/mesa 3D driver/src/gallium/drivers/softpipe/sp_setup.c	
@@ -395,52 +395,6 @@ setup_sort_vertices(struct setup_context *setup,
 }
 
 
-/* Apply cylindrical wrapping to v0, v1, v2 coordinates, if enabled.
- * Input coordinates must be in [0, 1] range, otherwise results are undefined.
- * Some combinations of coordinates produce invalid results,
- * but this behaviour is acceptable.
- */
-static void
-tri_apply_cylindrical_wrap(float v0,
-                           float v1,
-                           float v2,
-                           uint cylindrical_wrap,
-                           float output[3])
-{
-   if (cylindrical_wrap) {
-      float delta;
-
-      delta = v1 - v0;
-      if (delta > 0.5f) {
-         v0 += 1.0f;
-      }
-      else if (delta < -0.5f) {
-         v1 += 1.0f;
-      }
-
-      delta = v2 - v1;
-      if (delta > 0.5f) {
-         v1 += 1.0f;
-      }
-      else if (delta < -0.5f) {
-         v2 += 1.0f;
-      }
-
-      delta = v0 - v2;
-      if (delta > 0.5f) {
-         v2 += 1.0f;
-      }
-      else if (delta < -0.5f) {
-         v0 += 1.0f;
-      }
-   }
-
-   output[0] = v0;
-   output[1] = v1;
-   output[2] = v2;
-}
-
-
 /**
  * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
  * The value value comes from vertex[slot][i].
@@ -620,21 +574,17 @@ setup_tri_coefficients(struct setup_context *setup)
          break;
       case SP_INTERP_LINEAR:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
-            tri_apply_cylindrical_wrap(setup->vmin[vertSlot][j],
-                                       setup->vmid[vertSlot][j],
-                                       setup->vmax[vertSlot][j],
-                                       fsInfo->input_cylindrical_wrap[fragSlot] & (1 << j),
-                                       v);
+            v[0] = setup->vmin[vertSlot][j];
+            v[1] = setup->vmid[vertSlot][j];
+            v[2] = setup->vmax[vertSlot][j];
             tri_linear_coeff(setup, &setup->coef[fragSlot], j, v);
          }
          break;
       case SP_INTERP_PERSPECTIVE:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
-            tri_apply_cylindrical_wrap(setup->vmin[vertSlot][j],
-                                       setup->vmid[vertSlot][j],
-                                       setup->vmax[vertSlot][j],
-                                       fsInfo->input_cylindrical_wrap[fragSlot] & (1 << j),
-                                       v);
+            v[0] = setup->vmin[vertSlot][j];
+            v[1] = setup->vmid[vertSlot][j];
+            v[2] = setup->vmax[vertSlot][j];
             tri_persp_coeff(setup, &setup->coef[fragSlot], j, v);
          }
          break;
@@ -876,32 +826,6 @@ sp_setup_tri(struct setup_context *setup,
 }
 
 
-/* Apply cylindrical wrapping to v0, v1 coordinates, if enabled.
- * Input coordinates must be in [0, 1] range, otherwise results are undefined.
- */
-static void
-line_apply_cylindrical_wrap(float v0,
-                            float v1,
-                            uint cylindrical_wrap,
-                            float output[2])
-{
-   if (cylindrical_wrap) {
-      float delta;
-
-      delta = v1 - v0;
-      if (delta > 0.5f) {
-         v0 += 1.0f;
-      }
-      else if (delta < -0.5f) {
-         v1 += 1.0f;
-      }
-   }
-
-   output[0] = v0;
-   output[1] = v1;
-}
-
-
 /**
  * Compute a0, dadx and dady for a linearly interpolated coefficient,
  * for a line.
@@ -1006,19 +930,15 @@ setup_line_coefficients(struct setup_context *setup,
          break;
       case SP_INTERP_LINEAR:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
-            line_apply_cylindrical_wrap(setup->vmin[vertSlot][j],
-                                        setup->vmax[vertSlot][j],
-                                        fsInfo->input_cylindrical_wrap[fragSlot] & (1 << j),
-                                        v);
+            v[0] = setup->vmin[vertSlot][j];
+            v[1] = setup->vmax[vertSlot][j];
             line_linear_coeff(setup, &setup->coef[fragSlot], j, v);
          }
          break;
       case SP_INTERP_PERSPECTIVE:
          for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
-            line_apply_cylindrical_wrap(setup->vmin[vertSlot][j],
-                                        setup->vmax[vertSlot][j],
-                                        fsInfo->input_cylindrical_wrap[fragSlot] & (1 << j),
-                                        v);
+            v[0] = setup->vmin[vertSlot][j];
+            v[1] = setup->vmax[vertSlot][j];
             line_persp_coeff(setup, &setup->coef[fragSlot], j, v);
          }
          break;
diff --git a/mesa 3D driver/src/gallium/drivers/softpipe/sp_state.h b/mesa 3D driver/src/gallium/drivers/softpipe/sp_state.h
index 7196cdff11..dcb89ef252 100644
--- a/mesa 3D driver/src/gallium/drivers/softpipe/sp_state.h	
+++ b/mesa 3D driver/src/gallium/drivers/softpipe/sp_state.h	
@@ -176,6 +176,7 @@ softpipe_set_sampler_views(struct pipe_context *pipe,
                            unsigned start,
                            unsigned num,
                            unsigned unbind_num_trailing_slots,
+                           bool take_ownership,
                            struct pipe_sampler_view **views);
 
 
diff --git a/mesa 3D driver/src/gallium/drivers/softpipe/sp_state_derived.c b/mesa 3D driver/src/gallium/drivers/softpipe/sp_state_derived.c
index d120abf0f0..a5e97869f8 100644
--- a/mesa 3D driver/src/gallium/drivers/softpipe/sp_state_derived.c	
+++ b/mesa 3D driver/src/gallium/drivers/softpipe/sp_state_derived.c	
@@ -420,7 +420,7 @@ update_polygon_stipple_enable(struct softpipe_context *softpipe, unsigned prim)
 
       /* sampler view state */
       softpipe_set_sampler_views(&softpipe->pipe, PIPE_SHADER_FRAGMENT,
-                                 unit, 1, 0, &softpipe->pstipple.sampler_view);
+                                 unit, 1, 0, false, &softpipe->pstipple.sampler_view);
 
       softpipe->dirty |= SP_NEW_SAMPLER;
    }
diff --git a/mesa 3D driver/src/gallium/drivers/softpipe/sp_state_sampler.c b/mesa 3D driver/src/gallium/drivers/softpipe/sp_state_sampler.c
index 16db953b16..c20de39a88 100644
--- a/mesa 3D driver/src/gallium/drivers/softpipe/sp_state_sampler.c	
+++ b/mesa 3D driver/src/gallium/drivers/softpipe/sp_state_sampler.c	
@@ -101,6 +101,7 @@ softpipe_set_sampler_views(struct pipe_context *pipe,
                            unsigned start,
                            unsigned num,
                            unsigned unbind_num_trailing_slots,
+                           bool take_ownership,
                            struct pipe_sampler_view **views)
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
@@ -117,7 +118,13 @@ softpipe_set_sampler_views(struct pipe_context *pipe,
       struct sp_sampler_view *sp_sviewdst =
          &softpipe->tgsi.sampler[shader]->sp_sview[start + i];
       struct pipe_sampler_view **pview = &softpipe->sampler_views[shader][start + i];
-      pipe_sampler_view_reference(pview, views[i]);
+
+      if (take_ownership) {
+         pipe_sampler_view_reference(pview, NULL);
+         *pview = views[i];
+      } else {
+         pipe_sampler_view_reference(pview, views[i]);
+      }
       sp_tex_tile_cache_set_sampler_view(softpipe->tex_cache[shader][start + i],
                                          views[i]);
       /*
diff --git a/mesa 3D driver/src/gallium/drivers/svga/svga_context.h b/mesa 3D driver/src/gallium/drivers/svga/svga_context.h
index 578e1794a4..d5ef4c3455 100644
--- a/mesa 3D driver/src/gallium/drivers/svga/svga_context.h	
+++ b/mesa 3D driver/src/gallium/drivers/svga/svga_context.h	
@@ -655,6 +655,7 @@ struct svga_context
 
    boolean render_condition;
    boolean disable_rasterizer; /* Set if to disable rasterization */
+   uint8_t patch_vertices;
 
    struct {
       struct svga_tcs_shader *passthrough_tcs;
diff --git a/mesa 3D driver/src/gallium/drivers/svga/svga_draw_elements.c b/mesa 3D driver/src/gallium/drivers/svga/svga_draw_elements.c
index fa165da267..225edf06c9 100644
--- a/mesa 3D driver/src/gallium/drivers/svga/svga_draw_elements.c	
+++ b/mesa 3D driver/src/gallium/drivers/svga/svga_draw_elements.c	
@@ -284,7 +284,7 @@ svga_hwtnl_draw_range_elements(struct svga_hwtnl *hwtnl,
                                                   gen_prim, index_offset, count,
                                                   info->start_instance,
                                                   info->instance_count,
-                                                  info->vertices_per_patch);
+                                                  hwtnl->svga->patch_vertices);
       pipe_resource_reference(&index_buffer, NULL);
    }
    else {
@@ -313,7 +313,7 @@ svga_hwtnl_draw_range_elements(struct svga_hwtnl *hwtnl,
                                                      gen_nr,
                                                      info->start_instance,
                                                      info->instance_count,
-                                                     info->vertices_per_patch);
+                                                     hwtnl->svga->patch_vertices);
       }
 
       if (gen_buf) {
diff --git a/mesa 3D driver/src/gallium/drivers/svga/svga_mksstats.h b/mesa 3D driver/src/gallium/drivers/svga/svga_mksstats.h
index a1c157301b..699ec620dd 100644
--- a/mesa 3D driver/src/gallium/drivers/svga/svga_mksstats.h	
+++ b/mesa 3D driver/src/gallium/drivers/svga/svga_mksstats.h	
@@ -30,14 +30,14 @@
 
 #ifdef VMX86_STATS
 #define SVGA_STATS_COUNT_INC(_sws, _stat)                    \
-   _sws->stats_inc(_stat);
+   _sws->stats_inc(_sws, _stat);
 
 #define SVGA_STATS_TIME_PUSH(_sws, _stat)                    \
    struct svga_winsys_stats_timeframe timeFrame;             \
-   _sws->stats_time_push(_stat, &timeFrame);
+   _sws->stats_time_push(_sws, _stat, &timeFrame);
 
 #define SVGA_STATS_TIME_POP(_sws)                            \
-   _sws->stats_time_pop();
+   _sws->stats_time_pop(_sws);
 
 #else
 
diff --git a/mesa 3D driver/src/gallium/drivers/svga/svga_pipe_draw.c b/mesa 3D driver/src/gallium/drivers/svga/svga_pipe_draw.c
index f1c6276fc7..745fdad64a 100644
--- a/mesa 3D driver/src/gallium/drivers/svga/svga_pipe_draw.c	
+++ b/mesa 3D driver/src/gallium/drivers/svga/svga_pipe_draw.c	
@@ -102,7 +102,7 @@ retry_draw_auto(struct svga_context *svga,
       unsigned hw_count;
 
       range.primType = svga_translate_prim(info->mode, 12, &hw_count,
-                                           info->vertices_per_patch);
+                                           svga->patch_vertices);
       range.primitiveCount = 0;
       range.indexArray.surfaceId = SVGA3D_INVALID_ID;
       range.indexArray.offset = 0;
@@ -151,7 +151,7 @@ retry_draw_indirect(struct svga_context *svga,
       unsigned hw_count;
 
       range.primType = svga_translate_prim(info->mode, 12, &hw_count,
-                                           info->vertices_per_patch);
+                                           svga->patch_vertices);
       range.primitiveCount = 0;  /* specified in indirect buffer */
       range.indexArray.surfaceId = SVGA3D_INVALID_ID;
       range.indexArray.offset = 0;
@@ -269,8 +269,8 @@ svga_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info,
       svga->dirty |= SVGA_NEW_VS_CONSTS;
    }
 
-   if (svga->curr.vertices_per_patch != info->vertices_per_patch) {
-      svga->curr.vertices_per_patch = info->vertices_per_patch;
+   if (svga->curr.vertices_per_patch != svga->patch_vertices) {
+      svga->curr.vertices_per_patch = svga->patch_vertices;
 
       /* If input patch size changes, we need to notifiy the TCS
        * code to reevaluate the shader variant since the
@@ -369,7 +369,7 @@ svga_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info,
       else {
          ret = retry_draw_arrays(svga, info->mode, draws[0].start, count,
                                  info->start_instance, info->instance_count,
-                                 info->vertices_per_patch);
+                                 svga->patch_vertices);
       }
    }
 
diff --git a/mesa 3D driver/src/gallium/drivers/svga/svga_pipe_sampler.c b/mesa 3D driver/src/gallium/drivers/svga/svga_pipe_sampler.c
index eb23b7b725..ad1040c9d6 100644
--- a/mesa 3D driver/src/gallium/drivers/svga/svga_pipe_sampler.c	
+++ b/mesa 3D driver/src/gallium/drivers/svga/svga_pipe_sampler.c	
@@ -415,6 +415,7 @@ svga_set_sampler_views(struct pipe_context *pipe,
                        unsigned start,
                        unsigned num,
                        unsigned unbind_num_trailing_slots,
+                       bool take_ownership,
                        struct pipe_sampler_view **views)
 {
    struct svga_context *svga = svga_context(pipe);
@@ -427,8 +428,13 @@ svga_set_sampler_views(struct pipe_context *pipe,
    assert(start + num <= ARRAY_SIZE(svga->curr.sampler_views[shader]));
 
    /* Pre-VGPU10 only supports FS textures */
-   if (!svga_have_vgpu10(svga) && shader != PIPE_SHADER_FRAGMENT)
+   if (!svga_have_vgpu10(svga) && shader != PIPE_SHADER_FRAGMENT) {
+      for (unsigned i = 0; i < num; i++) {
+         struct pipe_sampler_view *view = views[i];
+         pipe_sampler_view_reference(&view, NULL);
+      }
       return;
+   }
 
    SVGA_STATS_TIME_PUSH(svga_sws(svga), SVGA_STATS_TIME_SETSAMPLERVIEWS);
 
@@ -448,10 +454,15 @@ svga_set_sampler_views(struct pipe_context *pipe,
    for (i = 0; i < num; i++) {
       enum pipe_texture_target target;
 
-      if (svga->curr.sampler_views[shader][start + i] != views[i]) {
+      any_change |= svga->curr.sampler_views[shader][start + i] != views[i];
+
+      if (take_ownership) {
+         pipe_sampler_view_reference(&svga->curr.sampler_views[shader][start + i],
+               NULL);
+         svga->curr.sampler_views[shader][start + i] = views[i];
+      } else if (svga->curr.sampler_views[shader][start + i] != views[i]) {
          pipe_sampler_view_reference(&svga->curr.sampler_views[shader][start + i],
                                      views[i]);
-         any_change = TRUE;
       }
 
       if (!views[i])
diff --git a/mesa 3D driver/src/gallium/drivers/svga/svga_pipe_ts.c b/mesa 3D driver/src/gallium/drivers/svga/svga_pipe_ts.c
index 12a3bf486b..e0318ae7b1 100644
--- a/mesa 3D driver/src/gallium/drivers/svga/svga_pipe_ts.c	
+++ b/mesa 3D driver/src/gallium/drivers/svga/svga_pipe_ts.c	
@@ -47,6 +47,15 @@ svga_set_tess_state(struct pipe_context *pipe,
 }
 
 
+static void
+svga_set_patch_vertices(struct pipe_context *pipe, uint8_t patch_vertices)
+{
+   struct svga_context *svga = svga_context(pipe);
+
+   svga->patch_vertices = patch_vertices;
+}
+
+
 static void *
 svga_create_tcs_state(struct pipe_context *pipe,
                       const struct pipe_shader_state *templ)
@@ -210,6 +219,7 @@ void
 svga_init_ts_functions(struct svga_context *svga)
 {
    svga->pipe.set_tess_state = svga_set_tess_state;
+   svga->pipe.set_patch_vertices = svga_set_patch_vertices;
    svga->pipe.create_tcs_state = svga_create_tcs_state;
    svga->pipe.bind_tcs_state = svga_bind_tcs_state;
    svga->pipe.delete_tcs_state = svga_delete_tcs_state;
diff --git a/mesa 3D driver/src/gallium/drivers/svga/svga_surface.c b/mesa 3D driver/src/gallium/drivers/svga/svga_surface.c
index 3eebf6eca4..09a6afb68d 100644
--- a/mesa 3D driver/src/gallium/drivers/svga/svga_surface.c	
+++ b/mesa 3D driver/src/gallium/drivers/svga/svga_surface.c	
@@ -277,14 +277,14 @@ svga_texture_view_surface(struct svga_context *svga,
 
 /**
  * A helper function to create a surface view.
- * The view boolean flag specifies whether svga_texture_view_surface()
- * will be called to create a cloned surface and resource for the view.
+ * The clone_resource boolean flag specifies whether to clone the resource
+ * for the surface view.
  */
 static struct pipe_surface *
 svga_create_surface_view(struct pipe_context *pipe,
                          struct pipe_resource *pt,
                          const struct pipe_surface *surf_tmpl,
-                         boolean view)
+                         boolean clone_resource)
 {
    struct svga_context *svga = svga_context(pipe);
    struct svga_texture *tex = svga_texture(pt);
@@ -357,7 +357,7 @@ svga_create_surface_view(struct pipe_context *pipe,
 
    assert(format != SVGA3D_FORMAT_INVALID);
 
-   if (view) {
+   if (clone_resource) {
       SVGA_DBG(DEBUG_VIEWS,
                "New backed surface view: resource %p, level %u layer %u z %u, %p\n",
                pt, surf_tmpl->u.tex.level, layer, zslice, s);
@@ -462,10 +462,11 @@ svga_create_surface(struct pipe_context *pipe,
 
 
 /**
- * Clone the surface view and its associated resource.
+ * Create an alternate surface view and clone the resource if specified
  */
 static struct svga_surface *
-create_backed_surface_view(struct svga_context *svga, struct svga_surface *s)
+create_backed_surface_view(struct svga_context *svga, struct svga_surface *s,
+                           boolean clone_resource)
 {
    struct svga_texture *tex = svga_texture(s->base.texture);
 
@@ -478,7 +479,7 @@ create_backed_surface_view(struct svga_context *svga, struct svga_surface *s)
       backed_view = svga_create_surface_view(&svga->pipe,
                                              &tex->b,
                                              &s->base,
-                                             TRUE);
+                                             clone_resource);
       if (!backed_view)
          goto done;
 
@@ -486,7 +487,8 @@ create_backed_surface_view(struct svga_context *svga, struct svga_surface *s)
 
       SVGA_STATS_TIME_POP(svga_sws(svga));
    }
-   else if (s->backed->age < tex->age) {
+   else if (s->backed->handle != tex->handle &&
+            s->backed->age < tex->age) {
       /*
        * There is already an existing backing surface, but we still need to
        * sync the backing resource if the original resource has been modified
@@ -519,6 +521,8 @@ create_backed_surface_view(struct svga_context *svga, struct svga_surface *s)
    svga_mark_surface_dirty(&s->backed->base);
    s->backed->age = tex->age;
 
+   assert(s->backed->base.context == &svga->pipe);
+
 done:
    return s->backed;
 }
@@ -552,7 +556,7 @@ svga_validate_surface_view(struct svga_context *svga, struct svga_surface *s)
          SVGA_DBG(DEBUG_VIEWS,
                   "same resource used in shaderResource and renderTarget 0x%x\n",
                   s->handle);
-         s = create_backed_surface_view(svga, s);
+         s = create_backed_surface_view(svga, s, TRUE);
 
          if (s)
             svga->state.hw_draw.has_backed_views = TRUE;
@@ -567,9 +571,10 @@ svga_validate_surface_view(struct svga_context *svga, struct svga_surface *s)
     * view was created for another context.
     */
    if (s && s->base.context != &svga->pipe) {
-      struct pipe_surface *surf;
-      surf = svga_create_surface_view(&svga->pipe, s->base.texture, &s->base, FALSE);
-      s = svga_surface(surf);
+      s = create_backed_surface_view(svga, s, FALSE);
+
+      if (s)
+         svga->state.hw_draw.has_backed_views = TRUE;
    }
 
    if (s && s->view_id == SVGA3D_INVALID_ID) {
diff --git a/mesa 3D driver/src/gallium/drivers/svga/svga_swtnl_draw.c b/mesa 3D driver/src/gallium/drivers/svga/svga_swtnl_draw.c
index 11d9724c7e..da2d5f4612 100644
--- a/mesa 3D driver/src/gallium/drivers/svga/svga_swtnl_draw.c	
+++ b/mesa 3D driver/src/gallium/drivers/svga/svga_swtnl_draw.c	
@@ -115,7 +115,8 @@ svga_swtnl_draw_vbo(struct svga_context *svga,
          svga->curr.constbufs[PIPE_SHADER_VERTEX][i].buffer->width0);
    }
 
-   draw_vbo(draw, info, drawid_offset, indirect, draw_one, 1);
+   draw_vbo(draw, info, drawid_offset, indirect, draw_one, 1,
+	    svga->patch_vertices);
 
    draw_flush(svga->swtnl.draw);
 
diff --git a/mesa 3D driver/src/gallium/drivers/svga/svga_winsys.h b/mesa 3D driver/src/gallium/drivers/svga/svga_winsys.h
index 535c5543b1..7b3f439af4 100644
--- a/mesa 3D driver/src/gallium/drivers/svga/svga_winsys.h	
+++ b/mesa 3D driver/src/gallium/drivers/svga/svga_winsys.h	
@@ -99,6 +99,9 @@ struct svga_winsys_stats_timeframe {
    uint64 startTime;
    uint64 adjustedStartTime;
    struct svga_winsys_stats_timeframe *enclosing;
+
+   struct svga_winsys_screen *sws;
+   int32 slot;
 };
 
 enum svga_stats_count {
@@ -762,19 +765,19 @@ struct svga_winsys_screen
     * Increment a statistic counter
     */
    void
-   (*stats_inc)(enum svga_stats_count);
+   (*stats_inc)(struct svga_winsys_screen *, enum svga_stats_count);
 
    /**
     * Push a time frame onto the stack
     */
    void
-   (*stats_time_push)(enum svga_stats_time, struct svga_winsys_stats_timeframe *);
+   (*stats_time_push)(struct svga_winsys_screen *, enum svga_stats_time, struct svga_winsys_stats_timeframe *);
 
    /**
     * Pop a time frame.
     */
    void
-   (*stats_time_pop)();
+   (*stats_time_pop)(struct svga_winsys_screen *);
 
    /**
     * Send a host log message
diff --git a/mesa 3D driver/src/gallium/drivers/swr/swr_context.h b/mesa 3D driver/src/gallium/drivers/swr/swr_context.h
index b0681d8387..11578764c2 100644
--- a/mesa 3D driver/src/gallium/drivers/swr/swr_context.h	
+++ b/mesa 3D driver/src/gallium/drivers/swr/swr_context.h	
@@ -201,6 +201,7 @@ struct swr_context {
    SWR_TILE_INTERFACE tileApi;
 
    uint32_t max_draws_in_flight;
+   uint8_t patch_vertices;
 };
 
 static INLINE struct swr_context *
diff --git a/mesa 3D driver/src/gallium/drivers/swr/swr_draw.cpp b/mesa 3D driver/src/gallium/drivers/swr/swr_draw.cpp
index 91e43e0e2a..4b42a8e039 100644
--- a/mesa 3D driver/src/gallium/drivers/swr/swr_draw.cpp	
+++ b/mesa 3D driver/src/gallium/drivers/swr/swr_draw.cpp	
@@ -90,7 +90,7 @@ swr_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info,
       // trick copied from softpipe to modify const struct *info
       memcpy(&resolved_info, (void*)info, sizeof(struct pipe_draw_info));
       resolved_draw.start = draws[0].start;
-      resolved_draw.count = ctx->so_primCounter * resolved_info.vertices_per_patch;
+      resolved_draw.count = ctx->so_primCounter * ctx->patch_vertices;
       resolved_info.max_index = resolved_draw.count - 1;
       info = &resolved_info;
       indirect = NULL;
@@ -252,7 +252,7 @@ swr_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info,
 
    if (info->index_size)
       ctx->api.pfnSwrDrawIndexedInstanced(ctx->swrContext,
-                                          swr_convert_prim_topology(info->mode, info->vertices_per_patch),
+                                          swr_convert_prim_topology(info->mode, ctx->patch_vertices),
                                           draws[0].count,
                                           info->instance_count,
                                           draws[0].start,
@@ -260,7 +260,7 @@ swr_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info,
                                           info->start_instance);
    else
       ctx->api.pfnSwrDrawInstanced(ctx->swrContext,
-                                   swr_convert_prim_topology(info->mode, info->vertices_per_patch),
+                                   swr_convert_prim_topology(info->mode, ctx->patch_vertices),
                                    draws[0].count,
                                    info->instance_count,
                                    draws[0].start,
diff --git a/mesa 3D driver/src/gallium/drivers/swr/swr_state.cpp b/mesa 3D driver/src/gallium/drivers/swr/swr_state.cpp
index 6a56e22741..5f1464e6d0 100644
--- a/mesa 3D driver/src/gallium/drivers/swr/swr_state.cpp	
+++ b/mesa 3D driver/src/gallium/drivers/swr/swr_state.cpp	
@@ -301,6 +301,7 @@ swr_set_sampler_views(struct pipe_context *pipe,
                       unsigned start,
                       unsigned num,
                       unsigned unbind_num_trailing_slots,
+                      bool take_ownership,
                       struct pipe_sampler_view **views)
 {
    struct swr_context *ctx = swr_context(pipe);
@@ -314,8 +315,14 @@ swr_set_sampler_views(struct pipe_context *pipe,
    /* set the new sampler views */
    ctx->num_sampler_views[shader] = num;
    for (i = 0; i < num; i++) {
-      pipe_sampler_view_reference(&ctx->sampler_views[shader][start + i],
-                                  views[i]);
+      if (take_ownership) {
+         pipe_sampler_view_reference(&ctx->sampler_views[shader][start + i],
+                                     NULL);
+         ctx->sampler_views[shader][start + i] = views[i];
+      } else {
+         pipe_sampler_view_reference(&ctx->sampler_views[shader][start + i],
+                                     views[i]);
+      }
    }
    for (; i < num + unbind_num_trailing_slots; i++) {
       pipe_sampler_view_reference(&ctx->sampler_views[shader][start + i],
@@ -1651,7 +1658,7 @@ swr_update_derived(struct pipe_context *pipe,
                      SWR_NEW_SAMPLER |
                      SWR_NEW_SAMPLER_VIEW)) {
       if (ctx->tcs) {
-         ctx->tcs->vertices_per_patch = p_draw_info->vertices_per_patch;
+         ctx->tcs->vertices_per_patch = ctx->patch_vertices;
 
          swr_jit_tcs_key key;
          swr_generate_tcs_key(key, ctx, ctx->tcs);
@@ -2155,6 +2162,14 @@ swr_set_so_targets(struct pipe_context *pipe,
    swr->dirty |= SWR_NEW_SO;
 }
 
+static void
+swr_set_patch_vertices(struct pipe_context *pipe, uint8_t patch_vertices)
+{
+   struct swr_context *swr = swr_context(pipe);
+
+   swr->patch_vertices = patch_vertices;
+}
+
 
 void
 swr_state_init(struct pipe_context *pipe)
@@ -2223,4 +2238,6 @@ swr_state_init(struct pipe_context *pipe)
    pipe->create_stream_output_target = swr_create_so_target;
    pipe->stream_output_target_destroy = swr_destroy_so_target;
    pipe->set_stream_output_targets = swr_set_so_targets;
+
+   pipe->set_patch_vertices = swr_set_patch_vertices;
 }
diff --git a/mesa 3D driver/src/gallium/drivers/tegra/tegra_context.c b/mesa 3D driver/src/gallium/drivers/tegra/tegra_context.c
index b7dc73bb1b..a07c740dc0 100644
--- a/mesa 3D driver/src/gallium/drivers/tegra/tegra_context.c	
+++ b/mesa 3D driver/src/gallium/drivers/tegra/tegra_context.c	
@@ -562,6 +562,7 @@ static void
 tegra_set_sampler_views(struct pipe_context *pcontext, unsigned shader,
                         unsigned start_slot, unsigned num_views,
                         unsigned unbind_num_trailing_slots,
+                        bool take_ownership,
                         struct pipe_sampler_view **pviews)
 {
    struct pipe_sampler_view *views[PIPE_MAX_SHADER_SAMPLER_VIEWS];
@@ -573,7 +574,7 @@ tegra_set_sampler_views(struct pipe_context *pcontext, unsigned shader,
 
    context->gpu->set_sampler_views(context->gpu, shader, start_slot,
                                    num_views, unbind_num_trailing_slots,
-                                   views);
+                                   take_ownership, views);
 }
 
 static void
diff --git a/mesa 3D driver/src/gallium/drivers/v3d/v3d_blit.c b/mesa 3D driver/src/gallium/drivers/v3d/v3d_blit.c
index 192e8d75bb..793f2d2e53 100644
--- a/mesa 3D driver/src/gallium/drivers/v3d/v3d_blit.c	
+++ b/mesa 3D driver/src/gallium/drivers/v3d/v3d_blit.c	
@@ -182,7 +182,7 @@ v3d_stencil_blit(struct pipe_context *ctx, struct pipe_blit_info *info)
                                   PIPE_MASK_R,
                                   PIPE_TEX_FILTER_NEAREST,
                                   info->scissor_enable ? &info->scissor : NULL,
-                                  info->alpha_blend);
+                                  info->alpha_blend, false);
 
         pipe_surface_reference(&dst_surf, NULL);
         pipe_sampler_view_reference(&src_view, NULL);
@@ -823,7 +823,7 @@ v3d_sand8_blit(struct pipe_context *pctx, struct pipe_blit_info *info)
         /* Unbind the textures, to make sure we don't try to recurse into the
          * shadow blit.
          */
-        pctx->set_sampler_views(pctx, PIPE_SHADER_FRAGMENT, 0, 0, 0, NULL);
+        pctx->set_sampler_views(pctx, PIPE_SHADER_FRAGMENT, 0, 0, 0, false, NULL);
         pctx->bind_sampler_states(pctx, PIPE_SHADER_FRAGMENT, 0, 0, NULL);
 
         util_blitter_custom_shader(v3d->blitter, dst_surf,
diff --git a/mesa 3D driver/src/gallium/drivers/v3d/v3d_context.c b/mesa 3D driver/src/gallium/drivers/v3d/v3d_context.c
index f931447111..50a0e602e3 100644
--- a/mesa 3D driver/src/gallium/drivers/v3d/v3d_context.c	
+++ b/mesa 3D driver/src/gallium/drivers/v3d/v3d_context.c	
@@ -32,7 +32,6 @@
 #include "util/u_blitter.h"
 #include "util/u_upload_mgr.h"
 #include "util/u_prim.h"
-#include "indices/u_primconvert.h"
 #include "pipe/p_screen.h"
 
 #include "v3d_screen.h"
@@ -282,9 +281,6 @@ v3d_context_destroy(struct pipe_context *pctx)
         if (v3d->blitter)
                 util_blitter_destroy(v3d->blitter);
 
-        if (v3d->primconvert)
-                util_primconvert_destroy(v3d->primconvert);
-
         if (v3d->uploader)
                 u_upload_destroy(v3d->uploader);
         if (v3d->state_uploader)
@@ -394,11 +390,6 @@ v3d_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
                 goto fail;
         v3d->blitter->use_index_buffer = true;
 
-        v3d->primconvert = util_primconvert_create(pctx,
-                                                   (1 << PIPE_PRIM_QUADS) - 1);
-        if (!v3d->primconvert)
-                goto fail;
-
         V3D_DEBUG |= saved_shaderdb_flag;
 
         v3d->sample_mask = (1 << V3D_MAX_SAMPLES) - 1;
diff --git a/mesa 3D driver/src/gallium/drivers/v3d/v3d_context.h b/mesa 3D driver/src/gallium/drivers/v3d/v3d_context.h
index 5bd3e27fd6..964289c015 100644
--- a/mesa 3D driver/src/gallium/drivers/v3d/v3d_context.h	
+++ b/mesa 3D driver/src/gallium/drivers/v3d/v3d_context.h	
@@ -500,8 +500,6 @@ struct v3d_context {
         /** bitfield of V3D_DIRTY_* */
         uint64_t dirty;
 
-        struct primconvert_context *primconvert;
-
         uint32_t next_uncompiled_program_id;
         uint64_t next_compiled_program_id;
 
diff --git a/mesa 3D driver/src/gallium/drivers/v3d/v3d_job.c b/mesa 3D driver/src/gallium/drivers/v3d/v3d_job.c
index 6cd8899912..0e64624165 100644
--- a/mesa 3D driver/src/gallium/drivers/v3d/v3d_job.c	
+++ b/mesa 3D driver/src/gallium/drivers/v3d/v3d_job.c	
@@ -426,12 +426,16 @@ v3d_get_job_for_fbo(struct v3d_context *v3d)
 static void
 v3d_clif_dump(struct v3d_context *v3d, struct v3d_job *job)
 {
-        if (!(V3D_DEBUG & (V3D_DEBUG_CL | V3D_DEBUG_CLIF)))
+        if (!(unlikely(V3D_DEBUG & (V3D_DEBUG_CL |
+                                    V3D_DEBUG_CL_NO_BIN |
+                                    V3D_DEBUG_CLIF))))
                 return;
 
         struct clif_dump *clif = clif_dump_init(&v3d->screen->devinfo,
                                                 stderr,
-                                                V3D_DEBUG & V3D_DEBUG_CL);
+                                                V3D_DEBUG & (V3D_DEBUG_CL |
+                                                             V3D_DEBUG_CL_NO_BIN),
+                                                V3D_DEBUG & V3D_DEBUG_CL_NO_BIN);
 
         set_foreach(job->bos, entry) {
                 struct v3d_bo *bo = (void *)entry->key;
@@ -534,7 +538,7 @@ v3d_job_submit(struct v3d_context *v3d, struct v3d_job *job)
 
         v3d_clif_dump(v3d, job);
 
-        if (!(V3D_DEBUG & V3D_DEBUG_NORAST)) {
+        if (!(unlikely(V3D_DEBUG & V3D_DEBUG_NORAST))) {
                 int ret;
 
                 ret = v3d_ioctl(v3d->fd, DRM_IOCTL_V3D_SUBMIT_CL, &job->submit);
diff --git a/mesa 3D driver/src/gallium/drivers/v3d/v3d_program.c b/mesa 3D driver/src/gallium/drivers/v3d/v3d_program.c
index db376cbb3c..3b2f9830b9 100644
--- a/mesa 3D driver/src/gallium/drivers/v3d/v3d_program.c	
+++ b/mesa 3D driver/src/gallium/drivers/v3d/v3d_program.c	
@@ -297,7 +297,7 @@ v3d_uncompiled_shader_create(struct pipe_context *pctx,
         } else {
                 assert(type == PIPE_SHADER_IR_TGSI);
 
-                if (V3D_DEBUG & V3D_DEBUG_TGSI) {
+                if (unlikely(V3D_DEBUG & V3D_DEBUG_TGSI)) {
                         fprintf(stderr, "prog %d TGSI:\n",
                                 so->program_id);
                         tgsi_dump(ir, 0);
@@ -328,8 +328,8 @@ v3d_uncompiled_shader_create(struct pipe_context *pctx,
         so->base.type = PIPE_SHADER_IR_NIR;
         so->base.ir.nir = s;
 
-        if (V3D_DEBUG & (V3D_DEBUG_NIR |
-                         v3d_debug_flag_for_shader_stage(s->info.stage))) {
+        if (unlikely(V3D_DEBUG & (V3D_DEBUG_NIR |
+                                  v3d_debug_flag_for_shader_stage(s->info.stage)))) {
                 fprintf(stderr, "%s prog %d NIR:\n",
                         gl_shader_stage_name(s->info.stage),
                         so->program_id);
@@ -337,7 +337,7 @@ v3d_uncompiled_shader_create(struct pipe_context *pctx,
                 fprintf(stderr, "\n");
         }
 
-        if (V3D_DEBUG & V3D_DEBUG_PRECOMPILE)
+        if (unlikely(V3D_DEBUG & V3D_DEBUG_PRECOMPILE))
                 v3d_shader_precompile(v3d, so);
 
         return so;
@@ -600,9 +600,8 @@ v3d_update_compiled_fs(struct v3d_context *v3d, uint8_t prim_mode)
         if (key->is_points) {
                 key->point_sprite_mask =
                         v3d->rasterizer->base.sprite_coord_enable;
-                key->point_coord_upper_left =
-                        (v3d->rasterizer->base.sprite_coord_mode ==
-                         PIPE_SPRITE_COORD_UPPER_LEFT);
+                /* this is handled by lower_wpos_pntc */
+                key->point_coord_upper_left = false;
         }
 
         struct v3d_compiled_shader *old_fs = v3d->prog.fs;
diff --git a/mesa 3D driver/src/gallium/drivers/v3d/v3d_resource.c b/mesa 3D driver/src/gallium/drivers/v3d/v3d_resource.c
index d48479460d..0885a2cb80 100644
--- a/mesa 3D driver/src/gallium/drivers/v3d/v3d_resource.c	
+++ b/mesa 3D driver/src/gallium/drivers/v3d/v3d_resource.c	
@@ -41,7 +41,7 @@
 static void
 v3d_debug_resource_layout(struct v3d_resource *rsc, const char *caller)
 {
-        if (!(V3D_DEBUG & V3D_DEBUG_SURFACE))
+        if (!(unlikely(V3D_DEBUG & V3D_DEBUG_SURFACE)))
                 return;
 
         struct pipe_resource *prsc = &rsc->base;
diff --git a/mesa 3D driver/src/gallium/drivers/v3d/v3d_screen.c b/mesa 3D driver/src/gallium/drivers/v3d/v3d_screen.c
index 033a6619f0..a3abf5bca8 100644
--- a/mesa 3D driver/src/gallium/drivers/v3d/v3d_screen.c	
+++ b/mesa 3D driver/src/gallium/drivers/v3d/v3d_screen.c	
@@ -123,6 +123,8 @@ v3d_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_FRAGMENT_SHADER_DERIVATIVES:
         case PIPE_CAP_VERTEX_SHADER_SATURATE:
         case PIPE_CAP_PRIMITIVE_RESTART_FIXED_INDEX:
+        case PIPE_CAP_EMULATE_NONFIXED_PRIMITIVE_RESTART:
+        case PIPE_CAP_PRIMITIVE_RESTART:
         case PIPE_CAP_OCCLUSION_QUERY:
         case PIPE_CAP_POINT_SPRITE:
         case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
@@ -136,6 +138,7 @@ v3d_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
         case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
         case PIPE_CAP_TGSI_TEXCOORD:
+        case PIPE_CAP_TEXTURE_MIRROR_CLAMP_TO_EDGE:
                 return 1;
 
         case PIPE_CAP_TEXTURE_QUERY_LOD:
@@ -275,6 +278,10 @@ v3d_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_MAX_GS_INVOCATIONS:
                 return 32;
 
+        case PIPE_CAP_SUPPORTED_PRIM_MODES:
+        case PIPE_CAP_SUPPORTED_PRIM_MODES_WITH_RESTART:
+                return screen->prim_types;
+
         default:
                 return u_pipe_screen_get_param_defaults(pscreen, param);
         }
@@ -639,7 +646,8 @@ v3d_screen_is_format_supported(struct pipe_screen *pscreen,
 }
 
 static const nir_shader_compiler_options v3d_nir_options = {
-        .lower_add_sat = true,
+        .lower_uadd_sat = true,
+        .lower_iadd_sat = true,
         .lower_all_io_to_temps = true,
         .lower_extract_byte = true,
         .lower_extract_word = true,
@@ -832,6 +840,19 @@ v3d_screen_create(int fd, const struct pipe_screen_config *config,
                 pscreen->get_driver_query_info = v3d_get_driver_query_info;
         }
 
+        /* Generate the bitmask of supported draw primitives. */
+        screen->prim_types = BITFIELD_BIT(PIPE_PRIM_POINTS) |
+                             BITFIELD_BIT(PIPE_PRIM_LINES) |
+                             BITFIELD_BIT(PIPE_PRIM_LINE_LOOP) |
+                             BITFIELD_BIT(PIPE_PRIM_LINE_STRIP) |
+                             BITFIELD_BIT(PIPE_PRIM_TRIANGLES) |
+                             BITFIELD_BIT(PIPE_PRIM_TRIANGLE_STRIP) |
+                             BITFIELD_BIT(PIPE_PRIM_TRIANGLE_FAN) |
+                             BITFIELD_BIT(PIPE_PRIM_LINES_ADJACENCY) |
+                             BITFIELD_BIT(PIPE_PRIM_LINE_STRIP_ADJACENCY) |
+                             BITFIELD_BIT(PIPE_PRIM_TRIANGLES_ADJACENCY) |
+                             BITFIELD_BIT(PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY);
+
         return pscreen;
 
 fail:
diff --git a/mesa 3D driver/src/gallium/drivers/v3d/v3d_screen.h b/mesa 3D driver/src/gallium/drivers/v3d/v3d_screen.h
index 3469731a2d..9bf2a06548 100644
--- a/mesa 3D driver/src/gallium/drivers/v3d/v3d_screen.h	
+++ b/mesa 3D driver/src/gallium/drivers/v3d/v3d_screen.h	
@@ -76,6 +76,7 @@ struct v3d_screen {
 
         uint32_t bo_size;
         uint32_t bo_count;
+        uint32_t prim_types;
 
         bool has_csd;
         bool has_cache_flush;
diff --git a/mesa 3D driver/src/gallium/drivers/v3d/v3dx_draw.c b/mesa 3D driver/src/gallium/drivers/v3d/v3dx_draw.c
index ba94df748f..815e5eff86 100644
--- a/mesa 3D driver/src/gallium/drivers/v3d/v3dx_draw.c	
+++ b/mesa 3D driver/src/gallium/drivers/v3d/v3dx_draw.c	
@@ -28,7 +28,6 @@
 #include "util/u_pack_color.h"
 #include "util/u_prim_restart.h"
 #include "util/u_upload_mgr.h"
-#include "indices/u_primconvert.h"
 
 #include "v3d_context.h"
 #include "v3d_resource.h"
@@ -972,14 +971,6 @@ v3d_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
                 }
         }
 
-        if (info->mode >= PIPE_PRIM_QUADS && info->mode <= PIPE_PRIM_POLYGON) {
-                util_primconvert_save_rasterizer_state(v3d->primconvert, &v3d->rasterizer->base);
-                util_primconvert_draw_vbo(v3d->primconvert, info, drawid_offset, indirect, draws, num_draws);
-                perf_debug("Fallback conversion for %d %s vertices\n",
-                           draws[0].count, u_prim_name(info->mode));
-                return;
-        }
-
         /* Before setting up the draw, flush anything writing to the resources
          * that we read from or reading from resources we write to.
          */
@@ -1293,7 +1284,7 @@ v3d_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
                 v3d_flush(pctx);
         }
 
-        if (V3D_DEBUG & V3D_DEBUG_ALWAYS_FLUSH)
+        if (unlikely(V3D_DEBUG & V3D_DEBUG_ALWAYS_FLUSH))
                 v3d_flush(pctx);
 }
 
@@ -1455,7 +1446,7 @@ v3d_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info)
 
         v3d->last_perfmon = v3d->active_perfmon;
 
-        if (!(V3D_DEBUG & V3D_DEBUG_NORAST)) {
+        if (!(unlikely(V3D_DEBUG & V3D_DEBUG_NORAST))) {
                 int ret = v3d_ioctl(screen->fd, DRM_IOCTL_V3D_SUBMIT_CSD,
                                     &submit);
                 static bool warned = false;
diff --git a/mesa 3D driver/src/gallium/drivers/v3d/v3dx_format_table.c b/mesa 3D driver/src/gallium/drivers/v3d/v3dx_format_table.c
index d2dab33839..89b4449187 100644
--- a/mesa 3D driver/src/gallium/drivers/v3d/v3dx_format_table.c	
+++ b/mesa 3D driver/src/gallium/drivers/v3d/v3dx_format_table.c	
@@ -70,6 +70,7 @@ static const struct v3d_format format_table[] = {
         FORMAT(R8G8B8A8_SNORM,    NO,           RGBA8_SNORM, SWIZ_XYZW, 16, 0),
         FORMAT(R8G8B8X8_SNORM,    NO,           RGBA8_SNORM, SWIZ_XYZ1, 16, 0),
         FORMAT(R10G10B10A2_UNORM, RGB10_A2,     RGB10_A2,    SWIZ_XYZW, 16, 0),
+        FORMAT(R10G10B10X2_UNORM, RGB10_A2,     RGB10_A2,    SWIZ_XYZ1, 16, 0),
         FORMAT(R10G10B10A2_UINT,  RGB10_A2UI,   RGB10_A2UI,  SWIZ_XYZW, 16, 0),
 
         FORMAT(A4B4G4R4_UNORM,    ABGR4444,     RGBA4,       SWIZ_XYZW, 16, 0),
diff --git a/mesa 3D driver/src/gallium/drivers/v3d/v3dx_state.c b/mesa 3D driver/src/gallium/drivers/v3d/v3dx_state.c
index 4c553988b8..3315421b9b 100644
--- a/mesa 3D driver/src/gallium/drivers/v3d/v3dx_state.c	
+++ b/mesa 3D driver/src/gallium/drivers/v3d/v3dx_state.c	
@@ -520,6 +520,8 @@ translate_wrap(uint32_t pipe_wrap)
                 return V3D_WRAP_MODE_MIRROR;
         case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
                 return V3D_WRAP_MODE_BORDER;
+        case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+                return V3D_WRAP_MODE_MIRROR_ONCE;
         default:
                 unreachable("Unknown wrap mode");
         }
@@ -1148,6 +1150,7 @@ v3d_set_sampler_views(struct pipe_context *pctx,
                       enum pipe_shader_type shader,
                       unsigned start, unsigned nr,
                       unsigned unbind_num_trailing_slots,
+                      bool take_ownership,
                       struct pipe_sampler_view **views)
 {
         struct v3d_context *v3d = v3d_context(pctx);
@@ -1160,7 +1163,12 @@ v3d_set_sampler_views(struct pipe_context *pctx,
         for (i = 0; i < nr; i++) {
                 if (views[i])
                         new_nr = i + 1;
-                pipe_sampler_view_reference(&stage_tex->textures[i], views[i]);
+                if (take_ownership) {
+                        pipe_sampler_view_reference(&stage_tex->textures[i], NULL);
+                        stage_tex->textures[i] = views[i];
+                } else {
+                        pipe_sampler_view_reference(&stage_tex->textures[i], views[i]);
+                }
         }
 
         for (; i < stage_tex->num_textures; i++) {
diff --git a/mesa 3D driver/src/gallium/drivers/vc4/vc4_blit.c b/mesa 3D driver/src/gallium/drivers/vc4/vc4_blit.c
index 5e50559743..d7f756f729 100644
--- a/mesa 3D driver/src/gallium/drivers/vc4/vc4_blit.c	
+++ b/mesa 3D driver/src/gallium/drivers/vc4/vc4_blit.c	
@@ -381,7 +381,7 @@ vc4_yuv_blit(struct pipe_context *pctx, const struct pipe_blit_info *info)
         /* Unbind the textures, to make sure we don't try to recurse into the
          * shadow blit.
          */
-        pctx->set_sampler_views(pctx, PIPE_SHADER_FRAGMENT, 0, 0, 0, NULL);
+        pctx->set_sampler_views(pctx, PIPE_SHADER_FRAGMENT, 0, 0, 0, false, NULL);
         pctx->bind_sampler_states(pctx, PIPE_SHADER_FRAGMENT, 0, 0, NULL);
 
         util_blitter_custom_shader(vc4->blitter, dst_surf,
diff --git a/mesa 3D driver/src/gallium/drivers/vc4/vc4_cl_dump.c b/mesa 3D driver/src/gallium/drivers/vc4/vc4_cl_dump.c
index a6ae0cf80d..eea26205d0 100644
--- a/mesa 3D driver/src/gallium/drivers/vc4/vc4_cl_dump.c	
+++ b/mesa 3D driver/src/gallium/drivers/vc4/vc4_cl_dump.c	
@@ -42,7 +42,7 @@ vc4_dump_cl(void *cl, uint32_t size, bool is_render)
         };
         struct v3d_spec *spec = v3d_spec_load(&devinfo);
 
-        struct clif_dump *clif = clif_dump_init(&devinfo, stderr, true);
+        struct clif_dump *clif = clif_dump_init(&devinfo, stderr, true, false);
 
         uint32_t offset = 0, hw_offset = 0;
         uint8_t *p = cl;
diff --git a/mesa 3D driver/src/gallium/drivers/vc4/vc4_context.c b/mesa 3D driver/src/gallium/drivers/vc4/vc4_context.c
index 94969dcb13..75d82f3e3a 100644
--- a/mesa 3D driver/src/gallium/drivers/vc4/vc4_context.c	
+++ b/mesa 3D driver/src/gallium/drivers/vc4/vc4_context.c	
@@ -30,7 +30,6 @@
 #include "util/u_memory.h"
 #include "util/u_blitter.h"
 #include "util/u_upload_mgr.h"
-#include "indices/u_primconvert.h"
 #include "pipe/p_screen.h"
 
 #include "vc4_screen.h"
@@ -124,9 +123,6 @@ vc4_context_destroy(struct pipe_context *pctx)
         if (vc4->blitter)
                 util_blitter_destroy(vc4->blitter);
 
-        if (vc4->primconvert)
-                util_primconvert_destroy(vc4->primconvert);
-
         if (vc4->uploader)
                 u_upload_destroy(vc4->uploader);
 
@@ -206,11 +202,6 @@ vc4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
         if (!vc4->blitter)
                 goto fail;
 
-        vc4->primconvert = util_primconvert_create(pctx,
-                                                   (1 << PIPE_PRIM_QUADS) - 1);
-        if (!vc4->primconvert)
-                goto fail;
-
         vc4_debug |= saved_shaderdb_flag;
 
         vc4->sample_mask = (1 << VC4_MAX_SAMPLES) - 1;
diff --git a/mesa 3D driver/src/gallium/drivers/vc4/vc4_context.h b/mesa 3D driver/src/gallium/drivers/vc4/vc4_context.h
index 6fb54d56b9..f471353718 100644
--- a/mesa 3D driver/src/gallium/drivers/vc4/vc4_context.h	
+++ b/mesa 3D driver/src/gallium/drivers/vc4/vc4_context.h	
@@ -328,8 +328,6 @@ struct vc4_context {
         /** bitfield of VC4_DIRTY_* */
         uint32_t dirty;
 
-        struct primconvert_context *primconvert;
-
         struct hash_table *fs_cache, *vs_cache;
         struct set *fs_inputs_set;
         uint32_t next_uncompiled_program_id;
diff --git a/mesa 3D driver/src/gallium/drivers/vc4/vc4_draw.c b/mesa 3D driver/src/gallium/drivers/vc4/vc4_draw.c
index d01a1c2727..ebaddae762 100644
--- a/mesa 3D driver/src/gallium/drivers/vc4/vc4_draw.c	
+++ b/mesa 3D driver/src/gallium/drivers/vc4/vc4_draw.c	
@@ -29,7 +29,6 @@
 #include "util/u_pack_color.h"
 #include "util/u_split_draw.h"
 #include "util/u_upload_mgr.h"
-#include "indices/u_primconvert.h"
 
 #include "vc4_context.h"
 #include "vc4_resource.h"
@@ -304,29 +303,12 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
            return;
 
         struct vc4_context *vc4 = vc4_context(pctx);
-        struct pipe_draw_info local_info;
 
 	if (!indirect &&
 	    !info->primitive_restart &&
 	    !u_trim_pipe_prim(info->mode, (unsigned*)&draws[0].count))
 		return;
 
-        if (info->mode >= PIPE_PRIM_QUADS) {
-                if (info->mode == PIPE_PRIM_QUADS &&
-                    draws[0].count == 4 &&
-                    !vc4->rasterizer->base.flatshade) {
-                        local_info = *info;
-                        local_info.mode = PIPE_PRIM_TRIANGLE_FAN;
-                        info = &local_info;
-                } else {
-                        util_primconvert_save_rasterizer_state(vc4->primconvert, &vc4->rasterizer->base);
-                        util_primconvert_draw_vbo(vc4->primconvert, info, drawid_offset, indirect, draws, num_draws);
-                        perf_debug("Fallback conversion for %d %s vertices\n",
-                                   draws[0].count, u_prim_name(info->mode));
-                        return;
-                }
-        }
-
         /* Before setting up the draw, do any fixup blits necessary. */
         vc4_predraw_check_textures(pctx, &vc4->verttex);
         vc4_predraw_check_textures(pctx, &vc4->fragtex);
diff --git a/mesa 3D driver/src/gallium/drivers/vc4/vc4_screen.c b/mesa 3D driver/src/gallium/drivers/vc4/vc4_screen.c
index 5cfce4da2d..40d49cc6b0 100644
--- a/mesa 3D driver/src/gallium/drivers/vc4/vc4_screen.c	
+++ b/mesa 3D driver/src/gallium/drivers/vc4/vc4_screen.c	
@@ -204,6 +204,9 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_TEXRECT:
                 return 0;
 
+        case PIPE_CAP_SUPPORTED_PRIM_MODES:
+                return screen->prim_types;
+
         default:
                 return u_pipe_screen_get_param_defaults(pscreen, param);
         }
@@ -606,6 +609,16 @@ vc4_screen_create(int fd, struct renderonly *ro)
                 pscreen->get_driver_query_info = vc4_get_driver_query_info;
         }
 
+        /* Generate the bitmask of supported draw primitives. */
+        screen->prim_types = BITFIELD_BIT(PIPE_PRIM_POINTS) |
+                             BITFIELD_BIT(PIPE_PRIM_LINES) |
+                             BITFIELD_BIT(PIPE_PRIM_LINE_LOOP) |
+                             BITFIELD_BIT(PIPE_PRIM_LINE_STRIP) |
+                             BITFIELD_BIT(PIPE_PRIM_TRIANGLES) |
+                             BITFIELD_BIT(PIPE_PRIM_TRIANGLE_STRIP) |
+                             BITFIELD_BIT(PIPE_PRIM_TRIANGLE_FAN);
+
+
         return pscreen;
 
 fail:
diff --git a/mesa 3D driver/src/gallium/drivers/vc4/vc4_screen.h b/mesa 3D driver/src/gallium/drivers/vc4/vc4_screen.h
index 2a5a160354..06342f717f 100644
--- a/mesa 3D driver/src/gallium/drivers/vc4/vc4_screen.h	
+++ b/mesa 3D driver/src/gallium/drivers/vc4/vc4_screen.h	
@@ -92,6 +92,7 @@ struct vc4_screen {
 
         uint32_t bo_size;
         uint32_t bo_count;
+        uint32_t prim_types;
         bool has_control_flow;
         bool has_etc1;
         bool has_threaded_fs;
diff --git a/mesa 3D driver/src/gallium/drivers/vc4/vc4_state.c b/mesa 3D driver/src/gallium/drivers/vc4/vc4_state.c
index 6416a19527..ad068494b3 100644
--- a/mesa 3D driver/src/gallium/drivers/vc4/vc4_state.c	
+++ b/mesa 3D driver/src/gallium/drivers/vc4/vc4_state.c	
@@ -652,6 +652,7 @@ vc4_set_sampler_views(struct pipe_context *pctx,
                       enum pipe_shader_type shader,
                       unsigned start, unsigned nr,
                       unsigned unbind_num_trailing_slots,
+                      bool take_ownership,
                       struct pipe_sampler_view **views)
 {
         struct vc4_context *vc4 = vc4_context(pctx);
@@ -664,7 +665,12 @@ vc4_set_sampler_views(struct pipe_context *pctx,
         for (i = 0; i < nr; i++) {
                 if (views[i])
                         new_nr = i + 1;
-                pipe_sampler_view_reference(&stage_tex->textures[i], views[i]);
+                if (take_ownership) {
+                        pipe_sampler_view_reference(&stage_tex->textures[i], NULL);
+                        stage_tex->textures[i] = views[i];
+                } else {
+                        pipe_sampler_view_reference(&stage_tex->textures[i], views[i]);
+                }
         }
 
         for (; i < stage_tex->num_textures; i++) {
diff --git a/mesa 3D driver/src/gallium/drivers/virgl/ci/deqp-virgl-gl-fails.txt b/mesa 3D driver/src/gallium/drivers/virgl/ci/deqp-virgl-gl-fails.txt
index e315117d25..794e0ce8aa 100644
--- a/mesa 3D driver/src/gallium/drivers/virgl/ci/deqp-virgl-gl-fails.txt	
+++ b/mesa 3D driver/src/gallium/drivers/virgl/ci/deqp-virgl-gl-fails.txt	
@@ -3,22 +3,11 @@ dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_corner,Fail
 dEQP-GLES2.functional.clipping.point.wide_point_clip,Fail
 dEQP-GLES2.functional.clipping.point.wide_point_clip_viewport_center,Fail
 dEQP-GLES2.functional.clipping.point.wide_point_clip_viewport_corner,Fail
-dEQP-GLES2.functional.clipping.triangle_vertex.clip_two.clip_neg_y_neg_z_and_neg_x_neg_y_pos_z,Fail
-dEQP-GLES2.functional.clipping.triangle_vertex.clip_two.clip_pos_y_pos_z_and_neg_x_neg_y_neg_z,Fail
-dEQP-GLES2.functional.polygon_offset.default_displacement_with_units,Fail
-dEQP-GLES2.functional.rasterization.interpolation.basic.line_loop_wide,Fail
-dEQP-GLES2.functional.rasterization.interpolation.basic.line_strip_wide,Fail
-dEQP-GLES2.functional.rasterization.interpolation.basic.lines_wide,Fail
-dEQP-GLES2.functional.rasterization.interpolation.projected.line_loop_wide,Fail
-dEQP-GLES2.functional.rasterization.interpolation.projected.line_strip_wide,Fail
-dEQP-GLES2.functional.rasterization.interpolation.projected.lines_wide,Fail
 dEQP-GLES3.functional.clipping.line.wide_line_clip_viewport_center,Fail
 dEQP-GLES3.functional.clipping.line.wide_line_clip_viewport_corner,Fail
 dEQP-GLES3.functional.clipping.point.wide_point_clip,Fail
 dEQP-GLES3.functional.clipping.point.wide_point_clip_viewport_center,Fail
 dEQP-GLES3.functional.clipping.point.wide_point_clip_viewport_corner,Fail
-dEQP-GLES3.functional.clipping.triangle_vertex.clip_two.clip_neg_y_neg_z_and_neg_x_neg_y_pos_z,Fail
-dEQP-GLES3.functional.clipping.triangle_vertex.clip_two.clip_pos_y_pos_z_and_neg_x_neg_y_neg_z,Fail
 dEQP-GLES3.functional.fbo.blit.rect.nearest_consistency_mag,Fail
 dEQP-GLES3.functional.fbo.blit.rect.nearest_consistency_mag_reverse_dst_x,Fail
 dEQP-GLES3.functional.fbo.blit.rect.nearest_consistency_mag_reverse_src_dst_x,Fail
@@ -29,17 +18,6 @@ dEQP-GLES3.functional.fbo.blit.rect.nearest_consistency_min_reverse_dst_x,Fail
 dEQP-GLES3.functional.fbo.blit.rect.nearest_consistency_min_reverse_src_dst_x,Fail
 dEQP-GLES3.functional.fbo.blit.rect.nearest_consistency_min_reverse_src_dst_y,Fail
 dEQP-GLES3.functional.fbo.blit.rect.nearest_consistency_min_reverse_src_x,Fail
-dEQP-GLES3.functional.polygon_offset.default_displacement_with_units,Fail
-dEQP-GLES3.functional.polygon_offset.fixed24_displacement_with_units,Fail
-dEQP-GLES3.functional.polygon_offset.float32_displacement_with_units,Fail
-dEQP-GLES3.functional.rasterization.fbo.rbo_singlesample.interpolation.lines_wide,Fail
-dEQP-GLES3.functional.rasterization.fbo.texture_2d.interpolation.lines_wide,Fail
-dEQP-GLES3.functional.rasterization.interpolation.basic.line_loop_wide,Fail
-dEQP-GLES3.functional.rasterization.interpolation.basic.line_strip_wide,Fail
-dEQP-GLES3.functional.rasterization.interpolation.basic.lines_wide,Fail
-dEQP-GLES3.functional.rasterization.interpolation.projected.line_loop_wide,Fail
-dEQP-GLES3.functional.rasterization.interpolation.projected.line_strip_wide,Fail
-dEQP-GLES3.functional.rasterization.interpolation.projected.lines_wide,Fail
 dEQP-GLES31.functional.draw_buffers_indexed.random.max_required_draw_buffers.4,Fail
 dEQP-GLES31.functional.draw_buffers_indexed.random.max_implementation_draw_buffers.8,Fail
 dEQP-GLES31.functional.image_load_store.buffer.image_size.writeonly_12,Fail
diff --git a/mesa 3D driver/src/gallium/drivers/virgl/ci/deqp-virgl-gl.toml b/mesa 3D driver/src/gallium/drivers/virgl/ci/deqp-virgl-gl.toml
index aa06b482b1..f65be42cc1 100644
--- a/mesa 3D driver/src/gallium/drivers/virgl/ci/deqp-virgl-gl.toml	
+++ b/mesa 3D driver/src/gallium/drivers/virgl/ci/deqp-virgl-gl.toml	
@@ -8,6 +8,8 @@ deqp_args = [
     "--deqp-gl-config-name=rgba8888d24s8ms0",
     "--deqp-visibility=hidden"
 ]
+version_check = "GL ES 3.2.*git"
+renderer_check = "virgl"
 
 [[deqp]]
 deqp = "/deqp/modules/gles3/deqp-gles3"
diff --git a/mesa 3D driver/src/gallium/drivers/virgl/ci/deqp-virgl-gles-fails.txt b/mesa 3D driver/src/gallium/drivers/virgl/ci/deqp-virgl-gles-fails.txt
index 002a1cdbd0..29db955239 100644
--- a/mesa 3D driver/src/gallium/drivers/virgl/ci/deqp-virgl-gles-fails.txt	
+++ b/mesa 3D driver/src/gallium/drivers/virgl/ci/deqp-virgl-gles-fails.txt	
@@ -3,22 +3,12 @@ dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_corner,Fail
 dEQP-GLES2.functional.clipping.point.wide_point_clip,Fail
 dEQP-GLES2.functional.clipping.point.wide_point_clip_viewport_center,Fail
 dEQP-GLES2.functional.clipping.point.wide_point_clip_viewport_corner,Fail
-dEQP-GLES2.functional.clipping.triangle_vertex.clip_two.clip_neg_y_neg_z_and_neg_x_neg_y_pos_z,Fail
-dEQP-GLES2.functional.clipping.triangle_vertex.clip_two.clip_pos_y_pos_z_and_neg_x_neg_y_neg_z,Fail
 dEQP-GLES2.functional.polygon_offset.default_displacement_with_units,Fail
-dEQP-GLES2.functional.rasterization.interpolation.basic.line_loop_wide,Fail
-dEQP-GLES2.functional.rasterization.interpolation.basic.line_strip_wide,Fail
-dEQP-GLES2.functional.rasterization.interpolation.basic.lines_wide,Fail
-dEQP-GLES2.functional.rasterization.interpolation.projected.line_loop_wide,Fail
-dEQP-GLES2.functional.rasterization.interpolation.projected.line_strip_wide,Fail
-dEQP-GLES2.functional.rasterization.interpolation.projected.lines_wide,Fail
 dEQP-GLES3.functional.clipping.line.wide_line_clip_viewport_center,Fail
 dEQP-GLES3.functional.clipping.line.wide_line_clip_viewport_corner,Fail
 dEQP-GLES3.functional.clipping.point.wide_point_clip,Fail
 dEQP-GLES3.functional.clipping.point.wide_point_clip_viewport_center,Fail
 dEQP-GLES3.functional.clipping.point.wide_point_clip_viewport_corner,Fail
-dEQP-GLES3.functional.clipping.triangle_vertex.clip_two.clip_neg_y_neg_z_and_neg_x_neg_y_pos_z,Fail
-dEQP-GLES3.functional.clipping.triangle_vertex.clip_two.clip_pos_y_pos_z_and_neg_x_neg_y_neg_z,Fail
 dEQP-GLES3.functional.fbo.blit.rect.nearest_consistency_mag,Fail
 dEQP-GLES3.functional.fbo.blit.rect.nearest_consistency_mag_reverse_dst_x,Fail
 dEQP-GLES3.functional.fbo.blit.rect.nearest_consistency_mag_reverse_src_dst_x,Fail
@@ -32,14 +22,6 @@ dEQP-GLES3.functional.fbo.blit.rect.nearest_consistency_min_reverse_src_x,Fail
 dEQP-GLES3.functional.polygon_offset.default_displacement_with_units,Fail
 dEQP-GLES3.functional.polygon_offset.fixed24_displacement_with_units,Fail
 dEQP-GLES3.functional.polygon_offset.float32_displacement_with_units,Fail
-dEQP-GLES3.functional.rasterization.fbo.rbo_singlesample.interpolation.lines_wide,Fail
-dEQP-GLES3.functional.rasterization.fbo.texture_2d.interpolation.lines_wide,Fail
-dEQP-GLES3.functional.rasterization.interpolation.basic.line_loop_wide,Fail
-dEQP-GLES3.functional.rasterization.interpolation.basic.line_strip_wide,Fail
-dEQP-GLES3.functional.rasterization.interpolation.basic.lines_wide,Fail
-dEQP-GLES3.functional.rasterization.interpolation.projected.line_loop_wide,Fail
-dEQP-GLES3.functional.rasterization.interpolation.projected.line_strip_wide,Fail
-dEQP-GLES3.functional.rasterization.interpolation.projected.lines_wide,Fail
 dEQP-GLES31.functional.draw_buffers_indexed.random.max_required_draw_buffers.4,Fail
 dEQP-GLES31.functional.draw_buffers_indexed.random.max_required_draw_buffers.9,Fail
 dEQP-GLES31.functional.image_load_store.buffer.image_size.readonly_12,Fail
diff --git a/mesa 3D driver/src/gallium/drivers/virgl/ci/deqp-virgl-gles.toml b/mesa 3D driver/src/gallium/drivers/virgl/ci/deqp-virgl-gles.toml
index 4a012affa3..a332f152ba 100644
--- a/mesa 3D driver/src/gallium/drivers/virgl/ci/deqp-virgl-gles.toml	
+++ b/mesa 3D driver/src/gallium/drivers/virgl/ci/deqp-virgl-gles.toml	
@@ -8,6 +8,8 @@ deqp_args = [
     "--deqp-gl-config-name=rgba8888d24s8ms0",
     "--deqp-visibility=hidden"
 ]
+version_check = "GL ES 3.1.*git"
+renderer_check = "virgl"
 
 [[deqp]]
 deqp = "/deqp/modules/gles3/deqp-gles3"
diff --git a/mesa 3D driver/src/gallium/drivers/virgl/ci/gitlab-ci.yml b/mesa 3D driver/src/gallium/drivers/virgl/ci/gitlab-ci.yml
index 3380c6e857..65ca31ff5d 100644
--- a/mesa 3D driver/src/gallium/drivers/virgl/ci/gitlab-ci.yml	
+++ b/mesa 3D driver/src/gallium/drivers/virgl/ci/gitlab-ci.yml	
@@ -9,13 +9,11 @@
 
 virgl-on-gl:
   variables:
-    DEQP_VER: gles2 # For renderer check
     DEQP_SUITE: virgl-gl
     GPU_VERSION: virgl-gl
     # Speed rendering a bit, as crosvm is processing requests serially
     LP_NUM_THREADS: 8
-    DEQP_PARALLEL: 3
-    DEQP_EXPECTED_RENDERER: virgl
+    FDO_CI_CONCURRENT: 3
     CROSVM_TEST_SCRIPT: "/install/deqp-runner.sh"
   parallel: 6
   tags:
@@ -45,8 +43,6 @@ virgl-traces:
     PIGLIT_REPLAY_DESCRIPTION_FILE: "${CI_PROJECT_DIR}/install/traces-virgl.yml"
     PIGLIT_REPLAY_DEVICE_NAME: "gl-virgl"
     PIGLIT_RESULTS: "virgl-replay"
-    MESA_GLES_VERSION_OVERRIDE: "3.1"
-    MESA_GLSL_VERSION_OVERRIDE: "310"
     CROSVM_TEST_SCRIPT: "/install/piglit/run.sh"
   tags:
     - kvm
diff --git a/mesa 3D driver/src/gallium/drivers/virgl/ci/traces-virgl.yml b/mesa 3D driver/src/gallium/drivers/virgl/ci/traces-virgl.yml
index 1857ec0d53..b0cbf98b56 100644
--- a/mesa 3D driver/src/gallium/drivers/virgl/ci/traces-virgl.yml	
+++ b/mesa 3D driver/src/gallium/drivers/virgl/ci/traces-virgl.yml	
@@ -5,11 +5,11 @@ traces:
   - path: glmark2/desktop:windows=4:effect=blur:blur-radius=5:passes=1:separable=true.trace
     expectations:
       - device: gl-virgl
-        checksum: 26080879ac8eb63c2c5da3de5fc732f4
+        checksum: 2fc8433c4a38b796173bda2bcfb924cc
   - path: glmark2/jellyfish.trace
     expectations:
       - device: gl-virgl
-        checksum: 91dbe94735a132aeb192ae5c618ddc06
+        checksum: 48a2ad1162bf92301cedbe53edf52a6b
   - path: glxgears/glxgears-2.trace
     expectations:
       - device: gl-virgl
@@ -17,23 +17,23 @@ traces:
   - path: gputest/furmark.trace
     expectations:
       - device: gl-virgl
-        checksum: 87ffd45be95f2d55f82325c86ce32f20
+        checksum: d5682aaa762a4849f0cae1692623bdcb
   - path: gputest/pixmark-piano.trace
     expectations:
       - device: gl-virgl
-        checksum: 8293e59b818715ddf1c23e9f60b17851
+        checksum: 1bcded27a6ba04fe0f76ff997b98dbc3
   - path: gputest/triangle.trace
     expectations:
       - device: gl-virgl
-        checksum: 848436d1a2448bdc76fd6220382d8606
+        checksum: 7812de00011a3a059892e36cea19c696
   - path: humus/Portals.trace
     expectations:
       - device: gl-virgl
-        checksum: 6d78eed6749f01cc5625dec0ad129c66
+        checksum: b697edce7776f1afe294a7e80dfc013e
   - path: 0ad/0ad.trace
     expectations:
       - device: gl-virgl
-        checksum: 03145ecd597dca6aaf5e113b99687278
+        checksum: 5e5bd83446d2554bf25761576d9b1af6
   - path: glmark2/buffer:update-fraction=0.5:update-dispersion=0.9:columns=200:update-method=map:interleave=false.trace
     expectations:
       - device: gl-virgl
@@ -73,7 +73,7 @@ traces:
   - path: glmark2/desktop:windows=4:effect=shadow.trace
     expectations:
       - device: gl-virgl
-        checksum: f5312a6270ed394b5fcbe66d590f4e49
+        checksum: d4b3e8338327859a029c7267c9916524
   - path: glmark2/effect2d:kernel=0,1,0;1,-4,1;0,1,0;.trace
     expectations:
       - device: gl-virgl
@@ -121,7 +121,7 @@ traces:
   - path: glmark2/refract.trace
     expectations:
       - device: gl-virgl
-        checksum: b77f24a0651ef93e1c4253042ad9c9fa
+        checksum: b1332df324d0fc1db22b362231d3ed01
   - path: glmark2/shading:shading=blinn-phong-inf.trace
     expectations:
       - device: gl-virgl
@@ -145,11 +145,11 @@ traces:
   - path: glmark2/texture:texture-filter=linear.trace
     expectations:
       - device: gl-virgl
-        checksum: 8a91e4a0d3bb6c797b81ea36e7dac837
+        checksum: 914fd8dddb23751d9a187a979d881abb
   - path: glmark2/texture:texture-filter=mipmap.trace
     expectations:
       - device: gl-virgl
-        checksum: b29190a2f339f0fafb3a20100f58e79e
+        checksum: ea1939f3c4e8dd9cdbc26d41f9dc891a
   - path: glmark2/texture:texture-filter=nearest.trace
     expectations:
       - device: gl-virgl
@@ -166,87 +166,80 @@ traces:
   - path: gputest/pixmark-julia-fp32.trace
     expectations:
       - device: gl-virgl
-        checksum: dd78a9f15834d0def6c07be648240d85
-# Crash
-# - path: gputest/pixmark-julia-fp64.trace
-#   expectations:
-#     - device: gl-virgl
-#       checksum: 0
+        checksum: 8b3584b1dd8f1d1bb63205564bd78e4e
+  - path: gputest/pixmark-julia-fp64.trace
+    expectations:
+      - device: gl-virgl
+        checksum: 73ccaff82ea764057fb0f93f0024cf84
   - path: gputest/pixmark-volplosion.trace
     expectations:
       - device: gl-virgl
-        checksum: 592f35e418490990ea88e19c90cf1205
+        checksum: 9bedb84d81528e1b4087522de9f70383
   - path: gputest/plot3d.trace
     expectations:
       - device: gl-virgl
-        checksum: 0ebf993380210d9e5138f1057ddaa7ab
-# Crash
-#  - path: gputest/tessmark.trace
-#    expectations:
-#      - device: gl-virgl
-#        checksum: 5d04b8d71517238b9bc8a527574e884b
+        checksum: a1af286874f7060171cb3ca2e765c448
+# Times out
+# - path: gputest/tessmark.trace
+#   expectations:
+#     - device: gl-virgl
+#       checksum: 5d04b8d71517238b9bc8a527574e884b
   - path: humus/AmbientAperture.trace
     expectations:
       - device: gl-virgl
-        checksum: d5635a3660405651a572b5efc6107827
+        checksum: b33fb8ee73b0c50b14822e170f15ab8a
   - path: humus/CelShading.trace
     expectations:
       - device: gl-virgl
-        checksum: 7d56b6ec4a1cf30406bceb9fe3e4d5c8
+        checksum: 3629cba72bde53e4275a8365175fde83
   - path: humus/DynamicBranching3.trace
     expectations:
-      # speckling present on the ground that isn't there on other drivers.
       - device: gl-virgl
-        checksum: 45f7f54972d196c173763f77082393bb
+        checksum: 0236b28aa8b26fa60172d71bb040f2e9
   - path: humus/HDR.trace
     expectations:
       - device: gl-virgl
-        checksum: c04b8feeb0d41763db0c636373d3ba4e
+        checksum: eab0801aadeae87ce31aa0d4ff55e8f8
   - path: humus/RaytracedShadows.trace
     expectations:
       - device: gl-virgl
-        checksum: a0d3b811a34033465441776d7115462c
+        checksum: 298e49b697e9141294ecbc6283729d86
   - path: humus/VolumetricFogging2.trace
     expectations:
       - device: gl-virgl
-        checksum: 6260e5d1e7e6fc050d48e6a1790c6117
-# Crash
-#  - path: itoral-gl-terrain-demo/demo.trace
-#    expectations:
-#      - device: gl-virgl
-#        checksum: 9571117bf4eab6fe29b12f6c3d42d865
+        checksum: 382891c83f2afe4fcbdd2bfd241c1339
+  - path: itoral-gl-terrain-demo/demo.trace
+    expectations:
+      - device: gl-virgl
+        checksum: fe6124227b7f8e4e96ffbbd48c713c42
   - path: neverball/neverball.trace
     expectations:
       - device: gl-virgl
-        checksum: 4ad035fd572792648c82903e0c3b87ed
-# Crash
-#  - path: pathfinder/canvas_moire.trace
-#    expectations:
-#      - device: gl-virgl
-#        checksum: 0e32ca8fc815a7250f38a07faeafb21b
-# Crash
-#  - path: pathfinder/canvas_text_v2.trace
-#    expectations:
-#      - device: gl-virgl
-#        checksum: 74129b650bd3ca806ff2dd46813ba6e6
-# Crash
-#  - path: pathfinder/demo.trace
-#    expectations:
-#      - device: gl-virgl
-#        checksum: f6661ed4de9e0a444c6338ebd0cd3768
+        checksum: cc11743f008ccd76adf72695a423436a
+  - path: pathfinder/canvas_moire.trace
+    expectations:
+      - device: gl-virgl
+        checksum: 2cb5be6a6f62e417f1a89c89180e5728
+  - path: pathfinder/canvas_text_v2.trace
+    expectations:
+      - device: gl-virgl
+        checksum: a1446d0c42a78771240fca6f3b1e10d8
+  - path: pathfinder/demo.trace
+    expectations:
+      - device: gl-virgl
+        checksum: 0702a66c415cfc13d5bae8bec08402cf
   - path: supertuxkart/supertuxkart-mansion-egl-gles.trace
     expectations:
       - device: gl-virgl
-        checksum: 0e93531dfb9d18941904ff1f49581ea0
-# Sometimes crashes
-#  - path: xonotic/xonotic-keybench-high.trace
-#    expectations:
-#      - device: gl-virgl
-#        checksum: e8a52c8f558a0085eb45fcba0f6c59e2
+        checksum: 156c26de2cefe1973b1593e6b22f7edb
+  - path: xonotic/xonotic-keybench-high.trace
+    expectations:
+      - device: gl-virgl
+        checksum: f3b184bf8858a6ebccd09e7ca032197e
   - path: valve/counterstrike.trace
     expectations:
       - device: gl-virgl
-        checksum: 7f3ae17190d74da032d9a463c738404a
+        checksum: 3bc0e0e39cb3c29f6d76ff07f1f02860
 # Piglit times-out when trying to run these two
 #  - path: valve/counterstrike-source.trace
 #    expectations:
@@ -259,7 +252,7 @@ traces:
   - path: valve/portal-2.trace
     expectations:
       - device: gl-virgl
-        checksum: adf249c3d2ee204ef6583641f70816db
+        checksum: 9f7fecf8df89e105a4d2b4a61468b427
 # Piglit crashes when trying to run this one
 #  - path: supertuxkart/supertuxkart-antediluvian-abyss.rdc
 #    expectations:
diff --git a/mesa 3D driver/src/gallium/drivers/virgl/meson.build b/mesa 3D driver/src/gallium/drivers/virgl/meson.build
index 6526b3b6e3..b3adb12071 100644
--- a/mesa 3D driver/src/gallium/drivers/virgl/meson.build	
+++ b/mesa 3D driver/src/gallium/drivers/virgl/meson.build	
@@ -37,7 +37,7 @@ libvirgl = static_library(
   [ files_libvirgl ],
   gnu_symbol_visibility : 'hidden',
   include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_virtio],
-  dependencies : [dep_libdrm, idep_mesautil, idep_xmlconfig],
+  dependencies : [dep_libdrm, idep_mesautil, idep_xmlconfig, idep_nir],
 )
 
 driver_virgl = declare_dependency(
diff --git a/mesa 3D driver/src/gallium/drivers/virgl/tests/meson.build b/mesa 3D driver/src/gallium/drivers/virgl/tests/meson.build
index 595e63675a..d047bdc94e 100644
--- a/mesa 3D driver/src/gallium/drivers/virgl/tests/meson.build	
+++ b/mesa 3D driver/src/gallium/drivers/virgl/tests/meson.build	
@@ -28,4 +28,5 @@ test(
     link_with : [libvirgl, libgallium],
   ),
   suite : ['virgl'],
+  protocol : gtest_test_protocol,
 )
diff --git a/mesa 3D driver/src/gallium/drivers/virgl/virgl_context.c b/mesa 3D driver/src/gallium/drivers/virgl/virgl_context.c
index dcb0ec62ae..8970c4f37d 100644
--- a/mesa 3D driver/src/gallium/drivers/virgl/virgl_context.c	
+++ b/mesa 3D driver/src/gallium/drivers/virgl/virgl_context.c	
@@ -24,10 +24,12 @@
 #include <libsync.h>
 #include "pipe/p_shader_tokens.h"
 
+#include "compiler/nir/nir.h"
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
 #include "pipe/p_state.h"
+#include "nir/nir_to_tgsi.h"
 #include "util/u_draw.h"
 #include "util/u_inlines.h"
 #include "util/u_memory.h"
@@ -41,8 +43,6 @@
 #include "tgsi/tgsi_text.h"
 #include "indices/u_primconvert.h"
 
-#include "pipebuffer/pb_buffer.h"
-
 #include "virgl_encode.h"
 #include "virgl_context.h"
 #include "virtio-gpu/virgl_protocol.h"
@@ -680,10 +680,19 @@ static void *virgl_shader_encoder(struct pipe_context *ctx,
 {
    struct virgl_context *vctx = virgl_context(ctx);
    uint32_t handle;
+   const struct tgsi_token *tokens;
+   const struct tgsi_token *ntt_tokens = NULL;
    struct tgsi_token *new_tokens;
    int ret;
 
-   new_tokens = virgl_tgsi_transform((struct virgl_screen *)vctx->base.screen, shader->tokens);
+   if (shader->type == PIPE_SHADER_IR_NIR) {
+      nir_shader *s = nir_shader_clone(NULL, shader->ir.nir);
+      ntt_tokens = tokens = nir_to_tgsi(s, vctx->base.screen); /* takes ownership */
+   } else {
+      tokens = shader->tokens;
+   }
+
+   new_tokens = virgl_tgsi_transform((struct virgl_screen *)vctx->base.screen, tokens);
    if (!new_tokens)
       return NULL;
 
@@ -693,9 +702,11 @@ static void *virgl_shader_encoder(struct pipe_context *ctx,
                                    &shader->stream_output, 0,
                                    new_tokens);
    if (ret) {
+      FREE((void *)ntt_tokens);
       return NULL;
    }
 
+   FREE((void *)ntt_tokens);
    FREE(new_tokens);
    return (void *)(unsigned long)handle;
 
@@ -936,8 +947,8 @@ static void virgl_submit_cmd(struct virgl_winsys *vws,
    }
 }
 
-static void virgl_flush_eq(struct virgl_context *ctx, void *closure,
-			   struct pipe_fence_handle **fence)
+void virgl_flush_eq(struct virgl_context *ctx, void *closure,
+		    struct pipe_fence_handle **fence)
 {
    struct virgl_screen *rs = virgl_screen(ctx->base.screen);
 
@@ -1015,6 +1026,7 @@ static void virgl_set_sampler_views(struct pipe_context *ctx,
                                    unsigned start_slot,
                                    unsigned num_views,
                                    unsigned unbind_num_trailing_slots,
+                                   bool take_ownership,
                                    struct pipe_sampler_view **views)
 {
    struct virgl_context *vctx = virgl_context(ctx);
@@ -1028,7 +1040,12 @@ static void virgl_set_sampler_views(struct pipe_context *ctx,
          struct virgl_resource *res = virgl_resource(views[i]->texture);
          res->bind_history |= PIPE_BIND_SAMPLER_VIEW;
 
-         pipe_sampler_view_reference(&binding->views[idx], views[i]);
+         if (take_ownership) {
+            pipe_sampler_view_reference(&binding->views[idx], NULL);
+            binding->views[idx] = views[i];
+         } else {
+            pipe_sampler_view_reference(&binding->views[idx], views[i]);
+         }
          binding->view_enabled_mask |= 1 << idx;
       } else {
          pipe_sampler_view_reference(&binding->views[idx], NULL);
@@ -1041,7 +1058,7 @@ static void virgl_set_sampler_views(struct pipe_context *ctx,
 
    if (unbind_num_trailing_slots) {
       virgl_set_sampler_views(ctx, shader_type, start_slot + num_views,
-                              unbind_num_trailing_slots, 0, NULL);
+                              unbind_num_trailing_slots, 0, false, NULL);
    }
 }
 
@@ -1157,6 +1174,13 @@ static void virgl_set_tess_state(struct pipe_context *ctx,
    virgl_encode_set_tess_state(vctx, default_outer_level, default_inner_level);
 }
 
+static void virgl_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+
+   vctx->patch_vertices = patch_vertices;
+}
+
 static void virgl_resource_copy_region(struct pipe_context *ctx,
                                       struct pipe_resource *dst,
                                       unsigned dst_level,
@@ -1340,19 +1364,30 @@ static void *virgl_create_compute_state(struct pipe_context *ctx,
 {
    struct virgl_context *vctx = virgl_context(ctx);
    uint32_t handle;
-   const struct tgsi_token *new_tokens = state->prog;
+   const struct tgsi_token *ntt_tokens = NULL;
+   const struct tgsi_token *tokens;
    struct pipe_stream_output_info so_info = {};
    int ret;
 
+   if (state->ir_type == PIPE_SHADER_IR_NIR) {
+      nir_shader *s = nir_shader_clone(NULL, state->prog);
+      ntt_tokens = tokens = nir_to_tgsi(s, vctx->base.screen); /* takes ownership */
+   } else {
+      tokens = state->prog;
+   }
+
    handle = virgl_object_assign_handle();
    ret = virgl_encode_shader_state(vctx, handle, PIPE_SHADER_COMPUTE,
                                    &so_info,
                                    state->req_local_mem,
-                                   new_tokens);
+                                   tokens);
    if (ret) {
+      FREE((void *)ntt_tokens);
       return NULL;
    }
 
+   FREE((void *)ntt_tokens);
+
    return (void *)(unsigned long)handle;
 }
 
@@ -1540,6 +1575,7 @@ struct pipe_context *virgl_context_create(struct pipe_screen *pscreen,
    vctx->base.set_constant_buffer = virgl_set_constant_buffer;
 
    vctx->base.set_tess_state = virgl_set_tess_state;
+   vctx->base.set_patch_vertices = virgl_set_patch_vertices;
    vctx->base.create_vs_state = virgl_create_vs_state;
    vctx->base.create_tcs_state = virgl_create_tcs_state;
    vctx->base.create_tes_state = virgl_create_tes_state;
diff --git a/mesa 3D driver/src/gallium/drivers/virgl/virgl_context.h b/mesa 3D driver/src/gallium/drivers/virgl/virgl_context.h
index 9acd698bc7..455af6240d 100644
--- a/mesa 3D driver/src/gallium/drivers/virgl/virgl_context.h	
+++ b/mesa 3D driver/src/gallium/drivers/virgl/virgl_context.h	
@@ -85,6 +85,7 @@ struct virgl_context {
    struct virgl_staging_mgr staging;
    bool encoded_transfers;
    bool supports_staging;
+   uint8_t patch_vertices;
 
    struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
    unsigned num_vertex_buffers;
@@ -137,4 +138,6 @@ void
 virgl_rebind_resource(struct virgl_context *vctx,
                       struct pipe_resource *res);
 
+void virgl_flush_eq(struct virgl_context *ctx, void *closure,
+		    struct pipe_fence_handle **fence);
 #endif
diff --git a/mesa 3D driver/src/gallium/drivers/virgl/virgl_encode.c b/mesa 3D driver/src/gallium/drivers/virgl/virgl_encode.c
index 27595b6817..519b3018ef 100644
--- a/mesa 3D driver/src/gallium/drivers/virgl/virgl_encode.c	
+++ b/mesa 3D driver/src/gallium/drivers/virgl/virgl_encode.c	
@@ -764,7 +764,7 @@ int virgl_encoder_draw_vbo(struct virgl_context *ctx,
    else
       virgl_encoder_write_dword(ctx->cbuf, 0);
    if (length >= VIRGL_DRAW_VBO_SIZE_TESS) {
-      virgl_encoder_write_dword(ctx->cbuf, info->vertices_per_patch); /* vertices per patch */
+      virgl_encoder_write_dword(ctx->cbuf, ctx->patch_vertices); /* vertices per patch */
       virgl_encoder_write_dword(ctx->cbuf, drawid_offset); /* drawid */
    }
    if (length == VIRGL_DRAW_VBO_SIZE_INDIRECT) {
diff --git a/mesa 3D driver/src/gallium/drivers/virgl/virgl_screen.c b/mesa 3D driver/src/gallium/drivers/virgl/virgl_screen.c
index 56cfdc6e52..a832f98461 100644
--- a/mesa 3D driver/src/gallium/drivers/virgl/virgl_screen.c	
+++ b/mesa 3D driver/src/gallium/drivers/virgl/virgl_screen.c	
@@ -31,6 +31,7 @@
 #include "util/xmlconfig.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
+#include "nir/nir_to_tgsi.h"
 
 #include "tgsi/tgsi_exec.h"
 
@@ -45,10 +46,12 @@ int virgl_debug = 0;
 static const struct debug_named_value virgl_debug_options[] = {
    { "verbose",   VIRGL_DEBUG_VERBOSE,             NULL },
    { "tgsi",      VIRGL_DEBUG_TGSI,                NULL },
+   { "nir",       VIRGL_DEBUG_NIR,                 NULL },
    { "noemubgra", VIRGL_DEBUG_NO_EMULATE_BGRA,     "Disable tweak to emulate BGRA as RGBA on GLES hosts"},
    { "nobgraswz", VIRGL_DEBUG_NO_BGRA_DEST_SWIZZLE,"Disable tweak to swizzle emulated BGRA on GLES hosts" },
    { "sync",      VIRGL_DEBUG_SYNC,                "Sync after every flush" },
    { "xfer",      VIRGL_DEBUG_XFER,                "Do not optimize for transfers" },
+   { "nocoherent", VIRGL_DEBUG_NO_COHERENT,        "Disable coherent memory"},
    DEBUG_NAMED_VALUE_END
 };
 DEBUG_GET_ONCE_FLAGS_OPTION(virgl_debug, "VIRGL_DEBUG", virgl_debug_options, 0)
@@ -170,6 +173,7 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
       return MIN2(vscreen->caps.caps.v1.glsl_level, 140);
    case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
+      return 1;
    case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE:
       return 0;
    case PIPE_CAP_COMPUTE:
@@ -188,6 +192,7 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
    case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
    case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
+   case PIPE_CAP_NIR_IMAGES_AS_DEREF:
       return 0;
    case PIPE_CAP_QUERY_TIMESTAMP:
       return 1;
@@ -300,7 +305,7 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
       return (vscreen->caps.caps.v2.capability_bits & VIRGL_CAP_ARB_BUFFER_STORAGE) &&
              (vscreen->caps.caps.v2.host_feature_check_version >= 4) &&
-              vscreen->vws->supports_coherent;
+              vscreen->vws->supports_coherent && !vscreen->no_coherent;
    case PIPE_CAP_PCI_GROUP:
    case PIPE_CAP_PCI_BUS:
    case PIPE_CAP_PCI_DEVICE:
@@ -423,8 +428,10 @@ virgl_get_shader_param(struct pipe_screen *screen,
             return vscreen->caps.caps.v2.max_shader_image_frag_compute;
          else
             return vscreen->caps.caps.v2.max_shader_image_other_stages;
+      case PIPE_SHADER_CAP_PREFERRED_IR:
+         return (virgl_debug & VIRGL_DEBUG_NIR) ? PIPE_SHADER_IR_NIR : PIPE_SHADER_IR_TGSI;
       case PIPE_SHADER_CAP_SUPPORTED_IRS:
-         return (1 << PIPE_SHADER_IR_TGSI);
+         return (1 << PIPE_SHADER_IR_TGSI) | ((virgl_debug & VIRGL_DEBUG_NIR) ? (1 << PIPE_SHADER_IR_NIR) : 0);
       case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
          return vscreen->caps.caps.v2.max_atomic_counters[shader];
       case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
@@ -800,6 +807,10 @@ static bool virgl_fence_finish(struct pipe_screen *screen,
 {
    struct virgl_screen *vscreen = virgl_screen(screen);
    struct virgl_winsys *vws = vscreen->vws;
+   struct virgl_context *vctx = virgl_context(ctx);
+
+   if (vctx && timeout)
+      virgl_flush_eq(vctx, NULL, NULL);
 
    return vws->fence_wait(vws, fence, timeout);
 }
@@ -900,13 +911,23 @@ static void virgl_disk_cache_create(struct virgl_screen *screen)
 {
    const struct build_id_note *note =
       build_id_find_nhdr_for_addr(virgl_disk_cache_create);
-   assert(note && build_id_length(note) == 20); /* sha1 */
+   unsigned build_id_len = build_id_length(note);
+   assert(note && build_id_len == 20); /* sha1 */
 
    const uint8_t *id_sha1 = build_id_data(note);
    assert(id_sha1);
 
+   struct mesa_sha1 sha1_ctx;
+   _mesa_sha1_init(&sha1_ctx);
+   _mesa_sha1_update(&sha1_ctx, id_sha1, build_id_len);
+
+   uint32_t shader_debug_flags = virgl_debug & VIRGL_DEBUG_NIR;
+   _mesa_sha1_update(&sha1_ctx, &shader_debug_flags, sizeof(shader_debug_flags));
+
+   uint8_t sha1[20];
+   _mesa_sha1_final(&sha1_ctx, sha1);
    char timestamp[41];
-   _mesa_sha1_format(timestamp, id_sha1);
+   _mesa_sha1_format(timestamp, sha1);
 
    screen->disk_cache = disk_cache_create("virgl", timestamp, 0);
 }
@@ -954,6 +975,7 @@ virgl_create_screen(struct virgl_winsys *vws, const struct pipe_screen_config *c
    }
    screen->tweak_gles_emulate_bgra &= !(virgl_debug & VIRGL_DEBUG_NO_EMULATE_BGRA);
    screen->tweak_gles_apply_bgra_dest_swizzle &= !(virgl_debug & VIRGL_DEBUG_NO_BGRA_DEST_SWIZZLE);
+   screen->no_coherent = virgl_debug & VIRGL_DEBUG_NO_COHERENT;
 
    screen->vws = vws;
    screen->base.get_name = virgl_get_name;
@@ -962,6 +984,7 @@ virgl_create_screen(struct virgl_winsys *vws, const struct pipe_screen_config *c
    screen->base.get_shader_param = virgl_get_shader_param;
    screen->base.get_compute_param = virgl_get_compute_param;
    screen->base.get_paramf = virgl_get_paramf;
+   screen->base.get_compiler_options = nir_to_tgsi_get_compiler_options;
    screen->base.is_format_supported = virgl_is_format_supported;
    screen->base.destroy = virgl_destroy_screen;
    screen->base.context_create = virgl_context_create;
diff --git a/mesa 3D driver/src/gallium/drivers/virgl/virgl_screen.h b/mesa 3D driver/src/gallium/drivers/virgl/virgl_screen.h
index 22275cbea1..c44509f3ff 100644
--- a/mesa 3D driver/src/gallium/drivers/virgl/virgl_screen.h	
+++ b/mesa 3D driver/src/gallium/drivers/virgl/virgl_screen.h	
@@ -35,6 +35,8 @@ enum virgl_debug_flags {
    VIRGL_DEBUG_NO_BGRA_DEST_SWIZZLE = 1 << 3,
    VIRGL_DEBUG_SYNC                 = 1 << 4,
    VIRGL_DEBUG_XFER                 = 1 << 5,
+   VIRGL_DEBUG_NO_COHERENT          = 1 << 6,
+   VIRGL_DEBUG_NIR                  = 1 << 7,
 };
 
 extern int virgl_debug;
@@ -56,6 +58,7 @@ struct virgl_screen {
    uint32_t sub_ctx_id;
    bool tweak_gles_emulate_bgra;
    bool tweak_gles_apply_bgra_dest_swizzle;
+   bool no_coherent;
    int32_t tweak_gles_tf3_value;
 
    struct disk_cache *disk_cache;
diff --git a/mesa 3D driver/src/gallium/drivers/zink/ci/deqp-zink-lvp-fails.txt b/mesa 3D driver/src/gallium/drivers/zink/ci/deqp-zink-lvp-fails.txt
index 3a8736cc74..13aeb75e87 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/ci/deqp-zink-lvp-fails.txt	
+++ b/mesa 3D driver/src/gallium/drivers/zink/ci/deqp-zink-lvp-fails.txt	
@@ -1,19 +1,11 @@
 dEQP-GLES2.functional.clipping.point.wide_point_clip,Fail
 dEQP-GLES2.functional.clipping.point.wide_point_clip_viewport_center,Fail
 dEQP-GLES2.functional.clipping.point.wide_point_clip_viewport_corner,Fail
-dEQP-GLES2.functional.clipping.triangle_vertex.clip_three.clip_pos_x_and_neg_y_neg_z_and_pos_y_pos_z,Fail
-dEQP-GLES2.functional.clipping.triangle_vertex.clip_two.clip_neg_y_neg_z_and_neg_x_neg_y_pos_z,Fail
-dEQP-GLES2.functional.clipping.triangle_vertex.clip_two.clip_pos_y_pos_z_and_neg_x_neg_y_neg_z,Fail
-dEQP-GLES2.functional.polygon_offset.default_displacement_with_units,Fail
-dEQP-GLES3.functional.polygon_offset.default_displacement_with_units,Fail
-dEQP-GLES3.functional.polygon_offset.float32_displacement_with_units,Fail
 dEQP-GLES3.functional.clipping.line.wide_line_clip_viewport_center,Fail
 dEQP-GLES3.functional.clipping.line.wide_line_clip_viewport_corner,Fail
 dEQP-GLES3.functional.clipping.point.wide_point_clip,Fail
 dEQP-GLES3.functional.clipping.point.wide_point_clip_viewport_center,Fail
 dEQP-GLES3.functional.clipping.point.wide_point_clip_viewport_corner,Fail
-dEQP-GLES3.functional.clipping.triangle_vertex.clip_two.clip_neg_y_neg_z_and_neg_x_neg_y_pos_z,Fail
-dEQP-GLES3.functional.clipping.triangle_vertex.clip_two.clip_pos_y_pos_z_and_neg_x_neg_y_neg_z,Fail
 dEQP-GLES3.functional.fbo.blit.rect.nearest_consistency_mag,Fail
 dEQP-GLES3.functional.fbo.blit.rect.nearest_consistency_mag_reverse_dst_x,Fail
 dEQP-GLES3.functional.fbo.blit.rect.nearest_consistency_mag_reverse_src_dst_x,Fail
@@ -28,14 +20,6 @@ dEQP-GLES3.functional.multisample.fbo_4_samples.proportionality_sample_coverage,
 dEQP-GLES3.functional.multisample.fbo_4_samples.sample_coverage_invert,Fail
 dEQP-GLES3.functional.multisample.fbo_max_samples.proportionality_sample_coverage,Fail
 dEQP-GLES3.functional.multisample.fbo_max_samples.sample_coverage_invert,Fail
-dEQP-GLES3.functional.rasterization.fbo.rbo_singlesample.interpolation.lines_wide,Fail
-dEQP-GLES3.functional.rasterization.fbo.texture_2d.interpolation.lines_wide,Fail
-dEQP-GLES3.functional.rasterization.interpolation.basic.line_loop_wide,Fail
-dEQP-GLES3.functional.rasterization.interpolation.basic.line_strip_wide,Fail
-dEQP-GLES3.functional.rasterization.interpolation.basic.lines_wide,Fail
-dEQP-GLES3.functional.rasterization.interpolation.projected.line_loop_wide,Fail
-dEQP-GLES3.functional.rasterization.interpolation.projected.line_strip_wide,Fail
-dEQP-GLES3.functional.rasterization.interpolation.projected.lines_wide,Fail
 KHR-GL32.transform_feedback.capture_geometry_separate_test,Fail
 KHR-GL32.transform_feedback.capture_vertex_interleaved_test,Fail
 KHR-GL32.transform_feedback.capture_vertex_separate_test,Fail
@@ -47,25 +31,3 @@ KHR-GL32.transform_feedback.query_vertex_interleaved_test,Fail
 KHR-GL32.transform_feedback.query_vertex_separate_test,Fail
 dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_center,Fail
 dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_corner,Fail
-dEQP-GLES2.functional.rasterization.interpolation.basic.line_loop_wide,Fail
-dEQP-GLES2.functional.rasterization.interpolation.basic.line_strip_wide,Fail
-dEQP-GLES2.functional.rasterization.interpolation.basic.lines_wide,Fail
-dEQP-GLES2.functional.rasterization.interpolation.projected.line_loop_wide,Fail
-dEQP-GLES2.functional.rasterization.interpolation.projected.line_strip_wide,Fail
-dEQP-GLES2.functional.rasterization.interpolation.projected.lines_wide,Fail
-KHR-GL32.packed_pixels.pbo_rectangle.r16i,Fail
-KHR-GL32.packed_pixels.pbo_rectangle.r16ui,Fail
-KHR-GL32.packed_pixels.pbo_rectangle.r32i,Fail
-KHR-GL32.packed_pixels.pbo_rectangle.r32ui,Fail
-KHR-GL32.packed_pixels.pbo_rectangle.r8ui,Fail
-KHR-GL32.packed_pixels.pbo_rectangle.rg16i,Fail
-KHR-GL32.packed_pixels.pbo_rectangle.rg16ui,Fail
-KHR-GL32.packed_pixels.pbo_rectangle.rg32i,Fail
-KHR-GL32.packed_pixels.pbo_rectangle.rg32ui,Fail
-KHR-GL32.packed_pixels.pbo_rectangle.rg8ui,Fail
-KHR-GL32.packed_pixels.pbo_rectangle.rgb10_a2ui,Fail
-KHR-GL32.packed_pixels.pbo_rectangle.rgba16i,Fail
-KHR-GL32.packed_pixels.pbo_rectangle.rgba16ui,Fail
-KHR-GL32.packed_pixels.pbo_rectangle.rgba32i,Fail
-KHR-GL32.packed_pixels.pbo_rectangle.rgba32ui,Fail
-KHR-GL32.packed_pixels.pbo_rectangle.rgba8ui,Fail
diff --git a/mesa 3D driver/src/gallium/drivers/zink/ci/deqp-zink-lvp.toml b/mesa 3D driver/src/gallium/drivers/zink/ci/deqp-zink-lvp.toml
index 8c902ef073..c31b323ba7 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/ci/deqp-zink-lvp.toml	
+++ b/mesa 3D driver/src/gallium/drivers/zink/ci/deqp-zink-lvp.toml	
@@ -9,6 +9,8 @@ deqp_args = [
     "--deqp-visibility=hidden"
 ]
 timeout = 180.0
+version_check = "GL ES 3.2.*git"
+renderer_check = "zink.*llvmpipe"
 
 [[deqp]]
 deqp = "/deqp/modules/gles3/deqp-gles3"
diff --git a/mesa 3D driver/src/gallium/drivers/zink/ci/gitlab-ci.yml b/mesa 3D driver/src/gallium/drivers/zink/ci/gitlab-ci.yml
index a4132cd75d..83c966cee2 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/ci/gitlab-ci.yml	
+++ b/mesa 3D driver/src/gallium/drivers/zink/ci/gitlab-ci.yml	
@@ -30,6 +30,12 @@ zink-piglit-no_timelines:
   script:
     - xvfb-run --server-args='-noreset' sh -c "ZINK_NO_TIMELINES=1 GALLIUM_DRIVER=zink VK_DRIVER=lvp install/piglit/piglit-runner.sh"
 
+zink-piglit-lazy:
+  extends:
+    - .zink-piglit-quick_gl
+  script:
+    - xvfb-run --server-args='-noreset' sh -c "ZINK_DESCRIPTORS=lazy GALLIUM_DRIVER=zink VK_DRIVER=lvp install/piglit/piglit-runner.sh"
+
 zink-lvp-deqp:
   extends:
     - .test-gl
@@ -38,7 +44,5 @@ zink-lvp-deqp:
   variables:
     GALLIUM_DRIVER: "zink" # move here due to bad xvfb-run interactions
     VK_DRIVER: lvp # Don't move to the top level, piglit runs do funny stuff with VK_DRIVER set
-    DEQP_EXPECTED_RENDERER: "zink.*llvmpipe"
-    DEQP_VER: gles2
     DEQP_SUITE: zink-lvp
   parallel: 2
diff --git a/mesa 3D driver/src/gallium/drivers/zink/ci/piglit-zink-lvp-fails.txt b/mesa 3D driver/src/gallium/drivers/zink/ci/piglit-zink-lvp-fails.txt
index 11b8b33399..057e15bc34 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/ci/piglit-zink-lvp-fails.txt	
+++ b/mesa 3D driver/src/gallium/drivers/zink/ci/piglit-zink-lvp-fails.txt	
@@ -1,11 +1,9 @@
 glx@glx-copy-sub-buffer,Fail
 glx@glx-copy-sub-buffer samples=2,Fail
 glx@glx-copy-sub-buffer samples=4,Fail
-glx@glx-multi-window-single-context,Fail
 glx@glx-multithread-texture,Fail
 glx@glx-swap-copy,Fail
 glx@glx-swap-pixmap-bad,Fail
-glx@glx-tfp,Crash
 glx@glx-visuals-depth,Crash
 glx@glx-visuals-depth -pixmap,Crash
 glx@glx-visuals-stencil,Crash
@@ -22,7 +20,6 @@ glx@glx_ext_import_context@imported context has same context id,Fail
 glx@glx_ext_import_context@make current- multi process,Fail
 glx@glx_ext_import_context@make current- single process,Fail
 glx@glx_ext_import_context@query context info,Fail
-shaders@glsl-bug-110796,Fail
 shaders@glsl-fs-pointcoord,Fail
 shaders@point-vertex-id divisor,Fail
 shaders@point-vertex-id gl_instanceid,Fail
@@ -79,7 +76,6 @@ spec@!opengl 3.2@gl-3.2-adj-prims pv-first,Fail
 spec@!opengl es 2.0@glsl-fs-pointcoord,Fail
 spec@!opengl es 3.0@gles-3.0-transform-feedback-uniform-buffer-object,Fail
 spec@arb_depth_texture@depth-tex-modes,Fail
-spec@arb_framebuffer_object@fbo-blit-scaled-linear,Fail
 spec@arb_framebuffer_object@fbo-gl_pointcoord,Fail
 spec@arb_get_program_binary@restore-sso-program,Fail
 spec@arb_gpu_shader_fp64@execution@arb_gpu_shader_fp64-tf-separate,Fail
@@ -131,7 +127,6 @@ spec@arb_texture_float@fbo-blending-formats@GL_INTENSITY16F_ARB,Fail
 spec@arb_texture_float@fbo-blending-formats@GL_INTENSITY32F_ARB,Fail
 spec@arb_texture_float@fbo-blending-formats@GL_LUMINANCE16F_ARB,Fail
 spec@arb_texture_float@fbo-blending-formats@GL_LUMINANCE32F_ARB,Fail
-spec@arb_texture_float@fbo-blending-formats@GL_RGB16F,Fail
 spec@arb_texture_float@fbo-blending-formats@GL_RGB32F,Fail
 spec@arb_texture_rg@multisample-fast-clear gl_arb_texture_rg-int,Fail
 spec@arb_texture_view@rendering-formats,Fail
@@ -452,8 +447,6 @@ spec@ext_framebuffer_object@fbo-blending-formats@GL_INTENSITY8,Fail
 spec@ext_framebuffer_object@fbo-blending-formats@GL_LUMINANCE12,Fail
 spec@ext_framebuffer_object@fbo-blending-formats@GL_LUMINANCE16,Fail
 spec@ext_framebuffer_object@fbo-blending-formats@GL_RGB10,Fail
-spec@ext_framebuffer_object@fbo-blending-formats@GL_RGB12,Fail
-spec@ext_framebuffer_object@fbo-blending-formats@GL_RGB16,Fail
 spec@ext_image_dma_buf_import@ext_image_dma_buf_import-export,Fail
 spec@ext_image_dma_buf_import@ext_image_dma_buf_import-export-tex,Fail
 spec@ext_image_dma_buf_import@ext_image_dma_buf_import-intel_external_sampler_only,Fail
@@ -492,7 +485,6 @@ spec@ext_texture_snorm@fbo-blending-formats@GL_INTENSITY_SNORM,Fail
 spec@ext_texture_snorm@fbo-blending-formats@GL_LUMINANCE16_SNORM,Fail
 spec@ext_texture_snorm@fbo-blending-formats@GL_LUMINANCE8_SNORM,Fail
 spec@ext_texture_snorm@fbo-blending-formats@GL_LUMINANCE_SNORM,Fail
-spec@ext_texture_snorm@fbo-blending-formats@GL_RGB16_SNORM,Fail
 spec@ext_texture_swizzle@depth_texture_mode_and_swizzle,Fail
 spec@ext_transform_feedback2@counting with pause,Fail
 spec@ext_transform_feedback@generatemipmap prims_generated,Fail
@@ -548,6 +540,10 @@ spec@oes_egl_image_external_essl3@oes_egl_image_external_essl3,Fail
 spec@oes_egl_image_external_essl3@oes_egl_image_external_essl3@oes_egl_image_external_essl3_imageLoad,Fail
 spec@oes_egl_image_external_essl3@oes_egl_image_external_essl3@oes_egl_image_external_essl3_imageStore,Fail
 spec@oes_texture_view@rendering-formats,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RGB16 as GL_RGB16F,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RGB16I as GL_RGB16F,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RGB16UI as GL_RGB16F,Fail
+spec@oes_texture_view@rendering-formats@clear GL_RGB16_SNORM as GL_RGB16F,Fail
 spec@oes_texture_view@rendering-formats@clear GL_R16 as GL_R16F,Fail
 spec@oes_texture_view@rendering-formats@clear GL_R16 as GL_R16I,Fail
 spec@oes_texture_view@rendering-formats@clear GL_R16 as GL_R16UI,Fail
diff --git a/mesa 3D driver/src/gallium/drivers/zink/ci/piglit-zink-lvp-flakes.txt b/mesa 3D driver/src/gallium/drivers/zink/ci/piglit-zink-lvp-flakes.txt
index 104c2c0cc9..076954365a 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/ci/piglit-zink-lvp-flakes.txt	
+++ b/mesa 3D driver/src/gallium/drivers/zink/ci/piglit-zink-lvp-flakes.txt	
@@ -1,4 +1,2 @@
 spec@khr_debug@push-pop-group_gl.*
-# I can't reproduce these crashes locally
-# even after running them in loops for 4+ hours, so disable for now
-spec@arb_shader_texture_lod@execution@tex-miplevel-selection*
+glx@glx-multi-window-single-context
diff --git a/mesa 3D driver/src/gallium/drivers/zink/ci/piglit-zink-lvp-skips.txt b/mesa 3D driver/src/gallium/drivers/zink/ci/piglit-zink-lvp-skips.txt
index c15f4acd69..eccec4ae50 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/ci/piglit-zink-lvp-skips.txt	
+++ b/mesa 3D driver/src/gallium/drivers/zink/ci/piglit-zink-lvp-skips.txt	
@@ -25,3 +25,7 @@ glx@glx-multi-context-single-window
 # This one takes too long, but passes. There's other tests that don't
 # try all the combinations, so that's probably enough.
 spec@arb_compute_shader@local-id-explosion
+
+# I can't reproduce these crashes locally
+# even after running them in loops for 4+ hours, so disable for now
+.*tex-miplevel-selection.*
diff --git a/mesa 3D driver/src/gallium/drivers/zink/meson.build b/mesa 3D driver/src/gallium/drivers/zink/meson.build
index 87501df180..9dafc4f969 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/meson.build	
+++ b/mesa 3D driver/src/gallium/drivers/zink/meson.build	
@@ -92,8 +92,11 @@ libzink = static_library(
   'zink',
   [files_libzink, zink_device_info, zink_instance, zink_nir_algebraic_c, vk_dispatch_table],
   gnu_symbol_visibility : 'hidden',
-  include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_vulkan_wsi, inc_vulkan_util, inc_zink_vk],
-  dependencies: [dep_vulkan, idep_nir_headers, idep_mesautil, idep_vulkan_util_headers],
+  include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_vulkan_util, inc_zink_vk],
+  dependencies: [
+    dep_vulkan, idep_nir_headers, idep_mesautil, idep_vulkan_util_headers,
+    idep_vulkan_wsi_headers, dep_libdrm
+  ],
   c_args: zink_c_args,
 )
 
diff --git a/mesa 3D driver/src/gallium/drivers/zink/nir_lower_dynamic_bo_access.c b/mesa 3D driver/src/gallium/drivers/zink/nir_lower_dynamic_bo_access.c
index 157b67bb2c..cc38565b15 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/nir_lower_dynamic_bo_access.c	
+++ b/mesa 3D driver/src/gallium/drivers/zink/nir_lower_dynamic_bo_access.c	
@@ -49,7 +49,7 @@ recursive_generate_bo_ssa_def(nir_builder *b, nir_intrinsic_instr *instr, nir_ss
       new_instr->src[0] = nir_src_for_ssa(nir_imm_int(b, start));
       for (unsigned i = 0; i < nir_intrinsic_infos[instr->intrinsic].num_srcs; i++) {
          if (i)
-            nir_src_copy(&new_instr->src[i], &instr->src[i], &new_instr->instr);
+            nir_src_copy(&new_instr->src[i], &instr->src[i]);
       }
       if (instr->intrinsic != nir_intrinsic_load_ubo_vec4) {
          nir_intrinsic_set_align(new_instr, nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr));
diff --git a/mesa 3D driver/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c b/mesa 3D driver/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c
index 4679b5875c..22d4525cfb 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c	
+++ b/mesa 3D driver/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c	
@@ -49,9 +49,10 @@ struct ntv_context {
    gl_shader_stage stage;
    const struct zink_so_info *so_info;
 
-   SpvId ubos[128];
+   SpvId ubos[PIPE_MAX_CONSTANT_BUFFERS][3]; //8, 16, 32
+   nir_variable *ubo_vars[PIPE_MAX_CONSTANT_BUFFERS];
 
-   SpvId ssbos[PIPE_MAX_SHADER_BUFFERS];
+   SpvId ssbos[PIPE_MAX_SHADER_BUFFERS][3]; //8, 16, 32
    nir_variable *ssbo_vars[PIPE_MAX_SHADER_BUFFERS];
    SpvId image_types[PIPE_MAX_SAMPLERS];
    SpvId images[PIPE_MAX_SAMPLERS];
@@ -240,14 +241,14 @@ emit_float_const(struct ntv_context *ctx, int bit_size, double value)
 static SpvId
 emit_uint_const(struct ntv_context *ctx, int bit_size, uint64_t value)
 {
-   assert(bit_size == 16 || bit_size == 32 || bit_size == 64);
+   assert(bit_size == 8 || bit_size == 16 || bit_size == 32 || bit_size == 64);
    return spirv_builder_const_uint(&ctx->builder, bit_size, value);
 }
 
 static SpvId
 emit_int_const(struct ntv_context *ctx, int bit_size, int64_t value)
 {
-   assert(bit_size == 16 || bit_size == 32 || bit_size == 64);
+   assert(bit_size == 8 || bit_size == 16 || bit_size == 32 || bit_size == 64);
    return spirv_builder_const_int(&ctx->builder, bit_size, value);
 }
 
@@ -268,7 +269,7 @@ get_fvec_type(struct ntv_context *ctx, unsigned bit_size, unsigned num_component
 static SpvId
 get_ivec_type(struct ntv_context *ctx, unsigned bit_size, unsigned num_components)
 {
-   assert(bit_size == 16 || bit_size == 32 || bit_size == 64);
+   assert(bit_size == 8 || bit_size == 16 || bit_size == 32 || bit_size == 64);
 
    SpvId int_type = spirv_builder_type_int(&ctx->builder, bit_size);
    if (num_components > 1)
@@ -282,7 +283,7 @@ get_ivec_type(struct ntv_context *ctx, unsigned bit_size, unsigned num_component
 static SpvId
 get_uvec_type(struct ntv_context *ctx, unsigned bit_size, unsigned num_components)
 {
-   assert(bit_size == 16 || bit_size == 32 || bit_size == 64);
+   assert(bit_size == 8 || bit_size == 16 || bit_size == 32 || bit_size == 64);
 
    SpvId uint_type = spirv_builder_type_uint(&ctx->builder, bit_size);
    if (num_components > 1)
@@ -304,6 +305,7 @@ get_storage_class(struct nir_variable *var)
    case nir_var_shader_out:
       return SpvStorageClassOutput;
    case nir_var_uniform:
+   case nir_var_image:
       return SpvStorageClassUniformConstant;
    default:
       unreachable("Unsupported nir_variable_mode");
@@ -673,6 +675,8 @@ type_to_dim(enum glsl_sampler_dim gdim, bool *is_ms)
    case GLSL_SAMPLER_DIM_MS:
       *is_ms = true;
       return SpvDim2D;
+   case GLSL_SAMPLER_DIM_SUBPASS:
+      return SpvDimSubpassData;
    default:
       fprintf(stderr, "unknown sampler type %d\n", gdim);
       break;
@@ -797,15 +801,16 @@ get_image_format(struct ntv_context *ctx, enum pipe_format format)
    return ret;
 }
 
-static void
-emit_image(struct ntv_context *ctx, struct nir_variable *var)
+static SpvId
+get_bare_image_type(struct ntv_context *ctx, struct nir_variable *var, bool is_sampler)
 {
    const struct glsl_type *type = glsl_without_array(var->type);
 
    bool is_ms;
-   bool is_sampler = glsl_type_is_sampler(type);
 
-   if (!is_sampler && !var->data.image.format) {
+   if (var->data.fb_fetch_output) {
+      spirv_builder_emit_cap(&ctx->builder, SpvCapabilityInputAttachment);
+   } else if (!is_sampler && !var->data.image.format) {
       if (!(var->data.access & ACCESS_NON_WRITEABLE))
          spirv_builder_emit_cap(&ctx->builder, SpvCapabilityStorageImageWriteWithoutFormat);
       if (!(var->data.access & ACCESS_NON_READABLE))
@@ -818,12 +823,29 @@ emit_image(struct ntv_context *ctx, struct nir_variable *var)
       spirv_builder_emit_cap(&ctx->builder, SpvCapabilityImageCubeArray);
 
    SpvId result_type = get_glsl_basetype(ctx, glsl_get_sampler_result_type(type));
-   SpvId image_type = spirv_builder_type_image(&ctx->builder, result_type,
+   return spirv_builder_type_image(&ctx->builder, result_type,
                                                dimension, false,
                                                arrayed,
                                                is_ms, is_sampler ? 1 : 2,
                                                get_image_format(ctx, var->data.image.format));
+}
 
+static SpvId
+get_image_type(struct ntv_context *ctx, struct nir_variable *var, bool is_sampler)
+{
+   SpvId image_type = get_bare_image_type(ctx, var, is_sampler);
+   return is_sampler ? spirv_builder_type_sampled_image(&ctx->builder, image_type) : image_type;
+}
+
+static SpvId
+emit_image(struct ntv_context *ctx, struct nir_variable *var, bool bindless)
+{
+   if (var->data.bindless)
+      return 0;
+   const struct glsl_type *type = glsl_without_array(var->type);
+
+   bool is_sampler = glsl_type_is_sampler(type);
+   SpvId image_type = get_bare_image_type(ctx, var, is_sampler);
    SpvId var_type = is_sampler ? spirv_builder_type_sampled_image(&ctx->builder, image_type) : image_type;
 
    int index = var->data.driver_location;
@@ -831,7 +853,7 @@ emit_image(struct ntv_context *ctx, struct nir_variable *var)
    assert(!is_sampler || !ctx->sampler_types[index]);
    assert(is_sampler || !ctx->image_types[index]);
 
-   if (glsl_type_is_array(var->type)) {
+   if (!bindless && glsl_type_is_array(var->type)) {
       var_type = spirv_builder_type_array(&ctx->builder, var_type,
                                               emit_uint_const(ctx, 32, glsl_get_aoa_size(var->type)));
       spirv_builder_emit_array_stride(&ctx->builder, var_type, sizeof(void*));
@@ -847,6 +869,13 @@ emit_image(struct ntv_context *ctx, struct nir_variable *var)
    if (var->name)
       spirv_builder_emit_name(&ctx->builder, var_id, var->name);
 
+   if (var->data.fb_fetch_output)
+      spirv_builder_emit_input_attachment_index(&ctx->builder, var_id, var->data.index);
+
+   if (bindless)
+      return var_id;
+
+   _mesa_hash_table_insert(ctx->vars, var, (void *)(intptr_t)var_id);
    if (is_sampler) {
       ctx->sampler_types[index] = image_type;
       ctx->samplers[index] = var_id;
@@ -854,7 +883,6 @@ emit_image(struct ntv_context *ctx, struct nir_variable *var)
    } else {
       ctx->image_types[index] = image_type;
       ctx->images[index] = var_id;
-      _mesa_hash_table_insert(ctx->vars, var, (void *)(intptr_t)var_id);
       uint32_t *key = ralloc_size(ctx->mem_ctx, sizeof(uint32_t));
       *key = var_id;
       _mesa_hash_table_insert(ctx->image_vars, key, var);
@@ -867,46 +895,52 @@ emit_image(struct ntv_context *ctx, struct nir_variable *var)
 
    spirv_builder_emit_descriptor_set(&ctx->builder, var_id, var->data.descriptor_set);
    spirv_builder_emit_binding(&ctx->builder, var_id, var->data.binding);
+   return var_id;
 }
 
 static SpvId
-get_sized_uint_array_type(struct ntv_context *ctx, unsigned array_size)
+get_sized_uint_array_type(struct ntv_context *ctx, unsigned array_size, unsigned bitsize)
 {
    SpvId array_length = emit_uint_const(ctx, 32, array_size);
-   SpvId array_type = spirv_builder_type_array(&ctx->builder, get_uvec_type(ctx, 32, 1),
+   SpvId array_type = spirv_builder_type_array(&ctx->builder, get_uvec_type(ctx, bitsize, 1),
                                             array_length);
-   spirv_builder_emit_array_stride(&ctx->builder, array_type, 4);
+   spirv_builder_emit_array_stride(&ctx->builder, array_type, bitsize / 8);
    return array_type;
 }
 
 static SpvId
-get_bo_array_type(struct ntv_context *ctx, struct nir_variable *var)
+get_bo_array_type(struct ntv_context *ctx, struct nir_variable *var, unsigned bitsize)
 {
+   assert(bitsize);
    SpvId array_type;
-   SpvId uint_type = spirv_builder_type_uint(&ctx->builder, 32);
-   if (glsl_type_is_unsized_array(var->type)) {
-      array_type = spirv_builder_type_runtime_array(&ctx->builder, uint_type);
-      spirv_builder_emit_array_stride(&ctx->builder, array_type, 4);
-   } else {
-      uint32_t array_size = glsl_get_length(glsl_get_struct_field(var->interface_type, 0));
-      array_type = get_sized_uint_array_type(ctx, array_size);
+   const struct glsl_type *type = var->type;
+   if (!glsl_type_is_unsized_array(type)) {
+      type = glsl_get_struct_field(var->interface_type, 0);
+      if (!glsl_type_is_unsized_array(type)) {
+         uint32_t array_size = glsl_get_length(type) * (bitsize / 4);
+         assert(array_size);
+         return get_sized_uint_array_type(ctx, array_size, bitsize);
+      }
    }
+   SpvId uint_type = spirv_builder_type_uint(&ctx->builder, bitsize);
+   array_type = spirv_builder_type_runtime_array(&ctx->builder, uint_type);
+   spirv_builder_emit_array_stride(&ctx->builder, array_type, bitsize / 8);
    return array_type;
 }
 
 static SpvId
-get_bo_struct_type(struct ntv_context *ctx, struct nir_variable *var)
+get_bo_struct_type(struct ntv_context *ctx, struct nir_variable *var, unsigned bitsize)
 {
-   SpvId array_type = get_bo_array_type(ctx, var);
+   SpvId array_type = get_bo_array_type(ctx, var, bitsize);
    bool ssbo = var->data.mode == nir_var_mem_ssbo;
 
    // wrap UBO-array in a struct
    SpvId runtime_array = 0;
-   if (ssbo) {
+   if (ssbo && glsl_get_length(var->interface_type) > 1) {
        const struct glsl_type *last_member = glsl_get_struct_field(var->interface_type, glsl_get_length(var->interface_type) - 1);
        if (glsl_type_is_unsized_array(last_member)) {
           bool is_64bit = glsl_type_is_64bit(glsl_without_array(last_member));
-          runtime_array = spirv_builder_type_runtime_array(&ctx->builder, get_uvec_type(ctx, is_64bit ? 64 : 32, 1));
+          runtime_array = spirv_builder_type_runtime_array(&ctx->builder, get_uvec_type(ctx, is_64bit ? 64 : bitsize, 1));
           spirv_builder_emit_array_stride(&ctx->builder, runtime_array, glsl_get_explicit_stride(last_member));
        }
    }
@@ -933,11 +967,14 @@ get_bo_struct_type(struct ntv_context *ctx, struct nir_variable *var)
 }
 
 static void
-emit_bo(struct ntv_context *ctx, struct nir_variable *var)
+emit_bo(struct ntv_context *ctx, struct nir_variable *var, unsigned force_bitsize)
 {
    bool ssbo = var->data.mode == nir_var_mem_ssbo;
+   unsigned bitsize = force_bitsize ? force_bitsize : 32;
+   unsigned idx = bitsize >> 4;
+   assert(idx < ARRAY_SIZE(ctx->ssbos[0]));
 
-   SpvId pointer_type = get_bo_struct_type(ctx, var);
+   SpvId pointer_type = get_bo_struct_type(ctx, var, bitsize);
 
    SpvId var_id = spirv_builder_emit_var(&ctx->builder, pointer_type,
                                          ssbo ? SpvStorageClassStorageBuffer : SpvStorageClassUniform);
@@ -945,12 +982,13 @@ emit_bo(struct ntv_context *ctx, struct nir_variable *var)
       spirv_builder_emit_name(&ctx->builder, var_id, var->name);
 
    if (ssbo) {
-      assert(!ctx->ssbos[var->data.driver_location]);
-      ctx->ssbos[var->data.driver_location] = var_id;
+      assert(!ctx->ssbos[var->data.driver_location][idx]);
+      ctx->ssbos[var->data.driver_location][idx] = var_id;
       ctx->ssbo_vars[var->data.driver_location] = var;
    } else {
-      assert(!ctx->ubos[var->data.driver_location]);
-      ctx->ubos[var->data.driver_location] = var_id;
+      assert(!ctx->ubos[var->data.driver_location][idx]);
+      ctx->ubos[var->data.driver_location][idx] = var_id;
+      ctx->ubo_vars[var->data.driver_location] = var;
    }
    if (ctx->spirv_1_4_interfaces) {
       assert(ctx->num_entry_ifaces < ARRAY_SIZE(ctx->entry_ifaces));
@@ -965,12 +1003,13 @@ static void
 emit_uniform(struct ntv_context *ctx, struct nir_variable *var)
 {
    if (var->data.mode == nir_var_mem_ubo || var->data.mode == nir_var_mem_ssbo)
-      emit_bo(ctx, var);
+      emit_bo(ctx, var, 0);
    else {
-      assert(var->data.mode == nir_var_uniform);
+      assert(var->data.mode == nir_var_uniform ||
+             var->data.mode == nir_var_image);
       const struct glsl_type *type = glsl_without_array(var->type);
       if (glsl_type_is_sampler(type) || glsl_type_is_image(type))
-         emit_image(ctx, var);
+         emit_image(ctx, var, false);
    }
 }
 
@@ -979,7 +1018,7 @@ get_vec_from_bit_size(struct ntv_context *ctx, uint32_t bit_size, uint32_t num_c
 {
    if (bit_size == 1)
       return get_bvec_type(ctx, num_components);
-   if (bit_size == 16 || bit_size == 32 || bit_size == 64)
+   if (bit_size == 8 || bit_size == 16 || bit_size == 32 || bit_size == 64)
       return get_uvec_type(ctx, bit_size, num_components);
    unreachable("unhandled register bit size");
    return 0;
@@ -1051,7 +1090,7 @@ get_alu_src_raw(struct ntv_context *ctx, nir_alu_instr *alu, unsigned src)
       return def;
 
    int bit_size = nir_src_bit_size(alu->src[src].src);
-   assert(bit_size == 1 || bit_size == 16 || bit_size == 32 || bit_size == 64);
+   assert(bit_size == 1 || bit_size == 8 || bit_size == 16 || bit_size == 32 || bit_size == 64);
 
    SpvId raw_type = bit_size == 1 ? spirv_builder_type_bool(&ctx->builder) :
                                     spirv_builder_type_uint(&ctx->builder, bit_size);
@@ -1459,7 +1498,7 @@ static SpvId
 get_ivec_constant(struct ntv_context *ctx, unsigned bit_size,
                   unsigned num_components, int64_t value)
 {
-   assert(bit_size == 16 || bit_size == 32 || bit_size == 64);
+   assert(bit_size == 8 || bit_size == 16 || bit_size == 32 || bit_size == 64);
 
    SpvId result = emit_int_const(ctx, bit_size, value);
    if (num_components == 1)
@@ -1614,6 +1653,7 @@ emit_alu(struct ntv_context *ctx, nir_alu_instr *alu)
    UNOP(nir_op_u2f32, SpvOpConvertUToF)
    UNOP(nir_op_i2i16, SpvOpSConvert)
    UNOP(nir_op_i2i32, SpvOpSConvert)
+   UNOP(nir_op_u2u8, SpvOpUConvert)
    UNOP(nir_op_u2u16, SpvOpUConvert)
    UNOP(nir_op_u2u32, SpvOpUConvert)
    UNOP(nir_op_f2f16, SpvOpFConvert)
@@ -1792,6 +1832,12 @@ emit_alu(struct ntv_context *ctx, nir_alu_instr *alu)
       result = emit_select(ctx, dest_type, src[0], src[1], src[2]);
       break;
 
+   case nir_op_pack_half_2x16_split: {
+      SpvId fvec = spirv_builder_emit_composite_construct(&ctx->builder, get_fvec_type(ctx, 32, 2),
+                                                          src, 2);
+      result = emit_builtin_unop(ctx, GLSLstd450PackHalf2x16, dest_type, fvec);
+      break;
+   }
    case nir_op_vec2:
    case nir_op_vec3:
    case nir_op_vec4: {
@@ -1865,13 +1911,24 @@ emit_load_const(struct ntv_context *ctx, nir_load_const_instr *load_const)
 static void
 emit_load_bo(struct ntv_context *ctx, nir_intrinsic_instr *intr)
 {
-   ASSERTED nir_const_value *const_block_index = nir_src_as_const_value(intr->src[0]);
+   nir_const_value *const_block_index = nir_src_as_const_value(intr->src[0]);
    bool ssbo = intr->intrinsic == nir_intrinsic_load_ssbo;
    assert(const_block_index); // no dynamic indexing for now
 
-   SpvId bo = ssbo ? ctx->ssbos[const_block_index->u32] : ctx->ubos[const_block_index->u32];
+   unsigned idx = 0;
    unsigned bit_size = nir_dest_bit_size(intr->dest);
-   SpvId uint_type = get_uvec_type(ctx, 32, 1);
+   idx = MIN2(bit_size, 32) >> 4;
+   if (ssbo) {
+      assert(idx < ARRAY_SIZE(ctx->ssbos[0]));
+      if (!ctx->ssbos[const_block_index->u32][idx])
+         emit_bo(ctx, ctx->ssbo_vars[const_block_index->u32], nir_dest_bit_size(intr->dest));
+   } else {
+      assert(idx < ARRAY_SIZE(ctx->ubos[0]));
+      if (!ctx->ubos[const_block_index->u32][idx])
+         emit_bo(ctx, ctx->ubo_vars[const_block_index->u32], nir_dest_bit_size(intr->dest));
+   }
+   SpvId bo = ssbo ? ctx->ssbos[const_block_index->u32][idx] : ctx->ubos[const_block_index->u32][idx];
+   SpvId uint_type = get_uvec_type(ctx, MIN2(bit_size, 32), 1);
    SpvId one = emit_uint_const(ctx, 32, 1);
 
    /* number of components being loaded */
@@ -1885,7 +1942,7 @@ emit_load_bo(struct ntv_context *ctx, nir_intrinsic_instr *intr)
    /* destination type for the load */
    SpvId type = get_dest_uvec_type(ctx, &intr->dest);
    /* an id of an array member in bytes */
-   SpvId uint_size = emit_uint_const(ctx, 32, sizeof(uint32_t));
+   SpvId uint_size = emit_uint_const(ctx, 32, MIN2(bit_size, 32) / 8);
 
    /* we grab a single array member at a time, so it's a pointer to a uint */
    SpvId pointer_type = spirv_builder_type_pointer(&ctx->builder,
@@ -1962,6 +2019,101 @@ emit_load_bo(struct ntv_context *ctx, nir_intrinsic_instr *intr)
    store_dest(ctx, &intr->dest, result, nir_type_uint);
 }
 
+static void
+emit_store_ssbo(struct ntv_context *ctx, nir_intrinsic_instr *intr)
+{
+   /* TODO: would be great to refactor this in with emit_load_bo() */
+
+   nir_const_value *const_block_index = nir_src_as_const_value(intr->src[1]);
+   assert(const_block_index);
+
+   unsigned idx = MIN2(nir_src_bit_size(intr->src[0]), 32) >> 4;
+   assert(idx < ARRAY_SIZE(ctx->ssbos[0]));
+   if (!ctx->ssbos[const_block_index->u32][idx])
+      emit_bo(ctx, ctx->ssbo_vars[const_block_index->u32], nir_src_bit_size(intr->src[0]));
+   SpvId bo = ctx->ssbos[const_block_index->u32][idx];
+
+   unsigned bit_size = nir_src_bit_size(intr->src[0]);
+   SpvId uint_type = get_uvec_type(ctx, 32, 1);
+   SpvId one = emit_uint_const(ctx, 32, 1);
+
+   /* number of components being stored */
+   unsigned wrmask = nir_intrinsic_write_mask(intr);
+   unsigned num_components = util_bitcount(wrmask);
+
+   /* we need to grab 2x32 to fill the 64bit value */
+   bool is_64bit = bit_size == 64;
+
+   /* an id of an array member in bytes */
+   SpvId uint_size = emit_uint_const(ctx, 32, MIN2(bit_size, 32) / 8);
+   /* we grab a single array member at a time, so it's a pointer to a uint */
+   SpvId pointer_type = spirv_builder_type_pointer(&ctx->builder,
+                                                   SpvStorageClassStorageBuffer,
+                                                   get_uvec_type(ctx, MIN2(bit_size, 32), 1));
+
+   /* our generated uniform has a memory layout like
+    *
+    * struct {
+    *    uint base[array_size];
+    * };
+    *
+    * where 'array_size' is set as though every member of the ubo takes up a vec4,
+    * even if it's only a vec2 or a float.
+    *
+    * first, access 'base'
+    */
+   SpvId member = emit_uint_const(ctx, 32, 0);
+   /* this is the offset (in bytes) that we're accessing:
+    * it may be a const value or it may be dynamic in the shader
+    */
+   SpvId offset = get_src(ctx, &intr->src[2]);
+   /* calculate byte offset */
+   SpvId vec_offset = emit_binop(ctx, SpvOpUDiv, uint_type, offset, uint_size);
+
+   SpvId value = get_src(ctx, &intr->src[0]);
+   /* OpAccessChain takes an array of indices that drill into a hierarchy based on the type:
+    * index 0 is accessing 'base'
+    * index 1 is accessing 'base[index 1]'
+    * index 2 is accessing 'base[index 1][index 2]'
+    *
+    * we must perform the access this way in case src[1] is dynamic because there's
+    * no other spirv method for using an id to access a member of a composite, as
+    * (composite|vector)_extract both take literals
+    */
+   unsigned write_count = 0;
+   SpvId src_base_type = get_uvec_type(ctx, bit_size, 1);
+   for (unsigned i = 0; write_count < num_components; i++) {
+      if (wrmask & (1 << i)) {
+         SpvId component = nir_src_num_components(intr->src[0]) > 1 ?
+                           spirv_builder_emit_composite_extract(&ctx->builder, src_base_type, value, &i, 1) :
+                           value;
+         SpvId component_split;
+         if (is_64bit)
+            component_split = emit_bitcast(ctx, get_uvec_type(ctx, 32, 2), component);
+         for (unsigned j = 0; j < 1 + !!is_64bit; j++) {
+            if (j)
+               vec_offset = emit_binop(ctx, SpvOpIAdd, uint_type, vec_offset, one);
+            SpvId indices[] = { member, vec_offset };
+            SpvId ptr = spirv_builder_emit_access_chain(&ctx->builder, pointer_type,
+                                                         bo, indices,
+                                                         ARRAY_SIZE(indices));
+            if (is_64bit)
+               component = spirv_builder_emit_composite_extract(&ctx->builder, uint_type, component_split, &j, 1);
+            if (nir_intrinsic_access(intr) & ACCESS_COHERENT)
+               spirv_builder_emit_atomic_store(&ctx->builder, ptr, SpvScopeWorkgroup, 0, component);
+            else
+               spirv_builder_emit_store(&ctx->builder, ptr, component);
+         }
+         write_count++;
+      } else if (is_64bit)
+         /* we're doing 32bit stores here, so we need to increment correctly here */
+         vec_offset = emit_binop(ctx, SpvOpIAdd, uint_type, vec_offset, one);
+
+      /* increment to the next vec4 member index for the next store */
+      vec_offset = emit_binop(ctx, SpvOpIAdd, uint_type, vec_offset, one);
+   }
+}
+
 static void
 emit_discard(struct ntv_context *ctx, nir_intrinsic_instr *intr)
 {
@@ -1977,8 +2129,16 @@ emit_load_deref(struct ntv_context *ctx, nir_intrinsic_instr *intr)
 {
    SpvId ptr = get_src(ctx, intr->src);
 
+   nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
+   SpvId type;
+   if (glsl_type_is_image(deref->type)) {
+      nir_variable *var = nir_deref_instr_get_variable(deref);
+      type = get_image_type(ctx, var, glsl_type_is_sampler(glsl_without_array(var->type)));
+   } else {
+      type = get_glsl_type(ctx, deref->type);
+   }
    SpvId result = spirv_builder_emit_load(&ctx->builder,
-                                          get_glsl_type(ctx, nir_src_as_deref(intr->src[0])->type),
+                                          type,
                                           ptr);
    unsigned num_components = nir_dest_num_components(intr->dest);
    unsigned bit_size = nir_dest_bit_size(intr->dest);
@@ -2313,7 +2473,12 @@ emit_ssbo_atomic_intrinsic(struct ntv_context *ctx, nir_intrinsic_instr *intr)
 
    nir_const_value *const_block_index = nir_src_as_const_value(intr->src[0]);
    assert(const_block_index); // no dynamic indexing for now
-   ssbo = ctx->ssbos[const_block_index->u32];
+   unsigned bit_size = MIN2(nir_src_bit_size(intr->src[0]), 32);
+   unsigned idx = bit_size >> 4;
+   assert(idx < ARRAY_SIZE(ctx->ssbos[0]));
+   if (!ctx->ssbos[const_block_index->u32][idx])
+      emit_bo(ctx, ctx->ssbo_vars[const_block_index->u32], nir_dest_bit_size(intr->dest));
+   ssbo = ctx->ssbos[const_block_index->u32][idx];
    param = get_src(ctx, &intr->src[2]);
 
    SpvId pointer_type = spirv_builder_type_pointer(&ctx->builder,
@@ -2321,7 +2486,7 @@ emit_ssbo_atomic_intrinsic(struct ntv_context *ctx, nir_intrinsic_instr *intr)
                                                    dest_type);
    SpvId uint_type = get_uvec_type(ctx, 32, 1);
    /* an id of the array stride in bytes */
-   SpvId uint_size = emit_uint_const(ctx, 32, sizeof(uint32_t));
+   SpvId uint_size = emit_uint_const(ctx, 32, bit_size / 8);
    SpvId member = emit_uint_const(ctx, 32, 0);
    SpvId offset = get_src(ctx, &intr->src[1]);
    SpvId vec_offset = emit_binop(ctx, SpvOpUDiv, uint_type, offset, uint_size);
@@ -2359,6 +2524,32 @@ emit_shared_atomic_intrinsic(struct ntv_context *ctx, nir_intrinsic_instr *intr)
    handle_atomic_op(ctx, intr, ptr, param, param2);
 }
 
+static void
+emit_get_ssbo_size(struct ntv_context *ctx, nir_intrinsic_instr *intr)
+{
+   SpvId uint_type = get_uvec_type(ctx, 32, 1);
+   nir_const_value *const_block_index = nir_src_as_const_value(intr->src[0]);
+   assert(const_block_index); // no dynamic indexing for now
+   nir_variable *var = ctx->ssbo_vars[const_block_index->u32];
+   SpvId result = spirv_builder_emit_binop(&ctx->builder, SpvOpArrayLength, uint_type,
+                                             ctx->ssbos[const_block_index->u32][2], 1);
+   /* this is going to be converted by nir to:
+
+      length = (buffer_size - offset) / stride
+
+      * so we need to un-convert it to avoid having the calculation performed twice
+      */
+   unsigned last_member_idx = glsl_get_length(var->interface_type) - 1;
+   const struct glsl_type *last_member = glsl_get_struct_field(var->interface_type, last_member_idx);
+   /* multiply by stride */
+   result = emit_binop(ctx, SpvOpIMul, uint_type, result, emit_uint_const(ctx, 32, glsl_get_explicit_stride(last_member)));
+   /* get total ssbo size by adding offset */
+   result = emit_binop(ctx, SpvOpIAdd, uint_type, result,
+                        emit_uint_const(ctx, 32,
+                                       glsl_get_struct_field_offset(var->interface_type, last_member_idx)));
+   store_dest(ctx, &intr->dest, result, nir_type_uint);
+}
+
 static inline nir_variable *
 get_var_from_image(struct ntv_context *ctx, SpvId var_id)
 {
@@ -2390,13 +2581,80 @@ get_image_coords(struct ntv_context *ctx, const struct glsl_type *type, nir_src
 }
 
 static void
-emit_image_intrinsic(struct ntv_context *ctx, nir_intrinsic_instr *intr)
+emit_image_deref_store(struct ntv_context *ctx, nir_intrinsic_instr *intr)
 {
    SpvId img_var = get_src(ctx, &intr->src[0]);
-   SpvId sample = get_src(ctx, &intr->src[2]);
-   SpvId param = get_src(ctx, &intr->src[3]);
-   nir_variable *var = get_var_from_image(ctx, img_var);
+   nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
+   nir_variable *var = deref->deref_type == nir_deref_type_var ? deref->var : get_var_from_image(ctx, img_var);
+   SpvId img_type = var->data.bindless ? get_bare_image_type(ctx, var, false) : ctx->image_types[var->data.driver_location];
    const struct glsl_type *type = glsl_without_array(var->type);
+   SpvId base_type = get_glsl_basetype(ctx, glsl_get_sampler_result_type(type));
+   SpvId img = spirv_builder_emit_load(&ctx->builder, img_type, img_var);
+   SpvId coord = get_image_coords(ctx, type, &intr->src[1]);
+   SpvId texel = get_src(ctx, &intr->src[3]);
+   SpvId sample = glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_MS ? get_src(ctx, &intr->src[2]) : 0;
+   assert(nir_src_bit_size(intr->src[3]) == glsl_base_type_bit_size(glsl_get_sampler_result_type(type)));
+   /* texel type must match image type */
+   texel = emit_bitcast(ctx,
+                        spirv_builder_type_vector(&ctx->builder, base_type, 4),
+                        texel);
+   spirv_builder_emit_image_write(&ctx->builder, img, coord, texel, 0, sample, 0);
+}
+
+static void
+emit_image_deref_load(struct ntv_context *ctx, nir_intrinsic_instr *intr)
+{
+   SpvId img_var = get_src(ctx, &intr->src[0]);
+   nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
+   nir_variable *var = deref->deref_type == nir_deref_type_var ? deref->var : get_var_from_image(ctx, img_var);
+   SpvId img_type = var->data.bindless ? get_bare_image_type(ctx, var, false) : ctx->image_types[var->data.driver_location];
+   const struct glsl_type *type = glsl_without_array(var->type);
+   SpvId base_type = get_glsl_basetype(ctx, glsl_get_sampler_result_type(type));
+   SpvId img = spirv_builder_emit_load(&ctx->builder, img_type, img_var);
+   SpvId coord = get_image_coords(ctx, type, &intr->src[1]);
+   SpvId sample = glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_MS ? get_src(ctx, &intr->src[2]) : 0;
+   SpvId result = spirv_builder_emit_image_read(&ctx->builder,
+                                 spirv_builder_type_vector(&ctx->builder, base_type, nir_dest_num_components(intr->dest)),
+                                 img, coord, 0, sample, 0);
+   store_dest(ctx, &intr->dest, result, nir_type_float);
+}
+
+static void
+emit_image_deref_size(struct ntv_context *ctx, nir_intrinsic_instr *intr)
+{
+   SpvId img_var = get_src(ctx, &intr->src[0]);
+   nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
+   nir_variable *var = deref->deref_type == nir_deref_type_var ? deref->var : get_var_from_image(ctx, img_var);
+   SpvId img_type = var->data.bindless ? get_bare_image_type(ctx, var, false) : ctx->image_types[var->data.driver_location];
+   const struct glsl_type *type = glsl_without_array(var->type);
+   SpvId img = spirv_builder_emit_load(&ctx->builder, img_type, img_var);
+   SpvId result = spirv_builder_emit_image_query_size(&ctx->builder, get_uvec_type(ctx, 32, glsl_get_sampler_coordinate_components(type)), img, 0);
+   store_dest(ctx, &intr->dest, result, nir_type_uint);
+}
+
+static void
+emit_image_deref_samples(struct ntv_context *ctx, nir_intrinsic_instr *intr)
+{
+   SpvId img_var = get_src(ctx, &intr->src[0]);
+   nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
+   nir_variable *var = deref->deref_type == nir_deref_type_var ? deref->var : get_var_from_image(ctx, img_var);
+   SpvId img_type = var->data.bindless ? get_bare_image_type(ctx, var, false) : ctx->image_types[var->data.driver_location];
+   SpvId img = spirv_builder_emit_load(&ctx->builder, img_type, img_var);
+   SpvId result = spirv_builder_emit_unop(&ctx->builder, SpvOpImageQuerySamples, get_dest_type(ctx, &intr->dest, nir_type_uint), img);
+   store_dest(ctx, &intr->dest, result, nir_type_uint);
+}
+
+static void
+emit_image_intrinsic(struct ntv_context *ctx, nir_intrinsic_instr *intr)
+{
+   SpvId param = get_src(ctx, &intr->src[3]);
+   SpvId img_var = get_src(ctx, &intr->src[0]);
+   nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
+   nir_variable *var = deref->deref_type == nir_deref_type_var ? deref->var : get_var_from_image(ctx, img_var);
+   const struct glsl_type *type = glsl_without_array(var->type);
+   bool is_ms;
+   type_to_dim(glsl_get_sampler_dim(type), &is_ms);
+   SpvId sample = is_ms ? get_src(ctx, &intr->src[2]) : emit_uint_const(ctx, 32, 0);
    SpvId coord = get_image_coords(ctx, type, &intr->src[1]);
    SpvId base_type = get_glsl_basetype(ctx, glsl_get_sampler_result_type(type));
    SpvId texel = spirv_builder_emit_image_texel_pointer(&ctx->builder, base_type, img_var, coord, sample);
@@ -2407,6 +2665,50 @@ emit_image_intrinsic(struct ntv_context *ctx, nir_intrinsic_instr *intr)
    handle_atomic_op(ctx, intr, texel, param, param2);
 }
 
+static void
+emit_ballot(struct ntv_context *ctx, nir_intrinsic_instr *intr)
+{
+   spirv_builder_emit_cap(&ctx->builder, SpvCapabilitySubgroupBallotKHR);
+   spirv_builder_emit_extension(&ctx->builder, "SPV_KHR_shader_ballot");
+   SpvId type = get_dest_uvec_type(ctx, &intr->dest);
+   SpvId result = emit_unop(ctx, SpvOpSubgroupBallotKHR, type, get_src(ctx, &intr->src[0]));
+   store_dest(ctx, &intr->dest, result, nir_type_uint);
+}
+
+static void
+emit_read_first_invocation(struct ntv_context *ctx, nir_intrinsic_instr *intr)
+{
+   spirv_builder_emit_cap(&ctx->builder, SpvCapabilitySubgroupBallotKHR);
+   spirv_builder_emit_extension(&ctx->builder, "SPV_KHR_shader_ballot");
+   SpvId type = get_dest_type(ctx, &intr->dest, nir_type_uint);
+   SpvId result = emit_unop(ctx, SpvOpSubgroupFirstInvocationKHR, type, get_src(ctx, &intr->src[0]));
+   store_dest(ctx, &intr->dest, result, nir_type_uint);
+}
+
+static void
+emit_read_invocation(struct ntv_context *ctx, nir_intrinsic_instr *intr)
+{
+   spirv_builder_emit_cap(&ctx->builder, SpvCapabilitySubgroupBallotKHR);
+   spirv_builder_emit_extension(&ctx->builder, "SPV_KHR_shader_ballot");
+   SpvId type = get_dest_type(ctx, &intr->dest, nir_type_uint);
+   SpvId result = emit_binop(ctx, SpvOpSubgroupReadInvocationKHR, type,
+                              get_src(ctx, &intr->src[0]),
+                              get_src(ctx, &intr->src[1]));
+   store_dest(ctx, &intr->dest, result, nir_type_uint);
+}
+
+static void
+emit_shader_clock(struct ntv_context *ctx, nir_intrinsic_instr *intr)
+{
+   spirv_builder_emit_cap(&ctx->builder, SpvCapabilityShaderClockKHR);
+   spirv_builder_emit_extension(&ctx->builder, "SPV_KHR_shader_clock");
+
+   SpvScope scope = get_scope(nir_intrinsic_memory_scope(intr));
+   SpvId type = get_dest_type(ctx, &intr->dest, nir_type_uint);
+   SpvId result = spirv_builder_emit_unop_const(&ctx->builder, SpvOpReadClockKHR, type, scope);
+   store_dest(ctx, &intr->dest, result, nir_type_uint);
+}
+
 static void
 emit_vote(struct ntv_context *ctx, nir_intrinsic_instr *intr)
 {
@@ -2439,94 +2741,9 @@ emit_intrinsic(struct ntv_context *ctx, nir_intrinsic_instr *intr)
       emit_load_bo(ctx, intr);
       break;
 
-   /* TODO: would be great to refactor this in with emit_load_bo() */
-   case nir_intrinsic_store_ssbo: {
-      nir_const_value *const_block_index = nir_src_as_const_value(intr->src[1]);
-      assert(const_block_index);
-
-      SpvId bo = ctx->ssbos[const_block_index->u32];
-
-      unsigned bit_size = nir_src_bit_size(intr->src[0]);
-      SpvId uint_type = get_uvec_type(ctx, 32, 1);
-      SpvId one = emit_uint_const(ctx, 32, 1);
-
-      /* number of components being stored */
-      unsigned wrmask = nir_intrinsic_write_mask(intr);
-      unsigned num_components = util_bitcount(wrmask);
-
-      /* we need to grab 2x32 to fill the 64bit value */
-      bool is_64bit = bit_size == 64;
-
-      /* an id of an array member in bytes */
-      SpvId uint_size = emit_uint_const(ctx, 32, sizeof(uint32_t));
-      /* we grab a single array member at a time, so it's a pointer to a uint */
-      SpvId pointer_type = spirv_builder_type_pointer(&ctx->builder,
-                                                      SpvStorageClassStorageBuffer,
-                                                      uint_type);
-
-      /* our generated uniform has a memory layout like
-       *
-       * struct {
-       *    uint base[array_size];
-       * };
-       *
-       * where 'array_size' is set as though every member of the ubo takes up a vec4,
-       * even if it's only a vec2 or a float.
-       *
-       * first, access 'base'
-       */
-      SpvId member = emit_uint_const(ctx, 32, 0);
-      /* this is the offset (in bytes) that we're accessing:
-       * it may be a const value or it may be dynamic in the shader
-       */
-      SpvId offset = get_src(ctx, &intr->src[2]);
-      /* calculate byte offset */
-      SpvId vec_offset = emit_binop(ctx, SpvOpUDiv, uint_type, offset, uint_size);
-
-      SpvId value = get_src(ctx, &intr->src[0]);
-      /* OpAccessChain takes an array of indices that drill into a hierarchy based on the type:
-       * index 0 is accessing 'base'
-       * index 1 is accessing 'base[index 1]'
-       * index 2 is accessing 'base[index 1][index 2]'
-       *
-       * we must perform the access this way in case src[1] is dynamic because there's
-       * no other spirv method for using an id to access a member of a composite, as
-       * (composite|vector)_extract both take literals
-       */
-      unsigned write_count = 0;
-      SpvId src_base_type = get_uvec_type(ctx, nir_src_bit_size(intr->src[0]), 1);
-      for (unsigned i = 0; write_count < num_components; i++) {
-         if (wrmask & (1 << i)) {
-            SpvId component = nir_src_num_components(intr->src[0]) > 1 ?
-                              spirv_builder_emit_composite_extract(&ctx->builder, src_base_type, value, &i, 1) :
-                              value;
-            SpvId component_split;
-            if (is_64bit)
-               component_split = emit_bitcast(ctx, get_uvec_type(ctx, 32, 2), component);
-            for (unsigned j = 0; j < 1 + !!is_64bit; j++) {
-               if (j)
-                  vec_offset = emit_binop(ctx, SpvOpIAdd, uint_type, vec_offset, one);
-               SpvId indices[] = { member, vec_offset };
-               SpvId ptr = spirv_builder_emit_access_chain(&ctx->builder, pointer_type,
-                                                           bo, indices,
-                                                           ARRAY_SIZE(indices));
-               if (is_64bit)
-                  component = spirv_builder_emit_composite_extract(&ctx->builder, uint_type, component_split, &j, 1);
-               if (nir_intrinsic_access(intr) & ACCESS_COHERENT)
-                  spirv_builder_emit_atomic_store(&ctx->builder, ptr, SpvScopeWorkgroup, 0, component);
-               else
-                  spirv_builder_emit_store(&ctx->builder, ptr, component);
-            }
-            write_count++;
-         } else if (is_64bit)
-            /* we're doing 32bit stores here, so we need to increment correctly here */
-            vec_offset = emit_binop(ctx, SpvOpIAdd, uint_type, vec_offset, one);
-
-         /* increment to the next vec4 member index for the next store */
-         vec_offset = emit_binop(ctx, SpvOpIAdd, uint_type, vec_offset, one);
-      }
+   case nir_intrinsic_store_ssbo:
+      emit_store_ssbo(ctx, intr);
       break;
-   }
 
    case nir_intrinsic_discard:
       emit_discard(ctx, intr);
@@ -2697,81 +2914,26 @@ emit_intrinsic(struct ntv_context *ctx, nir_intrinsic_instr *intr)
       spirv_builder_emit_interlock(&ctx->builder, intr->intrinsic == nir_intrinsic_end_invocation_interlock);
       break;
 
-   case nir_intrinsic_get_ssbo_size: {
-      SpvId uint_type = get_uvec_type(ctx, 32, 1);
-      nir_variable *var = ctx->ssbo_vars[nir_src_as_const_value(intr->src[0])->u32];
-      SpvId result = spirv_builder_emit_binop(&ctx->builder, SpvOpArrayLength, uint_type,
-                                              ctx->ssbos[nir_src_as_const_value(intr->src[0])->u32], 1);
-      /* this is going to be converted by nir to:
+   case nir_intrinsic_get_ssbo_size:
+      emit_get_ssbo_size(ctx, intr);
+      break;
 
-         length = (buffer_size - offset) / stride
+   case nir_intrinsic_image_deref_store:
+      emit_image_deref_store(ctx, intr);
+      break;
 
-        * so we need to un-convert it to avoid having the calculation performed twice
-        */
-      unsigned last_member_idx = glsl_get_length(var->interface_type) - 1;
-      const struct glsl_type *last_member = glsl_get_struct_field(var->interface_type, last_member_idx);
-      /* multiply by stride */
-      result = emit_binop(ctx, SpvOpIMul, uint_type, result, emit_uint_const(ctx, 32, glsl_get_explicit_stride(last_member)));
-      /* get total ssbo size by adding offset */
-      result = emit_binop(ctx, SpvOpIAdd, uint_type, result,
-                          emit_uint_const(ctx, 32,
-                                          glsl_get_struct_field_offset(var->interface_type, last_member_idx)));
-      store_dest(ctx, &intr->dest, result, nir_type_uint);
+   case nir_intrinsic_image_deref_load:
+      emit_image_deref_load(ctx, intr);
       break;
-   }
 
-   case nir_intrinsic_image_deref_store: {
-      SpvId img_var = get_src(ctx, &intr->src[0]);
-      nir_variable *var = get_var_from_image(ctx, img_var);
-      SpvId img_type = ctx->image_types[var->data.driver_location];
-      const struct glsl_type *type = glsl_without_array(var->type);
-      SpvId base_type = get_glsl_basetype(ctx, glsl_get_sampler_result_type(type));
-      SpvId img = spirv_builder_emit_load(&ctx->builder, img_type, img_var);
-      SpvId coord = get_image_coords(ctx, type, &intr->src[1]);
-      SpvId texel = get_src(ctx, &intr->src[3]);
-      SpvId sample = glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_MS ? get_src(ctx, &intr->src[2]) : 0;
-      assert(nir_src_bit_size(intr->src[3]) == glsl_base_type_bit_size(glsl_get_sampler_result_type(type)));
-      /* texel type must match image type */
-      texel = emit_bitcast(ctx,
-                           spirv_builder_type_vector(&ctx->builder, base_type, 4),
-                           texel);
-      spirv_builder_emit_image_write(&ctx->builder, img, coord, texel, 0, sample, 0);
+   case nir_intrinsic_image_deref_size:
+      emit_image_deref_size(ctx, intr);
       break;
-   }
-   case nir_intrinsic_image_deref_load: {
-      SpvId img_var = get_src(ctx, &intr->src[0]);
-      nir_variable *var = get_var_from_image(ctx, img_var);
-      SpvId img_type = ctx->image_types[var->data.driver_location];
-      const struct glsl_type *type = glsl_without_array(var->type);
-      SpvId base_type = get_glsl_basetype(ctx, glsl_get_sampler_result_type(type));
-      SpvId img = spirv_builder_emit_load(&ctx->builder, img_type, img_var);
-      SpvId coord = get_image_coords(ctx, type, &intr->src[1]);
-      SpvId sample = glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_MS ? get_src(ctx, &intr->src[2]) : 0;
-      SpvId result = spirv_builder_emit_image_read(&ctx->builder,
-                                    spirv_builder_type_vector(&ctx->builder, base_type, nir_dest_num_components(intr->dest)),
-                                    img, coord, 0, sample, 0);
-      store_dest(ctx, &intr->dest, result, nir_type_float);
+
+   case nir_intrinsic_image_deref_samples:
+      emit_image_deref_samples(ctx, intr);
       break;
-   }
-   case nir_intrinsic_image_deref_size: {
-      SpvId img_var = get_src(ctx, &intr->src[0]);
-      nir_variable *var = get_var_from_image(ctx, img_var);
-      SpvId img_type = ctx->image_types[var->data.driver_location];
-      const struct glsl_type *type = glsl_without_array(var->type);
-      SpvId img = spirv_builder_emit_load(&ctx->builder, img_type, img_var);
-      SpvId result = spirv_builder_emit_image_query_size(&ctx->builder, get_uvec_type(ctx, 32, glsl_get_sampler_coordinate_components(type)), img, 0);
-      store_dest(ctx, &intr->dest, result, nir_type_uint);
-      break;
-   }
-   case nir_intrinsic_image_deref_samples: {
-      SpvId img_var = get_src(ctx, &intr->src[0]);
-      nir_variable *var = get_var_from_image(ctx, img_var);
-      SpvId img_type = ctx->image_types[var->data.driver_location];
-      SpvId img = spirv_builder_emit_load(&ctx->builder, img_type, img_var);
-      SpvId result = spirv_builder_emit_unop(&ctx->builder, SpvOpImageQuerySamples, get_dest_type(ctx, &intr->dest, nir_type_uint), img);
-      store_dest(ctx, &intr->dest, result, nir_type_uint);
-      break;
-   }
+
    case nir_intrinsic_image_deref_atomic_add:
    case nir_intrinsic_image_deref_atomic_umin:
    case nir_intrinsic_image_deref_atomic_imin:
@@ -2818,40 +2980,22 @@ emit_intrinsic(struct ntv_context *ctx, nir_intrinsic_instr *intr)
    LOAD_SHADER_BALLOT(subgroup_lt_mask, SubgroupLtMask);
    LOAD_SHADER_BALLOT(subgroup_size, SubgroupSize);
 
-   case nir_intrinsic_ballot: {
-      spirv_builder_emit_cap(&ctx->builder, SpvCapabilitySubgroupBallotKHR);
-      spirv_builder_emit_extension(&ctx->builder, "SPV_KHR_shader_ballot");
-      SpvId type = get_dest_uvec_type(ctx, &intr->dest);
-      SpvId result = emit_unop(ctx, SpvOpSubgroupBallotKHR, type, get_src(ctx, &intr->src[0]));
-      store_dest(ctx, &intr->dest, result, nir_type_uint);
+   case nir_intrinsic_ballot:
+      emit_ballot(ctx, intr);
       break;
-   }
 
-   case nir_intrinsic_read_first_invocation: {
-      spirv_builder_emit_cap(&ctx->builder, SpvCapabilitySubgroupBallotKHR);
-      spirv_builder_emit_extension(&ctx->builder, "SPV_KHR_shader_ballot");
-      SpvId type = get_dest_type(ctx, &intr->dest, nir_type_uint);
-      SpvId result = emit_unop(ctx, SpvOpSubgroupFirstInvocationKHR, type, get_src(ctx, &intr->src[0]));
-      store_dest(ctx, &intr->dest, result, nir_type_uint);
+   case nir_intrinsic_read_first_invocation:
+      emit_read_first_invocation(ctx, intr);
       break;
-   }
 
-   case nir_intrinsic_read_invocation: {
-      spirv_builder_emit_cap(&ctx->builder, SpvCapabilitySubgroupBallotKHR);
-      spirv_builder_emit_extension(&ctx->builder, "SPV_KHR_shader_ballot");
-      SpvId type = get_dest_type(ctx, &intr->dest, nir_type_uint);
-      SpvId result = emit_binop(ctx, SpvOpSubgroupReadInvocationKHR, type,
-                                get_src(ctx, &intr->src[0]),
-                                get_src(ctx, &intr->src[1]));
-      store_dest(ctx, &intr->dest, result, nir_type_uint);
+   case nir_intrinsic_read_invocation:
+      emit_read_invocation(ctx, intr);
       break;
-   }
 
-   case nir_intrinsic_load_workgroup_size: {
+   case nir_intrinsic_load_workgroup_size:
       assert(ctx->local_group_size_var);
       store_dest(ctx, &intr->dest, ctx->local_group_size_var, nir_type_uint);
       break;
-   }
 
    case nir_intrinsic_load_shared:
       emit_load_shared(ctx, intr);
@@ -2861,16 +3005,9 @@ emit_intrinsic(struct ntv_context *ctx, nir_intrinsic_instr *intr)
       emit_store_shared(ctx, intr);
       break;
 
-   case nir_intrinsic_shader_clock: {
-      spirv_builder_emit_cap(&ctx->builder, SpvCapabilityShaderClockKHR);
-      spirv_builder_emit_extension(&ctx->builder, "SPV_KHR_shader_clock");
-
-      SpvScope scope = get_scope(nir_intrinsic_memory_scope(intr));
-      SpvId type = get_dest_type(ctx, &intr->dest, nir_type_uint);
-      SpvId result = spirv_builder_emit_unop_const(&ctx->builder, SpvOpReadClockKHR, type, scope);
-      store_dest(ctx, &intr->dest, result, nir_type_uint);
+   case nir_intrinsic_shader_clock:
+      emit_shader_clock(ctx, intr);
       break;
-   }
 
    case nir_intrinsic_vote_all:
    case nir_intrinsic_vote_any:
@@ -2945,9 +3082,11 @@ emit_tex(struct ntv_context *ctx, nir_tex_instr *tex)
    assert(tex->texture_index == tex->sampler_index);
 
    SpvId coord = 0, proj = 0, bias = 0, lod = 0, dref = 0, dx = 0, dy = 0,
-         const_offset = 0, offset = 0, sample = 0, tex_offset = 0;
+         const_offset = 0, offset = 0, sample = 0, tex_offset = 0, bindless = 0;
    unsigned coord_components = 0;
+   nir_variable *bindless_var = NULL;
    for (unsigned i = 0; i < tex->num_srcs; i++) {
+      nir_const_value *cv;
       switch (tex->src[i].src_type) {
       case nir_tex_src_coord:
          if (tex->op == nir_texop_txf ||
@@ -2965,14 +3104,14 @@ emit_tex(struct ntv_context *ctx, nir_tex_instr *tex)
          break;
 
       case nir_tex_src_offset:
-         if (nir_src_is_const(tex->src[i].src)) {
-            nir_const_value *v = nir_src_as_const_value(tex->src[i].src);
+         cv = nir_src_as_const_value(tex->src[i].src);
+         if (cv) {
             unsigned bit_size = nir_src_bit_size(tex->src[i].src);
             unsigned num_components = nir_src_num_components(tex->src[i].src);
 
             SpvId components[NIR_MAX_VEC_COMPONENTS];
             for (int i = 0; i < num_components; ++i) {
-               int64_t tmp = nir_const_value_as_int(v[i], bit_size);
+               int64_t tmp = nir_const_value_as_int(cv[i], bit_size);
                components[i] = emit_int_const(ctx, bit_size, tmp);
             }
 
@@ -3031,9 +3170,15 @@ emit_tex(struct ntv_context *ctx, nir_tex_instr *tex)
          break;
 
       case nir_tex_src_sampler_offset:
+      case nir_tex_src_sampler_handle:
          /* don't care */
          break;
 
+      case nir_tex_src_texture_handle:
+         bindless = get_src(ctx, &tex->src[i].src);
+         bindless_var = nir_deref_instr_get_variable(nir_src_as_deref(tex->src[i].src));
+         break;
+
       default:
          fprintf(stderr, "texture source: %d\n", tex->src[i].src_type);
          unreachable("unknown texture source");
@@ -3057,13 +3202,13 @@ emit_tex(struct ntv_context *ctx, nir_tex_instr *tex)
          }
       }
    }
-   SpvId image_type = ctx->sampler_types[texture_index];
+   SpvId image_type = bindless ? get_bare_image_type(ctx, bindless_var, true) : ctx->sampler_types[texture_index];
    assert(image_type);
    SpvId sampled_type = spirv_builder_type_sampled_image(&ctx->builder,
                                                          image_type);
    assert(sampled_type);
-   assert(ctx->samplers_used & (1u << texture_index));
-   SpvId sampler_id = ctx->samplers[texture_index];
+   assert(bindless || ctx->samplers_used & (1u << texture_index));
+   SpvId sampler_id = bindless ? bindless : ctx->samplers[texture_index];
    if (tex_offset) {
        SpvId ptr = spirv_builder_type_pointer(&ctx->builder, SpvStorageClassUniformConstant, sampled_type);
        sampler_id = spirv_builder_emit_access_chain(&ctx->builder, ptr, sampler_id, &tex_offset, 1);
@@ -3271,12 +3416,12 @@ emit_deref_array(struct ntv_context *ctx, nir_deref_instr *deref)
       type = get_glsl_type(ctx, deref->type);
       break;
 
-   case nir_var_uniform: {
-      assert(glsl_type_is_image(glsl_without_array(var->type)));
+   case nir_var_uniform:
+   case nir_var_image: {
       struct hash_entry *he = _mesa_hash_table_search(ctx->vars, var);
       assert(he);
       base = (SpvId)(intptr_t)he->data;
-      type = ctx->image_types[var->data.driver_location];
+      type = get_image_type(ctx, var, glsl_type_is_sampler(glsl_without_array(var->type)));
       break;
    }
 
@@ -3703,6 +3848,8 @@ nir_to_spirv(struct nir_shader *s, const struct zink_so_info *so_info, uint32_t
       spirv_builder_emit_cap(&ctx.builder, SpvCapabilityImageQuery);
    }
 
+   if (s->info.bit_sizes_int & 8)
+      spirv_builder_emit_cap(&ctx.builder, SpvCapabilityInt8);
    if (s->info.bit_sizes_int & 16)
       spirv_builder_emit_cap(&ctx.builder, SpvCapabilityInt16);
    if (s->info.bit_sizes_int & 64)
@@ -3800,6 +3947,7 @@ nir_to_spirv(struct nir_shader *s, const struct zink_so_info *so_info, uint32_t
    /* we have to reverse iterate to match what's done in zink_compiler.c */
    foreach_list_typed_reverse(nir_variable, var, node, &s->variables)
       if (_nir_shader_variable_has_mode(var, nir_var_uniform |
+                                        nir_var_image |
                                         nir_var_mem_ubo |
                                         nir_var_mem_ssbo))
          emit_uniform(&ctx, var);
diff --git a/mesa 3D driver/src/gallium/drivers/zink/nir_to_spirv/spirv_builder.c b/mesa 3D driver/src/gallium/drivers/zink/nir_to_spirv/spirv_builder.c
index 878783fd3f..339af44f4c 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/nir_to_spirv/spirv_builder.c	
+++ b/mesa 3D driver/src/gallium/drivers/zink/nir_to_spirv/spirv_builder.c	
@@ -215,6 +215,13 @@ spirv_builder_emit_decoration(struct spirv_builder *b, SpvId target,
    emit_decoration(b, target, decoration, NULL, 0);
 }
 
+void
+spirv_builder_emit_input_attachment_index(struct spirv_builder *b, SpvId target, uint32_t id)
+{
+   uint32_t args[] = { id };
+   emit_decoration(b, target, SpvDecorationInputAttachmentIndex, args, ARRAY_SIZE(args));
+}
+
 void
 spirv_builder_emit_specid(struct spirv_builder *b, SpvId target, uint32_t id)
 {
@@ -1420,7 +1427,7 @@ spirv_builder_const_int(struct spirv_builder *b, int width, int64_t val)
 SpvId
 spirv_builder_const_uint(struct spirv_builder *b, int width, uint64_t val)
 {
-   assert(width >= 16);
+   assert(width >= 8);
    SpvId type = spirv_builder_type_uint(b, width);
    if (width <= 32)
       return emit_constant_32(b, type, val);
diff --git a/mesa 3D driver/src/gallium/drivers/zink/nir_to_spirv/spirv_builder.h b/mesa 3D driver/src/gallium/drivers/zink/nir_to_spirv/spirv_builder.h
index 6ad3064ced..d18c101b39 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/nir_to_spirv/spirv_builder.h	
+++ b/mesa 3D driver/src/gallium/drivers/zink/nir_to_spirv/spirv_builder.h	
@@ -89,6 +89,9 @@ void
 spirv_builder_emit_decoration(struct spirv_builder *b, SpvId target,
                               SpvDecoration decoration);
 
+void
+spirv_builder_emit_input_attachment_index(struct spirv_builder *b, SpvId target, uint32_t id);
+
 void
 spirv_builder_emit_specid(struct spirv_builder *b, SpvId target, uint32_t id);
 
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_batch.c b/mesa 3D driver/src/gallium/drivers/zink/zink_batch.c
index 17aecc73ee..33e4ad1792 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_batch.c	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_batch.c	
@@ -30,19 +30,32 @@ zink_reset_batch_state(struct zink_context *ctx, struct zink_batch_state *bs)
 {
    struct zink_screen *screen = zink_screen(ctx->base.screen);
 
-   if (vkResetCommandPool(screen->dev, bs->cmdpool, 0) != VK_SUCCESS)
+   if (VKSCR(ResetCommandPool)(screen->dev, bs->cmdpool, 0) != VK_SUCCESS)
       debug_printf("vkResetCommandPool failed\n");
 
    /* unref all used resources */
    set_foreach_remove(bs->resources, entry) {
       struct zink_resource_object *obj = (struct zink_resource_object *)entry->key;
-      zink_resource_object_usage_unset(obj, bs);
-      zink_resource_object_reference(screen, &obj, NULL);
+      if (!zink_resource_object_usage_unset(obj, bs)) {
+         obj->unordered_barrier = false;
+         obj->access = 0;
+         obj->access_stage = 0;
+      }
+      util_dynarray_append(&bs->unref_resources, struct zink_resource_object*, obj);
+   }
+
+   for (unsigned i = 0; i < 2; i++) {
+      while (util_dynarray_contains(&bs->bindless_releases[i], uint32_t)) {
+         uint32_t handle = util_dynarray_pop(&bs->bindless_releases[i], uint32_t);
+         bool is_buffer = ZINK_BINDLESS_IS_BUFFER(handle);
+         struct util_idalloc *ids = i ? &ctx->di.bindless[is_buffer].img_slots : &ctx->di.bindless[is_buffer].tex_slots;
+         util_idalloc_free(ids, is_buffer ? handle - ZINK_MAX_BINDLESS_HANDLES : handle);
+      }
    }
 
    set_foreach_remove(bs->active_queries, entry) {
       struct zink_query *query = (void*)entry->key;
-      zink_prune_query(screen, query);
+      zink_prune_query(screen, bs, query);
    }
 
    set_foreach_remove(bs->surfaces, entry) {
@@ -56,8 +69,12 @@ zink_reset_batch_state(struct zink_context *ctx, struct zink_batch_state *bs)
       zink_buffer_view_reference(screen, &buffer_view, NULL);
    }
 
+   util_dynarray_foreach(&bs->dead_framebuffers, struct zink_framebuffer*, fb) {
+      zink_framebuffer_reference(screen, fb, NULL);
+   }
+   util_dynarray_clear(&bs->dead_framebuffers);
    util_dynarray_foreach(&bs->zombie_samplers, VkSampler, samp) {
-      vkDestroySampler(screen->dev, *samp, NULL);
+      VKSCR(DestroySampler)(screen->dev, *samp, NULL);
    }
    util_dynarray_clear(&bs->zombie_samplers);
    util_dynarray_clear(&bs->persistent_resources);
@@ -69,19 +86,13 @@ zink_reset_batch_state(struct zink_context *ctx, struct zink_batch_state *bs)
       zink_batch_usage_unset(&pg->batch_uses, bs);
       if (pg->is_compute) {
          struct zink_compute_program *comp = (struct zink_compute_program*)pg;
-         zink_compute_program_reference(screen, &comp, NULL);
+         zink_compute_program_reference(ctx, &comp, NULL);
       } else {
          struct zink_gfx_program *prog = (struct zink_gfx_program*)pg;
-         zink_gfx_program_reference(screen, &prog, NULL);
+         zink_gfx_program_reference(ctx, &prog, NULL);
       }
    }
 
-   set_foreach(bs->fbs, entry) {
-      struct zink_framebuffer *fb = (void*)entry->key;
-      zink_framebuffer_reference(screen, &fb, NULL);
-      _mesa_set_remove(bs->fbs, entry);
-   }
-
    pipe_resource_reference(&bs->flush_res, NULL);
 
    bs->resource_size = 0;
@@ -97,6 +108,16 @@ zink_reset_batch_state(struct zink_context *ctx, struct zink_batch_state *bs)
    bs->submit_count++;
    bs->fence.batch_id = 0;
    bs->usage.usage = 0;
+   bs->next = NULL;
+}
+
+static void
+unref_resources(struct zink_screen *screen, struct zink_batch_state *bs)
+{
+   while (util_dynarray_contains(&bs->unref_resources, struct zink_resource_object*)) {
+      struct zink_resource_object *obj = util_dynarray_pop(&bs->unref_resources, struct zink_resource_object*);
+      zink_resource_object_reference(screen, &obj, NULL);
+   }
 }
 
 void
@@ -104,17 +125,28 @@ zink_clear_batch_state(struct zink_context *ctx, struct zink_batch_state *bs)
 {
    bs->fence.completed = true;
    zink_reset_batch_state(ctx, bs);
+   unref_resources(zink_screen(ctx->base.screen), bs);
+}
+
+static void
+pop_batch_state(struct zink_context *ctx)
+{
+   const struct zink_batch_state *bs = ctx->batch_states;
+   ctx->batch_states = bs->next;
+   ctx->batch_states_count--;
+   if (ctx->last_fence == &bs->fence)
+      ctx->last_fence = NULL;
 }
 
 void
 zink_batch_reset_all(struct zink_context *ctx)
 {
    simple_mtx_lock(&ctx->batch_mtx);
-   hash_table_foreach(&ctx->batch_states, entry) {
-      struct zink_batch_state *bs = entry->data;
+   while (ctx->batch_states) {
+      struct zink_batch_state *bs = ctx->batch_states;
       bs->fence.completed = true;
+      pop_batch_state(ctx);
       zink_reset_batch_state(ctx, bs);
-      _mesa_hash_table_remove(&ctx->batch_states, entry);
       util_dynarray_append(&ctx->free_batch_states, struct zink_batch_state *, bs);
    }
    simple_mtx_unlock(&ctx->batch_mtx);
@@ -132,17 +164,20 @@ zink_batch_state_destroy(struct zink_screen *screen, struct zink_batch_state *bs
    mtx_destroy(&bs->usage.mtx);
 
    if (bs->fence.fence)
-      vkDestroyFence(screen->dev, bs->fence.fence, NULL);
+      VKSCR(DestroyFence)(screen->dev, bs->fence.fence, NULL);
 
    if (bs->cmdbuf)
-      vkFreeCommandBuffers(screen->dev, bs->cmdpool, 1, &bs->cmdbuf);
+      VKSCR(FreeCommandBuffers)(screen->dev, bs->cmdpool, 1, &bs->cmdbuf);
    if (bs->barrier_cmdbuf)
-      vkFreeCommandBuffers(screen->dev, bs->cmdpool, 1, &bs->barrier_cmdbuf);
+      VKSCR(FreeCommandBuffers)(screen->dev, bs->cmdpool, 1, &bs->barrier_cmdbuf);
    if (bs->cmdpool)
-      vkDestroyCommandPool(screen->dev, bs->cmdpool, NULL);
+      VKSCR(DestroyCommandPool)(screen->dev, bs->cmdpool, NULL);
 
-   _mesa_set_destroy(bs->fbs, NULL);
    util_dynarray_fini(&bs->zombie_samplers);
+   util_dynarray_fini(&bs->dead_framebuffers);
+   util_dynarray_fini(&bs->unref_resources);
+   util_dynarray_fini(&bs->bindless_releases[0]);
+   util_dynarray_fini(&bs->bindless_releases[1]);
    _mesa_set_destroy(bs->surfaces, NULL);
    _mesa_set_destroy(bs->bufferviews, NULL);
    _mesa_set_destroy(bs->programs, NULL);
@@ -160,8 +195,7 @@ create_batch_state(struct zink_context *ctx)
    VkCommandPoolCreateInfo cpci = {0};
    cpci.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
    cpci.queueFamilyIndex = screen->gfx_queue;
-   cpci.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;
-   if (vkCreateCommandPool(screen->dev, &cpci, NULL, &bs->cmdpool) != VK_SUCCESS)
+   if (VKSCR(CreateCommandPool)(screen->dev, &cpci, NULL, &bs->cmdpool) != VK_SUCCESS)
       goto fail;
 
    VkCommandBufferAllocateInfo cbai = {0};
@@ -170,10 +204,10 @@ create_batch_state(struct zink_context *ctx)
    cbai.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
    cbai.commandBufferCount = 1;
 
-   if (vkAllocateCommandBuffers(screen->dev, &cbai, &bs->cmdbuf) != VK_SUCCESS)
+   if (VKSCR(AllocateCommandBuffers)(screen->dev, &cbai, &bs->cmdbuf) != VK_SUCCESS)
       goto fail;
 
-   if (vkAllocateCommandBuffers(screen->dev, &cbai, &bs->barrier_cmdbuf) != VK_SUCCESS)
+   if (VKSCR(AllocateCommandBuffers)(screen->dev, &cbai, &bs->barrier_cmdbuf) != VK_SUCCESS)
       goto fail;
 
 #define SET_CREATE_OR_FAIL(ptr) \
@@ -182,16 +216,18 @@ create_batch_state(struct zink_context *ctx)
       goto fail
 
    bs->ctx = ctx;
-   pipe_reference_init(&bs->reference, 1);
 
-   SET_CREATE_OR_FAIL(bs->fbs);
    SET_CREATE_OR_FAIL(bs->resources);
    SET_CREATE_OR_FAIL(bs->surfaces);
    SET_CREATE_OR_FAIL(bs->bufferviews);
    SET_CREATE_OR_FAIL(bs->programs);
    SET_CREATE_OR_FAIL(bs->active_queries);
    util_dynarray_init(&bs->zombie_samplers, NULL);
+   util_dynarray_init(&bs->dead_framebuffers, NULL);
    util_dynarray_init(&bs->persistent_resources, NULL);
+   util_dynarray_init(&bs->unref_resources, NULL);
+   util_dynarray_init(&bs->bindless_releases[0], NULL);
+   util_dynarray_init(&bs->bindless_releases[1], NULL);
 
    cnd_init(&bs->usage.flush);
    mtx_init(&bs->usage.mtx, mtx_plain);
@@ -202,7 +238,7 @@ create_batch_state(struct zink_context *ctx)
    VkFenceCreateInfo fci = {0};
    fci.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
 
-   if (vkCreateFence(screen->dev, &fci, NULL, &bs->fence.fence) != VK_SUCCESS)
+   if (VKSCR(CreateFence)(screen->dev, &fci, NULL, &bs->fence.fence) != VK_SUCCESS)
       goto fail;
 
    util_queue_fence_init(&bs->flush_completed);
@@ -214,9 +250,9 @@ create_batch_state(struct zink_context *ctx)
 }
 
 static inline bool
-find_unused_state(struct hash_entry *entry)
+find_unused_state(struct zink_batch_state *bs)
 {
-   struct zink_fence *fence = entry->data;
+   struct zink_fence *fence = &bs->fence;
    /* we can't reset these from fence_finish because threads */
    bool completed = p_atomic_read(&fence->completed);
    bool submitted = p_atomic_read(&fence->submitted);
@@ -226,26 +262,25 @@ find_unused_state(struct hash_entry *entry)
 static struct zink_batch_state *
 get_batch_state(struct zink_context *ctx, struct zink_batch *batch)
 {
+   struct zink_screen *screen = zink_screen(ctx->base.screen);
    struct zink_batch_state *bs = NULL;
 
    simple_mtx_lock(&ctx->batch_mtx);
    if (util_dynarray_num_elements(&ctx->free_batch_states, struct zink_batch_state*))
       bs = util_dynarray_pop(&ctx->free_batch_states, struct zink_batch_state*);
-   if (!bs) {
-      hash_table_foreach(&ctx->batch_states, he) {
-         struct zink_fence *fence = he->data;
-         if (zink_screen_check_last_finished(zink_screen(ctx->base.screen), fence->batch_id) || find_unused_state(he)) {
-            bs = he->data;
-            _mesa_hash_table_remove(&ctx->batch_states, he);
-            break;
-         }
+   if (!bs && ctx->batch_states) {
+      /* states are stored sequentially, so if the first one doesn't work, none of them will */
+      if (zink_screen_check_last_finished(screen, ctx->batch_states->fence.batch_id) ||
+          find_unused_state(ctx->batch_states)) {
+         bs = ctx->batch_states;
+         pop_batch_state(ctx);
       }
    }
    simple_mtx_unlock(&ctx->batch_mtx);
    if (bs) {
       if (bs->fence.submitted && !bs->fence.completed)
          /* this fence is already done, so we need vulkan to release the cmdbuf */
-         zink_vkfence_wait(zink_screen(ctx->base.screen), &bs->fence, PIPE_TIMEOUT_INFINITE);
+         zink_vkfence_wait(screen, &bs->fence, PIPE_TIMEOUT_INFINITE);
       zink_reset_batch_state(ctx, bs);
    } else {
       if (!batch->state) {
@@ -263,15 +298,6 @@ get_batch_state(struct zink_context *ctx, struct zink_batch *batch)
 void
 zink_reset_batch(struct zink_context *ctx, struct zink_batch *batch)
 {
-   struct zink_screen *screen = zink_screen(ctx->base.screen);
-
-   if (ctx->have_timelines && screen->last_finished > ctx->curr_batch && ctx->curr_batch == 1) {
-      if (!zink_screen_init_semaphore(screen)) {
-         debug_printf("timeline init failed, things are about to go dramatically wrong.");
-         ctx->have_timelines = false;
-      }
-   }
-
    batch->state = get_batch_state(ctx, batch);
    assert(batch->state);
 
@@ -288,17 +314,17 @@ zink_start_batch(struct zink_context *ctx, struct zink_batch *batch)
    VkCommandBufferBeginInfo cbbi = {0};
    cbbi.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
    cbbi.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
-   if (vkBeginCommandBuffer(batch->state->cmdbuf, &cbbi) != VK_SUCCESS)
+   if (VKCTX(BeginCommandBuffer)(batch->state->cmdbuf, &cbbi) != VK_SUCCESS)
       debug_printf("vkBeginCommandBuffer failed\n");
-   if (vkBeginCommandBuffer(batch->state->barrier_cmdbuf, &cbbi) != VK_SUCCESS)
+   if (VKCTX(BeginCommandBuffer)(batch->state->barrier_cmdbuf, &cbbi) != VK_SUCCESS)
       debug_printf("vkBeginCommandBuffer failed\n");
 
-   batch->state->fence.batch_id = ctx->curr_batch;
    batch->state->fence.completed = false;
    if (ctx->last_fence) {
       struct zink_batch_state *last_state = zink_batch_state(ctx->last_fence);
       batch->last_batch_usage = &last_state->usage;
    }
+
    if (!ctx->queries_disabled)
       zink_resume_queries(ctx, batch);
 }
@@ -307,11 +333,14 @@ static void
 post_submit(void *data, void *gdata, int thread_index)
 {
    struct zink_batch_state *bs = data;
+   struct zink_screen *screen = zink_screen(bs->ctx->base.screen);
 
    if (bs->is_device_lost) {
       if (bs->ctx->reset.reset)
          bs->ctx->reset.reset(bs->ctx->reset.data, PIPE_GUILTY_CONTEXT_RESET);
-      zink_screen(bs->ctx->base.screen)->device_lost = true;
+      screen->device_lost = true;
+   } else if (bs->ctx->batch_states_count > 5000) {
+      zink_screen_batch_id_wait(screen, bs->fence.batch_id - 2500, PIPE_TIMEOUT_INFINITE);
    }
 }
 
@@ -323,15 +352,19 @@ submit_queue(void *data, void *gdata, int thread_index)
    struct zink_screen *screen = zink_screen(ctx->base.screen);
    VkSubmitInfo si = {0};
 
-   simple_mtx_lock(&ctx->batch_mtx);
    while (!bs->fence.batch_id)
       bs->fence.batch_id = p_atomic_inc_return(&screen->curr_batch);
-   _mesa_hash_table_insert_pre_hashed(&ctx->batch_states, bs->fence.batch_id, (void*)(uintptr_t)bs->fence.batch_id, bs);
    bs->usage.usage = bs->fence.batch_id;
    bs->usage.unflushed = false;
-   simple_mtx_unlock(&ctx->batch_mtx);
 
-   vkResetFences(screen->dev, 1, &bs->fence.fence);
+   if (ctx->have_timelines && screen->last_finished > bs->fence.batch_id && bs->fence.batch_id == 1) {
+      if (!zink_screen_init_semaphore(screen)) {
+         debug_printf("timeline init failed, things are about to go dramatically wrong.");
+         ctx->have_timelines = false;
+      }
+   }
+
+   VKSCR(ResetFences)(screen->dev, 1, &bs->fence.fence);
 
    uint64_t batch_id = bs->fence.batch_id;
    si.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
@@ -368,12 +401,12 @@ submit_queue(void *data, void *gdata, int thread_index)
       si.pNext = &mem_signal;
    }
 
-   if (vkEndCommandBuffer(bs->cmdbuf) != VK_SUCCESS) {
+   if (VKSCR(EndCommandBuffer)(bs->cmdbuf) != VK_SUCCESS) {
       debug_printf("vkEndCommandBuffer failed\n");
       bs->is_device_lost = true;
       goto end;
    }
-   if (vkEndCommandBuffer(bs->barrier_cmdbuf) != VK_SUCCESS) {
+   if (VKSCR(EndCommandBuffer)(bs->barrier_cmdbuf) != VK_SUCCESS) {
       debug_printf("vkEndCommandBuffer failed\n");
       bs->is_device_lost = true;
       goto end;
@@ -382,10 +415,10 @@ submit_queue(void *data, void *gdata, int thread_index)
    while (util_dynarray_contains(&bs->persistent_resources, struct zink_resource_object*)) {
       struct zink_resource_object *obj = util_dynarray_pop(&bs->persistent_resources, struct zink_resource_object*);
        VkMappedMemoryRange range = zink_resource_init_mem_range(screen, obj, 0, obj->size);
-       vkFlushMappedMemoryRanges(screen->dev, 1, &range);
+       VKSCR(FlushMappedMemoryRanges)(screen->dev, 1, &range);
    }
 
-   if (vkQueueSubmit(bs->queue, 1, &si, bs->fence.fence) != VK_SUCCESS) {
+   if (VKSCR(QueueSubmit)(bs->queue, 1, &si, bs->fence.fence) != VK_SUCCESS) {
       debug_printf("ZINK: vkQueueSubmit() failed\n");
       bs->is_device_lost = true;
    }
@@ -394,6 +427,7 @@ submit_queue(void *data, void *gdata, int thread_index)
    cnd_broadcast(&bs->usage.flush);
 
    p_atomic_set(&bs->fence.submitted, true);
+   unref_resources(screen, bs);
 }
 
 
@@ -403,6 +437,7 @@ copy_scanout(struct zink_batch_state *bs, struct zink_resource *res)
 {
    if (!bs->scanout_flush)
       return;
+   struct zink_context *ctx = bs->ctx;
 
    VkImageCopy region = {0};
    struct pipe_box box = {0, 0, 0,
@@ -475,9 +510,9 @@ copy_scanout(struct zink_batch_state *bs, struct zink_resource *res)
 
    VkImageMemoryBarrier imb1;
    zink_resource_image_barrier_init(&imb1, res, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, VK_ACCESS_TRANSFER_READ_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT);
-   vkCmdPipelineBarrier(
+   VKCTX(CmdPipelineBarrier)(
       bs->cmdbuf,
-      res->access_stage ? res->access_stage : VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+      res->obj->access_stage ? res->obj->access_stage : VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
       VK_PIPELINE_STAGE_TRANSFER_BIT,
       0,
       0, NULL,
@@ -502,7 +537,7 @@ copy_scanout(struct zink_batch_state *bs, struct zink_resource *res)
       res->scanout_obj->image,
       isr
    };
-   vkCmdPipelineBarrier(
+   VKCTX(CmdPipelineBarrier)(
       bs->cmdbuf,
       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
       VK_PIPELINE_STAGE_TRANSFER_BIT,
@@ -512,14 +547,14 @@ copy_scanout(struct zink_batch_state *bs, struct zink_resource *res)
       1, &imb
    );
 
-   vkCmdCopyImage(bs->cmdbuf, res->obj->image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+   VKCTX(CmdCopyImage)(bs->cmdbuf, res->obj->image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
                   res->scanout_obj->image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
                   1, &region);
    imb.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
    imb.dstAccessMask = 0;
    imb.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
    imb.newLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR;
-   vkCmdPipelineBarrier(
+   VKCTX(CmdPipelineBarrier)(
       bs->cmdbuf,
       VK_PIPELINE_STAGE_TRANSFER_BIT,
       VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
@@ -543,36 +578,52 @@ zink_end_batch(struct zink_context *ctx, struct zink_batch *batch)
    tc_driver_internal_flush_notify(ctx->tc);
 
    struct zink_screen *screen = zink_screen(ctx->base.screen);
+   struct zink_batch_state *bs;
 
-   ctx->last_fence = &batch->state->fence;
-   if (ctx->oom_flush || _mesa_hash_table_num_entries(&ctx->batch_states) > 10) {
-      simple_mtx_lock(&ctx->batch_mtx);
-      hash_table_foreach(&ctx->batch_states, he) {
-         struct zink_fence *fence = he->data;
-         struct zink_batch_state *bs = he->data;
-         if (zink_check_batch_completion(ctx, fence->batch_id, true)) {
-            zink_reset_batch_state(ctx, he->data);
-            _mesa_hash_table_remove(&ctx->batch_states, he);
-            util_dynarray_append(&ctx->free_batch_states, struct zink_batch_state *, bs);
-         }
+   simple_mtx_lock(&ctx->batch_mtx);
+   if (ctx->oom_flush || ctx->batch_states_count > 10) {
+      assert(!ctx->batch_states_count || ctx->batch_states);
+      while (ctx->batch_states) {
+         bs = ctx->batch_states;
+         struct zink_fence *fence = &bs->fence;
+         /* once an incomplete state is reached, no more will be complete */
+         if (!zink_check_batch_completion(ctx, fence->batch_id, true))
+            break;
+
+         if (bs->fence.submitted && !bs->fence.completed)
+            /* this fence is already done, so we need vulkan to release the cmdbuf */
+            zink_vkfence_wait(screen, &bs->fence, PIPE_TIMEOUT_INFINITE);
+         pop_batch_state(ctx);
+         zink_reset_batch_state(ctx, bs);
+         util_dynarray_append(&ctx->free_batch_states, struct zink_batch_state *, bs);
       }
-      simple_mtx_unlock(&ctx->batch_mtx);
-      if (_mesa_hash_table_num_entries(&ctx->batch_states) > 50)
+      if (ctx->batch_states_count > 50)
          ctx->oom_flush = true;
    }
+
+   bs = batch->state;
+   if (ctx->last_fence)
+      zink_batch_state(ctx->last_fence)->next = bs;
+   else {
+      assert(!ctx->batch_states);
+      ctx->batch_states = bs;
+   }
+   ctx->last_fence = &bs->fence;
+   ctx->batch_states_count++;
+   simple_mtx_unlock(&ctx->batch_mtx);
    batch->work_count = 0;
 
    if (screen->device_lost)
       return;
 
    if (screen->threaded) {
-      batch->state->queue = screen->thread_queue;
-      util_queue_add_job(&screen->flush_queue, batch->state, &batch->state->flush_completed,
+      bs->queue = screen->thread_queue;
+      util_queue_add_job(&screen->flush_queue, bs, &bs->flush_completed,
                          submit_queue, post_submit, 0);
    } else {
-      batch->state->queue = screen->queue;
-      submit_queue(batch->state, NULL, 0);
-      post_submit(batch->state, NULL, 0);
+      bs->queue = screen->queue;
+      submit_queue(bs, NULL, 0);
+      post_submit(bs, NULL, 0);
    }
 }
 
@@ -592,11 +643,12 @@ zink_batch_resource_usage_set(struct zink_batch *batch, struct zink_resource *re
 void
 zink_batch_reference_resource_rw(struct zink_batch *batch, struct zink_resource *res, bool write)
 {
-   /* if the resource already has usage of any sort set for this batch, we can skip hashing */
-   if (!zink_batch_usage_matches(res->obj->reads, batch->state) &&
-       !zink_batch_usage_matches(res->obj->writes, batch->state)) {
+   /* if the resource already has usage of any sort set for this batch, */
+   if (!zink_resource_usage_matches(res, batch->state) ||
+       /* or if it's bound somewhere */
+       !zink_resource_has_binds(res))
+      /* then it already has a batch ref and doesn't need one here */
       zink_batch_reference_resource(batch, res);
-   }
    zink_batch_resource_usage_set(batch, res, write);
 }
 
@@ -668,16 +720,6 @@ zink_batch_reference_sampler_view(struct zink_batch *batch,
       zink_batch_reference_surface(batch, sv->image_view);
 }
 
-void
-zink_batch_reference_framebuffer(struct zink_batch *batch,
-                                 struct zink_framebuffer *fb)
-{
-   bool found;
-   _mesa_set_search_or_add(batch->state->fbs, fb, &found);
-   if (!found)
-      pipe_reference(NULL, &fb->reference);
-}
-
 void
 zink_batch_reference_program(struct zink_batch *batch,
                              struct zink_program *pg)
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_batch.h b/mesa 3D driver/src/gallium/drivers/zink/zink_batch.h
index fb929bb7f0..ddf668f3d6 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_batch.h	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_batch.h	
@@ -41,7 +41,6 @@ struct pipe_reference;
 struct zink_buffer_view;
 struct zink_context;
 struct zink_descriptor_set;
-struct zink_framebuffer;
 struct zink_image_view;
 struct zink_program;
 struct zink_render_pass;
@@ -62,7 +61,7 @@ batch_ptr_add_usage(struct zink_batch *batch, struct set *s, void *ptr);
 
 struct zink_batch_state {
    struct zink_fence fence;
-   struct pipe_reference reference;
+   struct zink_batch_state *next;
 
    struct zink_batch_usage usage;
    struct zink_context *ctx;
@@ -77,15 +76,18 @@ struct zink_batch_state {
 
    struct pipe_resource *flush_res;
 
-   struct set *fbs;
    struct set *programs;
 
    struct set *resources;
    struct set *surfaces;
    struct set *bufferviews;
 
+   struct util_dynarray unref_resources;
+   struct util_dynarray bindless_releases[2];
+
    struct util_dynarray persistent_resources;
    struct util_dynarray zombie_samplers;
+   struct util_dynarray dead_framebuffers;
 
    struct set *active_queries; /* zink_query objects which were active at some point in this batch */
 
@@ -110,6 +112,7 @@ struct zink_batch {
    unsigned work_count;
 
    bool has_work;
+   bool last_was_compute;
    bool in_rp; //renderpass is currently active
 };
 
@@ -138,9 +141,6 @@ zink_batch_state_clear_resources(struct zink_screen *screen, struct zink_batch_s
 void
 zink_reset_batch(struct zink_context *ctx, struct zink_batch *batch);
 void
-zink_batch_reference_framebuffer(struct zink_batch *batch,
-                                 struct zink_framebuffer *fb);
-void
 zink_start_batch(struct zink_context *ctx, struct zink_batch *batch);
 
 void
@@ -179,19 +179,6 @@ zink_batch_reference_surface(struct zink_batch *batch, struct zink_surface *surf
 void
 debug_describe_zink_batch_state(char *buf, const struct zink_batch_state *ptr);
 
-static inline void
-zink_batch_state_reference(struct zink_screen *screen,
-                           struct zink_batch_state **dst,
-                           struct zink_batch_state *src)
-{
-   struct zink_batch_state *old_dst = dst ? *dst : NULL;
-
-   if (pipe_reference_described(old_dst ? &old_dst->reference : NULL, src ? &src->reference : NULL,
-                                (debug_reference_descriptor)debug_describe_zink_batch_state))
-      zink_batch_state_destroy(screen, old_dst);
-   if (dst) *dst = src;
-}
-
 static inline bool
 zink_batch_usage_is_unflushed(const struct zink_batch_usage *u)
 {
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_blit.c b/mesa 3D driver/src/gallium/drivers/zink/zink_blit.c
index 3e216a118b..7085c8e033 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_blit.c	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_blit.c	
@@ -52,8 +52,9 @@ blit_resolve(struct zink_context *ctx, const struct pipe_blit_info *info)
 
    apply_dst_clears(ctx, info, false);
    zink_fb_clears_apply_region(ctx, info->src.resource, zink_rect_from_box(&info->src.box));
-   struct zink_batch *batch = zink_batch_no_rp(ctx);
 
+   struct zink_batch *batch = &ctx->batch;
+   zink_batch_no_rp(ctx);
    zink_batch_reference_resource_rw(batch, src, false);
    zink_batch_reference_resource_rw(batch, dst, true);
 
@@ -96,7 +97,7 @@ blit_resolve(struct zink_context *ctx, const struct pipe_blit_info *info)
    region.extent.width = info->dst.box.width;
    region.extent.height = info->dst.box.height;
    region.extent.depth = info->dst.box.depth;
-   vkCmdResolveImage(batch->state->cmdbuf, src->obj->image, src->layout,
+   VKCTX(CmdResolveImage)(batch->state->cmdbuf, src->obj->image, src->layout,
                      dst->obj->image, dst->layout,
                      1, &region);
 
@@ -157,7 +158,9 @@ blit_native(struct zink_context *ctx, const struct pipe_blit_info *info)
 
    apply_dst_clears(ctx, info, false);
    zink_fb_clears_apply_region(ctx, info->src.resource, zink_rect_from_box(&info->src.box));
-   struct zink_batch *batch = zink_batch_no_rp(ctx);
+
+   struct zink_batch *batch = &ctx->batch;
+   zink_batch_no_rp(ctx);
    zink_batch_reference_resource_rw(batch, src, false);
    zink_batch_reference_resource_rw(batch, dst, true);
 
@@ -235,7 +238,7 @@ blit_native(struct zink_context *ctx, const struct pipe_blit_info *info)
    }
    assert(region.dstOffsets[0].z != region.dstOffsets[1].z);
 
-   vkCmdBlitImage(batch->state->cmdbuf, src->obj->image, src->layout,
+   VKCTX(CmdBlitImage)(batch->state->cmdbuf, src->obj->image, src->layout,
                   dst->obj->image, dst->layout,
                   1, &region,
                   zink_filter(info->filter));
@@ -250,6 +253,11 @@ zink_blit(struct pipe_context *pctx,
    struct zink_context *ctx = zink_context(pctx);
    const struct util_format_description *src_desc = util_format_description(info->src.format);
    const struct util_format_description *dst_desc = util_format_description(info->dst.format);
+
+   if (info->render_condition_enable &&
+       unlikely(!zink_screen(pctx->screen)->info.have_EXT_conditional_rendering && !zink_check_conditional_render(ctx)))
+      return;
+
    if (src_desc == dst_desc ||
        src_desc->nr_channels != 4 || src_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN ||
        (src_desc->nr_channels == 4 && src_desc->channel[3].type != UTIL_FORMAT_TYPE_VOID)) {
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_bo.c b/mesa 3D driver/src/gallium/drivers/zink/zink_bo.c
index 29c1bd0b72..e673efefb3 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_bo.c	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_bo.c	
@@ -130,7 +130,7 @@ bo_destroy(struct zink_screen *screen, struct pb_buffer *pbuf)
       zink_bo_unmap(screen, bo);
    }
 
-   vkFreeMemory(screen->dev, bo->mem, NULL);
+   VKSCR(FreeMemory)(screen->dev, bo->mem, NULL);
 
    simple_mtx_destroy(&bo->lock);
    FREE(bo);
@@ -265,7 +265,7 @@ bo_create_internal(struct zink_screen *screen,
       alignment = MAX2(alignment, screen->info.props.limits.minMemoryMapAlignment);
       mai.allocationSize = align(mai.allocationSize, screen->info.props.limits.minMemoryMapAlignment);
    }
-   VkResult ret = vkAllocateMemory(screen->dev, &mai, NULL, &bo->mem);
+   VkResult ret = VKSCR(AllocateMemory)(screen->dev, &mai, NULL, &bo->mem);
    if (!zink_screen_handle_vkresult(screen, ret))
       goto fail;
 
@@ -642,7 +642,7 @@ zink_bo_map(struct zink_screen *screen, struct zink_bo *bo)
        * be atomic thanks to the lock. */
       cpu = real->u.real.cpu_ptr;
       if (!cpu) {
-         VkResult result = vkMapMemory(screen->dev, real->mem, 0, real->base.size, 0, &cpu);
+         VkResult result = VKSCR(MapMemory)(screen->dev, real->mem, 0, real->base.size, 0, &cpu);
          if (result != VK_SUCCESS) {
             simple_mtx_unlock(&real->lock);
             return NULL;
@@ -665,7 +665,7 @@ zink_bo_unmap(struct zink_screen *screen, struct zink_bo *bo)
 
    if (p_atomic_dec_zero(&real->u.real.map_count)) {
       p_atomic_set(&real->u.real.cpu_ptr, NULL);
-      vkUnmapMemory(screen->dev, real->mem);
+      VKSCR(UnmapMemory)(screen->dev, real->mem);
    }
 }
 
@@ -681,7 +681,7 @@ resource_commit(struct zink_screen *screen, VkBindSparseInfo *sparse)
 {
    VkQueue queue = screen->threaded ? screen->thread_queue : screen->queue;
 
-   VkResult ret = vkQueueBindSparse(queue, 1, sparse, VK_NULL_HANDLE);
+   VkResult ret = VKSCR(QueueBindSparse)(queue, 1, sparse, VK_NULL_HANDLE);
    return zink_screen_handle_vkresult(screen, ret);
 }
 
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_bo.h b/mesa 3D driver/src/gallium/drivers/zink/zink_bo.h
index 403e2d6de5..7dc4d0c341 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_bo.h	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_bo.h	
@@ -32,6 +32,7 @@
 #include "zink_batch.h"
 
 #define VK_VIS_VRAM (VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)
+#define VK_LAZY_VRAM (VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT | VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)
 enum zink_resource_access {
    ZINK_RESOURCE_ACCESS_READ = 1,
    ZINK_RESOURCE_ACCESS_WRITE = 32,
@@ -42,8 +43,8 @@ enum zink_resource_access {
 enum zink_heap {
    ZINK_HEAP_DEVICE_LOCAL,
    ZINK_HEAP_DEVICE_LOCAL_SPARSE,
+   ZINK_HEAP_DEVICE_LOCAL_LAZY,
    ZINK_HEAP_DEVICE_LOCAL_VISIBLE,
-   ZINK_HEAP_HOST_VISIBLE_ANY,
    ZINK_HEAP_HOST_VISIBLE_COHERENT,
    ZINK_HEAP_HOST_VISIBLE_CACHED,
    ZINK_HEAP_MAX,
@@ -129,12 +130,12 @@ vk_domain_from_heap(enum zink_heap heap)
    case ZINK_HEAP_DEVICE_LOCAL_SPARSE:
       domains = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
       break;
+   case ZINK_HEAP_DEVICE_LOCAL_LAZY:
+      domains = VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT | VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+      break;
    case ZINK_HEAP_DEVICE_LOCAL_VISIBLE:
       domains = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
       break;
-   case ZINK_HEAP_HOST_VISIBLE_ANY:
-      domains = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
-      break;
    case ZINK_HEAP_HOST_VISIBLE_COHERENT:
       domains = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
       break;
@@ -159,13 +160,10 @@ zink_heap_from_domain_flags(VkMemoryPropertyFlags domains, enum zink_alloc_flag
    if (domains & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)
       return ZINK_HEAP_DEVICE_LOCAL;
 
-   if (domains & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)
-      return ZINK_HEAP_HOST_VISIBLE_COHERENT;
-
    if (domains & VK_MEMORY_PROPERTY_HOST_CACHED_BIT)
       return ZINK_HEAP_HOST_VISIBLE_CACHED;
 
-   return ZINK_HEAP_HOST_VISIBLE_ANY;
+   return ZINK_HEAP_HOST_VISIBLE_COHERENT;
 }
 
 bool
@@ -252,11 +250,12 @@ zink_bo_usage_set(struct zink_bo *bo, struct zink_batch_state *bs, bool write)
       zink_batch_usage_set(&bo->reads, bs);
 }
 
-static inline void
+static inline bool
 zink_bo_usage_unset(struct zink_bo *bo, struct zink_batch_state *bs)
 {
    zink_batch_usage_unset(&bo->reads, bs);
    zink_batch_usage_unset(&bo->writes, bs);
+   return bo->reads || bo->writes;
 }
 
 
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_clear.c b/mesa 3D driver/src/gallium/drivers/zink/zink_clear.c
index 09be568af5..77f294f527 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_clear.c	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_clear.c	
@@ -116,14 +116,16 @@ clear_in_rp(struct pipe_context *pctx,
    }
    cr.baseArrayLayer = 0;
    cr.layerCount = util_framebuffer_get_num_layers(fb);
-   struct zink_batch *batch = zink_batch_rp(ctx);
-   vkCmdClearAttachments(batch->state->cmdbuf, num_attachments, attachments, 1, &cr);
+   struct zink_batch *batch = &ctx->batch;
+   zink_batch_rp(ctx);
+   VKCTX(CmdClearAttachments)(batch->state->cmdbuf, num_attachments, attachments, 1, &cr);
 }
 
 static void
 clear_color_no_rp(struct zink_context *ctx, struct zink_resource *res, const union pipe_color_union *pcolor, unsigned level, unsigned layer, unsigned layerCount)
 {
-   struct zink_batch *batch = zink_batch_no_rp(ctx);
+   struct zink_batch *batch = &ctx->batch;
+   zink_batch_no_rp(ctx);
    VkImageSubresourceRange range = {0};
    range.baseMipLevel = level;
    range.levelCount = 1;
@@ -139,15 +141,16 @@ clear_color_no_rp(struct zink_context *ctx, struct zink_resource *res, const uni
 
    if (zink_resource_image_needs_barrier(res, VK_IMAGE_LAYOUT_GENERAL, 0, 0) &&
        zink_resource_image_needs_barrier(res, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 0, 0))
-      zink_resource_image_barrier(ctx, NULL, res, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 0, 0);
+      zink_resource_image_barrier(ctx, res, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 0, 0);
    zink_batch_reference_resource_rw(batch, res, true);
-   vkCmdClearColorImage(batch->state->cmdbuf, res->obj->image, res->layout, &color, 1, &range);
+   VKCTX(CmdClearColorImage)(batch->state->cmdbuf, res->obj->image, res->layout, &color, 1, &range);
 }
 
 static void
 clear_zs_no_rp(struct zink_context *ctx, struct zink_resource *res, VkImageAspectFlags aspects, double depth, unsigned stencil, unsigned level, unsigned layer, unsigned layerCount)
 {
-   struct zink_batch *batch = zink_batch_no_rp(ctx);
+   struct zink_batch *batch = &ctx->batch;
+   zink_batch_no_rp(ctx);
    VkImageSubresourceRange range = {0};
    range.baseMipLevel = level;
    range.levelCount = 1;
@@ -159,9 +162,9 @@ clear_zs_no_rp(struct zink_context *ctx, struct zink_resource *res, VkImageAspec
 
    if (zink_resource_image_needs_barrier(res, VK_IMAGE_LAYOUT_GENERAL, 0, 0) &&
        zink_resource_image_needs_barrier(res, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 0, 0))
-      zink_resource_image_barrier(ctx, NULL, res, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 0, 0);
+      zink_resource_image_barrier(ctx, res, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 0, 0);
    zink_batch_reference_resource_rw(batch, res, true);
-   vkCmdClearDepthStencilImage(batch->state->cmdbuf, res->obj->image, res->layout, &zs_value, 1, &range);
+   VKCTX(CmdClearDepthStencilImage)(batch->state->cmdbuf, res->obj->image, res->layout, &zs_value, 1, &range);
 }
 
 
@@ -197,6 +200,9 @@ zink_clear(struct pipe_context *pctx,
    struct zink_batch *batch = &ctx->batch;
    bool needs_rp = false;
 
+   if (unlikely(!zink_screen(pctx->screen)->info.have_EXT_conditional_rendering && !zink_check_conditional_render(ctx)))
+      return;
+
    if (scissor_state) {
       struct u_rect scissor = {scissor_state->minx, scissor_state->maxx, scissor_state->miny, scissor_state->maxy};
       needs_rp = !zink_blit_region_fills(scissor, fb->width, fb->height);
@@ -448,10 +454,11 @@ zink_clear_buffer(struct pipe_context *pctx,
          - size is the number of bytes to fill, and must be either a multiple of 4,
            or VK_WHOLE_SIZE to fill the range from offset to the end of the buffer
        */
-      struct zink_batch *batch = zink_batch_no_rp(ctx);
+      struct zink_batch *batch = &ctx->batch;
+      zink_batch_no_rp(ctx);
       zink_batch_reference_resource_rw(batch, res, true);
       util_range_add(&res->base.b, &res->valid_buffer_range, offset, offset + size);
-      vkCmdFillBuffer(batch->state->cmdbuf, res->obj->buffer, offset, size, *(uint32_t*)clear_value);
+      VKCTX(CmdFillBuffer)(batch->state->cmdbuf, res->obj->buffer, offset, size, *(uint32_t*)clear_value);
       return;
    }
    struct pipe_transfer *xfer;
@@ -535,12 +542,13 @@ static void
 fb_clears_apply_internal(struct zink_context *ctx, struct pipe_resource *pres, int i)
 {
    struct zink_framebuffer_clear *fb_clear = &ctx->fb_clears[i];
+   struct zink_resource *res = zink_resource(pres);
 
    if (!zink_fb_clear_enabled(ctx, i))
       return;
    if (ctx->batch.in_rp)
       zink_clear_framebuffer(ctx, BITFIELD_BIT(i));
-   else if (zink_resource(pres)->aspect == VK_IMAGE_ASPECT_COLOR_BIT) {
+   else if (res->aspect == VK_IMAGE_ASPECT_COLOR_BIT) {
       if (zink_fb_clear_needs_explicit(fb_clear) || !check_3d_layers(ctx->fb_state.cbufs[i]))
          /* this will automatically trigger all the clears */
          zink_batch_rp(ctx);
@@ -550,7 +558,7 @@ fb_clears_apply_internal(struct zink_context *ctx, struct pipe_resource *pres, i
          union pipe_color_union color;
          zink_fb_clear_util_unpack_clear_color(clear, psurf->format, &color);
 
-         clear_color_no_rp(ctx, zink_resource(pres), &color,
+         clear_color_no_rp(ctx, res, &color,
                                 psurf->u.tex.level, psurf->u.tex.first_layer,
                                 psurf->u.tex.last_layer - psurf->u.tex.first_layer + 1);
       }
@@ -568,7 +576,7 @@ fb_clears_apply_internal(struct zink_context *ctx, struct pipe_resource *pres, i
             aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
          if (clear->zs.bits & PIPE_CLEAR_STENCIL)
             aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
-         clear_zs_no_rp(ctx, zink_resource(pres), aspects, clear->zs.depth, clear->zs.stencil,
+         clear_zs_no_rp(ctx, res, aspects, clear->zs.depth, clear->zs.stencil,
                              psurf->u.tex.level, psurf->u.tex.first_layer,
                              psurf->u.tex.last_layer - psurf->u.tex.first_layer + 1);
       }
@@ -596,7 +604,6 @@ zink_fb_clears_apply(struct zink_context *ctx, struct pipe_resource *pres)
       for (int i = 0; i < ctx->fb_state.nr_cbufs; i++) {
          if (ctx->fb_state.cbufs[i] && ctx->fb_state.cbufs[i]->texture == pres) {
             fb_clears_apply_internal(ctx, pres, i);
-            return;
          }
       }
    } else {
@@ -614,7 +621,6 @@ zink_fb_clears_discard(struct zink_context *ctx, struct pipe_resource *pres)
          if (ctx->fb_state.cbufs[i] && ctx->fb_state.cbufs[i]->texture == pres) {
             if (zink_fb_clear_enabled(ctx, i)) {
                zink_fb_clear_reset(ctx, i);
-               return;
             }
          }
       }
@@ -688,7 +694,6 @@ zink_fb_clears_apply_or_discard(struct zink_context *ctx, struct pipe_resource *
       for (int i = 0; i < ctx->fb_state.nr_cbufs; i++) {
          if (ctx->fb_state.cbufs[i] && ctx->fb_state.cbufs[i]->texture == pres) {
             fb_clears_apply_or_discard_internal(ctx, pres, region, discard_only, false, i);
-            return;
          }
       }
    }  else {
@@ -705,7 +710,6 @@ zink_fb_clears_apply_region(struct zink_context *ctx, struct pipe_resource *pres
       for (int i = 0; i < ctx->fb_state.nr_cbufs; i++) {
          if (ctx->fb_state.cbufs[i] && ctx->fb_state.cbufs[i]->texture == pres) {
             fb_clears_apply_or_discard_internal(ctx, pres, region, false, true, i);
-            return;
          }
       }
    }  else {
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_compiler.c b/mesa 3D driver/src/gallium/drivers/zink/zink_compiler.c
index c4ac008157..1715ce2c2d 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_compiler.c	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_compiler.c	
@@ -375,6 +375,8 @@ zink_screen_init_compiler(struct zink_screen *screen)
       .lower_uadd_carry = true,
       .lower_pack_64_2x32_split = true,
       .lower_unpack_64_2x32_split = true,
+      .lower_pack_32_2x16_split = true,
+      .lower_unpack_32_2x16_split = true,
       .lower_vector_cmp = true,
       .lower_int64_options = 0,
       .lower_doubles_options = ~nir_lower_fp64_full_software,
@@ -448,6 +450,56 @@ optimize_nir(struct nir_shader *s)
    } while (progress);
 }
 
+/* - copy the lowered fbfetch variable
+ * - set the new one up as an input attachment for descriptor 0.6
+ * - load it as an image
+ * - overwrite the previous load
+ */
+static bool
+lower_fbfetch_instr(nir_builder *b, nir_instr *instr, void *data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   if (intr->intrinsic != nir_intrinsic_load_deref)
+      return false;
+   nir_variable *var = nir_deref_instr_get_variable(nir_src_as_deref(intr->src[0]));
+   if (var != data)
+      return false;
+   b->cursor = nir_after_instr(instr);
+   nir_variable *fbfetch = nir_variable_clone(data, b->shader);
+   /* If Dim is SubpassData, ... Image Format must be Unknown
+    * - SPIRV OpTypeImage specification
+    */
+   fbfetch->data.image.format = 0;
+   fbfetch->data.index = 0; /* fix this if more than 1 fbfetch target is supported */
+   fbfetch->data.mode = nir_var_uniform;
+   fbfetch->data.binding = ZINK_FBFETCH_BINDING;
+   fbfetch->type = glsl_image_type(GLSL_SAMPLER_DIM_SUBPASS, false, GLSL_TYPE_FLOAT);
+   nir_shader_add_variable(b->shader, fbfetch);
+   nir_ssa_def *deref = &nir_build_deref_var(b, fbfetch)->dest.ssa;
+   nir_ssa_def *load = nir_image_deref_load(b, 4, 32, deref, nir_imm_vec4(b, 0, 0, 0, 1), nir_ssa_undef(b, 1, 32), nir_imm_int(b, 0));
+   unsigned swiz[4] = {2, 1, 0, 3};
+   nir_ssa_def *swizzle = nir_swizzle(b, load, swiz, 4);
+   nir_ssa_def_rewrite_uses(&intr->dest.ssa, swizzle);
+   return true;
+}
+
+static bool
+lower_fbfetch(nir_shader *shader, nir_variable **fbfetch)
+{
+   nir_foreach_shader_out_variable(var, shader) {
+      if (var->data.fb_fetch_output) {
+         *fbfetch = var;
+         break;
+      }
+   }
+   assert(*fbfetch);
+   if (!*fbfetch)
+      return false;
+   return nir_shader_instructions_pass(shader, lower_fbfetch_instr, nir_metadata_dominance, *fbfetch);
+}
+
 /* check for a genuine gl_PointSize output vs one from nir_lower_point_size_mov */
 static bool
 check_psiz(struct nir_shader *s)
@@ -512,6 +564,79 @@ update_so_info(struct zink_shader *zs, const struct pipe_stream_output_info *so_
    zs->streamout.have_xfb = !!zs->streamout.so_info.num_outputs;
 }
 
+struct decompose_state {
+  nir_variable **split;
+  bool needs_w;
+};
+
+static bool
+lower_attrib(nir_builder *b, nir_instr *instr, void *data)
+{
+   struct decompose_state *state = data;
+   nir_variable **split = state->split;
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   if (intr->intrinsic != nir_intrinsic_load_deref)
+      return false;
+   nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+   if (var != split[0])
+      return false;
+   unsigned num_components = glsl_get_vector_elements(split[0]->type);
+   b->cursor = nir_after_instr(instr);
+   nir_ssa_def *loads[4];
+   for (unsigned i = 0; i < (state->needs_w ? num_components - 1 : num_components); i++)
+      loads[i] = nir_load_deref(b, nir_build_deref_var(b, split[i+1]));
+   if (state->needs_w) {
+      /* oob load w comopnent to get correct value for int/float */
+      loads[3] = nir_channel(b, loads[0], 3);
+      loads[0] = nir_channel(b, loads[0], 0);
+   }
+   nir_ssa_def *new_load = nir_vec(b, loads, num_components);
+   nir_ssa_def_rewrite_uses(&intr->dest.ssa, new_load);
+   nir_instr_remove_v(instr);
+   return true;
+}
+
+static bool
+decompose_attribs(nir_shader *nir, uint32_t decomposed_attrs, uint32_t decomposed_attrs_without_w)
+{
+   uint32_t bits = 0;
+   nir_foreach_variable_with_modes(var, nir, nir_var_shader_in)
+      bits |= BITFIELD_BIT(var->data.driver_location);
+   bits = ~bits;
+   u_foreach_bit(location, decomposed_attrs | decomposed_attrs_without_w) {
+      nir_variable *split[5];
+      struct decompose_state state;
+      state.split = split;
+      nir_variable *var = nir_find_variable_with_driver_location(nir, nir_var_shader_in, location);
+      assert(var);
+      split[0] = var;
+      bits |= BITFIELD_BIT(var->data.driver_location);
+      const struct glsl_type *new_type = glsl_type_is_scalar(var->type) ? var->type : glsl_get_array_element(var->type);
+      unsigned num_components = glsl_get_vector_elements(var->type);
+      state.needs_w = (decomposed_attrs_without_w & BITFIELD_BIT(location)) != 0 && num_components == 4;
+      for (unsigned i = 0; i < (state.needs_w ? num_components - 1 : num_components); i++) {
+         split[i+1] = nir_variable_clone(var, nir);
+         split[i+1]->name = ralloc_asprintf(nir, "%s_split%u", var->name, i);
+         if (decomposed_attrs_without_w & BITFIELD_BIT(location))
+            split[i+1]->type = !i && num_components == 4 ? var->type : new_type;
+         else
+            split[i+1]->type = new_type;
+         split[i+1]->data.driver_location = ffs(bits) - 1;
+         bits &= ~BITFIELD_BIT(split[i+1]->data.driver_location);
+         nir_shader_add_variable(nir, split[i+1]);
+      }
+      var->data.mode = nir_var_shader_temp;
+      nir_shader_instructions_pass(nir, lower_attrib, nir_metadata_dominance, &state);
+   }
+   nir_fixup_deref_modes(nir);
+   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_temp, NULL);
+   optimize_nir(nir);
+   return true;
+}
+
 static void
 assign_producer_var_io(gl_shader_stage stage, nir_variable *var, unsigned *reserved, unsigned char *slot_map)
 {
@@ -657,7 +782,7 @@ zink_compiler_assign_io(nir_shader *producer, nir_shader *consumer)
 }
 
 VkShaderModule
-zink_shader_compile(struct zink_screen *screen, struct zink_shader *zs, nir_shader *base_nir, struct zink_shader_key *key)
+zink_shader_compile(struct zink_screen *screen, struct zink_shader *zs, nir_shader *base_nir, const struct zink_shader_key *key)
 {
    VkShaderModule mod = VK_NULL_HANDLE;
    void *streamout = NULL;
@@ -676,48 +801,78 @@ zink_shader_compile(struct zink_screen *screen, struct zink_shader *zs, nir_shad
          NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in |
                                                           nir_var_shader_out);
       }
-   }
 
-   /* TODO: use a separate mem ctx here for ralloc */
-   switch (zs->nir->info.stage) {
-   case MESA_SHADER_VERTEX:
-   case MESA_SHADER_TESS_EVAL:
-   case MESA_SHADER_GEOMETRY:
-      if (zink_vs_key(key)->last_vertex_stage) {
-         if (zs->streamout.have_xfb)
-            streamout = &zs->streamout;
+      /* TODO: use a separate mem ctx here for ralloc */
+      switch (zs->nir->info.stage) {
+      case MESA_SHADER_VERTEX: {
+         uint32_t decomposed_attrs = 0, decomposed_attrs_without_w = 0;
+         const struct zink_vs_key *vs_key = zink_vs_key(key);
+         switch (vs_key->size) {
+         case 4:
+            decomposed_attrs = vs_key->u32.decomposed_attrs;
+            decomposed_attrs_without_w = vs_key->u32.decomposed_attrs_without_w;
+            break;
+         case 2:
+            decomposed_attrs = vs_key->u16.decomposed_attrs;
+            decomposed_attrs_without_w = vs_key->u16.decomposed_attrs_without_w;
+            break;
+         case 1:
+            decomposed_attrs = vs_key->u8.decomposed_attrs;
+            decomposed_attrs_without_w = vs_key->u8.decomposed_attrs_without_w;
+            break;
+         default: break;
+         }
+         if (decomposed_attrs || decomposed_attrs_without_w)
+            NIR_PASS_V(nir, decompose_attribs, decomposed_attrs, decomposed_attrs_without_w);
+         FALLTHROUGH;
+      }
+      case MESA_SHADER_TESS_EVAL:
+      case MESA_SHADER_GEOMETRY:
+         if (zink_vs_key_base(key)->last_vertex_stage) {
+            if (zs->streamout.have_xfb)
+               streamout = &zs->streamout;
 
-         if (!zink_vs_key(key)->clip_halfz) {
-            NIR_PASS_V(nir, nir_lower_clip_halfz);
+            if (!zink_vs_key_base(key)->clip_halfz) {
+               NIR_PASS_V(nir, nir_lower_clip_halfz);
+            }
+            if (zink_vs_key_base(key)->push_drawid) {
+               NIR_PASS_V(nir, lower_drawid);
+            }
          }
-         if (zink_vs_key(key)->push_drawid) {
-            NIR_PASS_V(nir, lower_drawid);
+         break;
+      case MESA_SHADER_FRAGMENT:
+         if (!zink_fs_key(key)->samples &&
+            nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) {
+            /* VK will always use gl_SampleMask[] values even if sample count is 0,
+            * so we need to skip this write here to mimic GL's behavior of ignoring it
+            */
+            nir_foreach_shader_out_variable(var, nir) {
+               if (var->data.location == FRAG_RESULT_SAMPLE_MASK)
+                  var->data.mode = nir_var_shader_temp;
+            }
+            nir_fixup_deref_modes(nir);
+            NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_temp, NULL);
+            optimize_nir(nir);
          }
-      }
-      break;
-   case MESA_SHADER_FRAGMENT:
-      if (!zink_fs_key(key)->samples &&
-          nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) {
-         /* VK will always use gl_SampleMask[] values even if sample count is 0,
-          * so we need to skip this write here to mimic GL's behavior of ignoring it
-          */
-         nir_foreach_shader_out_variable(var, nir) {
-            if (var->data.location == FRAG_RESULT_SAMPLE_MASK)
-               var->data.mode = nir_var_shader_temp;
+         if (zink_fs_key(key)->force_dual_color_blend && nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DATA1)) {
+            NIR_PASS_V(nir, lower_dual_blend);
          }
-         nir_fixup_deref_modes(nir);
-         NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_temp, NULL);
-         optimize_nir(nir);
+         if (zink_fs_key(key)->coord_replace_bits) {
+            NIR_PASS_V(nir, nir_lower_texcoord_replace, zink_fs_key(key)->coord_replace_bits,
+                     false, zink_fs_key(key)->coord_replace_yinvert);
+         }
+         if (nir->info.fs.uses_fbfetch_output) {
+            nir_variable *fbfetch = NULL;
+            NIR_PASS_V(nir, lower_fbfetch, &fbfetch);
+            /* old variable must be deleted to avoid spirv errors */
+            fbfetch->data.mode = nir_var_shader_temp;
+            nir_fixup_deref_modes(nir);
+            NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_temp, NULL);
+            optimize_nir(nir);
+         }
+         break;
+      default: break;
       }
-      if (zink_fs_key(key)->force_dual_color_blend && nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DATA1)) {
-         NIR_PASS_V(nir, lower_dual_blend);
-      }
-      if (zink_fs_key(key)->coord_replace_bits) {
-         NIR_PASS_V(nir, nir_lower_texcoord_replace, zink_fs_key(key)->coord_replace_bits,
-                    false, zink_fs_key(key)->coord_replace_yinvert);
-      }
-      break;
-   default: break;
    }
    NIR_PASS_V(nir, nir_convert_from_ssa, true);
 
@@ -742,7 +897,7 @@ zink_shader_compile(struct zink_screen *screen, struct zink_shader *zs, nir_shad
    smci.codeSize = spirv->num_words * sizeof(uint32_t);
    smci.pCode = spirv->words;
 
-   if (vkCreateShaderModule(screen->dev, &smci, NULL, &mod) != VK_SUCCESS)
+   if (VKSCR(CreateShaderModule)(screen->dev, &smci, NULL, &mod) != VK_SUCCESS)
       mod = VK_NULL_HANDLE;
 
 done:
@@ -887,6 +1042,178 @@ unbreak_bos(nir_shader *shader)
    return true;
 }
 
+/* this is a "default" bindless texture used if the shader has no texture variables */
+static nir_variable *
+create_bindless_texture(nir_shader *nir, nir_tex_instr *tex)
+{
+   unsigned binding = tex->sampler_dim == GLSL_SAMPLER_DIM_BUF ? 1 : 0;
+   nir_variable *var;
+
+   const struct glsl_type *sampler_type = glsl_sampler_type(tex->sampler_dim, tex->is_shadow, tex->is_array, GLSL_TYPE_FLOAT);
+   var = nir_variable_create(nir, nir_var_uniform, glsl_array_type(sampler_type, ZINK_MAX_BINDLESS_HANDLES, 0), "bindless_texture");
+   var->data.descriptor_set = ZINK_DESCRIPTOR_BINDLESS;
+   var->data.driver_location = var->data.binding = binding;
+   return var;
+}
+
+/* this is a "default" bindless image used if the shader has no image variables */
+static nir_variable *
+create_bindless_image(nir_shader *nir, enum glsl_sampler_dim dim)
+{
+   unsigned binding = dim == GLSL_SAMPLER_DIM_BUF ? 3 : 2;
+   nir_variable *var;
+
+   const struct glsl_type *image_type = glsl_image_type(dim, false, GLSL_TYPE_FLOAT);
+   var = nir_variable_create(nir, nir_var_image, glsl_array_type(image_type, ZINK_MAX_BINDLESS_HANDLES, 0), "bindless_image");
+   var->data.descriptor_set = ZINK_DESCRIPTOR_BINDLESS;
+   var->data.driver_location = var->data.binding = binding;
+   var->data.image.format = PIPE_FORMAT_R8G8B8A8_UNORM;
+   return var;
+}
+
+/* rewrite bindless instructions as array deref instructions */
+static bool
+lower_bindless_instr(nir_builder *b, nir_instr *in, void *data)
+{
+   nir_variable **bindless = data;
+
+   if (in->type == nir_instr_type_tex) {
+      nir_tex_instr *tex = nir_instr_as_tex(in);
+      int idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
+      if (idx == -1)
+         return false;
+
+      nir_variable *var = tex->sampler_dim == GLSL_SAMPLER_DIM_BUF ? bindless[1] : bindless[0];
+      if (!var)
+         var = create_bindless_texture(b->shader, tex);
+      b->cursor = nir_before_instr(in);
+      nir_deref_instr *deref = nir_build_deref_var(b, var);
+      if (glsl_type_is_array(var->type))
+         deref = nir_build_deref_array(b, deref, nir_u2uN(b, tex->src[idx].src.ssa, 32));
+      nir_instr_rewrite_src_ssa(in, &tex->src[idx].src, &deref->dest.ssa);
+
+      /* bindless sampling uses the variable type directly, which means the tex instr has to exactly
+       * match up with it in contrast to normal sampler ops where things are a bit more flexible;
+       * this results in cases where a shader is passed with sampler2DArray but the tex instr only has
+       * 2 components, which explodes spirv compilation even though it doesn't trigger validation errors
+       *
+       * to fix this, pad the coord src here and fix the tex instr so that ntv will do the "right" thing
+       * - Warhammer 40k: Dawn of War III
+       */
+      unsigned needed_components = glsl_get_sampler_coordinate_components(glsl_without_array(var->type));
+      unsigned c = nir_tex_instr_src_index(tex, nir_tex_src_coord);
+      unsigned coord_components = nir_src_num_components(tex->src[c].src);
+      if (coord_components < needed_components) {
+         nir_ssa_def *def = nir_pad_vector(b, tex->src[c].src.ssa, needed_components);
+         nir_instr_rewrite_src_ssa(in, &tex->src[c].src, def);
+         tex->coord_components = needed_components;
+      }
+      return true;
+   }
+   if (in->type != nir_instr_type_intrinsic)
+      return false;
+   nir_intrinsic_instr *instr = nir_instr_as_intrinsic(in);
+
+   nir_intrinsic_op op;
+#define OP_SWAP(OP) \
+   case nir_intrinsic_bindless_image_##OP: \
+      op = nir_intrinsic_image_deref_##OP; \
+      break;
+
+
+   /* convert bindless intrinsics to deref intrinsics */
+   switch (instr->intrinsic) {
+   OP_SWAP(atomic_add)
+   OP_SWAP(atomic_and)
+   OP_SWAP(atomic_comp_swap)
+   OP_SWAP(atomic_dec_wrap)
+   OP_SWAP(atomic_exchange)
+   OP_SWAP(atomic_fadd)
+   OP_SWAP(atomic_fmax)
+   OP_SWAP(atomic_fmin)
+   OP_SWAP(atomic_imax)
+   OP_SWAP(atomic_imin)
+   OP_SWAP(atomic_inc_wrap)
+   OP_SWAP(atomic_or)
+   OP_SWAP(atomic_umax)
+   OP_SWAP(atomic_umin)
+   OP_SWAP(atomic_xor)
+   OP_SWAP(format)
+   OP_SWAP(load)
+   OP_SWAP(order)
+   OP_SWAP(samples)
+   OP_SWAP(size)
+   OP_SWAP(store)
+   default:
+      return false;
+   }
+
+   enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
+   nir_variable *var = dim == GLSL_SAMPLER_DIM_BUF ? bindless[3] : bindless[2];
+   if (!var)
+      var = create_bindless_image(b->shader, dim);
+   instr->intrinsic = op;
+   b->cursor = nir_before_instr(in);
+   nir_deref_instr *deref = nir_build_deref_var(b, var);
+   if (glsl_type_is_array(var->type))
+      deref = nir_build_deref_array(b, deref, nir_u2uN(b, instr->src[0].ssa, 32));
+   nir_instr_rewrite_src_ssa(in, &instr->src[0], &deref->dest.ssa);
+   return true;
+}
+
+static bool
+lower_bindless(nir_shader *shader, nir_variable **bindless)
+{
+   if (!nir_shader_instructions_pass(shader, lower_bindless_instr, nir_metadata_dominance, bindless))
+      return false;
+   nir_fixup_deref_modes(shader);
+   NIR_PASS_V(shader, nir_remove_dead_variables, nir_var_shader_temp, NULL);
+   optimize_nir(shader);
+   return true;
+}
+
+/* convert shader image/texture io variables to int64 handles for bindless indexing */
+static bool
+lower_bindless_io_instr(nir_builder *b, nir_instr *in, void *data)
+{
+   if (in->type != nir_instr_type_intrinsic)
+      return false;
+   nir_intrinsic_instr *instr = nir_instr_as_intrinsic(in);
+   if (instr->intrinsic != nir_intrinsic_load_deref &&
+       instr->intrinsic != nir_intrinsic_store_deref)
+      return false;
+
+   nir_deref_instr *src_deref = nir_src_as_deref(instr->src[0]);
+   nir_variable *var = nir_deref_instr_get_variable(src_deref);
+   if (var->data.bindless)
+      return false;
+   if (var->data.mode != nir_var_shader_in && var->data.mode != nir_var_shader_out)
+      return false;
+   if (!glsl_type_is_image(var->type) && !glsl_type_is_sampler(var->type))
+      return false;
+
+   var->type = glsl_int64_t_type();
+   var->data.bindless = 1;
+   b->cursor = nir_before_instr(in);
+   nir_deref_instr *deref = nir_build_deref_var(b, var);
+   if (instr->intrinsic == nir_intrinsic_load_deref) {
+       nir_ssa_def *def = nir_load_deref(b, deref);
+       nir_instr_rewrite_src_ssa(in, &instr->src[0], def);
+       nir_ssa_def_rewrite_uses(&instr->dest.ssa, def);
+   } else {
+      nir_store_deref(b, deref, instr->src[1].ssa, nir_intrinsic_write_mask(instr));
+   }
+   nir_instr_remove(in);
+   nir_instr_remove(&src_deref->instr);
+   return true;
+}
+
+static bool
+lower_bindless_io(nir_shader *shader)
+{
+   return nir_shader_instructions_pass(shader, lower_bindless_io_instr, nir_metadata_dominance, NULL);
+}
+
 static uint32_t
 zink_binding(gl_shader_stage stage, VkDescriptorType type, int index)
 {
@@ -919,6 +1246,84 @@ zink_binding(gl_shader_stage stage, VkDescriptorType type, int index)
    }
 }
 
+static void
+handle_bindless_var(nir_shader *nir, nir_variable *var, const struct glsl_type *type, nir_variable **bindless)
+{
+   if (glsl_type_is_struct(type)) {
+      for (unsigned i = 0; i < glsl_get_length(type); i++)
+         handle_bindless_var(nir, var, glsl_get_struct_field(type, i), bindless);
+      return;
+   }
+
+   /* just a random scalar in a struct */
+   if (!glsl_type_is_image(type) && !glsl_type_is_sampler(type))
+      return;
+
+   VkDescriptorType vktype = glsl_type_is_image(type) ? zink_image_type(type) : zink_sampler_type(type);
+   unsigned binding;
+   switch (vktype) {
+      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+         binding = 0;
+         break;
+      case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+         binding = 1;
+         break;
+      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+         binding = 2;
+         break;
+      case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+         binding = 3;
+         break;
+      default:
+         unreachable("unknown");
+   }
+   if (!bindless[binding]) {
+      bindless[binding] = nir_variable_clone(var, nir);
+      bindless[binding]->data.bindless = 0;
+      bindless[binding]->data.descriptor_set = ZINK_DESCRIPTOR_BINDLESS;
+      bindless[binding]->type = glsl_array_type(type, ZINK_MAX_BINDLESS_HANDLES, 0);
+      bindless[binding]->data.driver_location = bindless[binding]->data.binding = binding;
+      if (!bindless[binding]->data.image.format)
+         bindless[binding]->data.image.format = PIPE_FORMAT_R8G8B8A8_UNORM;
+      nir_shader_add_variable(nir, bindless[binding]);
+   } else {
+      assert(glsl_get_sampler_dim(glsl_without_array(bindless[binding]->type)) == glsl_get_sampler_dim(glsl_without_array(var->type)));
+   }
+   var->data.mode = nir_var_shader_temp;
+}
+
+static enum pipe_prim_type
+gl_prim_to_pipe(unsigned primitive_type)
+{
+   switch (primitive_type) {
+   case GL_POINTS:
+      return PIPE_PRIM_POINTS;
+   case GL_LINES:
+   case GL_LINE_LOOP:
+   case GL_LINE_STRIP:
+   case GL_LINES_ADJACENCY:
+   case GL_LINE_STRIP_ADJACENCY:
+   case GL_ISOLINES:
+      return PIPE_PRIM_LINES;
+   default:
+      return PIPE_PRIM_TRIANGLES;
+   }
+}
+
+static enum pipe_prim_type
+get_shader_base_prim_type(struct nir_shader *nir)
+{
+   switch (nir->info.stage) {
+   case MESA_SHADER_GEOMETRY:
+      return gl_prim_to_pipe(nir->info.gs.output_primitive);
+   case MESA_SHADER_TESS_EVAL:
+      return nir->info.tess.point_mode ? PIPE_PRIM_POINTS : gl_prim_to_pipe(nir->info.tess.primitive_mode);
+   default:
+      break;
+   }
+   return PIPE_PRIM_MAX;
+}
+
 struct zink_shader *
 zink_shader_create(struct zink_screen *screen, struct nir_shader *nir,
                    const struct pipe_stream_output_info *so_info)
@@ -926,6 +1331,9 @@ zink_shader_create(struct zink_screen *screen, struct nir_shader *nir,
    struct zink_shader *ret = CALLOC_STRUCT(zink_shader);
    bool have_psiz = false;
 
+   ret->hash = _mesa_hash_pointer(ret);
+   ret->reduced_prim = get_shader_base_prim_type(nir);
+
    ret->programs = _mesa_pointer_set_create(NULL);
    simple_mtx_init(&ret->lock, mtx_plain);
 
@@ -976,8 +1384,20 @@ zink_shader_create(struct zink_screen *screen, struct nir_shader *nir,
       fprintf(stderr, "---8<---\n");
    }
 
-   foreach_list_typed_reverse(nir_variable, var, node, &nir->variables) {
+   nir_variable *bindless[4] = {0};
+   bool has_bindless_io = false;
+   nir_foreach_variable_with_modes(var, nir, nir_var_shader_in | nir_var_shader_out) {
+      if (glsl_type_is_image(var->type) || glsl_type_is_sampler(var->type)) {
+         has_bindless_io = true;
+         break;
+      }
+   }
+   if (has_bindless_io)
+      NIR_PASS_V(nir, lower_bindless_io);
+
+   foreach_list_typed_reverse_safe(nir_variable, var, node, &nir->variables) {
       if (_nir_shader_variable_has_mode(var, nir_var_uniform |
+                                        nir_var_image |
                                         nir_var_mem_ubo |
                                         nir_var_mem_ssbo)) {
          enum zink_descriptor_type ztype;
@@ -1013,12 +1433,16 @@ zink_shader_create(struct zink_screen *screen, struct nir_shader *nir,
             ret->bindings[ztype][ret->num_bindings[ztype]].size = 1;
             ret->num_bindings[ztype]++;
          } else {
-            assert(var->data.mode == nir_var_uniform);
-            if (glsl_type_is_sampler(type) || glsl_type_is_image(type)) {
+            assert(var->data.mode == nir_var_uniform ||
+                   var->data.mode == nir_var_image);
+            if (var->data.bindless) {
+               ret->bindless = true;
+               handle_bindless_var(nir, var, type, bindless);
+            } else if (glsl_type_is_sampler(type) || glsl_type_is_image(type)) {
                VkDescriptorType vktype = glsl_type_is_image(type) ? zink_image_type(type) : zink_sampler_type(type);
+               ztype = zink_desc_type_from_vktype(vktype);
                if (vktype == VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER)
                   ret->num_texel_buffers++;
-               ztype = zink_desc_type_from_vktype(vktype);
                var->data.driver_location = var->data.binding;
                var->data.descriptor_set = ztype + 1;
                var->data.binding = zink_binding(nir->info.stage, vktype, var->data.driver_location);
@@ -1034,6 +1458,9 @@ zink_shader_create(struct zink_screen *screen, struct nir_shader *nir,
          }
       }
    }
+   bool bindless_lowered = false;
+   NIR_PASS(bindless_lowered, nir, lower_bindless, bindless);
+   ret->bindless |= bindless_lowered;
 
    ret->nir = nir;
    if (so_info && nir->info.outputs_written && nir->info.has_transform_feedback_varyings)
@@ -1042,7 +1469,7 @@ zink_shader_create(struct zink_screen *screen, struct nir_shader *nir,
    return ret;
 }
 
-void
+char *
 zink_shader_finalize(struct pipe_screen *pscreen, void *nirptr)
 {
    struct zink_screen *screen = zink_screen(pscreen);
@@ -1062,29 +1489,35 @@ zink_shader_finalize(struct pipe_screen *pscreen, void *nirptr)
    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
    if (screen->driconf.inline_uniforms)
       nir_find_inlinable_uniforms(nir);
+
+   return NULL;
 }
 
 void
 zink_shader_free(struct zink_context *ctx, struct zink_shader *shader)
 {
-   struct zink_screen *screen = zink_screen(ctx->base.screen);
    set_foreach(shader->programs, entry) {
       if (shader->nir->info.stage == MESA_SHADER_COMPUTE) {
          struct zink_compute_program *comp = (void*)entry->key;
-         _mesa_hash_table_remove_key(ctx->compute_program_cache, comp->shader);
+         if (!comp->base.removed) {
+            _mesa_hash_table_remove_key(&ctx->compute_program_cache, comp->shader);
+            comp->base.removed = true;
+         }
          comp->shader = NULL;
-         zink_compute_program_reference(screen, &comp, NULL);
+         zink_compute_program_reference(ctx, &comp, NULL);
       } else {
          struct zink_gfx_program *prog = (void*)entry->key;
          enum pipe_shader_type pstage = pipe_shader_type_from_mesa(shader->nir->info.stage);
          assert(pstage < ZINK_SHADER_COUNT);
-         if (shader->nir->info.stage != MESA_SHADER_TESS_CTRL || !shader->is_generated)
-            _mesa_hash_table_remove_key(ctx->program_cache, prog->shaders);
+         if (!prog->base.removed && (shader->nir->info.stage != MESA_SHADER_TESS_CTRL || !shader->is_generated)) {
+            _mesa_hash_table_remove_key(&ctx->program_cache[prog->stages_present >> 2], prog->shaders);
+            prog->base.removed = true;
+         }
          prog->shaders[pstage] = NULL;
          if (shader->nir->info.stage == MESA_SHADER_TESS_EVAL && shader->generated)
             /* automatically destroy generated tcs shaders when tes is destroyed */
             zink_shader_free(ctx, shader->generated);
-         zink_gfx_program_reference(screen, &prog, NULL);
+         zink_gfx_program_reference(ctx, &prog, NULL);
       }
    }
    _mesa_set_destroy(shader->programs, NULL);
@@ -1115,14 +1548,14 @@ void main()
 
 */
 struct zink_shader *
-zink_shader_tcs_create(struct zink_context *ctx, struct zink_shader *vs)
+zink_shader_tcs_create(struct zink_screen *screen, struct zink_shader *vs, unsigned vertices_per_patch)
 {
-   unsigned vertices_per_patch = ctx->gfx_pipeline_state.vertices_per_patch;
    struct zink_shader *ret = CALLOC_STRUCT(zink_shader);
+   ret->hash = _mesa_hash_pointer(ret);
    ret->programs = _mesa_pointer_set_create(NULL);
    simple_mtx_init(&ret->lock, mtx_plain);
 
-   nir_shader *nir = nir_shader_create(NULL, MESA_SHADER_TESS_CTRL, &zink_screen(ctx->base.screen)->nir_options, NULL);
+   nir_shader *nir = nir_shader_create(NULL, MESA_SHADER_TESS_CTRL, &screen->nir_options, NULL);
    nir_function *fn = nir_function_create(nir, "main");
    fn->is_entrypoint = true;
    nir_function_impl *impl = nir_function_impl_create(fn);
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_compiler.h b/mesa 3D driver/src/gallium/drivers/zink/zink_compiler.h
index 2addb6bc82..49834164fe 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_compiler.h	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_compiler.h	
@@ -68,7 +68,9 @@ zink_tgsi_to_nir(struct pipe_screen *screen, const struct tgsi_token *tokens);
 
 struct zink_shader {
    struct util_live_shader base;
+   uint32_t hash;
    struct nir_shader *nir;
+   enum pipe_prim_type reduced_prim; // PIPE_PRIM_MAX for vs
 
    struct zink_so_info streamout;
 
@@ -82,6 +84,7 @@ struct zink_shader {
    unsigned num_texel_buffers;
    uint32_t ubos_used; // bitfield of which ubo indices are used
    uint32_t ssbos_used; // bitfield of which ssbo indices are used
+   bool bindless;
 
    simple_mtx_t lock;
    struct set *programs;
@@ -89,6 +92,7 @@ struct zink_shader {
    union {
       struct zink_shader *generated; // a generated shader that this shader "owns"
       bool is_generated; // if this is a driver-created shader (e.g., tcs)
+      nir_variable *fbfetch; //for fs output
    };
 };
 
@@ -97,20 +101,20 @@ zink_screen_init_compiler(struct zink_screen *screen);
 void
 zink_compiler_assign_io(nir_shader *producer, nir_shader *consumer);
 VkShaderModule
-zink_shader_compile(struct zink_screen *screen, struct zink_shader *zs, nir_shader *nir, struct zink_shader_key *key);
+zink_shader_compile(struct zink_screen *screen, struct zink_shader *zs, nir_shader *nir, const struct zink_shader_key *key);
 
 struct zink_shader *
 zink_shader_create(struct zink_screen *screen, struct nir_shader *nir,
                  const struct pipe_stream_output_info *so_info);
 
-void
+char *
 zink_shader_finalize(struct pipe_screen *pscreen, void *nirptr);
 
 void
 zink_shader_free(struct zink_context *ctx, struct zink_shader *shader);
 
 struct zink_shader *
-zink_shader_tcs_create(struct zink_context *ctx, struct zink_shader *vs);
+zink_shader_tcs_create(struct zink_screen *screen, struct zink_shader *vs, unsigned vertices_per_patch);
 
 static inline bool
 zink_shader_descriptor_is_buffer(struct zink_shader *zs, enum zink_descriptor_type type, unsigned i)
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_context.c b/mesa 3D driver/src/gallium/drivers/zink/zink_context.c
index b279c3563a..7ebd1cc665 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_context.c	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_context.c	
@@ -26,6 +26,7 @@
 #include "zink_batch.h"
 #include "zink_compiler.h"
 #include "zink_fence.h"
+#include "zink_format.h"
 #include "zink_framebuffer.h"
 #include "zink_helpers.h"
 #include "zink_program.h"
@@ -42,7 +43,6 @@
 #include "util/u_debug.h"
 #include "util/format_srgb.h"
 #include "util/format/u_format.h"
-#include "util/u_framebuffer.h"
 #include "util/u_helpers.h"
 #include "util/u_inlines.h"
 #include "util/u_thread.h"
@@ -73,7 +73,7 @@ debug_describe_zink_buffer_view(char *buf, const struct zink_buffer_view *ptr)
 ALWAYS_INLINE static void
 check_resource_for_batch_ref(struct zink_context *ctx, struct zink_resource *res)
 {
-   if (!res->bind_count[0] && !res->bind_count[1])
+   if (!zink_resource_has_binds(res))
       zink_batch_reference_resource(&ctx->batch, res);
 }
 
@@ -83,34 +83,53 @@ zink_context_destroy(struct pipe_context *pctx)
    struct zink_context *ctx = zink_context(pctx);
    struct zink_screen *screen = zink_screen(pctx->screen);
 
-   if (screen->queue && !screen->device_lost && vkQueueWaitIdle(screen->queue) != VK_SUCCESS)
+   if (util_queue_is_initialized(&screen->flush_queue))
+      util_queue_finish(&screen->flush_queue);
+   if (screen->queue && !screen->device_lost && VKSCR(QueueWaitIdle)(screen->queue) != VK_SUCCESS)
       debug_printf("vkQueueWaitIdle failed\n");
 
    util_blitter_destroy(ctx->blitter);
    for (unsigned i = 0; i < ctx->fb_state.nr_cbufs; i++)
-      zink_surface_reference(screen, (struct zink_surface**)&ctx->fb_state.cbufs[i], NULL);
-   zink_surface_reference(screen, (struct zink_surface**)&ctx->fb_state.zsbuf, NULL);
+      pipe_surface_release(&ctx->base, &ctx->fb_state.cbufs[i]);
+   pipe_surface_release(&ctx->base, &ctx->fb_state.zsbuf);
 
    pipe_resource_reference(&ctx->dummy_vertex_buffer, NULL);
    pipe_resource_reference(&ctx->dummy_xfb_buffer, NULL);
 
-   zink_surface_reference(screen, (struct zink_surface**)&ctx->dummy_surface, NULL);
+   for (unsigned i = 0; i < ARRAY_SIZE(ctx->dummy_surface); i++)
+      pipe_surface_release(&ctx->base, &ctx->dummy_surface[i]);
    zink_buffer_view_reference(screen, &ctx->dummy_bufferview, NULL);
 
+   zink_descriptors_deinit_bindless(ctx);
+
    simple_mtx_destroy(&ctx->batch_mtx);
    zink_clear_batch_state(ctx, ctx->batch.state);
-   zink_batch_state_reference(screen, &ctx->batch.state, NULL);
-   hash_table_foreach(&ctx->batch_states, entry) {
-      struct zink_batch_state *bs = entry->data;
+   zink_batch_state_destroy(screen, ctx->batch.state);
+   struct zink_batch_state *bs = ctx->batch_states;
+   while (bs) {
+      struct zink_batch_state *bs_next = bs->next;
       zink_clear_batch_state(ctx, bs);
-      zink_batch_state_reference(screen, &bs, NULL);
+      zink_batch_state_destroy(screen, bs);
+      bs = bs_next;
    }
    util_dynarray_foreach(&ctx->free_batch_states, struct zink_batch_state*, bs) {
       zink_clear_batch_state(ctx, *bs);
-      zink_batch_state_reference(screen, bs, NULL);
+      zink_batch_state_destroy(screen, *bs);
    }
 
-   if (ctx->framebuffer) {
+   for (unsigned i = 0; i < 2; i++) {
+      util_idalloc_fini(&ctx->di.bindless[i].tex_slots);
+      util_idalloc_fini(&ctx->di.bindless[i].img_slots);
+      free(ctx->di.bindless[i].buffer_infos);
+      free(ctx->di.bindless[i].img_infos);
+      util_dynarray_fini(&ctx->di.bindless[i].updates);
+      util_dynarray_fini(&ctx->di.bindless[i].resident);
+   }
+
+   if (screen->info.have_KHR_imageless_framebuffer) {
+      hash_table_foreach(&ctx->framebuffer_cache, he)
+         zink_destroy_framebuffer(screen, he->data);
+   } else if (ctx->framebuffer) {
       simple_mtx_lock(&screen->framebuffer_mtx);
       struct hash_entry *entry = _mesa_hash_table_search(&screen->framebuffer_cache, &ctx->framebuffer->state);
       if (zink_framebuffer_reference(screen, &ctx->framebuffer, NULL))
@@ -124,8 +143,9 @@ zink_context_destroy(struct pipe_context *pctx)
    u_upload_destroy(pctx->stream_uploader);
    u_upload_destroy(pctx->const_uploader);
    slab_destroy_child(&ctx->transfer_pool);
-   _mesa_hash_table_destroy(ctx->program_cache, NULL);
-   _mesa_hash_table_destroy(ctx->compute_program_cache, NULL);
+   for (unsigned i = 0; i < ARRAY_SIZE(ctx->program_cache); i++)
+      _mesa_hash_table_clear(&ctx->program_cache[i], NULL);
+   _mesa_hash_table_clear(&ctx->compute_program_cache, NULL);
    _mesa_hash_table_destroy(ctx->render_pass_cache, NULL);
    slab_destroy_child(&ctx->transfer_pool_unsync);
 
@@ -356,7 +376,7 @@ zink_create_sampler_state(struct pipe_context *pctx,
    if (!sampler)
       return NULL;
 
-   if (vkCreateSampler(screen->dev, &sci, NULL, &sampler->sampler) != VK_SUCCESS) {
+   if (VKSCR(CreateSampler)(screen->dev, &sci, NULL, &sampler->sampler) != VK_SUCCESS) {
       FREE(sampler);
       return NULL;
    }
@@ -430,13 +450,12 @@ get_bufferview_for_binding(struct zink_context *ctx, enum pipe_shader_type stage
    return VK_NULL_HANDLE;
 }
 
-ALWAYS_INLINE static void
-update_descriptor_state_ubo(struct zink_context *ctx, enum pipe_shader_type shader, unsigned slot)
+ALWAYS_INLINE static struct zink_resource *
+update_descriptor_state_ubo(struct zink_context *ctx, enum pipe_shader_type shader, unsigned slot, struct zink_resource *res)
 {
    struct zink_screen *screen = zink_screen(ctx->base.screen);
    bool have_null_descriptors = screen->info.rb2_feats.nullDescriptor;
    const enum zink_descriptor_type type = ZINK_DESCRIPTOR_TYPE_UBO;
-   struct zink_resource *res = zink_get_resource_for_descriptor(ctx, type, shader, slot);
    ctx->di.descriptor_res[type][shader][slot] = res;
    ctx->di.ubos[shader][slot].offset = ctx->ubos[shader][slot].buffer_offset;
    if (res) {
@@ -454,15 +473,15 @@ update_descriptor_state_ubo(struct zink_context *ctx, enum pipe_shader_type shad
       else
          ctx->di.push_valid &= ~BITFIELD64_BIT(shader);
    }
+   return res;
 }
 
-ALWAYS_INLINE static void
-update_descriptor_state_ssbo(struct zink_context *ctx, enum pipe_shader_type shader, unsigned slot)
+ALWAYS_INLINE static struct zink_resource *
+update_descriptor_state_ssbo(struct zink_context *ctx, enum pipe_shader_type shader, unsigned slot, struct zink_resource *res)
 {
    struct zink_screen *screen = zink_screen(ctx->base.screen);
    bool have_null_descriptors = screen->info.rb2_feats.nullDescriptor;
    const enum zink_descriptor_type type = ZINK_DESCRIPTOR_TYPE_SSBO;
-   struct zink_resource *res = zink_get_resource_for_descriptor(ctx, type, shader, slot);
    ctx->di.descriptor_res[type][shader][slot] = res;
    ctx->di.ssbos[shader][slot].offset = ctx->ssbos[shader][slot].buffer_offset;
    if (res) {
@@ -473,15 +492,15 @@ update_descriptor_state_ssbo(struct zink_context *ctx, enum pipe_shader_type sha
       ctx->di.ssbos[shader][slot].buffer = have_null_descriptors ? VK_NULL_HANDLE : null_buffer;
       ctx->di.ssbos[shader][slot].range = VK_WHOLE_SIZE;
    }
+   return res;
 }
 
-ALWAYS_INLINE static void
-update_descriptor_state_sampler(struct zink_context *ctx, enum pipe_shader_type shader, unsigned slot)
+ALWAYS_INLINE static struct zink_resource *
+update_descriptor_state_sampler(struct zink_context *ctx, enum pipe_shader_type shader, unsigned slot, struct zink_resource *res)
 {
    struct zink_screen *screen = zink_screen(ctx->base.screen);
    bool have_null_descriptors = screen->info.rb2_feats.nullDescriptor;
    const enum zink_descriptor_type type = ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW;
-   struct zink_resource *res = zink_get_resource_for_descriptor(ctx, type, shader, slot);
    ctx->di.descriptor_res[type][shader][slot] = res;
    if (res) {
       if (res->obj->is_buffer) {
@@ -502,7 +521,7 @@ update_descriptor_state_sampler(struct zink_context *ctx, enum pipe_shader_type
          ctx->di.textures[shader][slot].imageLayout = VK_IMAGE_LAYOUT_UNDEFINED;
          ctx->di.tbos[shader][slot] = VK_NULL_HANDLE;
       } else {
-         struct zink_surface *null_surface = zink_surface(ctx->dummy_surface);
+         struct zink_surface *null_surface = zink_csurface(ctx->dummy_surface[0]);
          struct zink_buffer_view *null_bufferview = ctx->dummy_bufferview;
          ctx->di.textures[shader][slot].imageView = null_surface->image_view;
          ctx->di.textures[shader][slot].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
@@ -510,15 +529,15 @@ update_descriptor_state_sampler(struct zink_context *ctx, enum pipe_shader_type
       }
       memset(&ctx->di.sampler_surfaces[shader][slot], 0, sizeof(ctx->di.sampler_surfaces[shader][slot]));
    }
+   return res;
 }
 
-ALWAYS_INLINE static void
-update_descriptor_state_image(struct zink_context *ctx, enum pipe_shader_type shader, unsigned slot)
+ALWAYS_INLINE static struct zink_resource *
+update_descriptor_state_image(struct zink_context *ctx, enum pipe_shader_type shader, unsigned slot, struct zink_resource *res)
 {
    struct zink_screen *screen = zink_screen(ctx->base.screen);
    bool have_null_descriptors = screen->info.rb2_feats.nullDescriptor;
    const enum zink_descriptor_type type = ZINK_DESCRIPTOR_TYPE_IMAGE;
-   struct zink_resource *res = zink_get_resource_for_descriptor(ctx, type, shader, slot);
    ctx->di.descriptor_res[type][shader][slot] = res;
    if (res) {
       if (res->obj->is_buffer) {
@@ -538,7 +557,7 @@ update_descriptor_state_image(struct zink_context *ctx, enum pipe_shader_type sh
          memset(&ctx->di.images[shader][slot], 0, sizeof(ctx->di.images[shader][slot]));
          ctx->di.texel_images[shader][slot] = VK_NULL_HANDLE;
       } else {
-         struct zink_surface *null_surface = zink_surface(ctx->dummy_surface);
+         struct zink_surface *null_surface = zink_csurface(ctx->dummy_surface[0]);
          struct zink_buffer_view *null_bufferview = ctx->dummy_bufferview;
          ctx->di.images[shader][slot].imageView = null_surface->image_view;
          ctx->di.images[shader][slot].imageLayout = VK_IMAGE_LAYOUT_GENERAL;
@@ -546,6 +565,7 @@ update_descriptor_state_image(struct zink_context *ctx, enum pipe_shader_type sh
       }
       memset(&ctx->di.image_surfaces[shader][slot], 0, sizeof(ctx->di.image_surfaces[shader][slot]));
    }
+   return res;
 }
 
 static void
@@ -582,22 +602,6 @@ zink_delete_sampler_state(struct pipe_context *pctx,
    FREE(sampler);
 }
 
-static VkComponentSwizzle
-component_mapping(enum pipe_swizzle swizzle)
-{
-   switch (swizzle) {
-   case PIPE_SWIZZLE_X: return VK_COMPONENT_SWIZZLE_R;
-   case PIPE_SWIZZLE_Y: return VK_COMPONENT_SWIZZLE_G;
-   case PIPE_SWIZZLE_Z: return VK_COMPONENT_SWIZZLE_B;
-   case PIPE_SWIZZLE_W: return VK_COMPONENT_SWIZZLE_A;
-   case PIPE_SWIZZLE_0: return VK_COMPONENT_SWIZZLE_ZERO;
-   case PIPE_SWIZZLE_1: return VK_COMPONENT_SWIZZLE_ONE;
-   case PIPE_SWIZZLE_NONE: return VK_COMPONENT_SWIZZLE_IDENTITY; // ???
-   default:
-      unreachable("unexpected swizzle");
-   }
-}
-
 static VkImageAspectFlags
 sampler_aspect_from_format(enum pipe_format fmt)
 {
@@ -618,48 +622,61 @@ hash_bufferview(void *bvci)
    return _mesa_hash_data((char*)bvci + offset, sizeof(VkBufferViewCreateInfo) - offset);
 }
 
-static struct zink_buffer_view *
-get_buffer_view(struct zink_context *ctx, struct zink_resource *res, enum pipe_format format, uint32_t offset, uint32_t range)
+static VkBufferViewCreateInfo
+create_bvci(struct zink_context *ctx, struct zink_resource *res, enum pipe_format format, uint32_t offset, uint32_t range)
 {
    struct zink_screen *screen = zink_screen(ctx->base.screen);
-   struct zink_buffer_view *buffer_view = NULL;
-   VkBufferViewCreateInfo bvci = {0};
+   VkBufferViewCreateInfo bvci;
+   // Zero whole struct (including alignment holes), so hash_bufferview
+   // does not access potentially uninitialized data.
+   memset(&bvci, 0, sizeof(bvci));
    bvci.sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO;
+   bvci.pNext = NULL;
    bvci.buffer = res->obj->buffer;
    bvci.format = zink_get_format(screen, format);
    assert(bvci.format);
    bvci.offset = offset;
-   bvci.range = range;
+   bvci.range = !offset && range == res->base.b.width0 ? VK_WHOLE_SIZE : range;
+   bvci.flags = 0;
+   return bvci;
+}
 
-   uint32_t hash = hash_bufferview(&bvci);
-   simple_mtx_lock(&screen->bufferview_mtx);
-   struct hash_entry *he = _mesa_hash_table_search_pre_hashed(&screen->bufferview_cache, hash, &bvci);
+static struct zink_buffer_view *
+get_buffer_view(struct zink_context *ctx, struct zink_resource *res, VkBufferViewCreateInfo *bvci)
+{
+   struct zink_screen *screen = zink_screen(ctx->base.screen);
+   struct zink_buffer_view *buffer_view = NULL;
+
+   uint32_t hash = hash_bufferview(bvci);
+   simple_mtx_lock(&res->bufferview_mtx);
+   struct hash_entry *he = _mesa_hash_table_search_pre_hashed(&res->bufferview_cache, hash, bvci);
    if (he) {
       buffer_view = he->data;
       p_atomic_inc(&buffer_view->reference.count);
    } else {
       VkBufferView view;
-      if (vkCreateBufferView(screen->dev, &bvci, NULL, &view) != VK_SUCCESS)
+      if (VKSCR(CreateBufferView)(screen->dev, bvci, NULL, &view) != VK_SUCCESS)
          goto out;
       buffer_view = CALLOC_STRUCT(zink_buffer_view);
       if (!buffer_view) {
-         vkDestroyBufferView(screen->dev, view, NULL);
+         VKSCR(DestroyBufferView)(screen->dev, view, NULL);
          goto out;
       }
       pipe_reference_init(&buffer_view->reference, 1);
+      pipe_resource_reference(&buffer_view->pres, &res->base.b);
       util_dynarray_init(&buffer_view->desc_set_refs.refs, NULL);
-      buffer_view->bvci = bvci;
+      buffer_view->bvci = *bvci;
       buffer_view->buffer_view = view;
       buffer_view->hash = hash;
-      _mesa_hash_table_insert_pre_hashed(&screen->bufferview_cache, hash, &buffer_view->bvci, buffer_view);
+      _mesa_hash_table_insert_pre_hashed(&res->bufferview_cache, hash, &buffer_view->bvci, buffer_view);
    }
 out:
-   simple_mtx_unlock(&screen->bufferview_mtx);
+   simple_mtx_unlock(&res->bufferview_mtx);
    return buffer_view;
 }
 
-static inline enum pipe_swizzle
-clamp_void_swizzle(const struct util_format_description *desc, enum pipe_swizzle swizzle)
+enum pipe_swizzle
+zink_clamp_void_swizzle(const struct util_format_description *desc, enum pipe_swizzle swizzle)
 {
    switch (swizzle) {
    case PIPE_SWIZZLE_X:
@@ -688,35 +705,13 @@ clamp_zs_swizzle(enum pipe_swizzle swizzle)
    return swizzle;
 }
 
-static inline bool
-format_is_usable_rgba_variant(const struct util_format_description *desc)
-{
-   unsigned chan;
-
-   if(desc->block.width != 1 ||
-      desc->block.height != 1 ||
-      (desc->block.bits != 32 && desc->block.bits != 64))
-      return false;
-
-   if (desc->nr_channels != 4)
-      return false;
-
-   unsigned size = desc->channel[0].size;
-   for(chan = 0; chan < 4; ++chan) {
-      if(desc->channel[chan].size != size)
-         return false;
-   }
-
-   return true;
-}
-
 static struct pipe_sampler_view *
 zink_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *pres,
                          const struct pipe_sampler_view *state)
 {
    struct zink_screen *screen = zink_screen(pctx->screen);
    struct zink_resource *res = zink_resource(pres);
-   struct zink_sampler_view *sampler_view = CALLOC_STRUCT(zink_sampler_view);
+   struct zink_sampler_view *sampler_view = CALLOC_STRUCT_CL(zink_sampler_view);
    bool err;
 
    sampler_view->base = *state;
@@ -741,36 +736,37 @@ zink_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *pres,
       ivci.subresourceRange.aspectMask = sampler_aspect_from_format(state->format);
       /* samplers for stencil aspects of packed formats need to always use stencil swizzle */
       if (ivci.subresourceRange.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
-         ivci.components.r = component_mapping(clamp_zs_swizzle(sampler_view->base.swizzle_r));
-         ivci.components.g = component_mapping(clamp_zs_swizzle(sampler_view->base.swizzle_g));
-         ivci.components.b = component_mapping(clamp_zs_swizzle(sampler_view->base.swizzle_b));
-         ivci.components.a = component_mapping(clamp_zs_swizzle(sampler_view->base.swizzle_a));
+         ivci.components.r = zink_component_mapping(clamp_zs_swizzle(sampler_view->base.swizzle_r));
+         ivci.components.g = zink_component_mapping(clamp_zs_swizzle(sampler_view->base.swizzle_g));
+         ivci.components.b = zink_component_mapping(clamp_zs_swizzle(sampler_view->base.swizzle_b));
+         ivci.components.a = zink_component_mapping(clamp_zs_swizzle(sampler_view->base.swizzle_a));
       } else {
          /* if we have e.g., R8G8B8X8, then we have to ignore alpha since we're just emulating
           * these formats
           */
-          const struct util_format_description *desc = util_format_description(state->format);
-          if (format_is_usable_rgba_variant(desc)) {
-             sampler_view->base.swizzle_r = clamp_void_swizzle(desc, sampler_view->base.swizzle_r);
-             sampler_view->base.swizzle_g = clamp_void_swizzle(desc, sampler_view->base.swizzle_g);
-             sampler_view->base.swizzle_b = clamp_void_swizzle(desc, sampler_view->base.swizzle_b);
-             sampler_view->base.swizzle_a = clamp_void_swizzle(desc, sampler_view->base.swizzle_a);
+          if (zink_format_is_voidable_rgba_variant(state->format)) {
+             const struct util_format_description *desc = util_format_description(state->format);
+             sampler_view->base.swizzle_r = zink_clamp_void_swizzle(desc, sampler_view->base.swizzle_r);
+             sampler_view->base.swizzle_g = zink_clamp_void_swizzle(desc, sampler_view->base.swizzle_g);
+             sampler_view->base.swizzle_b = zink_clamp_void_swizzle(desc, sampler_view->base.swizzle_b);
+             sampler_view->base.swizzle_a = zink_clamp_void_swizzle(desc, sampler_view->base.swizzle_a);
           }
-          ivci.components.r = component_mapping(sampler_view->base.swizzle_r);
-          ivci.components.g = component_mapping(sampler_view->base.swizzle_g);
-          ivci.components.b = component_mapping(sampler_view->base.swizzle_b);
-          ivci.components.a = component_mapping(sampler_view->base.swizzle_a);
+          ivci.components.r = zink_component_mapping(sampler_view->base.swizzle_r);
+          ivci.components.g = zink_component_mapping(sampler_view->base.swizzle_g);
+          ivci.components.b = zink_component_mapping(sampler_view->base.swizzle_b);
+          ivci.components.a = zink_component_mapping(sampler_view->base.swizzle_a);
       }
       assert(ivci.format);
 
       sampler_view->image_view = (struct zink_surface*)zink_get_surface(zink_context(pctx), pres, &templ, &ivci);
       err = !sampler_view->image_view;
    } else {
-      sampler_view->buffer_view = get_buffer_view(zink_context(pctx), res, state->format, state->u.buf.offset, state->u.buf.size);
+      VkBufferViewCreateInfo bvci = create_bvci(zink_context(pctx), res, state->format, state->u.buf.offset, state->u.buf.size);
+      sampler_view->buffer_view = get_buffer_view(zink_context(pctx), res, &bvci);
       err = !sampler_view->buffer_view;
    }
    if (err) {
-      FREE(sampler_view);
+      FREE_CL(sampler_view);
       return NULL;
    }
    return &sampler_view->base;
@@ -779,12 +775,19 @@ zink_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *pres,
 void
 zink_destroy_buffer_view(struct zink_screen *screen, struct zink_buffer_view *buffer_view)
 {
-   simple_mtx_lock(&screen->bufferview_mtx);
-   struct hash_entry *he = _mesa_hash_table_search_pre_hashed(&screen->bufferview_cache, buffer_view->hash, &buffer_view->bvci);
+   struct zink_resource *res = zink_resource(buffer_view->pres);
+   simple_mtx_lock(&res->bufferview_mtx);
+   if (buffer_view->reference.count) {
+      /* got a cache hit during deletion */
+      simple_mtx_unlock(&res->bufferview_mtx);
+      return;
+   }
+   struct hash_entry *he = _mesa_hash_table_search_pre_hashed(&res->bufferview_cache, buffer_view->hash, &buffer_view->bvci);
    assert(he);
-   _mesa_hash_table_remove(&screen->bufferview_cache, he);
-   simple_mtx_unlock(&screen->bufferview_mtx);
-   vkDestroyBufferView(screen->dev, buffer_view->buffer_view, NULL);
+   _mesa_hash_table_remove(&res->bufferview_cache, he);
+   simple_mtx_unlock(&res->bufferview_mtx);
+   pipe_resource_reference(&buffer_view->pres, NULL);
+   VKSCR(DestroyBufferView)(screen->dev, buffer_view->buffer_view, NULL);
    zink_descriptor_set_refs_clear(&buffer_view->desc_set_refs, buffer_view);
    FREE(buffer_view);
 }
@@ -800,7 +803,7 @@ zink_sampler_view_destroy(struct pipe_context *pctx,
       zink_surface_reference(zink_screen(pctx->screen), &view->image_view, NULL);
    }
    pipe_resource_reference(&pview->texture, NULL);
-   FREE(view);
+   FREE_CL(view);
 }
 
 static void
@@ -905,7 +908,7 @@ update_existing_vbo(struct zink_context *ctx, unsigned slot)
    update_res_bind_count(ctx, res, false, true);
 }
 
-ALWAYS_INLINE static void
+ALWAYS_INLINE static struct zink_resource *
 set_vertex_buffer_clamped(struct zink_context *ctx, unsigned slot)
 {
    const struct pipe_vertex_buffer *ctx_vb = &ctx->vertex_buffers[slot];
@@ -927,6 +930,7 @@ set_vertex_buffer_clamped(struct zink_context *ctx, unsigned slot)
       ctx->vbuf_offsets[slot] = ctx_vb->buffer_offset;
    }
    assert(ctx->vbufs[slot]);
+   return res;
 }
 
 static void
@@ -938,13 +942,14 @@ zink_set_vertex_buffers(struct pipe_context *pctx,
                         const struct pipe_vertex_buffer *buffers)
 {
    struct zink_context *ctx = zink_context(pctx);
-
+   const bool need_state_change = !zink_screen(pctx->screen)->info.have_EXT_extended_dynamic_state &&
+                                  !zink_screen(pctx->screen)->info.have_EXT_vertex_input_dynamic_state;
    uint32_t enabled_buffers = ctx->gfx_pipeline_state.vertex_buffers_enabled_mask;
    enabled_buffers |= u_bit_consecutive(start_slot, num_buffers);
    enabled_buffers &= ~u_bit_consecutive(start_slot + num_buffers, unbind_num_trailing_slots);
 
    if (buffers) {
-      if (!zink_screen(pctx->screen)->info.have_EXT_extended_dynamic_state)
+      if (need_state_change)
          ctx->vertex_state_changed = true;
       for (unsigned i = 0; i < num_buffers; ++i) {
          const struct pipe_vertex_buffer *vb = buffers + i;
@@ -963,14 +968,14 @@ zink_set_vertex_buffers(struct pipe_context *pctx,
             ctx_vb->stride = vb->stride;
             ctx_vb->buffer_offset = vb->buffer_offset;
             /* always barrier before possible rebind */
-            zink_resource_buffer_barrier(ctx, NULL, res, VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT,
+            zink_resource_buffer_barrier(ctx, res, VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT,
                                          VK_PIPELINE_STAGE_VERTEX_INPUT_BIT);
             set_vertex_buffer_clamped(ctx, start_slot + i);
-            zink_batch_resource_usage_set(&ctx->batch, res, false);
-         }
+         } else
+            enabled_buffers &= ~BITFIELD_BIT(i);
       }
    } else {
-      if (!zink_screen(pctx->screen)->info.have_EXT_extended_dynamic_state)
+      if (need_state_change)
          ctx->vertex_state_changed = true;
       for (unsigned i = 0; i < num_buffers; ++i) {
          update_existing_vbo(ctx, start_slot + i);
@@ -983,6 +988,10 @@ zink_set_vertex_buffers(struct pipe_context *pctx,
    }
    ctx->gfx_pipeline_state.vertex_buffers_enabled_mask = enabled_buffers;
    ctx->vertex_buffers_dirty = num_buffers > 0;
+#ifndef NDEBUG
+   u_foreach_bit(b, enabled_buffers)
+      assert(ctx->vertex_buffers[b].buffer.resource);
+#endif
 }
 
 static void
@@ -998,9 +1007,9 @@ zink_set_viewport_states(struct pipe_context *pctx,
    ctx->vp_state.num_viewports = start_slot + num_viewports;
 
    if (!zink_screen(pctx->screen)->info.have_EXT_extended_dynamic_state) {
-      if (ctx->gfx_pipeline_state.num_viewports != ctx->vp_state.num_viewports)
+      if (ctx->gfx_pipeline_state.dyn_state1.num_viewports != ctx->vp_state.num_viewports)
          ctx->gfx_pipeline_state.dirty = true;
-      ctx->gfx_pipeline_state.num_viewports = ctx->vp_state.num_viewports;
+      ctx->gfx_pipeline_state.dyn_state1.num_viewports = ctx->vp_state.num_viewports;
    }
    ctx->vp_state_changed = true;
 }
@@ -1023,10 +1032,24 @@ zink_set_inlinable_constants(struct pipe_context *pctx,
                              uint num_values, uint32_t *values)
 {
    struct zink_context *ctx = (struct zink_context *)pctx;
+   const uint32_t bit = BITFIELD_BIT(shader);
+   uint32_t *inlinable_uniforms;
+   struct zink_shader_key *key = NULL;
 
-   memcpy(ctx->inlinable_uniforms[shader], values, num_values * 4);
-   ctx->dirty_shader_stages |= 1 << shader;
-   ctx->inlinable_uniforms_valid_mask |= 1 << shader;
+   if (shader == PIPE_SHADER_COMPUTE) {
+      inlinable_uniforms = ctx->compute_inlinable_uniforms;
+   } else {
+      key = &ctx->gfx_pipeline_state.shader_keys.key[shader];
+      inlinable_uniforms = key->base.inlined_uniform_values;
+   }
+   if (!(ctx->inlinable_uniforms_valid_mask & bit) ||
+       memcmp(inlinable_uniforms, values, num_values * 4)) {
+      memcpy(inlinable_uniforms, values, num_values * 4);
+      ctx->dirty_shader_stages |= bit;
+      ctx->inlinable_uniforms_valid_mask |= bit;
+      if (key)
+         key->inline_uniforms = true;
+   }
 }
 
 ALWAYS_INLINE static void
@@ -1039,6 +1062,21 @@ unbind_ubo(struct zink_context *ctx, struct zink_resource *res, enum pipe_shader
    update_res_bind_count(ctx, res, pstage == PIPE_SHADER_COMPUTE, true);
 }
 
+static void
+invalidate_inlined_uniforms(struct zink_context *ctx, enum pipe_shader_type pstage)
+{
+   unsigned bit = BITFIELD_BIT(pstage);
+   if (!(ctx->inlinable_uniforms_valid_mask & bit))
+      return;
+   ctx->inlinable_uniforms_valid_mask &= ~bit;
+   ctx->dirty_shader_stages |= bit;
+   if (pstage == PIPE_SHADER_COMPUTE)
+      return;
+
+   struct zink_shader_key *key = &ctx->gfx_pipeline_state.shader_keys.key[pstage];
+   key->inline_uniforms = false;
+}
+
 static void
 zink_set_constant_buffer(struct pipe_context *pctx,
                          enum pipe_shader_type shader, uint index,
@@ -1062,8 +1100,6 @@ zink_set_constant_buffer(struct pipe_context *pctx,
       if (new_res) {
          if (new_res != res) {
             unbind_ubo(ctx, res, shader, index);
-            new_res->bind_history |= BITFIELD_BIT(ZINK_DESCRIPTOR_TYPE_UBO);
-            new_res->bind_stages |= 1 << shader;
             new_res->ubo_bind_count[shader == PIPE_SHADER_COMPUTE]++;
             new_res->ubo_bind_mask[shader] |= BITFIELD_BIT(index);
             update_res_bind_count(ctx, new_res, shader == PIPE_SHADER_COMPUTE, false);
@@ -1091,23 +1127,25 @@ zink_set_constant_buffer(struct pipe_context *pctx,
 
       if (index + 1 >= ctx->di.num_ubos[shader])
          ctx->di.num_ubos[shader] = index + 1;
+      update_descriptor_state_ubo(ctx, shader, index, new_res);
    } else {
-      if (res)
-         unbind_ubo(ctx, res, shader, index);
-      update = !!ctx->ubos[shader][index].buffer;
-
-      pipe_resource_reference(&ctx->ubos[shader][index].buffer, NULL);
       ctx->ubos[shader][index].buffer_offset = 0;
       ctx->ubos[shader][index].buffer_size = 0;
       ctx->ubos[shader][index].user_buffer = NULL;
+      if (res) {
+         unbind_ubo(ctx, res, shader, index);
+         update_descriptor_state_ubo(ctx, shader, index, NULL);
+      }
+      update = !!ctx->ubos[shader][index].buffer;
+
+      pipe_resource_reference(&ctx->ubos[shader][index].buffer, NULL);
       if (ctx->di.num_ubos[shader] == index + 1)
          ctx->di.num_ubos[shader]--;
    }
    if (index == 0) {
       /* Invalidate current inlinable uniforms. */
-      ctx->inlinable_uniforms_valid_mask &= ~(1 << shader);
+      invalidate_inlined_uniforms(ctx, shader);
    }
-   update_descriptor_state_ubo(ctx, shader, index);
 
    if (update)
       zink_screen(pctx->screen)->context_invalidate_descriptor_state(ctx, shader, ZINK_DESCRIPTOR_TYPE_UBO, index, 1);
@@ -1148,8 +1186,6 @@ zink_set_shader_buffers(struct pipe_context *pctx,
          struct zink_resource *new_res = zink_resource(buffers[i].buffer);
          if (new_res != res) {
             unbind_ssbo(ctx, res, p_stage, i, was_writable);
-            new_res->bind_history |= BITFIELD_BIT(ZINK_DESCRIPTOR_TYPE_SSBO);
-            new_res->bind_stages |= 1 << p_stage;
             new_res->ssbo_bind_mask[p_stage] |= BITFIELD_BIT(i);
             update_res_bind_count(ctx, new_res, p_stage == PIPE_SHADER_COMPUTE, false);
          }
@@ -1168,15 +1204,17 @@ zink_set_shader_buffers(struct pipe_context *pctx,
                                       zink_pipeline_flags_from_pipe_stage(p_stage));
          update = true;
          max_slot = MAX2(max_slot, start_slot + i);
+         update_descriptor_state_ssbo(ctx, p_stage, start_slot + i, new_res);
       } else {
          update = !!res;
-         if (res)
-            unbind_ssbo(ctx, res, p_stage, i, was_writable);
-         pipe_resource_reference(&ssbo->buffer, NULL);
          ssbo->buffer_offset = 0;
          ssbo->buffer_size = 0;
+         if (res) {
+            unbind_ssbo(ctx, res, p_stage, i, was_writable);
+            update_descriptor_state_ssbo(ctx, p_stage, start_slot + i, NULL);
+         }
+         pipe_resource_reference(&ssbo->buffer, NULL);
       }
-      update_descriptor_state_ssbo(ctx, p_stage, start_slot + i);
    }
    if (start_slot + count >= ctx->di.num_ssbos[p_stage])
       ctx->di.num_ssbos[p_stage] = max_slot + 1;
@@ -1191,7 +1229,7 @@ update_binds_for_samplerviews(struct zink_context *ctx, struct zink_resource *re
     if (is_compute) {
        u_foreach_bit(slot, res->sampler_binds[PIPE_SHADER_COMPUTE]) {
           if (ctx->di.textures[PIPE_SHADER_COMPUTE][slot].imageLayout != layout) {
-             update_descriptor_state_sampler(ctx, PIPE_SHADER_COMPUTE, slot);
+             update_descriptor_state_sampler(ctx, PIPE_SHADER_COMPUTE, slot, res);
              zink_screen(ctx->base.screen)->context_invalidate_descriptor_state(ctx, PIPE_SHADER_COMPUTE, ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW, slot, 1);
           }
        }
@@ -1199,7 +1237,7 @@ update_binds_for_samplerviews(struct zink_context *ctx, struct zink_resource *re
        for (unsigned i = 0; i < ZINK_SHADER_COUNT; i++) {
           u_foreach_bit(slot, res->sampler_binds[i]) {
              if (ctx->di.textures[i][slot].imageLayout != layout) {
-                update_descriptor_state_sampler(ctx, i, slot);
+                update_descriptor_state_sampler(ctx, i, slot, res);
                 zink_screen(ctx->base.screen)->context_invalidate_descriptor_state(ctx, i, ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW, slot, 1);
              }
           }
@@ -1231,9 +1269,9 @@ check_for_layout_update(struct zink_context *ctx, struct zink_resource *res, boo
 {
    VkImageLayout layout = res->bind_count[is_compute] ? zink_descriptor_util_image_layout_eval(res, is_compute) : VK_IMAGE_LAYOUT_UNDEFINED;
    VkImageLayout other_layout = res->bind_count[!is_compute] ? zink_descriptor_util_image_layout_eval(res, !is_compute) : VK_IMAGE_LAYOUT_UNDEFINED;
-   if (res->bind_count[is_compute] && res->layout != layout)
+   if (res->bind_count[is_compute] && layout && res->layout != layout)
       _mesa_set_add(ctx->need_barriers[is_compute], res);
-   if (res->bind_count[!is_compute] && (layout != other_layout || res->layout != other_layout))
+   if (res->bind_count[!is_compute] && other_layout && (layout != other_layout || res->layout != other_layout))
       _mesa_set_add(ctx->need_barriers[!is_compute], res);
 }
 
@@ -1264,6 +1302,50 @@ unbind_shader_image(struct zink_context *ctx, enum pipe_shader_type stage, unsig
    image_view->surface = NULL;
 }
 
+static struct zink_buffer_view *
+create_image_bufferview(struct zink_context *ctx, const struct pipe_image_view *view)
+{
+   struct zink_resource *res = zink_resource(view->resource);
+   VkBufferViewCreateInfo bvci = create_bvci(ctx, res, view->format, view->u.buf.offset, view->u.buf.size);
+   struct zink_buffer_view *buffer_view = get_buffer_view(ctx, res, &bvci);
+   if (!buffer_view)
+      return NULL;
+   util_range_add(&res->base.b, &res->valid_buffer_range, view->u.buf.offset,
+                  view->u.buf.offset + view->u.buf.size);
+   return buffer_view;
+}
+
+static void
+finalize_image_bind(struct zink_context *ctx, struct zink_resource *res, bool is_compute)
+{
+   /* if this is the first image bind and there are sampler binds, the image's sampler layout
+    * must be updated to GENERAL
+    */
+   if (res->image_bind_count[is_compute] == 1 &&
+       res->bind_count[is_compute] > 1)
+      update_binds_for_samplerviews(ctx, res, is_compute);
+   check_for_layout_update(ctx, res, is_compute);
+}
+
+static struct zink_surface *
+create_image_surface(struct zink_context *ctx, const struct pipe_image_view *view, bool is_compute)
+{
+   struct zink_resource *res = zink_resource(view->resource);
+   struct pipe_surface tmpl = {0};
+   tmpl.format = view->format;
+   tmpl.u.tex.level = view->u.tex.level;
+   tmpl.u.tex.first_layer = view->u.tex.first_layer;
+   tmpl.u.tex.last_layer = view->u.tex.last_layer;
+   struct pipe_surface *psurf = ctx->base.create_surface(&ctx->base, &res->base.b, &tmpl);
+   if (!psurf)
+      return NULL;
+   /* this is actually a zink_ctx_surface, but we just want the inner surface */
+   struct zink_surface *surface = zink_csurface(psurf);
+   FREE(psurf);
+   flush_pending_clears(ctx, res);
+   return surface;
+}
+
 static void
 zink_set_shader_images(struct pipe_context *pctx,
                        enum pipe_shader_type p_stage,
@@ -1288,8 +1370,6 @@ zink_set_shader_images(struct pipe_context *pctx,
                if (!old_res->obj->is_buffer && !old_res->image_bind_count[p_stage == PIPE_SHADER_COMPUTE])
                   check_for_layout_update(ctx, old_res, p_stage == PIPE_SHADER_COMPUTE);
             }
-            res->bind_history |= BITFIELD_BIT(ZINK_DESCRIPTOR_TYPE_IMAGE);
-            res->bind_stages |= 1 << p_stage;
             update_res_bind_count(ctx, res, p_stage == PIPE_SHADER_COMPUTE, false);
          }
          util_copy_image_view(&image_view->base, images + i);
@@ -1303,46 +1383,32 @@ zink_set_shader_images(struct pipe_context *pctx,
          }
          res->image_bind_count[p_stage == PIPE_SHADER_COMPUTE]++;
          if (images[i].resource->target == PIPE_BUFFER) {
-            image_view->buffer_view = get_buffer_view(ctx, res, images[i].format, images[i].u.buf.offset, images[i].u.buf.size);
+            image_view->buffer_view = create_image_bufferview(ctx, &images[i]);
             assert(image_view->buffer_view);
-            util_range_add(&res->base.b, &res->valid_buffer_range, images[i].u.buf.offset,
-                           images[i].u.buf.offset + images[i].u.buf.size);
             zink_batch_usage_set(&image_view->buffer_view->batch_uses, ctx->batch.state);
             zink_fake_buffer_barrier(res, access,
                                          zink_pipeline_flags_from_pipe_stage(p_stage));
          } else {
-            struct pipe_surface tmpl = {0};
-            tmpl.format = images[i].format;
-            tmpl.nr_samples = 1;
-            tmpl.u.tex.level = images[i].u.tex.level;
-            tmpl.u.tex.first_layer = images[i].u.tex.first_layer;
-            tmpl.u.tex.last_layer = images[i].u.tex.last_layer;
-            image_view->surface = zink_surface(pctx->create_surface(pctx, &res->base.b, &tmpl));
+            image_view->surface = create_image_surface(ctx, &images[i], p_stage == PIPE_SHADER_COMPUTE);
             assert(image_view->surface);
-            /* if this is the first image bind and there are sampler binds, the image's sampler layout
-             * must be updated to GENERAL
-             */
-            if (res->image_bind_count[p_stage == PIPE_SHADER_COMPUTE] == 1 &&
-                res->bind_count[p_stage == PIPE_SHADER_COMPUTE] > 1)
-               update_binds_for_samplerviews(ctx, res, p_stage == PIPE_SHADER_COMPUTE);
-            check_for_layout_update(ctx, res, p_stage == PIPE_SHADER_COMPUTE);
+            finalize_image_bind(ctx, res, p_stage == PIPE_SHADER_COMPUTE);
             zink_batch_usage_set(&image_view->surface->batch_uses, ctx->batch.state);
-            flush_pending_clears(ctx, res);
          }
          zink_batch_resource_usage_set(&ctx->batch, zink_resource(image_view->base.resource),
                                           zink_resource_access_is_write(access));
          update = true;
+         update_descriptor_state_image(ctx, p_stage, start_slot + i, res);
       } else if (image_view->base.resource) {
          update |= !!image_view->base.resource;
 
          unbind_shader_image(ctx, p_stage, start_slot + i);
+         update_descriptor_state_image(ctx, p_stage, start_slot + i, NULL);
       }
-      update_descriptor_state_image(ctx, p_stage, start_slot + i);
    }
    for (unsigned i = 0; i < unbind_num_trailing_slots; i++) {
       update |= !!ctx->image_views[p_stage][start_slot + count + i].base.resource;
       unbind_shader_image(ctx, p_stage, start_slot + count + i);
-      update_descriptor_state_image(ctx, p_stage, start_slot + count + i);
+      update_descriptor_state_image(ctx, p_stage, start_slot + count + i, NULL);
    }
    ctx->di.num_images[p_stage] = start_slot + count;
    if (update)
@@ -1376,6 +1442,7 @@ zink_set_sampler_views(struct pipe_context *pctx,
                        unsigned start_slot,
                        unsigned num_views,
                        unsigned unbind_num_trailing_slots,
+                       bool take_ownership,
                        struct pipe_sampler_view **views)
 {
    struct zink_context *ctx = zink_context(pctx);
@@ -1386,33 +1453,30 @@ zink_set_sampler_views(struct pipe_context *pctx,
       struct pipe_sampler_view *pview = views ? views[i] : NULL;
       struct zink_sampler_view *a = zink_sampler_view(ctx->sampler_views[shader_type][start_slot + i]);
       struct zink_sampler_view *b = zink_sampler_view(pview);
+      struct zink_resource *res = b ? zink_resource(b->base.texture) : NULL;
       if (b && b->base.texture) {
-         struct zink_resource *res = zink_resource(b->base.texture);
          if (!a || zink_resource(a->base.texture) != res) {
             if (a)
                unbind_samplerview(ctx, shader_type, start_slot + i);
             update_res_bind_count(ctx, res, shader_type == PIPE_SHADER_COMPUTE, false);
-            res->bind_history |= BITFIELD64_BIT(ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW);
-            res->bind_stages |= 1 << shader_type;
          } else if (a != b) {
             check_samplerview_for_batch_ref(ctx, a);
          }
          if (res->base.b.target == PIPE_BUFFER) {
-            if (res->bind_history & BITFIELD64_BIT(ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW)) {
+            if (b->buffer_view->bvci.buffer != res->obj->buffer) {
                /* if this resource has been rebound while it wasn't set here,
                 * its backing resource will have changed and thus we need to update
                 * the bufferview
                 */
-               struct zink_buffer_view *buffer_view = get_buffer_view(ctx, res, b->base.format, b->base.u.buf.offset, b->base.u.buf.size);
-               if (buffer_view == b->buffer_view)
-                  p_atomic_dec(&buffer_view->reference.count);
-               else {
-                  if (zink_batch_usage_exists(b->buffer_view->batch_uses))
-                     zink_batch_reference_bufferview(&ctx->batch, b->buffer_view);
-                  zink_buffer_view_reference(zink_screen(ctx->base.screen), &b->buffer_view, NULL);
-                  b->buffer_view = buffer_view;
-                  update = true;
-               }
+               VkBufferViewCreateInfo bvci = b->buffer_view->bvci;
+               bvci.buffer = res->obj->buffer;
+               struct zink_buffer_view *buffer_view = get_buffer_view(ctx, res, &bvci);
+               assert(buffer_view != b->buffer_view);
+               if (zink_batch_usage_exists(b->buffer_view->batch_uses))
+                  zink_batch_reference_bufferview(&ctx->batch, b->buffer_view);
+               zink_buffer_view_reference(zink_screen(ctx->base.screen), &b->buffer_view, NULL);
+               b->buffer_view = buffer_view;
+               update = true;
             }
             zink_batch_usage_set(&b->buffer_view->batch_uses, ctx->batch.state);
             zink_fake_buffer_barrier(res, VK_ACCESS_SHADER_READ_BIT,
@@ -1440,8 +1504,13 @@ zink_set_sampler_views(struct pipe_context *pctx,
          unbind_samplerview(ctx, shader_type, start_slot + i);
          update = true;
       }
-      pipe_sampler_view_reference(&ctx->sampler_views[shader_type][start_slot + i], pview);
-      update_descriptor_state_sampler(ctx, shader_type, start_slot + i);
+      if (take_ownership) {
+         pipe_sampler_view_reference(&ctx->sampler_views[shader_type][start_slot + i], NULL);
+         ctx->sampler_views[shader_type][start_slot + i] = pview;
+      } else {
+         pipe_sampler_view_reference(&ctx->sampler_views[shader_type][start_slot + i], pview);
+      }
+      update_descriptor_state_sampler(ctx, shader_type, start_slot + i, res);
    }
    for (; i < num_views + unbind_num_trailing_slots; ++i) {
       update |= !!ctx->sampler_views[shader_type][start_slot + i];
@@ -1449,13 +1518,280 @@ zink_set_sampler_views(struct pipe_context *pctx,
       pipe_sampler_view_reference(
          &ctx->sampler_views[shader_type][start_slot + i],
          NULL);
-      update_descriptor_state_sampler(ctx, shader_type, start_slot + i);
+      update_descriptor_state_sampler(ctx, shader_type, start_slot + i, NULL);
    }
    ctx->di.num_sampler_views[shader_type] = start_slot + num_views;
    if (update)
       zink_screen(pctx->screen)->context_invalidate_descriptor_state(ctx, shader_type, ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW, start_slot, num_views);
 }
 
+static uint64_t
+zink_create_texture_handle(struct pipe_context *pctx, struct pipe_sampler_view *view, const struct pipe_sampler_state *state)
+{
+   struct zink_context *ctx = zink_context(pctx);
+   struct zink_resource *res = zink_resource(view->texture);
+   struct zink_sampler_view *sv = zink_sampler_view(view);
+   struct zink_bindless_descriptor *bd;
+   bd = calloc(1, sizeof(struct zink_bindless_descriptor));
+   if (!bd)
+      return 0;
+
+   bd->sampler = pctx->create_sampler_state(pctx, state);
+   if (!bd->sampler) {
+      free(bd);
+      return 0;
+   }
+
+   bd->ds.is_buffer = res->base.b.target == PIPE_BUFFER;
+   if (res->base.b.target == PIPE_BUFFER)
+      zink_buffer_view_reference(zink_screen(pctx->screen), &bd->ds.bufferview, sv->buffer_view);
+   else
+      zink_surface_reference(zink_screen(pctx->screen), &bd->ds.surface, sv->image_view);
+   uint64_t handle = util_idalloc_alloc(&ctx->di.bindless[bd->ds.is_buffer].tex_slots);
+   if (bd->ds.is_buffer)
+      handle += ZINK_MAX_BINDLESS_HANDLES;
+   bd->handle = handle;
+   _mesa_hash_table_insert(&ctx->di.bindless[bd->ds.is_buffer].tex_handles, (void*)(uintptr_t)handle, bd);
+   return handle;
+}
+
+static void
+zink_delete_texture_handle(struct pipe_context *pctx, uint64_t handle)
+{
+   struct zink_context *ctx = zink_context(pctx);
+   bool is_buffer = ZINK_BINDLESS_IS_BUFFER(handle);
+   struct hash_entry *he = _mesa_hash_table_search(&ctx->di.bindless[is_buffer].tex_handles, (void*)(uintptr_t)handle);
+   assert(he);
+   struct zink_bindless_descriptor *bd = he->data;
+   struct zink_descriptor_surface *ds = &bd->ds;
+   _mesa_hash_table_remove(&ctx->di.bindless[is_buffer].tex_handles, he);
+   uint32_t h = handle;
+   util_dynarray_append(&ctx->batch.state->bindless_releases[0], uint32_t, h);
+
+   struct zink_resource *res = zink_descriptor_surface_resource(ds);
+   if (ds->is_buffer) {
+      if (zink_resource_has_usage(res))
+         zink_batch_reference_bufferview(&ctx->batch, ds->bufferview);
+      zink_buffer_view_reference(zink_screen(pctx->screen), &ds->bufferview, NULL);
+   } else {
+      if (zink_resource_has_usage(res))
+         zink_batch_reference_surface(&ctx->batch, ds->surface);
+      zink_surface_reference(zink_screen(pctx->screen), &ds->surface, NULL);
+      pctx->delete_sampler_state(pctx, bd->sampler);
+   }
+   free(ds);
+}
+
+static void
+rebind_bindless_bufferview(struct zink_context *ctx, struct zink_resource *res, struct zink_descriptor_surface *ds)
+{
+   /* if this resource has been rebound while it wasn't set here,
+    * its backing resource will have changed and thus we need to update
+    * the bufferview
+    */
+   VkBufferViewCreateInfo bvci = ds->bufferview->bvci;
+   bvci.buffer = res->obj->buffer;
+   struct zink_buffer_view *buffer_view = get_buffer_view(ctx, res, &bvci);
+   assert(buffer_view != ds->bufferview);
+   if (zink_resource_has_usage(res))
+      zink_batch_reference_bufferview(&ctx->batch, ds->bufferview);
+   zink_buffer_view_reference(zink_screen(ctx->base.screen), &ds->bufferview, NULL);
+   ds->bufferview = buffer_view;
+}
+
+static void
+zero_bindless_descriptor(struct zink_context *ctx, uint32_t handle, bool is_buffer, bool is_image)
+{
+   if (likely(zink_screen(ctx->base.screen)->info.rb2_feats.nullDescriptor)) {
+      if (is_buffer) {
+         VkBufferView *bv = &ctx->di.bindless[is_image].buffer_infos[handle];
+         *bv = VK_NULL_HANDLE;
+      } else {
+         VkDescriptorImageInfo *ii = &ctx->di.bindless[is_image].img_infos[handle];
+         memset(ii, 0, sizeof(*ii));
+      }
+   } else {
+      if (is_buffer) {
+         VkBufferView *bv = &ctx->di.bindless[is_image].buffer_infos[handle];
+         struct zink_buffer_view *null_bufferview = ctx->dummy_bufferview;
+         *bv = null_bufferview->buffer_view;
+      } else {
+         struct zink_surface *null_surface = zink_csurface(ctx->dummy_surface[is_image]);
+         VkDescriptorImageInfo *ii = &ctx->di.bindless[is_image].img_infos[handle];
+         ii->sampler = VK_NULL_HANDLE;
+         ii->imageView = null_surface->image_view;
+         ii->imageLayout = VK_IMAGE_LAYOUT_GENERAL;
+      }
+   }
+}
+
+static void
+zink_make_texture_handle_resident(struct pipe_context *pctx, uint64_t handle, bool resident)
+{
+   struct zink_context *ctx = zink_context(pctx);
+   bool is_buffer = ZINK_BINDLESS_IS_BUFFER(handle);
+   struct hash_entry *he = _mesa_hash_table_search(&ctx->di.bindless[is_buffer].tex_handles, (void*)(uintptr_t)handle);
+   assert(he);
+   struct zink_bindless_descriptor *bd = he->data;
+   struct zink_descriptor_surface *ds = &bd->ds;
+   struct zink_resource *res = zink_descriptor_surface_resource(ds);
+   if (is_buffer)
+      handle -= ZINK_MAX_BINDLESS_HANDLES;
+   if (resident) {
+      update_res_bind_count(ctx, res, false, false);
+      update_res_bind_count(ctx, res, true, false);
+      res->bindless[0]++;
+      if (is_buffer) {
+         if (ds->bufferview->bvci.buffer != res->obj->buffer)
+            rebind_bindless_bufferview(ctx, res, ds);
+         VkBufferView *bv = &ctx->di.bindless[0].buffer_infos[handle];
+         *bv = ds->bufferview->buffer_view;
+         zink_fake_buffer_barrier(res, VK_ACCESS_SHADER_READ_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
+      } else {
+         VkDescriptorImageInfo *ii = &ctx->di.bindless[0].img_infos[handle];
+         ii->sampler = bd->sampler->sampler;
+         ii->imageView = ds->surface->image_view;
+         ii->imageLayout = zink_descriptor_util_image_layout_eval(res, false);
+         flush_pending_clears(ctx, res);
+         check_for_layout_update(ctx, res, false);
+         check_for_layout_update(ctx, res, true);
+      }
+      zink_batch_resource_usage_set(&ctx->batch, res, false);
+      util_dynarray_append(&ctx->di.bindless[0].resident, struct zink_bindless_descriptor *, bd);
+      uint32_t h = is_buffer ? handle + ZINK_MAX_BINDLESS_HANDLES : handle;
+      util_dynarray_append(&ctx->di.bindless[0].updates, uint32_t, h);
+   } else {
+      zero_bindless_descriptor(ctx, handle, is_buffer, false);
+      util_dynarray_delete_unordered(&ctx->di.bindless[0].resident, struct zink_bindless_descriptor *, bd);
+      update_res_bind_count(ctx, res, false, true);
+      update_res_bind_count(ctx, res, true, true);
+      res->bindless[0]--;
+      for (unsigned i = 0; i < 2; i++) {
+         if (!res->image_bind_count[i])
+            check_for_layout_update(ctx, res, i);
+      }
+   }
+   ctx->di.bindless_dirty[0] = true;
+}
+
+static uint64_t
+zink_create_image_handle(struct pipe_context *pctx, const struct pipe_image_view *view)
+{
+   struct zink_context *ctx = zink_context(pctx);
+   struct zink_resource *res = zink_resource(view->resource);
+   struct zink_bindless_descriptor *bd;
+   if (!zink_resource_object_init_storage(ctx, res)) {
+      debug_printf("couldn't create storage image!");
+      return 0;
+   }
+   bd = malloc(sizeof(struct zink_bindless_descriptor));
+   if (!bd)
+      return 0;
+   bd->sampler = NULL;
+
+   bd->ds.is_buffer = res->base.b.target == PIPE_BUFFER;
+   if (res->base.b.target == PIPE_BUFFER)
+      bd->ds.bufferview = create_image_bufferview(ctx, view);
+   else
+      bd->ds.surface = create_image_surface(ctx, view, false);
+   uint64_t handle = util_idalloc_alloc(&ctx->di.bindless[bd->ds.is_buffer].img_slots);
+   if (bd->ds.is_buffer)
+      handle += ZINK_MAX_BINDLESS_HANDLES;
+   bd->handle = handle;
+   _mesa_hash_table_insert(&ctx->di.bindless[bd->ds.is_buffer].img_handles, (void*)(uintptr_t)handle, bd);
+   return handle;
+}
+
+static void
+zink_delete_image_handle(struct pipe_context *pctx, uint64_t handle)
+{
+   struct zink_context *ctx = zink_context(pctx);
+   bool is_buffer = ZINK_BINDLESS_IS_BUFFER(handle);
+   struct hash_entry *he = _mesa_hash_table_search(&ctx->di.bindless[is_buffer].img_handles, (void*)(uintptr_t)handle);
+   assert(he);
+   struct zink_descriptor_surface *ds = he->data;
+   _mesa_hash_table_remove(&ctx->di.bindless[is_buffer].img_handles, he);
+   uint32_t h = handle;
+   util_dynarray_append(&ctx->batch.state->bindless_releases[1], uint32_t, h);
+
+   struct zink_resource *res = zink_descriptor_surface_resource(ds);
+   if (ds->is_buffer) {
+      if (zink_resource_has_usage(res))
+         zink_batch_reference_bufferview(&ctx->batch, ds->bufferview);
+      zink_buffer_view_reference(zink_screen(pctx->screen), &ds->bufferview, NULL);
+   } else {
+      if (zink_resource_has_usage(res))
+         zink_batch_reference_surface(&ctx->batch, ds->surface);
+      zink_surface_reference(zink_screen(pctx->screen), &ds->surface, NULL);
+   }
+   free(ds);
+}
+
+static void
+zink_make_image_handle_resident(struct pipe_context *pctx, uint64_t handle, unsigned paccess, bool resident)
+{
+   struct zink_context *ctx = zink_context(pctx);
+   bool is_buffer = ZINK_BINDLESS_IS_BUFFER(handle);
+   struct hash_entry *he = _mesa_hash_table_search(&ctx->di.bindless[is_buffer].img_handles, (void*)(uintptr_t)handle);
+   assert(he);
+   struct zink_bindless_descriptor *bd = he->data;
+   struct zink_descriptor_surface *ds = &bd->ds;
+   bd->access = paccess;
+   struct zink_resource *res = zink_descriptor_surface_resource(ds);
+   VkAccessFlags access = 0;
+   if (paccess & PIPE_IMAGE_ACCESS_WRITE) {
+      if (resident) {
+         res->write_bind_count[0]++;
+         res->write_bind_count[1]++;
+      } else {
+         res->write_bind_count[0]--;
+         res->write_bind_count[1]--;
+      }
+      access |= VK_ACCESS_SHADER_WRITE_BIT;
+   }
+   if (paccess & PIPE_IMAGE_ACCESS_READ) {
+      access |= VK_ACCESS_SHADER_READ_BIT;
+   }
+   if (is_buffer)
+      handle -= ZINK_MAX_BINDLESS_HANDLES;
+   if (resident) {
+      update_res_bind_count(ctx, res, false, false);
+      update_res_bind_count(ctx, res, true, false);
+      res->image_bind_count[0]++;
+      res->image_bind_count[1]++;
+      res->bindless[1]++;
+      if (is_buffer) {
+         if (ds->bufferview->bvci.buffer != res->obj->buffer)
+            rebind_bindless_bufferview(ctx, res, ds);
+         VkBufferView *bv = &ctx->di.bindless[1].buffer_infos[handle];
+         *bv = ds->bufferview->buffer_view;
+         zink_fake_buffer_barrier(res, access, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
+      } else {
+         VkDescriptorImageInfo *ii = &ctx->di.bindless[1].img_infos[handle];
+         ii->sampler = VK_NULL_HANDLE;
+         ii->imageView = ds->surface->image_view;
+         ii->imageLayout = VK_IMAGE_LAYOUT_GENERAL;
+         finalize_image_bind(ctx, res, false);
+         finalize_image_bind(ctx, res, true);
+      }
+      zink_batch_resource_usage_set(&ctx->batch, res, zink_resource_access_is_write(access));
+      util_dynarray_append(&ctx->di.bindless[1].resident, struct zink_bindless_descriptor *, bd);
+      uint32_t h = is_buffer ? handle + ZINK_MAX_BINDLESS_HANDLES : handle;
+      util_dynarray_append(&ctx->di.bindless[1].updates, uint32_t, h);
+   } else {
+      zero_bindless_descriptor(ctx, handle, is_buffer, true);
+      util_dynarray_delete_unordered(&ctx->di.bindless[1].resident, struct zink_bindless_descriptor *, bd);
+      unbind_shader_image_counts(ctx, res, false, false);
+      unbind_shader_image_counts(ctx, res, true, false);
+      res->bindless[1]--;
+      for (unsigned i = 0; i < 2; i++) {
+         if (!res->image_bind_count[i])
+            check_for_layout_update(ctx, res, i);
+      }
+   }
+   ctx->di.bindless_dirty[1] = true;
+}
+
 static void
 zink_set_stencil_ref(struct pipe_context *pctx,
                      const struct pipe_stencil_ref ref)
@@ -1481,6 +1817,60 @@ zink_set_tess_state(struct pipe_context *pctx,
    memcpy(&ctx->default_outer_level, default_outer_level, sizeof(ctx->default_outer_level));
 }
 
+static void
+zink_set_patch_vertices(struct pipe_context *pctx, uint8_t patch_vertices)
+{
+   struct zink_context *ctx = zink_context(pctx);
+   ctx->gfx_pipeline_state.patch_vertices = patch_vertices;
+}
+
+void
+zink_update_fbfetch(struct zink_context *ctx)
+{
+   const bool had_fbfetch = ctx->di.fbfetch.imageLayout == VK_IMAGE_LAYOUT_GENERAL;
+   if (!ctx->gfx_stages[PIPE_SHADER_FRAGMENT] ||
+       !ctx->gfx_stages[PIPE_SHADER_FRAGMENT]->nir->info.fs.uses_fbfetch_output) {
+      if (!had_fbfetch)
+         return;
+      ctx->di.fbfetch.imageLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+      ctx->di.fbfetch.imageView = zink_screen(ctx->base.screen)->info.rb2_feats.nullDescriptor ?
+                                  VK_NULL_HANDLE :
+                                  zink_csurface(ctx->dummy_surface[0])->image_view;
+      zink_screen(ctx->base.screen)->context_invalidate_descriptor_state(ctx, PIPE_SHADER_FRAGMENT, ZINK_DESCRIPTOR_TYPE_UBO, 0, 1);
+      return;
+   }
+
+   bool changed = !had_fbfetch;
+   if (ctx->fb_state.cbufs[0]) {
+      VkImageView fbfetch = zink_csurface(ctx->fb_state.cbufs[0])->image_view;
+      changed |= fbfetch != ctx->di.fbfetch.imageView;
+      ctx->di.fbfetch.imageView = zink_csurface(ctx->fb_state.cbufs[0])->image_view;
+   }
+   ctx->di.fbfetch.imageLayout = VK_IMAGE_LAYOUT_GENERAL;
+   if (changed)
+      zink_screen(ctx->base.screen)->context_invalidate_descriptor_state(ctx, PIPE_SHADER_FRAGMENT, ZINK_DESCRIPTOR_TYPE_UBO, 0, 1);
+}
+
+static size_t
+rp_state_size(const struct zink_render_pass_pipeline_state *pstate)
+{
+   return offsetof(struct zink_render_pass_pipeline_state, attachments) +
+                   sizeof(pstate->attachments[0]) * pstate->num_attachments;
+}
+
+static uint32_t
+hash_rp_state(const void *key)
+{
+   const struct zink_render_pass_pipeline_state *s = key;
+   return _mesa_hash_data(key, rp_state_size(s));
+}
+
+static bool
+equals_rp_state(const void *a, const void *b)
+{
+   return !memcmp(a, b, rp_state_size(a));
+}
+
 static uint32_t
 hash_render_pass_state(const void *key)
 {
@@ -1505,29 +1895,45 @@ get_render_pass(struct zink_context *ctx)
    struct zink_render_pass_state state = {0};
    uint32_t clears = 0;
    state.swapchain_init = ctx->new_swapchain;
+   state.samples = fb->samples > 0;
+
+   u_foreach_bit(i, ctx->fbfetch_outputs)
+      state.rts[i].fbfetch = true;
 
    for (int i = 0; i < fb->nr_cbufs; i++) {
       struct pipe_surface *surf = fb->cbufs[i];
       if (surf) {
+         struct zink_surface *transient = zink_transient_surface(surf);
          state.rts[i].format = zink_get_format(screen, surf->format);
-         state.rts[i].samples = surf->texture->nr_samples > 0 ? surf->texture->nr_samples :
-                                                       VK_SAMPLE_COUNT_1_BIT;
+         state.rts[i].samples = MAX3(transient ? transient->base.nr_samples : 0, surf->texture->nr_samples, 1);
          state.rts[i].clear_color = zink_fb_clear_enabled(ctx, i) && !zink_fb_clear_first_needs_explicit(&ctx->fb_clears[i]);
          clears |= !!state.rts[i].clear_color ? PIPE_CLEAR_COLOR0 << i : 0;
          state.rts[i].swapchain = surf->texture->bind & PIPE_BIND_SCANOUT;
+         if (transient) {
+            state.num_cresolves++;
+            state.rts[i].resolve = true;
+            if (!state.rts[i].clear_color)
+               state.msaa_expand_mask |= BITFIELD_BIT(i);
+         }
       } else {
          state.rts[i].format = VK_FORMAT_R8_UINT;
-         state.rts[i].samples = MAX2(fb->samples, 1);
+         state.rts[i].samples = fb->samples;
       }
       state.num_rts++;
    }
    state.num_cbufs = fb->nr_cbufs;
+   assert(!state.num_cresolves || state.num_cbufs == state.num_cresolves);
 
    if (fb->zsbuf) {
       struct zink_resource *zsbuf = zink_resource(fb->zsbuf->texture);
       struct zink_framebuffer_clear *fb_clear = &ctx->fb_clears[PIPE_MAX_COLOR_BUFS];
+      struct zink_surface *transient = zink_transient_surface(fb->zsbuf);
       state.rts[fb->nr_cbufs].format = zsbuf->format;
-      state.rts[fb->nr_cbufs].samples = zsbuf->base.b.nr_samples > 0 ? zsbuf->base.b.nr_samples : VK_SAMPLE_COUNT_1_BIT;
+      state.rts[fb->nr_cbufs].samples = MAX3(transient ? transient->base.nr_samples : 0, fb->zsbuf->texture->nr_samples, 1);
+      if (transient) {
+         state.num_zsresolves = 1;
+         state.rts[fb->nr_cbufs].resolve = true;
+      }
       state.rts[fb->nr_cbufs].clear_color = zink_fb_clear_enabled(ctx, PIPE_MAX_COLOR_BUFS) &&
                                             !zink_fb_clear_first_needs_explicit(fb_clear) &&
                                             (zink_fb_clear_element(fb_clear, 0)->zs.bits & PIPE_CLEAR_DEPTH);
@@ -1542,7 +1948,7 @@ get_render_pass(struct zink_context *ctx)
                                        ctx->gfx_stages[PIPE_SHADER_FRAGMENT]->nir->info.outputs_written : 0;
       bool needs_write = (ctx->dsa_state && ctx->dsa_state->hw_state.depth_write) ||
                                             outputs_written & (BITFIELD64_BIT(FRAG_RESULT_DEPTH) | BITFIELD64_BIT(FRAG_RESULT_STENCIL));
-      state.rts[fb->nr_cbufs].needs_write = needs_write || state.rts[fb->nr_cbufs].clear_color || state.rts[fb->nr_cbufs].clear_stencil;
+      state.rts[fb->nr_cbufs].needs_write = needs_write || state.num_zsresolves || state.rts[fb->nr_cbufs].clear_color || state.rts[fb->nr_cbufs].clear_stencil;
       state.num_rts++;
    }
    state.have_zsbuf = fb->zsbuf != NULL;
@@ -1556,57 +1962,38 @@ get_render_pass(struct zink_context *ctx)
       rp = entry->data;
       assert(rp->state.clears == clears);
    } else {
-      rp = zink_create_render_pass(screen, &state);
+      struct zink_render_pass_pipeline_state pstate;
+      pstate.samples = state.samples;
+      rp = zink_create_render_pass(screen, &state, &pstate);
       if (!_mesa_hash_table_insert_pre_hashed(ctx->render_pass_cache, hash, &rp->state, rp))
          return NULL;
+      bool found = false;
+      struct set_entry *entry = _mesa_set_search_or_add(&ctx->render_pass_state_cache, &pstate, &found);
+      struct zink_render_pass_pipeline_state *ppstate;
+      if (!found) {
+         entry->key = ralloc(ctx, struct zink_render_pass_pipeline_state);
+         ppstate = (void*)entry->key;
+         memcpy(ppstate, &pstate, rp_state_size(&pstate));
+         ppstate->id = ctx->render_pass_state_cache.entries;
+      }
+      ppstate = (void*)entry->key;
+      rp->pipeline_state = ppstate->id;
    }
    return rp;
 }
 
-static struct zink_framebuffer *
-get_framebuffer(struct zink_context *ctx)
+static uint32_t
+hash_framebuffer_imageless(const void *key)
 {
-   struct zink_screen *screen = zink_screen(ctx->base.screen);
-   struct pipe_surface *attachments[PIPE_MAX_COLOR_BUFS + 1] = {0};
+   struct zink_framebuffer_state* s = (struct zink_framebuffer_state*)key;
+   return _mesa_hash_data(key, offsetof(struct zink_framebuffer_state, infos) + sizeof(s->infos[0]) * s->num_attachments);
+}
 
-   struct zink_framebuffer_state state = {0};
-   for (int i = 0; i < ctx->fb_state.nr_cbufs; i++) {
-      struct pipe_surface *psurf = ctx->fb_state.cbufs[i];
-      state.attachments[i] = psurf ? zink_surface(psurf)->image_view : VK_NULL_HANDLE;
-      attachments[i] = psurf;
-   }
-
-   state.num_attachments = ctx->fb_state.nr_cbufs;
-   if (ctx->fb_state.zsbuf) {
-      struct pipe_surface *psurf = ctx->fb_state.zsbuf;
-      state.attachments[state.num_attachments] = psurf ? zink_surface(psurf)->image_view : VK_NULL_HANDLE;
-      attachments[state.num_attachments++] = psurf;
-   }
-
-   state.width = MAX2(ctx->fb_state.width, 1);
-   state.height = MAX2(ctx->fb_state.height, 1);
-   state.layers = MAX2(util_framebuffer_get_num_layers(&ctx->fb_state), 1);
-   state.samples = ctx->fb_state.samples;
-
-   struct zink_framebuffer *fb;
-   simple_mtx_lock(&screen->framebuffer_mtx);
-   struct hash_entry *entry = _mesa_hash_table_search(&screen->framebuffer_cache, &state);
-   if (entry) {
-      fb = (void*)entry->data;
-      struct zink_framebuffer *fb_ref = NULL;
-      /* this gains 1 ref every time we reuse it */
-      zink_framebuffer_reference(screen, &fb_ref, fb);
-   } else {
-      /* this adds 1 extra ref on creation because all newly-created framebuffers are
-       * going to be bound; necessary to handle framebuffers which have no "real" attachments
-       * and are only using null surfaces since the only ref they get is the extra one here
-       */
-      fb = zink_create_framebuffer(ctx, &state, attachments);
-      _mesa_hash_table_insert(&screen->framebuffer_cache, &fb->state, fb);
-   }
-   simple_mtx_unlock(&screen->framebuffer_mtx);
-
-   return fb;
+static bool
+equals_framebuffer_imageless(const void *a, const void *b)
+{
+   struct zink_framebuffer_state *s = (struct zink_framebuffer_state*)a;
+   return memcmp(a, b, offsetof(struct zink_framebuffer_state, infos) + sizeof(s->infos[0]) * s->num_attachments) == 0;
 }
 
 static void
@@ -1616,7 +2003,7 @@ setup_framebuffer(struct zink_context *ctx)
    struct zink_render_pass *rp = ctx->gfx_pipeline_state.render_pass;
 
    if (ctx->gfx_pipeline_state.sample_locations_enabled && ctx->sample_locations_changed) {
-      unsigned samples = ctx->gfx_pipeline_state.rast_samples;
+      unsigned samples = ctx->gfx_pipeline_state.rast_samples + 1;
       unsigned idx = util_logbase2_ceil(MAX2(samples, 1));
       VkExtent2D grid_size = screen->maxSampleLocationGridSize[idx];
  
@@ -1638,20 +2025,102 @@ setup_framebuffer(struct zink_context *ctx)
    if (ctx->rp_changed)
       rp = get_render_pass(ctx);
 
-   if (rp != ctx->gfx_pipeline_state.render_pass)
-      ctx->gfx_pipeline_state.dirty =
-      ctx->fb_changed = true;
+   ctx->fb_changed |= rp != ctx->gfx_pipeline_state.render_pass;
+   if (rp->pipeline_state != ctx->gfx_pipeline_state.rp_state) {
+      ctx->gfx_pipeline_state.rp_state = rp->pipeline_state;
+      ctx->gfx_pipeline_state.dirty = true;
+   }
 
    ctx->rp_changed = false;
 
    if (!ctx->fb_changed)
       return;
 
-   zink_init_framebuffer(screen, ctx->framebuffer, rp);
+   ctx->init_framebuffer(screen, ctx->framebuffer, rp);
    ctx->fb_changed = false;
    ctx->gfx_pipeline_state.render_pass = rp;
 }
 
+static VkImageView
+prep_fb_attachment(struct zink_context *ctx, struct zink_surface *surf, unsigned i)
+{
+   if (!surf)
+      return zink_csurface(ctx->dummy_surface[util_logbase2_ceil(ctx->fb_state.samples)])->image_view;
+
+   zink_batch_resource_usage_set(&ctx->batch, zink_resource(surf->base.texture), true);
+   zink_batch_usage_set(&surf->batch_uses, ctx->batch.state);
+
+   struct zink_resource *res = zink_resource(surf->base.texture);
+   VkAccessFlags access;
+   VkPipelineStageFlags pipeline;
+   VkImageLayout layout = zink_render_pass_attachment_get_barrier_info(ctx->gfx_pipeline_state.render_pass,
+                                                                       i, &pipeline, &access);
+   zink_resource_image_barrier(ctx, res, layout, access, pipeline);
+   return surf->image_view;
+}
+
+static void
+prep_fb_attachments(struct zink_context *ctx, VkImageView *att)
+{
+   const unsigned cresolve_offset = ctx->fb_state.nr_cbufs + !!ctx->fb_state.zsbuf;
+   unsigned num_resolves = 0;
+   for (int i = 0; i < ctx->fb_state.nr_cbufs; i++) {
+      struct zink_surface *surf = zink_csurface(ctx->fb_state.cbufs[i]);
+      struct zink_surface *transient = zink_transient_surface(ctx->fb_state.cbufs[i]);
+      if (transient) {
+         att[i] = prep_fb_attachment(ctx, transient, i);
+         att[i + cresolve_offset] = prep_fb_attachment(ctx, surf, i);
+         num_resolves++;
+      } else {
+         att[i] = prep_fb_attachment(ctx, surf, i);
+      }
+   }
+   if (ctx->fb_state.zsbuf) {
+      struct zink_surface *surf = zink_csurface(ctx->fb_state.zsbuf);
+      struct zink_surface *transient = zink_transient_surface(ctx->fb_state.zsbuf);
+      if (transient) {
+         att[ctx->fb_state.nr_cbufs] = prep_fb_attachment(ctx, transient, ctx->fb_state.nr_cbufs);
+         att[cresolve_offset + num_resolves] = prep_fb_attachment(ctx, surf, ctx->fb_state.nr_cbufs);
+      } else {
+         att[ctx->fb_state.nr_cbufs] = prep_fb_attachment(ctx, surf, ctx->fb_state.nr_cbufs);
+      }
+   }
+}
+
+static void
+update_framebuffer_state(struct zink_context *ctx, int old_w, int old_h)
+{
+   if (ctx->fb_state.width != old_w || ctx->fb_state.height != old_h)
+      ctx->scissor_changed = true;
+   /* get_framebuffer adds a ref if the fb is reused or created;
+    * always do get_framebuffer first to avoid deleting the same fb
+    * we're about to use
+    */
+   struct zink_framebuffer *fb = ctx->get_framebuffer(ctx);
+   struct zink_screen *screen = zink_screen(ctx->base.screen);
+   if (ctx->framebuffer && !screen->info.have_KHR_imageless_framebuffer) {
+      simple_mtx_lock(&screen->framebuffer_mtx);
+      struct hash_entry *he = _mesa_hash_table_search(&screen->framebuffer_cache, &ctx->framebuffer->state);
+      if (ctx->framebuffer && !ctx->framebuffer->state.num_attachments) {
+         /* if this has no attachments then its lifetime has ended */
+         _mesa_hash_table_remove(&screen->framebuffer_cache, he);
+         he = NULL;
+         /* ensure an unflushed fb doesn't get destroyed by deferring it */
+         util_dynarray_append(&ctx->batch.state->dead_framebuffers, struct zink_framebuffer*, ctx->framebuffer);
+         ctx->framebuffer = NULL;
+      }
+      /* a framebuffer loses 1 ref every time we unset it;
+       * we do NOT add refs here, as the ref has already been added in
+       * get_framebuffer()
+       */
+      if (zink_framebuffer_reference(screen, &ctx->framebuffer, NULL) && he)
+         _mesa_hash_table_remove(&screen->framebuffer_cache, he);
+      simple_mtx_unlock(&screen->framebuffer_mtx);
+   }
+   ctx->fb_changed |= ctx->framebuffer != fb;
+   ctx->framebuffer = fb;
+}
+
 static unsigned
 begin_render_pass(struct zink_context *ctx)
 {
@@ -1710,23 +2179,43 @@ begin_render_pass(struct zink_context *ctx)
 
    assert(ctx->gfx_pipeline_state.render_pass && ctx->framebuffer);
 
-   zink_batch_reference_framebuffer(batch, ctx->framebuffer);
-   for (int i = 0; i < ctx->framebuffer->state.num_attachments; i++) {
-      if (ctx->framebuffer->surfaces[i]) {
-         struct zink_surface *surf = zink_surface(ctx->framebuffer->surfaces[i]);
-         zink_batch_resource_usage_set(batch, zink_resource(surf->base.texture), true);
-         zink_batch_usage_set(&surf->batch_uses, batch->state);
-
-         struct zink_resource *res = zink_resource(surf->base.texture);
-         VkAccessFlags access;
-         VkPipelineStageFlags pipeline;
-         VkImageLayout layout = zink_render_pass_attachment_get_barrier_info(ctx->gfx_pipeline_state.render_pass,
-                                                                             i, &pipeline, &access);
-         zink_resource_image_barrier(ctx, NULL, res, layout, access, pipeline);
+   VkRenderPassAttachmentBeginInfo infos;
+   VkImageView att[2 * (PIPE_MAX_COLOR_BUFS + 1)];
+   infos.sType = VK_STRUCTURE_TYPE_RENDER_PASS_ATTACHMENT_BEGIN_INFO;
+   infos.pNext = NULL;
+   infos.attachmentCount = ctx->framebuffer->state.num_attachments;
+   infos.pAttachments = att;
+   prep_fb_attachments(ctx, att);
+   if (zink_screen(ctx->base.screen)->info.have_KHR_imageless_framebuffer) {
+#ifndef NDEBUG
+      const unsigned cresolve_offset = ctx->fb_state.nr_cbufs + !!ctx->fb_state.zsbuf;
+      for (int i = 0; i < ctx->fb_state.nr_cbufs; i++) {
+         if (ctx->fb_state.cbufs[i]) {
+            struct zink_surface *surf = zink_csurface(ctx->fb_state.cbufs[i]);
+            struct zink_surface *transient = zink_transient_surface(ctx->fb_state.cbufs[i]);
+            if (transient) {
+               assert(zink_resource(transient->base.texture)->obj->vkusage == ctx->framebuffer->state.infos[i].usage);
+               assert(zink_resource(surf->base.texture)->obj->vkusage == ctx->framebuffer->state.infos[cresolve_offset].usage);
+            } else {
+               assert(zink_resource(surf->base.texture)->obj->vkusage == ctx->framebuffer->state.infos[i].usage);
+            }
+         }
       }
+      if (ctx->fb_state.zsbuf) {
+         struct zink_surface *surf = zink_csurface(ctx->fb_state.zsbuf);
+         struct zink_surface *transient = zink_transient_surface(ctx->fb_state.zsbuf);
+         if (transient) {
+            assert(zink_resource(transient->base.texture)->obj->vkusage == ctx->framebuffer->state.infos[ctx->fb_state.nr_cbufs].usage);
+            assert(zink_resource(surf->base.texture)->obj->vkusage == ctx->framebuffer->state.infos[cresolve_offset].usage);
+         } else {
+            assert(zink_resource(surf->base.texture)->obj->vkusage == ctx->framebuffer->state.infos[ctx->fb_state.nr_cbufs].usage);
+         }
+      }
+#endif
+      rpbi.pNext = &infos;
    }
 
-   vkCmdBeginRenderPass(batch->state->cmdbuf, &rpbi, VK_SUBPASS_CONTENTS_INLINE);
+   VKCTX(CmdBeginRenderPass)(batch->state->cmdbuf, &rpbi, VK_SUBPASS_CONTENTS_INLINE);
    batch->in_rp = true;
    ctx->new_swapchain = false;
    return clear_buffers;
@@ -1736,11 +2225,11 @@ void
 zink_init_vk_sample_locations(struct zink_context *ctx, VkSampleLocationsInfoEXT *loc)
 {
    struct zink_screen *screen = zink_screen(ctx->base.screen);
-   unsigned idx = util_logbase2_ceil(MAX2(ctx->gfx_pipeline_state.rast_samples, 1));
+   unsigned idx = util_logbase2_ceil(MAX2(ctx->gfx_pipeline_state.rast_samples + 1, 1));
    loc->sType = VK_STRUCTURE_TYPE_SAMPLE_LOCATIONS_INFO_EXT;
    loc->pNext = NULL;
    loc->sampleLocationsPerPixel = 1 << idx;
-   loc->sampleLocationsCount = ctx->gfx_pipeline_state.rast_samples;
+   loc->sampleLocationsCount = ctx->gfx_pipeline_state.rast_samples + 1;
    loc->sampleLocationGridSize = screen->maxSampleLocationGridSize[idx];
    loc->pSampleLocations = ctx->vk_sample_locations;
 }
@@ -1760,9 +2249,43 @@ zink_evaluate_depth_buffer(struct pipe_context *pctx)
 }
 
 void
-zink_begin_render_pass(struct zink_context *ctx, struct zink_batch *batch)
+zink_begin_render_pass(struct zink_context *ctx)
 {
    setup_framebuffer(ctx);
+   /* TODO: need replicate EXT */
+   if (ctx->framebuffer->rp->state.msaa_expand_mask) {
+      uint32_t rp_state = ctx->gfx_pipeline_state.rp_state;
+      struct zink_render_pass *rp = ctx->gfx_pipeline_state.render_pass;
+
+      u_foreach_bit(i, ctx->framebuffer->rp->state.msaa_expand_mask) {
+         struct zink_ctx_surface *csurf = (struct zink_ctx_surface*)ctx->fb_state.cbufs[i];
+         if (csurf->transient_init)
+            continue;
+         struct pipe_surface *dst_view = (struct pipe_surface*)csurf->transient;
+         assert(dst_view);
+         struct pipe_sampler_view src_templ, *src_view;
+         struct pipe_resource *src = ctx->fb_state.cbufs[i]->texture;
+         struct pipe_box dstbox;
+
+         u_box_3d(0, 0, 0, ctx->fb_state.width, ctx->fb_state.height,
+                  1 + dst_view->u.tex.last_layer - dst_view->u.tex.first_layer, &dstbox);
+
+         util_blitter_default_src_texture(ctx->blitter, &src_templ, src, ctx->fb_state.cbufs[i]->u.tex.level);
+         src_view = ctx->base.create_sampler_view(&ctx->base, src, &src_templ);
+
+         zink_blit_begin(ctx, ZINK_BLIT_SAVE_FB | ZINK_BLIT_SAVE_FS | ZINK_BLIT_SAVE_TEXTURES);
+         util_blitter_blit_generic(ctx->blitter, dst_view, &dstbox,
+                                   src_view, &dstbox, ctx->fb_state.width, ctx->fb_state.height,
+                                   PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL,
+                                   false, false);
+
+         pipe_sampler_view_reference(&src_view, NULL);
+         csurf->transient_init = true;
+      }
+      ctx->fb_changed = ctx->rp_changed = false;
+      ctx->gfx_pipeline_state.rp_state = rp_state;
+      ctx->gfx_pipeline_state.render_pass = rp;
+   }
    assert(ctx->gfx_pipeline_state.render_pass);
    unsigned clear_buffers = begin_render_pass(ctx);
 
@@ -1772,14 +2295,19 @@ zink_begin_render_pass(struct zink_context *ctx, struct zink_batch *batch)
 }
 
 void
-zink_end_render_pass(struct zink_context *ctx, struct zink_batch *batch)
+zink_end_render_pass(struct zink_context *ctx)
 {
-   if (batch->in_rp) {
+   if (ctx->batch.in_rp) {
       if (ctx->render_condition.query)
          zink_stop_conditional_render(ctx);
-      vkCmdEndRenderPass(batch->state->cmdbuf);
+      VKCTX(CmdEndRenderPass)(ctx->batch.state->cmdbuf);
+      for (unsigned i = 0; i < ctx->fb_state.nr_cbufs; i++) {
+         struct zink_ctx_surface *csurf = (struct zink_ctx_surface*)ctx->fb_state.cbufs[i];
+         if (csurf)
+            csurf->transient_init = true;
+      }
    }
-   batch->in_rp = false;
+   ctx->batch.in_rp = false;
 }
 
 static void
@@ -1881,6 +2409,23 @@ zink_update_descriptor_refs(struct zink_context *ctx, bool compute)
       if (ctx->curr_program)
          zink_batch_reference_program(batch, &ctx->curr_program->base);
    }
+   if (ctx->di.bindless_refs_dirty) {
+      ctx->di.bindless_refs_dirty = false;
+      for (unsigned i = 0; i < 2; i++) {
+         util_dynarray_foreach(&ctx->di.bindless[i].resident, struct zink_bindless_descriptor*, bd) {
+            struct zink_resource *res = zink_descriptor_surface_resource(&(*bd)->ds);
+            zink_batch_resource_usage_set(&ctx->batch, res, (*bd)->access & PIPE_IMAGE_ACCESS_WRITE);
+         }
+      }
+   }
+}
+
+static void
+stall(struct zink_context *ctx)
+{
+   sync_flush(ctx, zink_batch_state(ctx->last_fence));
+   zink_vkfence_wait(zink_screen(ctx->base.screen), ctx->last_fence, PIPE_TIMEOUT_INFINITE);
+   zink_batch_reset_all(ctx);
 }
 
 static void
@@ -1889,8 +2434,8 @@ flush_batch(struct zink_context *ctx, bool sync)
    struct zink_batch *batch = &ctx->batch;
    if (ctx->clears_enabled)
       /* start rp to do all the clears */
-      zink_begin_render_pass(ctx, batch);
-   zink_end_render_pass(ctx, batch);
+      zink_begin_render_pass(ctx);
+   zink_end_render_pass(ctx);
    zink_end_batch(ctx, batch);
    ctx->deferred_fence = NULL;
 
@@ -1908,9 +2453,11 @@ flush_batch(struct zink_context *ctx, bool sync)
       zink_select_launch_grid(ctx);
 
       if (ctx->oom_stall)
-         zink_fence_wait(&ctx->base);
+         stall(ctx);
       ctx->oom_flush = false;
       ctx->oom_stall = false;
+      ctx->dd->bindless_bound = false;
+      ctx->di.bindless_refs_dirty = true;
    }
 }
 
@@ -1926,8 +2473,8 @@ rebind_fb_surface(struct zink_context *ctx, struct pipe_surface **surf, struct z
    if (!*surf)
       return false;
    struct zink_resource *surf_res = zink_resource((*surf)->texture);
-   if ((match_res == surf_res) || surf_res->obj != zink_surface(*surf)->obj)
-      return zink_rebind_surface(ctx, surf);
+   if ((match_res == surf_res) || surf_res->obj != zink_csurface(*surf)->obj)
+      return zink_rebind_ctx_surface(ctx, surf);
    return false;
 }
 
@@ -1949,10 +2496,14 @@ unbind_fb_surface(struct zink_context *ctx, struct pipe_surface *surf, bool chan
 {
    if (!surf)
       return;
+   struct zink_surface *transient = zink_transient_surface(surf);
    if (changed) {
       zink_fb_clears_apply(ctx, surf->texture);
-      if (zink_batch_usage_exists(zink_surface(surf)->batch_uses))
-         zink_batch_reference_surface(&ctx->batch, zink_surface(surf));
+      if (zink_batch_usage_exists(zink_csurface(surf)->batch_uses)) {
+         zink_batch_reference_surface(&ctx->batch, zink_csurface(surf));
+         if (transient)
+            zink_batch_reference_surface(&ctx->batch, transient);
+      }
       ctx->rp_changed = true;
    }
    struct zink_resource *res = zink_resource(surf->texture);
@@ -1966,9 +2517,12 @@ zink_set_framebuffer_state(struct pipe_context *pctx,
                            const struct pipe_framebuffer_state *state)
 {
    struct zink_context *ctx = zink_context(pctx);
+   unsigned samples = state->nr_cbufs || state->zsbuf ? 0 : state->samples;
 
    for (int i = 0; i < ctx->fb_state.nr_cbufs; i++) {
       struct pipe_surface *surf = ctx->fb_state.cbufs[i];
+      if (i < state->nr_cbufs)
+         ctx->rp_changed |= !!zink_transient_surface(surf) != !!zink_transient_surface(state->cbufs[i]);
       unbind_fb_surface(ctx, surf, i >= state->nr_cbufs || surf != state->cbufs[i]);
    }
    if (ctx->fb_state.zsbuf) {
@@ -1976,10 +2530,12 @@ zink_set_framebuffer_state(struct pipe_context *pctx,
       struct zink_resource *res = zink_resource(surf->texture);
       bool changed = surf != state->zsbuf;
       unbind_fb_surface(ctx, surf, changed);
+      if (!changed)
+         ctx->rp_changed |= !!zink_transient_surface(surf) != !!zink_transient_surface(state->zsbuf);
       if (changed && unlikely(res->obj->needs_zs_evaluate))
          /* have to flush zs eval while the sample location data still exists,
           * so just throw some random barrier */
-         zink_resource_image_barrier(ctx, NULL, res, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+         zink_resource_image_barrier(ctx, res, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
                                      VK_ACCESS_SHADER_READ_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT);
    }
    /* renderpass changes if the number or types of attachments change */
@@ -1990,11 +2546,15 @@ zink_set_framebuffer_state(struct pipe_context *pctx,
    unsigned h = ctx->fb_state.height;
 
    util_copy_framebuffer_state(&ctx->fb_state, state);
+   zink_update_fbfetch(ctx);
    unsigned prev_void_alpha_attachments = ctx->gfx_pipeline_state.void_alpha_attachments;
    ctx->gfx_pipeline_state.void_alpha_attachments = 0;
    for (int i = 0; i < ctx->fb_state.nr_cbufs; i++) {
       struct pipe_surface *surf = ctx->fb_state.cbufs[i];
       if (surf) {
+         struct zink_surface *transient = zink_transient_surface(surf);
+         if (!samples)
+            samples = MAX3(transient ? transient->base.nr_samples : 1, surf->texture->nr_samples, 1);
          zink_resource(surf->texture)->fb_binds++;
          ctx->gfx_pipeline_state.void_alpha_attachments |= util_format_has_alpha1(surf->format) ? BITFIELD_BIT(i) : 0;
       }
@@ -2003,48 +2563,30 @@ zink_set_framebuffer_state(struct pipe_context *pctx,
       ctx->gfx_pipeline_state.dirty = true;
    if (ctx->fb_state.zsbuf) {
       struct pipe_surface *surf = ctx->fb_state.zsbuf;
+      struct zink_surface *transient = zink_transient_surface(surf);
+      if (!samples)
+         samples = MAX3(transient ? transient->base.nr_samples : 1, surf->texture->nr_samples, 1);
       zink_resource(surf->texture)->fb_binds++;
    }
-   if (ctx->fb_state.width != w || ctx->fb_state.height != h)
-      ctx->scissor_changed = true;
    rebind_fb_state(ctx, NULL, true);
-   /* get_framebuffer adds a ref if the fb is reused or created;
-    * always do get_framebuffer first to avoid deleting the same fb
-    * we're about to use
-    */
-   struct zink_framebuffer *fb = get_framebuffer(ctx);
-   if (ctx->framebuffer) {
-      struct zink_screen *screen = zink_screen(pctx->screen);
-      simple_mtx_lock(&screen->framebuffer_mtx);
-      struct hash_entry *he = _mesa_hash_table_search(&screen->framebuffer_cache, &ctx->framebuffer->state);
-      if (ctx->framebuffer && !ctx->framebuffer->state.num_attachments) {
-         /* if this has no attachments then its lifetime has ended */
-         _mesa_hash_table_remove(&screen->framebuffer_cache, he);
-         he = NULL;
-      }
-      /* a framebuffer loses 1 ref every time we unset it;
-       * we do NOT add refs here, as the ref has already been added in
-       * get_framebuffer()
-       */
-      if (zink_framebuffer_reference(screen, &ctx->framebuffer, NULL) && he)
-         _mesa_hash_table_remove(&screen->framebuffer_cache, he);
-      simple_mtx_unlock(&screen->framebuffer_mtx);
-   }
-   ctx->fb_changed |= ctx->framebuffer != fb;
-   ctx->framebuffer = fb;
+   ctx->fb_state.samples = MAX2(samples, 1);
+   update_framebuffer_state(ctx, w, h);
 
-   uint8_t rast_samples = util_framebuffer_get_num_samples(state);
-   /* in vulkan, gl_SampleMask needs to be explicitly ignored for sampleCount == 1 */
-   if ((ctx->gfx_pipeline_state.rast_samples > 1) != (rast_samples > 1))
-      ctx->dirty_shader_stages |= 1 << PIPE_SHADER_FRAGMENT;
+   uint8_t rast_samples = ctx->fb_state.samples - 1;
+   /* update the shader key if applicable:
+    * if gl_SampleMask[] is written to, we have to ensure that we get a shader with the same sample count:
+    * in GL, rast_samples==1 means ignore gl_SampleMask[]
+    * in VK, gl_SampleMask[] is never ignored
+    */
+   if (rast_samples != ctx->gfx_pipeline_state.rast_samples &&
+       (!ctx->gfx_stages[PIPE_SHADER_FRAGMENT] ||
+        ctx->gfx_stages[PIPE_SHADER_FRAGMENT]->nir->info.outputs_written & (1 << FRAG_RESULT_SAMPLE_MASK)))
+      zink_set_fs_key(ctx)->samples = ctx->fb_state.samples > 0;
    if (ctx->gfx_pipeline_state.rast_samples != rast_samples) {
       ctx->sample_locations_changed |= ctx->gfx_pipeline_state.sample_locations_enabled;
       ctx->gfx_pipeline_state.dirty = true;
    }
    ctx->gfx_pipeline_state.rast_samples = rast_samples;
-   if (ctx->gfx_pipeline_state.num_attachments != state->nr_cbufs)
-      ctx->gfx_pipeline_state.dirty = true;
-   ctx->gfx_pipeline_state.num_attachments = state->nr_cbufs;
 
    /* need to ensure we start a new rp on next draw */
    zink_batch_no_rp(ctx);
@@ -2078,7 +2620,9 @@ zink_set_sample_locations(struct pipe_context *pctx, size_t size, const uint8_t
    ctx->sample_locations_changed = ctx->gfx_pipeline_state.sample_locations_enabled;
    if (size > sizeof(ctx->sample_locations))
       size = sizeof(ctx->sample_locations);
-   memcpy(ctx->sample_locations, locations, size);
+
+   if (locations)
+      memcpy(ctx->sample_locations, locations, size);
 }
 
 static VkAccessFlags
@@ -2214,9 +2758,9 @@ zink_resource_image_needs_barrier(struct zink_resource *res, VkImageLayout new_l
       pipeline = pipeline_dst_stage(new_layout);
    if (!flags)
       flags = access_dst_flags(new_layout);
-   return res->layout != new_layout || (res->access_stage & pipeline) != pipeline ||
-          (res->access & flags) != flags ||
-          zink_resource_access_is_write(res->access) ||
+   return res->layout != new_layout || (res->obj->access_stage & pipeline) != pipeline ||
+          (res->obj->access & flags) != flags ||
+          zink_resource_access_is_write(res->obj->access) ||
           zink_resource_access_is_write(flags);
 }
 
@@ -2236,7 +2780,7 @@ zink_resource_image_barrier_init(VkImageMemoryBarrier *imb, struct zink_resource
    *imb = (VkImageMemoryBarrier){
       VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
       NULL,
-      res->access ? res->access : access_src_flags(res->layout),
+      res->obj->access ? res->obj->access : access_src_flags(res->layout),
       flags,
       res->layout,
       new_layout,
@@ -2262,7 +2806,7 @@ static void
 resource_check_defer_buffer_barrier(struct zink_context *ctx, struct zink_resource *res, VkPipelineStageFlags pipeline)
 {
    assert(res->obj->is_buffer);
-   if (res->bind_count[0]) {
+   if (res->bind_count[0] - res->so_bind_count > 0) {
       if ((res->obj->is_buffer && res->vbo_bind_mask && !(pipeline & VK_PIPELINE_STAGE_VERTEX_INPUT_BIT)) ||
           ((!res->obj->is_buffer || util_bitcount(res->vbo_bind_mask) != res->bind_count[0]) && !is_shader_pipline_stage(pipeline)))
          /* gfx rebind */
@@ -2276,13 +2820,12 @@ resource_check_defer_buffer_barrier(struct zink_context *ctx, struct zink_resour
 static inline VkCommandBuffer
 get_cmdbuf(struct zink_context *ctx, struct zink_resource *res)
 {
-   if ((res->access && !res->unordered_barrier) || !ctx->batch.in_rp) {
-      struct zink_batch *batch = zink_batch_no_rp(ctx);
-      assert(!batch->in_rp);
-      res->unordered_barrier = false;
-      return batch->state->cmdbuf;
+   if ((res->obj->access && !res->obj->unordered_barrier) || !ctx->batch.in_rp) {
+      zink_batch_no_rp(ctx);
+      res->obj->unordered_barrier = false;
+      return ctx->batch.state->cmdbuf;
    }
-   res->unordered_barrier = true;
+   res->obj->unordered_barrier = true;
    ctx->batch.state->has_barriers = true;
    return ctx->batch.state->barrier_cmdbuf;
 }
@@ -2314,7 +2857,7 @@ resource_check_defer_image_barrier(struct zink_context *ctx, struct zink_resourc
 }
 
 void
-zink_resource_image_barrier(struct zink_context *ctx, struct zink_batch *batch, struct zink_resource *res,
+zink_resource_image_barrier(struct zink_context *ctx, struct zink_resource *res,
                       VkImageLayout new_layout, VkAccessFlags flags, VkPipelineStageFlags pipeline)
 {
    VkImageMemoryBarrier imb;
@@ -2326,12 +2869,19 @@ zink_resource_image_barrier(struct zink_context *ctx, struct zink_batch *batch,
    /* only barrier if we're changing layout or doing something besides read -> read */
    VkCommandBuffer cmdbuf = get_cmdbuf(ctx, res);
    assert(new_layout);
+   if (!res->obj->access_stage)
+      imb.srcAccessMask = 0;
    if (res->obj->needs_zs_evaluate)
       imb.pNext = &res->obj->zs_evaluate;
    res->obj->needs_zs_evaluate = false;
-   vkCmdPipelineBarrier(
+   if (res->dmabuf_acquire) {
+      imb.srcQueueFamilyIndex = VK_QUEUE_FAMILY_FOREIGN_EXT;
+      imb.dstQueueFamilyIndex = zink_screen(ctx->base.screen)->gfx_queue;
+      res->dmabuf_acquire = false;
+   }
+   VKCTX(CmdPipelineBarrier)(
       cmdbuf,
-      res->access_stage ? res->access_stage : VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+      res->obj->access_stage ? res->obj->access_stage : VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
       pipeline,
       0,
       0, NULL,
@@ -2341,12 +2891,12 @@ zink_resource_image_barrier(struct zink_context *ctx, struct zink_batch *batch,
 
    resource_check_defer_image_barrier(ctx, res, new_layout, pipeline);
 
-   if (res->unordered_barrier) {
-      res->access |= imb.dstAccessMask;
-      res->access_stage |= pipeline;
+   if (res->obj->unordered_barrier) {
+      res->obj->access |= imb.dstAccessMask;
+      res->obj->access_stage |= pipeline;
    } else {
-      res->access = imb.dstAccessMask;
-      res->access_stage = pipeline;
+      res->obj->access = imb.dstAccessMask;
+      res->obj->access_stage = pipeline;
    }
    res->layout = new_layout;
 }
@@ -2394,25 +2944,25 @@ pipeline_access_stage(VkAccessFlags flags)
 ALWAYS_INLINE static bool
 zink_resource_buffer_needs_barrier(struct zink_resource *res, VkAccessFlags flags, VkPipelineStageFlags pipeline)
 {
-   if (!res->access || !res->access_stage)
+   if (!res->obj->access || !res->obj->access_stage)
       return true;
    if (!pipeline)
       pipeline = pipeline_access_stage(flags);
-   return zink_resource_access_is_write(res->access) ||
+   return zink_resource_access_is_write(res->obj->access) ||
           zink_resource_access_is_write(flags) ||
-          ((res->access_stage & pipeline) != pipeline && !(res->access_stage & (pipeline - 1))) ||
-          (res->access & flags) != flags;
+          ((res->obj->access_stage & pipeline) != pipeline && !(res->obj->access_stage & (pipeline - 1))) ||
+          (res->obj->access & flags) != flags;
 }
 
 void
 zink_fake_buffer_barrier(struct zink_resource *res, VkAccessFlags flags, VkPipelineStageFlags pipeline)
 {
-   res->access = flags;
-   res->access_stage = pipeline;
+   res->obj->access = flags;
+   res->obj->access_stage = pipeline;
 }
 
 void
-zink_resource_buffer_barrier(struct zink_context *ctx, struct zink_batch *batch, struct zink_resource *res, VkAccessFlags flags, VkPipelineStageFlags pipeline)
+zink_resource_buffer_barrier(struct zink_context *ctx, struct zink_resource *res, VkAccessFlags flags, VkPipelineStageFlags pipeline)
 {
    VkMemoryBarrier bmb;
    if (!pipeline)
@@ -2422,13 +2972,15 @@ zink_resource_buffer_barrier(struct zink_context *ctx, struct zink_batch *batch,
 
    bmb.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
    bmb.pNext = NULL;
-   bmb.srcAccessMask = res->access;
+   bmb.srcAccessMask = res->obj->access;
    bmb.dstAccessMask = flags;
+   if (!res->obj->access_stage)
+      bmb.srcAccessMask = 0;
    VkCommandBuffer cmdbuf = get_cmdbuf(ctx, res);
    /* only barrier if we're changing layout or doing something besides read -> read */
-   vkCmdPipelineBarrier(
+   VKCTX(CmdPipelineBarrier)(
       cmdbuf,
-      res->access_stage ? res->access_stage : pipeline_access_stage(res->access),
+      res->obj->access_stage ? res->obj->access_stage : pipeline_access_stage(res->obj->access),
       pipeline,
       0,
       1, &bmb,
@@ -2438,12 +2990,12 @@ zink_resource_buffer_barrier(struct zink_context *ctx, struct zink_batch *batch,
 
    resource_check_defer_buffer_barrier(ctx, res, pipeline);
 
-   if (res->unordered_barrier) {
-      res->access |= bmb.dstAccessMask;
-      res->access_stage |= pipeline;
+   if (res->obj->unordered_barrier) {
+      res->obj->access |= bmb.dstAccessMask;
+      res->obj->access_stage |= pipeline;
    } else {
-      res->access = bmb.dstAccessMask;
-      res->access_stage = pipeline;
+      res->obj->access = bmb.dstAccessMask;
+      res->obj->access_stage = pipeline;
    }
 }
 
@@ -2455,15 +3007,6 @@ zink_resource_needs_barrier(struct zink_resource *res, VkImageLayout layout, VkA
    return zink_resource_image_needs_barrier(res, layout, flags, pipeline);
 }
 
-void
-zink_resource_barrier(struct zink_context *ctx, struct zink_batch *batch, struct zink_resource *res, VkImageLayout layout, VkAccessFlags flags, VkPipelineStageFlags pipeline)
-{
-   if (res->base.b.target == PIPE_BUFFER)
-      zink_resource_buffer_barrier(ctx, batch, res, flags, pipeline);
-   else
-      zink_resource_image_barrier(ctx, batch, res, layout, flags, pipeline);
-}
-
 VkShaderStageFlagBits
 zink_shader_stage(enum pipe_shader_type type)
 {
@@ -2478,18 +3021,6 @@ zink_shader_stage(enum pipe_shader_type type)
    return stages[type];
 }
 
-static uint32_t
-hash_gfx_program(const void *key)
-{
-   return _mesa_hash_data(key, sizeof(struct zink_shader *) * (ZINK_SHADER_COUNT));
-}
-
-static bool
-equals_gfx_program(const void *a, const void *b)
-{
-   return memcmp(a, b, sizeof(struct zink_shader *) * (ZINK_SHADER_COUNT)) == 0;
-}
-
 static void
 zink_flush(struct pipe_context *pctx,
            struct pipe_fence_handle **pfence,
@@ -2506,7 +3037,7 @@ zink_flush(struct pipe_context *pctx,
    /* triggering clears will force has_work */
    if (!deferred && ctx->clears_enabled)
       /* start rp to do all the clears */
-      zink_begin_render_pass(ctx, batch);
+      zink_begin_render_pass(ctx);
 
    if (!batch->has_work) {
        if (pfence) {
@@ -2544,8 +3075,6 @@ zink_flush(struct pipe_context *pctx,
          *pfence = (struct pipe_fence_handle *)mfence;
       }
 
-      struct zink_batch_state *bs = zink_batch_state(fence);
-      zink_batch_state_reference(screen, NULL, bs);
       mfence->fence = fence;
       if (fence)
          mfence->submit_count = submit_count;
@@ -2585,11 +3114,8 @@ zink_fence_wait(struct pipe_context *pctx)
 
    if (ctx->batch.has_work)
       pctx->flush(pctx, NULL, PIPE_FLUSH_HINT_FINISH);
-   if (ctx->last_fence) {
-      sync_flush(ctx, zink_batch_state(ctx->last_fence));
-      zink_vkfence_wait(zink_screen(ctx->base.screen), ctx->last_fence, PIPE_TIMEOUT_INFINITE);
-      zink_batch_reset_all(ctx);
-   }
+   if (ctx->last_fence)
+      stall(ctx);
 }
 
 void
@@ -2612,8 +3138,13 @@ zink_wait_on_batch(struct zink_context *ctx, uint32_t batch_id)
    if (ctx->last_fence && (!batch_id || batch_id == zink_batch_state(ctx->last_fence)->fence.batch_id))
       fence = ctx->last_fence;
    else {
-      struct hash_entry *he = _mesa_hash_table_search_pre_hashed(&ctx->batch_states, batch_id, (void*)(uintptr_t)batch_id);
-      if (!he) {
+      for (bs = ctx->batch_states; bs; bs = bs->next) {
+         if (bs->fence.batch_id < batch_id)
+            continue;
+         if (!bs->fence.batch_id || bs->fence.batch_id > batch_id)
+            break;
+      }
+      if (!bs || bs->fence.batch_id != batch_id) {
          simple_mtx_unlock(&ctx->batch_mtx);
          /* if we can't find it, it either must have finished already or is on a different context */
          if (!zink_screen_check_last_finished(zink_screen(ctx->base.screen), batch_id)) {
@@ -2623,7 +3154,7 @@ zink_wait_on_batch(struct zink_context *ctx, uint32_t batch_id)
          }
          return;
       }
-      fence = he->data;
+      fence = &bs->fence;
    }
    simple_mtx_unlock(&ctx->batch_mtx);
    assert(fence);
@@ -2656,15 +3187,20 @@ zink_check_batch_completion(struct zink_context *ctx, uint32_t batch_id, bool ha
    if (ctx->last_fence && batch_id == zink_batch_state(ctx->last_fence)->fence.batch_id)
       fence = ctx->last_fence;
    else {
-      struct hash_entry *he = _mesa_hash_table_search_pre_hashed(&ctx->batch_states, batch_id, (void*)(uintptr_t)batch_id);
-      /* if we can't find it, it either must have finished already or is on a different context */
-      if (!he) {
+      struct zink_batch_state *bs;
+      for (bs = ctx->batch_states; bs; bs = bs->next) {
+         if (bs->fence.batch_id < batch_id)
+            continue;
+         if (!bs->fence.batch_id || bs->fence.batch_id > batch_id)
+            break;
+      }
+      if (!bs || bs->fence.batch_id != batch_id) {
          if (!have_lock)
             simple_mtx_unlock(&ctx->batch_mtx);
          /* return compare against last_finished, since this has info from all contexts */
          return zink_screen_check_last_finished(zink_screen(ctx->base.screen), batch_id);
       }
-      fence = he->data;
+      fence = &bs->fence;
    }
    if (!have_lock)
       simple_mtx_unlock(&ctx->batch_mtx);
@@ -2682,21 +3218,14 @@ zink_texture_barrier(struct pipe_context *pctx, unsigned flags)
    if (!ctx->framebuffer || !ctx->framebuffer->state.num_attachments)
       return;
 
-   VkMemoryBarrier bmb;
-   bmb.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
-   bmb.pNext = NULL;
-   bmb.srcAccessMask = 0;
-   bmb.dstAccessMask = 0;
-   struct zink_surface *surf = zink_surface(ctx->framebuffer->surfaces[ctx->framebuffer->state.num_attachments - 1]);
-   struct zink_resource *res = zink_resource(surf->base.texture);
    zink_batch_no_rp(ctx);
-   if (res->aspect != VK_IMAGE_ASPECT_COLOR_BIT) {
+   if (ctx->fb_state.zsbuf) {
       VkMemoryBarrier dmb;
       dmb.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
       dmb.pNext = NULL;
       dmb.srcAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
       dmb.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
-      vkCmdPipelineBarrier(
+      VKCTX(CmdPipelineBarrier)(
          ctx->batch.state->cmdbuf,
          VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT,
          VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
@@ -2705,35 +3234,87 @@ zink_texture_barrier(struct pipe_context *pctx, unsigned flags)
          0, NULL,
          0, NULL
       );
-   } else {
-      bmb.srcAccessMask |= VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
-      bmb.dstAccessMask |= VK_ACCESS_SHADER_READ_BIT;
    }
-   if (ctx->framebuffer->state.num_attachments > 1) {
-      bmb.srcAccessMask |= VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
-      bmb.dstAccessMask |= VK_ACCESS_SHADER_READ_BIT;
-   }
-   if (bmb.srcAccessMask)
-      vkCmdPipelineBarrier(
-         ctx->batch.state->cmdbuf,
-         VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
-         VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
-         0,
-         1, &bmb,
-         0, NULL,
-         0, NULL
-      );
+   if (!ctx->fb_state.nr_cbufs)
+      return;
+
+   VkMemoryBarrier bmb;
+   bmb.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
+   bmb.pNext = NULL;
+   bmb.srcAccessMask = 0;
+   bmb.dstAccessMask = 0;
+   bmb.srcAccessMask |= VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+   bmb.dstAccessMask |= VK_ACCESS_SHADER_READ_BIT;
+   VKCTX(CmdPipelineBarrier)(
+      ctx->batch.state->cmdbuf,
+      VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
+      VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
+      0,
+      1, &bmb,
+      0, NULL,
+      0, NULL
+   );
 }
 
 static inline void
-mem_barrier(struct zink_batch *batch, VkPipelineStageFlags src_stage, VkPipelineStageFlags dst_stage, VkAccessFlags src, VkAccessFlags dst)
+mem_barrier(struct zink_context *ctx, VkPipelineStageFlags src_stage, VkPipelineStageFlags dst_stage, VkAccessFlags src, VkAccessFlags dst)
 {
+   struct zink_batch *batch = &ctx->batch;
    VkMemoryBarrier mb;
    mb.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
    mb.pNext = NULL;
    mb.srcAccessMask = src;
    mb.dstAccessMask = dst;
-   vkCmdPipelineBarrier(batch->state->cmdbuf, src_stage, dst_stage, 0, 1, &mb, 0, NULL, 0, NULL);
+   zink_end_render_pass(ctx);
+   VKCTX(CmdPipelineBarrier)(batch->state->cmdbuf, src_stage, dst_stage, 0, 1, &mb, 0, NULL, 0, NULL);
+}
+
+void
+zink_flush_memory_barrier(struct zink_context *ctx, bool is_compute)
+{
+   const VkPipelineStageFlags gfx_flags = VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
+                                          VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
+                                          VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
+                                          VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
+                                          VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
+   const VkPipelineStageFlags cs_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+   VkPipelineStageFlags src = ctx->batch.last_was_compute ? cs_flags : gfx_flags;
+   VkPipelineStageFlags dst = is_compute ? cs_flags : gfx_flags;
+
+   if (ctx->memory_barrier & (PIPE_BARRIER_TEXTURE | PIPE_BARRIER_SHADER_BUFFER | PIPE_BARRIER_IMAGE))
+      mem_barrier(ctx, src, dst, VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
+
+   if (ctx->memory_barrier & PIPE_BARRIER_CONSTANT_BUFFER)
+      mem_barrier(ctx, src, dst,
+                  VK_ACCESS_SHADER_WRITE_BIT,
+                  VK_ACCESS_UNIFORM_READ_BIT);
+
+   if (!is_compute) {
+      if (ctx->memory_barrier & PIPE_BARRIER_INDIRECT_BUFFER)
+         mem_barrier(ctx, src, VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
+                     VK_ACCESS_SHADER_WRITE_BIT,
+                     VK_ACCESS_INDIRECT_COMMAND_READ_BIT);
+      if (ctx->memory_barrier & PIPE_BARRIER_VERTEX_BUFFER)
+         mem_barrier(ctx, gfx_flags, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT,
+                     VK_ACCESS_SHADER_WRITE_BIT,
+                     VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT);
+
+      if (ctx->memory_barrier & PIPE_BARRIER_INDEX_BUFFER)
+         mem_barrier(ctx, gfx_flags, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT,
+                     VK_ACCESS_SHADER_WRITE_BIT,
+                     VK_ACCESS_INDEX_READ_BIT);
+      if (ctx->memory_barrier & PIPE_BARRIER_FRAMEBUFFER)
+         zink_texture_barrier(&ctx->base, 0);
+      if (ctx->memory_barrier & PIPE_BARRIER_STREAMOUT_BUFFER)
+         mem_barrier(ctx, VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
+                            VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
+                            VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT,
+                     VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT,
+                     VK_ACCESS_SHADER_READ_BIT,
+                     VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT |
+                     VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT);
+   }
+   ctx->memory_barrier = 0;
 }
 
 static void
@@ -2741,58 +3322,15 @@ zink_memory_barrier(struct pipe_context *pctx, unsigned flags)
 {
    struct zink_context *ctx = zink_context(pctx);
 
-   VkPipelineStageFlags all_flags = VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
-                                    VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
-                                    VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
-                                    VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
-                                    VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
-                                    VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
-
-   if (!(flags & ~PIPE_BARRIER_UPDATE))
+   flags &= ~PIPE_BARRIER_UPDATE;
+   if (!flags)
       return;
 
-   struct zink_batch *batch = &ctx->batch;
-   zink_end_render_pass(ctx, batch);
-
    if (flags & PIPE_BARRIER_MAPPED_BUFFER) {
       /* TODO: this should flush all persistent buffers in use as I think */
+      flags &= ~PIPE_BARRIER_MAPPED_BUFFER;
    }
-
-   if (flags & (PIPE_BARRIER_TEXTURE | PIPE_BARRIER_SHADER_BUFFER | PIPE_BARRIER_IMAGE))
-      mem_barrier(batch, all_flags, all_flags, VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
-
-   if (flags & PIPE_BARRIER_QUERY_BUFFER)
-      mem_barrier(batch, all_flags, VK_PIPELINE_STAGE_TRANSFER_BIT,
-                  VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_TRANSFER_READ_BIT);
-
-   if (flags & PIPE_BARRIER_VERTEX_BUFFER)
-      mem_barrier(batch, all_flags, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT,
-                  VK_ACCESS_SHADER_WRITE_BIT,
-                  VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT);
-
-   if (flags & PIPE_BARRIER_INDEX_BUFFER)
-      mem_barrier(batch, all_flags, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT,
-                  VK_ACCESS_SHADER_WRITE_BIT,
-                  VK_ACCESS_INDEX_READ_BIT);
-
-   if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
-      mem_barrier(batch, all_flags, all_flags,
-                  VK_ACCESS_SHADER_WRITE_BIT,
-                  VK_ACCESS_UNIFORM_READ_BIT);
-
-   if (flags & PIPE_BARRIER_INDIRECT_BUFFER)
-      mem_barrier(batch, all_flags, VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
-                  VK_ACCESS_SHADER_WRITE_BIT,
-                  VK_ACCESS_INDIRECT_COMMAND_READ_BIT);
-
-   if (flags & PIPE_BARRIER_FRAMEBUFFER)
-      zink_texture_barrier(pctx, 0);
-   if (flags & PIPE_BARRIER_STREAMOUT_BUFFER)
-      mem_barrier(batch, all_flags,
-                  VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT,
-                  VK_ACCESS_SHADER_WRITE_BIT,
-                  VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT |
-                  VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT);
+   ctx->memory_barrier = flags;
 }
 
 static void
@@ -2808,7 +3346,7 @@ zink_flush_resource(struct pipe_context *pctx,
 }
 
 void
-zink_copy_buffer(struct zink_context *ctx, struct zink_batch *batch, struct zink_resource *dst, struct zink_resource *src,
+zink_copy_buffer(struct zink_context *ctx, struct zink_resource *dst, struct zink_resource *src,
                  unsigned dst_offset, unsigned src_offset, unsigned size)
 {
    VkBufferCopy region;
@@ -2816,36 +3354,34 @@ zink_copy_buffer(struct zink_context *ctx, struct zink_batch *batch, struct zink
    region.dstOffset = dst_offset;
    region.size = size;
 
-   if (!batch)
-      batch = zink_batch_no_rp(ctx);
-   assert(!batch->in_rp);
+   struct zink_batch *batch = &ctx->batch;
+   zink_batch_no_rp(ctx);
    zink_batch_reference_resource_rw(batch, src, false);
    zink_batch_reference_resource_rw(batch, dst, true);
    util_range_add(&dst->base.b, &dst->valid_buffer_range, dst_offset, dst_offset + size);
-   zink_resource_buffer_barrier(ctx, batch, src, VK_ACCESS_TRANSFER_READ_BIT, 0);
-   zink_resource_buffer_barrier(ctx, batch, dst, VK_ACCESS_TRANSFER_WRITE_BIT, 0);
-   vkCmdCopyBuffer(batch->state->cmdbuf, src->obj->buffer, dst->obj->buffer, 1, &region);
+   zink_resource_buffer_barrier(ctx, src, VK_ACCESS_TRANSFER_READ_BIT, 0);
+   zink_resource_buffer_barrier(ctx, dst, VK_ACCESS_TRANSFER_WRITE_BIT, 0);
+   VKCTX(CmdCopyBuffer)(batch->state->cmdbuf, src->obj->buffer, dst->obj->buffer, 1, &region);
 }
 
 void
-zink_copy_image_buffer(struct zink_context *ctx, struct zink_batch *batch, struct zink_resource *dst, struct zink_resource *src,
+zink_copy_image_buffer(struct zink_context *ctx, struct zink_resource *dst, struct zink_resource *src,
                        unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz,
                        unsigned src_level, const struct pipe_box *src_box, enum pipe_map_flags map_flags)
 {
    struct zink_resource *img = dst->base.b.target == PIPE_BUFFER ? src : dst;
    struct zink_resource *buf = dst->base.b.target == PIPE_BUFFER ? dst : src;
-
-   if (!batch)
-      batch = zink_batch_no_rp(ctx);
+   struct zink_batch *batch = &ctx->batch;
+   zink_batch_no_rp(ctx);
 
    bool buf2img = buf == src;
 
    if (buf2img) {
-      zink_resource_image_barrier(ctx, batch, img, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 0, 0);
-      zink_resource_buffer_barrier(ctx, batch, buf, VK_ACCESS_TRANSFER_READ_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT);
+      zink_resource_image_barrier(ctx, img, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 0, 0);
+      zink_resource_buffer_barrier(ctx, buf, VK_ACCESS_TRANSFER_READ_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT);
    } else {
-      zink_resource_image_barrier(ctx, batch, img, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, 0, 0);
-      zink_resource_buffer_barrier(ctx, batch, buf, VK_ACCESS_TRANSFER_WRITE_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT);
+      zink_resource_image_barrier(ctx, img, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, 0, 0);
+      zink_resource_buffer_barrier(ctx, buf, VK_ACCESS_TRANSFER_WRITE_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT);
       util_range_add(&dst->base.b, &dst->valid_buffer_range, dstx, dstx + src_box->width);
    }
 
@@ -2915,9 +3451,9 @@ zink_copy_image_buffer(struct zink_context *ctx, struct zink_batch *batch, struc
        * - vkCmdCopyBufferToImage spec
        */
       if (buf2img)
-         vkCmdCopyBufferToImage(batch->state->cmdbuf, buf->obj->buffer, img->obj->image, img->layout, 1, &region);
+         VKCTX(CmdCopyBufferToImage)(batch->state->cmdbuf, buf->obj->buffer, img->obj->image, img->layout, 1, &region);
       else
-         vkCmdCopyImageToBuffer(batch->state->cmdbuf, img->obj->image, img->layout, buf->obj->buffer, 1, &region);
+         VKCTX(CmdCopyImageToBuffer)(batch->state->cmdbuf, img->obj->image, img->layout, buf->obj->buffer, 1, &region);
    }
 }
 
@@ -3009,19 +3545,20 @@ zink_resource_copy_region(struct pipe_context *pctx,
       region.extent.width = src_box->width;
       region.extent.height = src_box->height;
 
-      struct zink_batch *batch = zink_batch_no_rp(ctx);
+      struct zink_batch *batch = &ctx->batch;
+      zink_batch_no_rp(ctx);
       zink_batch_reference_resource_rw(batch, src, false);
       zink_batch_reference_resource_rw(batch, dst, true);
 
       zink_resource_setup_transfer_layouts(ctx, src, dst);
-      vkCmdCopyImage(batch->state->cmdbuf, src->obj->image, src->layout,
+      VKCTX(CmdCopyImage)(batch->state->cmdbuf, src->obj->image, src->layout,
                      dst->obj->image, dst->layout,
                      1, &region);
    } else if (dst->base.b.target == PIPE_BUFFER &&
               src->base.b.target == PIPE_BUFFER) {
-      zink_copy_buffer(ctx, NULL, dst, src, dstx, src_box->x, src_box->width);
+      zink_copy_buffer(ctx, dst, src, dstx, src_box->x, src_box->width);
    } else
-      zink_copy_image_buffer(ctx, NULL, dst, src, dst_level, dstx, dsty, dstz, src_level, src_box, 0);
+      zink_copy_image_buffer(ctx, dst, src, dst_level, dstx, dsty, dstz, src_level, src_box, 0);
 }
 
 static struct pipe_stream_output_target *
@@ -3051,7 +3588,7 @@ zink_create_stream_output_target(struct pipe_context *pctx,
    t->base.buffer_offset = buffer_offset;
    t->base.buffer_size = buffer_size;
 
-   zink_resource(t->base.buffer)->bind_history |= ZINK_RESOURCE_USAGE_STREAMOUT;
+   zink_resource(t->base.buffer)->so_valid = true;
 
    return &t->base;
 }
@@ -3075,8 +3612,16 @@ zink_set_stream_output_targets(struct pipe_context *pctx,
    struct zink_context *ctx = zink_context(pctx);
 
    if (num_targets == 0) {
-      for (unsigned i = 0; i < ctx->num_so_targets; i++)
+      for (unsigned i = 0; i < ctx->num_so_targets; i++) {
+         if (ctx->so_targets[i]) {
+            struct zink_resource *so = zink_resource(ctx->so_targets[i]->buffer);
+            if (so) {
+               so->so_bind_count--;
+               update_res_bind_count(ctx, so, false, true);
+            }
+         }
          pipe_so_target_reference(&ctx->so_targets[i], NULL);
+      }
       ctx->num_so_targets = 0;
    } else {
       for (unsigned i = 0; i < num_targets; i++) {
@@ -3093,9 +3638,22 @@ zink_set_stream_output_targets(struct pipe_context *pctx,
             ctx->xfb_barrier |= zink_resource_buffer_needs_barrier(res,
                                                                    VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT,
                                                                    VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT);
+         struct zink_resource *so = zink_resource(ctx->so_targets[i]->buffer);
+         if (so) {
+            so->so_bind_count++;
+            update_res_bind_count(ctx, so, false, false);
+         }
       }
-      for (unsigned i = num_targets; i < ctx->num_so_targets; i++)
+      for (unsigned i = num_targets; i < ctx->num_so_targets; i++) {
+         if (ctx->so_targets[i]) {
+            struct zink_resource *so = zink_resource(ctx->so_targets[i]->buffer);
+            if (so) {
+               so->so_bind_count--;
+               update_res_bind_count(ctx, so, false, true);
+            }
+         }
          pipe_so_target_reference(&ctx->so_targets[i], NULL);
+      }
       ctx->num_so_targets = num_targets;
 
       /* TODO: possibly avoid rebinding on resume if resuming from same buffers? */
@@ -3108,97 +3666,208 @@ zink_rebind_framebuffer(struct zink_context *ctx, struct zink_resource *res)
 {
    if (!ctx->framebuffer)
       return;
-   for (unsigned i = 0; i < ctx->framebuffer->state.num_attachments; i++) {
-      if (!ctx->framebuffer->surfaces[i] ||
-          zink_resource(ctx->framebuffer->surfaces[i]->texture) != res)
-         continue;
-      zink_rebind_surface(ctx, &ctx->framebuffer->surfaces[i]);
-      zink_batch_no_rp(ctx);
+   bool did_rebind = false;
+   if (res->aspect & VK_IMAGE_ASPECT_COLOR_BIT) {
+      for (unsigned i = 0; i < ctx->fb_state.nr_cbufs; i++) {
+         if (!ctx->fb_state.cbufs[i] ||
+             zink_resource(ctx->fb_state.cbufs[i]->texture) != res)
+            continue;
+         zink_rebind_ctx_surface(ctx, &ctx->fb_state.cbufs[i]);
+         did_rebind = true;
+      }
+   } else {
+      if (ctx->fb_state.zsbuf && zink_resource(ctx->fb_state.zsbuf->texture) != res) {
+         zink_rebind_ctx_surface(ctx, &ctx->fb_state.zsbuf);
+         did_rebind = true;
+      }
+   }
+
+   did_rebind |= rebind_fb_state(ctx, res, false);
+
+   if (!did_rebind)
+      return;
+
+   zink_batch_no_rp(ctx);
+   if (zink_screen(ctx->base.screen)->info.have_KHR_imageless_framebuffer) {
+      struct zink_framebuffer *fb = ctx->get_framebuffer(ctx);
+      ctx->fb_changed |= ctx->framebuffer != fb;
+      ctx->framebuffer = fb;
    }
-   if (rebind_fb_state(ctx, res, false))
-      zink_batch_no_rp(ctx);
 }
 
-static void
-rebind_buffer(struct zink_context *ctx, struct zink_resource *res)
+ALWAYS_INLINE static struct zink_resource *
+rebind_ubo(struct zink_context *ctx, enum pipe_shader_type shader, unsigned slot)
 {
-   const unsigned total_rebinds = res->bind_count[0] + res->bind_count[1];
-   unsigned num_rebinds = 0, num_image_rebinds_remaining[2] = {res->image_bind_count[0], res->image_bind_count[1]};
+   struct zink_resource *res = update_descriptor_state_ubo(ctx, shader, slot,
+                                                           ctx->di.descriptor_res[ZINK_DESCRIPTOR_TYPE_UBO][shader][slot]);
+   zink_screen(ctx->base.screen)->context_invalidate_descriptor_state(ctx, shader, ZINK_DESCRIPTOR_TYPE_UBO, slot, 1);
+   return res;
+}
+
+ALWAYS_INLINE static struct zink_resource *
+rebind_ssbo(struct zink_context *ctx, enum pipe_shader_type shader, unsigned slot)
+{
+   const struct pipe_shader_buffer *ssbo = &ctx->ssbos[shader][slot];
+   struct zink_resource *res = zink_resource(ssbo->buffer);
+   if (!res)
+      return NULL;
+   util_range_add(&res->base.b, &res->valid_buffer_range, ssbo->buffer_offset,
+                  ssbo->buffer_offset + ssbo->buffer_size);
+   update_descriptor_state_ssbo(ctx, shader, slot, res);
+   zink_screen(ctx->base.screen)->context_invalidate_descriptor_state(ctx, shader, ZINK_DESCRIPTOR_TYPE_SSBO, slot, 1);
+   return res;
+}
+
+ALWAYS_INLINE static struct zink_resource *
+rebind_tbo(struct zink_context *ctx, enum pipe_shader_type shader, unsigned slot)
+{
+   struct zink_sampler_view *sampler_view = zink_sampler_view(ctx->sampler_views[shader][slot]);
+   if (!sampler_view || sampler_view->base.texture->target != PIPE_BUFFER)
+      return NULL;
+   struct zink_resource *res = zink_resource(sampler_view->base.texture);
+   if (zink_batch_usage_exists(sampler_view->buffer_view->batch_uses))
+      zink_batch_reference_bufferview(&ctx->batch, sampler_view->buffer_view);
+   VkBufferViewCreateInfo bvci = sampler_view->buffer_view->bvci;
+   bvci.buffer = res->obj->buffer;
+   zink_buffer_view_reference(zink_screen(ctx->base.screen), &sampler_view->buffer_view, NULL);
+   sampler_view->buffer_view = get_buffer_view(ctx, res, &bvci);
+   update_descriptor_state_sampler(ctx, shader, slot, res);
+   zink_screen(ctx->base.screen)->context_invalidate_descriptor_state(ctx, shader, ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW, slot, 1);
+   return res;
+}
+
+ALWAYS_INLINE static struct zink_resource *
+rebind_ibo(struct zink_context *ctx, enum pipe_shader_type shader, unsigned slot)
+{
+   struct zink_image_view *image_view = &ctx->image_views[shader][slot];
+   struct zink_resource *res = zink_resource(image_view->base.resource);
+   if (!res || res->base.b.target != PIPE_BUFFER)
+      return NULL;
+   zink_descriptor_set_refs_clear(&image_view->buffer_view->desc_set_refs, image_view->buffer_view);
+   if (zink_batch_usage_exists(image_view->buffer_view->batch_uses))
+      zink_batch_reference_bufferview(&ctx->batch, image_view->buffer_view);
+   VkBufferViewCreateInfo bvci = image_view->buffer_view->bvci;
+   bvci.buffer = res->obj->buffer;
+   zink_buffer_view_reference(zink_screen(ctx->base.screen), &image_view->buffer_view, NULL);
+   if (!zink_resource_object_init_storage(ctx, res)) {
+      debug_printf("couldn't create storage image!");
+      return NULL;
+   }
+   image_view->buffer_view = get_buffer_view(ctx, res, &bvci);
+   assert(image_view->buffer_view);
+   util_range_add(&res->base.b, &res->valid_buffer_range, image_view->base.u.buf.offset,
+                  image_view->base.u.buf.offset + image_view->base.u.buf.size);
+   update_descriptor_state_image(ctx, shader, slot, res);
+   zink_screen(ctx->base.screen)->context_invalidate_descriptor_state(ctx, shader, ZINK_DESCRIPTOR_TYPE_IMAGE, slot, 1);
+   return res;
+}
+
+static unsigned
+rebind_buffer(struct zink_context *ctx, struct zink_resource *res, uint32_t rebind_mask, const unsigned expected_num_rebinds)
+{
+   unsigned num_rebinds = 0;
    bool has_write = false;
 
-   if (res->vbo_bind_mask) {
+   if (!zink_resource_has_binds(res))
+      return 0;
+
+   assert(!res->bindless[1]); //TODO
+   if ((rebind_mask & BITFIELD_BIT(TC_BINDING_STREAMOUT_BUFFER)) || (!rebind_mask && res->so_bind_count && ctx->num_so_targets)) {
+      for (unsigned i = 0; i < ctx->num_so_targets; i++) {
+         if (ctx->so_targets[i]) {
+            struct zink_resource *so = zink_resource(ctx->so_targets[i]->buffer);
+            if (so && so == res) {
+               ctx->dirty_so_targets = true;
+               num_rebinds++;
+            }
+         }
+      }
+      rebind_mask &= ~BITFIELD_BIT(TC_BINDING_STREAMOUT_BUFFER);
+   }
+   if (num_rebinds && expected_num_rebinds >= num_rebinds && !rebind_mask)
+      goto end;
+
+   if ((rebind_mask & BITFIELD_BIT(TC_BINDING_VERTEX_BUFFER)) || (!rebind_mask && res->vbo_bind_mask)) {
       u_foreach_bit(slot, res->vbo_bind_mask) {
          if (ctx->vertex_buffers[slot].buffer.resource != &res->base.b) //wrong context
-            return;
+            goto end;
          set_vertex_buffer_clamped(ctx, slot);
          num_rebinds++;
       }
+      rebind_mask &= ~BITFIELD_BIT(TC_BINDING_VERTEX_BUFFER);
       ctx->vertex_buffers_dirty = true;
    }
-   for (unsigned shader = 0; num_rebinds < total_rebinds && shader < PIPE_SHADER_TYPES; shader++) {
+   if (num_rebinds && expected_num_rebinds >= num_rebinds && !rebind_mask)
+      goto end;
+
+   const uint32_t ubo_mask = rebind_mask ?
+                             rebind_mask & BITFIELD_RANGE(TC_BINDING_UBO_VS, PIPE_SHADER_TYPES) :
+                             ((res->ubo_bind_count[0] ? BITFIELD_RANGE(TC_BINDING_UBO_VS, (PIPE_SHADER_TYPES - 1)) : 0) |
+                              (res->ubo_bind_count[1] ? BITFIELD_BIT(TC_BINDING_UBO_CS) : 0));
+   u_foreach_bit(shader, ubo_mask >> TC_BINDING_UBO_VS) {
       u_foreach_bit(slot, res->ubo_bind_mask[shader]) {
          if (&res->base.b != ctx->ubos[shader][slot].buffer) //wrong context
-            return;
-
-         update_descriptor_state_ubo(ctx, shader, slot);
-         zink_screen(ctx->base.screen)->context_invalidate_descriptor_state(ctx, shader, ZINK_DESCRIPTOR_TYPE_UBO, slot, 1);
+            goto end;
+         rebind_ubo(ctx, shader, slot);
          num_rebinds++;
       }
+   }
+   rebind_mask &= ~BITFIELD_RANGE(TC_BINDING_UBO_VS, PIPE_SHADER_TYPES);
+   if (num_rebinds && expected_num_rebinds >= num_rebinds && !rebind_mask)
+      goto end;
+
+   const unsigned ssbo_mask = rebind_mask ?
+                              rebind_mask & BITFIELD_RANGE(TC_BINDING_SSBO_VS, PIPE_SHADER_TYPES) :
+                              BITFIELD_RANGE(TC_BINDING_SSBO_VS, PIPE_SHADER_TYPES);
+   u_foreach_bit(shader, ssbo_mask >> TC_BINDING_SSBO_VS) {
       u_foreach_bit(slot, res->ssbo_bind_mask[shader]) {
          struct pipe_shader_buffer *ssbo = &ctx->ssbos[shader][slot];
          if (&res->base.b != ssbo->buffer) //wrong context
-            return;
-
-         has_write |= ctx->writable_ssbos[shader] & BITFIELD64_BIT(slot);
-         util_range_add(&res->base.b, &res->valid_buffer_range, ssbo->buffer_offset,
-                        ssbo->buffer_offset + ssbo->buffer_size);
-         update_descriptor_state_ssbo(ctx, shader, slot);
-         zink_screen(ctx->base.screen)->context_invalidate_descriptor_state(ctx, shader, ZINK_DESCRIPTOR_TYPE_SSBO, slot, 1);
+            goto end;
+         rebind_ssbo(ctx, shader, slot);
+         has_write |= (ctx->writable_ssbos[shader] & BITFIELD64_BIT(slot)) != 0;
          num_rebinds++;
       }
+   }
+   rebind_mask &= ~BITFIELD_RANGE(TC_BINDING_SSBO_VS, PIPE_SHADER_TYPES);
+   if (num_rebinds && expected_num_rebinds >= num_rebinds && !rebind_mask)
+      goto end;
+   const unsigned sampler_mask = rebind_mask ?
+                                 rebind_mask & BITFIELD_RANGE(TC_BINDING_SAMPLERVIEW_VS, PIPE_SHADER_TYPES) :
+                                 BITFIELD_RANGE(TC_BINDING_SAMPLERVIEW_VS, PIPE_SHADER_TYPES);
+   u_foreach_bit(shader, sampler_mask >> TC_BINDING_SAMPLERVIEW_VS) {
       u_foreach_bit(slot, res->sampler_binds[shader]) {
          struct zink_sampler_view *sampler_view = zink_sampler_view(ctx->sampler_views[shader][slot]);
          if (&res->base.b != sampler_view->base.texture) //wrong context
-            return;
-
-         if (zink_batch_usage_exists(sampler_view->buffer_view->batch_uses))
-            zink_batch_reference_bufferview(&ctx->batch, sampler_view->buffer_view);
-         zink_buffer_view_reference(zink_screen(ctx->base.screen), &sampler_view->buffer_view, NULL);
-         sampler_view->buffer_view = get_buffer_view(ctx, res, sampler_view->base.format,
-                                                     sampler_view->base.u.buf.offset, sampler_view->base.u.buf.size);
-         update_descriptor_state_sampler(ctx, shader, slot);
-         zink_screen(ctx->base.screen)->context_invalidate_descriptor_state(ctx, shader, ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW, slot, 1);
+            goto end;
+         rebind_tbo(ctx, shader, slot);
          num_rebinds++;
       }
-      if (unlikely(num_image_rebinds_remaining[shader == PIPE_SHADER_COMPUTE])) {
-         for (unsigned slot = 0; num_image_rebinds_remaining[shader == PIPE_SHADER_COMPUTE] &&
-                                 slot < ctx->di.num_images[shader]; slot++) {
-            struct zink_resource *cres = zink_get_resource_for_descriptor(ctx, ZINK_DESCRIPTOR_TYPE_IMAGE, shader, slot);
-            if (res != cres)
-               continue;
+   }
+   rebind_mask &= ~BITFIELD_RANGE(TC_BINDING_SAMPLERVIEW_VS, PIPE_SHADER_TYPES);
+   if (num_rebinds && expected_num_rebinds >= num_rebinds && !rebind_mask)
+      goto end;
 
-            struct zink_image_view *image_view = &ctx->image_views[shader][slot];
-            zink_descriptor_set_refs_clear(&image_view->buffer_view->desc_set_refs, image_view->buffer_view);
-            if (zink_batch_usage_exists(image_view->buffer_view->batch_uses))
-               zink_batch_reference_bufferview(&ctx->batch, image_view->buffer_view);
-            zink_buffer_view_reference(zink_screen(ctx->base.screen), &image_view->buffer_view, NULL);
-            if (!zink_resource_object_init_storage(ctx, res)) {
-               debug_printf("couldn't create storage image!");
-               continue;
-            }
-            has_write |= image_view->base.access & PIPE_IMAGE_ACCESS_WRITE;
-            image_view->buffer_view = get_buffer_view(ctx, res, image_view->base.format,
-                                                      image_view->base.u.buf.offset, image_view->base.u.buf.size);
-            assert(image_view->buffer_view);
-            util_range_add(&res->base.b, &res->valid_buffer_range, image_view->base.u.buf.offset,
-                           image_view->base.u.buf.offset + image_view->base.u.buf.size);
-            update_descriptor_state_image(ctx, shader, slot);
-            zink_screen(ctx->base.screen)->context_invalidate_descriptor_state(ctx, shader, ZINK_DESCRIPTOR_TYPE_IMAGE, slot, 1);
-            num_image_rebinds_remaining[shader == PIPE_SHADER_COMPUTE]--;
-         }
+   const unsigned image_mask = rebind_mask ?
+                               rebind_mask & BITFIELD_RANGE(TC_BINDING_IMAGE_VS, PIPE_SHADER_TYPES) :
+                               BITFIELD_RANGE(TC_BINDING_IMAGE_VS, PIPE_SHADER_TYPES);
+   unsigned num_image_rebinds_remaining = rebind_mask ? expected_num_rebinds - num_rebinds : res->image_bind_count[0] + res->image_bind_count[1];
+   u_foreach_bit(shader, image_mask >> TC_BINDING_IMAGE_VS) {
+      for (unsigned slot = 0; num_image_rebinds_remaining && slot < ctx->di.num_images[shader]; slot++) {
+         struct zink_resource *cres = ctx->di.descriptor_res[ZINK_DESCRIPTOR_TYPE_IMAGE][shader][slot];
+         if (res != cres)
+            continue;
+
+         rebind_ibo(ctx, shader, slot);
+         const struct zink_image_view *image_view = &ctx->image_views[shader][slot];
+         has_write |= (image_view->base.access & PIPE_IMAGE_ACCESS_WRITE) != 0;
+         num_image_rebinds_remaining--;
+         num_rebinds++;
       }
    }
+end:
    zink_batch_resource_usage_set(&ctx->batch, res, has_write);
+   return num_rebinds;
 }
 
 static bool
@@ -3223,7 +3892,7 @@ static void
 rebind_image(struct zink_context *ctx, struct zink_resource *res)
 {
     zink_rebind_framebuffer(ctx, res);
-    if (!res->bind_count[0] && !res->bind_count[1])
+    if (!zink_resource_has_binds(res))
        return;
     for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++) {
        if (res->sampler_binds[i]) {
@@ -3234,7 +3903,7 @@ rebind_image(struct zink_context *ctx, struct zink_resource *res)
                  zink_rebind_surface(ctx, &psurf);
                  sv->image_view = zink_surface(psurf);
                  zink_screen(ctx->base.screen)->context_invalidate_descriptor_state(ctx, i, ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW, j, 1);
-                 update_descriptor_state_sampler(ctx, i, j);
+                 update_descriptor_state_sampler(ctx, i, j, res);
              }
           }
        }
@@ -3243,27 +3912,58 @@ rebind_image(struct zink_context *ctx, struct zink_resource *res)
        for (unsigned j = 0; j < ctx->di.num_images[i]; j++) {
           if (zink_resource(ctx->image_views[i][j].base.resource) == res) {
              zink_screen(ctx->base.screen)->context_invalidate_descriptor_state(ctx, i, ZINK_DESCRIPTOR_TYPE_IMAGE, j, 1);
-             update_descriptor_state_sampler(ctx, i, j);
+             update_descriptor_state_sampler(ctx, i, j, res);
              _mesa_set_add(ctx->need_barriers[i == PIPE_SHADER_COMPUTE], res);
           }
        }
     }
 }
 
-void
+bool
 zink_resource_rebind(struct zink_context *ctx, struct zink_resource *res)
 {
-   if (res->bind_history & ZINK_RESOURCE_USAGE_STREAMOUT)
-      ctx->dirty_so_targets = true;
-   /* force counter buffer reset */
-   res->bind_history &= ~ZINK_RESOURCE_USAGE_STREAMOUT;
+   if (res->base.b.target == PIPE_BUFFER) {
+      /* force counter buffer reset */
+      res->so_valid = false;
+      return rebind_buffer(ctx, res, 0, 0) == res->bind_count[0] + res->bind_count[1];
+   }
+   rebind_image(ctx, res);
+   return false;
+}
 
-   if (!res->bind_count[0] && !res->bind_count[1])
-      return;
-   if (res->base.b.target == PIPE_BUFFER)
-      rebind_buffer(ctx, res);
-   else
-      rebind_image(ctx, res);
+void
+zink_rebind_all_buffers(struct zink_context *ctx)
+{
+   struct zink_batch *batch = &ctx->batch;
+   u_foreach_bit(slot, ctx->gfx_pipeline_state.vertex_buffers_enabled_mask)
+      set_vertex_buffer_clamped(ctx, slot);
+   ctx->vertex_buffers_dirty = ctx->gfx_pipeline_state.vertex_buffers_enabled_mask > 0;
+   ctx->dirty_so_targets = ctx->num_so_targets > 0;
+   if (ctx->num_so_targets)
+      zink_resource_buffer_barrier(ctx, zink_resource(ctx->dummy_xfb_buffer),
+                                   VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT, VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT);
+   for (unsigned shader = PIPE_SHADER_VERTEX; shader < PIPE_SHADER_TYPES; shader++) {
+      for (unsigned slot = 0; slot < ctx->di.num_ubos[shader]; slot++) {
+         struct zink_resource *res = rebind_ubo(ctx, shader, slot);
+         if (res)
+            zink_batch_resource_usage_set(batch, res, false);
+      }
+      for (unsigned slot = 0; slot < ctx->di.num_sampler_views[shader]; slot++) {
+         struct zink_resource *res = rebind_tbo(ctx, shader, slot);
+         if (res)
+            zink_batch_resource_usage_set(batch, res, false);
+      }
+      for (unsigned slot = 0; slot < ctx->di.num_ssbos[shader]; slot++) {
+         struct zink_resource *res = rebind_ssbo(ctx, shader, slot);
+         if (res)
+            zink_batch_resource_usage_set(batch, res, (ctx->writable_ssbos[shader] & BITFIELD64_BIT(slot)) != 0);
+      }
+      for (unsigned slot = 0; slot < ctx->di.num_images[shader]; slot++) {
+         struct zink_resource *res = rebind_ibo(ctx, shader, slot);
+         if (res)
+            zink_batch_resource_usage_set(batch, res, (ctx->image_views[shader][slot].base.access & PIPE_IMAGE_ACCESS_WRITE) != 0);
+      }
+   }
 }
 
 static void
@@ -3273,16 +3973,23 @@ zink_context_replace_buffer_storage(struct pipe_context *pctx, struct pipe_resou
 {
    struct zink_resource *d = zink_resource(dst);
    struct zink_resource *s = zink_resource(src);
+   struct zink_context *ctx = zink_context(pctx);
+   struct zink_screen *screen = zink_screen(pctx->screen);
 
    assert(d->internal_format == s->internal_format);
-   util_idalloc_mt_free(&zink_screen(pctx->screen)->buffer_ids, delete_buffer_id);
-   if (zink_resource_has_unflushed_usage(d))
-      zink_batch_reference_resource(&zink_context(pctx)->batch, d);
-   zink_resource_object_reference(zink_screen(pctx->screen), &d->obj, s->obj);
-   d->access = s->access;
-   d->access_stage = s->access_stage;
-   d->unordered_barrier = s->unordered_barrier;
-   zink_resource_rebind(zink_context(pctx), d);
+   assert(d->obj);
+   assert(s->obj);
+   util_idalloc_mt_free(&screen->buffer_ids, delete_buffer_id);
+   zink_descriptor_set_refs_clear(&d->obj->desc_set_refs, d->obj);
+   /* add a ref just like check_resource_for_batch_ref() would've */
+   if (zink_resource_has_binds(d) && zink_resource_has_usage(d))
+      zink_batch_reference_resource(&ctx->batch, d);
+   /* don't be too creative */
+   zink_resource_object_reference(screen, &d->obj, s->obj);
+   /* force counter buffer reset */
+   d->so_valid = false;
+   if (num_rebinds && rebind_buffer(ctx, d, rebind_mask, num_rebinds) < num_rebinds)
+      ctx->buffer_rebind_counter = p_atomic_inc_return(&screen->buffer_rebind_counter);
 }
 
 static bool
@@ -3336,6 +4043,7 @@ zink_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
    ctx->gfx_pipeline_state.dirty = true;
    ctx->compute_pipeline_state.dirty = true;
    ctx->fb_changed = ctx->rp_changed = true;
+   ctx->gfx_pipeline_state.gfx_prim_mode = PIPE_PRIM_MAX;
 
    zink_init_draw_functions(ctx, screen);
    zink_init_grid_functions(ctx);
@@ -3343,6 +4051,14 @@ zink_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
    ctx->base.screen = pscreen;
    ctx->base.priv = priv;
 
+   if (screen->info.have_KHR_imageless_framebuffer) {
+      ctx->get_framebuffer = zink_get_framebuffer_imageless;
+      ctx->init_framebuffer = zink_init_framebuffer_imageless;
+   } else {
+      ctx->get_framebuffer = zink_get_framebuffer;
+      ctx->init_framebuffer = zink_init_framebuffer;
+   }
+
    ctx->base.destroy = zink_context_destroy;
    ctx->base.get_device_reset_status = zink_get_device_reset_status;
    ctx->base.set_device_reset_callback = zink_set_device_reset_callback;
@@ -3374,6 +4090,7 @@ zink_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
    ctx->base.set_clip_state = zink_set_clip_state;
    ctx->base.set_blend_color = zink_set_blend_color;
    ctx->base.set_tess_state = zink_set_tess_state;
+   ctx->base.set_patch_vertices = zink_set_patch_vertices;
 
    ctx->base.set_sample_mask = zink_set_sample_mask;
 
@@ -3412,9 +4129,9 @@ zink_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
    ctx->need_barriers[1] = &ctx->update_barriers[1][0];
 
    util_dynarray_init(&ctx->free_batch_states, ctx);
-   _mesa_hash_table_init(&ctx->batch_states, ctx, NULL, _mesa_key_pointer_equal);
 
    ctx->gfx_pipeline_state.have_EXT_extended_dynamic_state = screen->info.have_EXT_extended_dynamic_state;
+   ctx->gfx_pipeline_state.have_EXT_extended_dynamic_state2 = screen->info.have_EXT_extended_dynamic_state2;
 
    slab_create_child(&ctx->transfer_pool, &screen->transfer_pool);
    slab_create_child(&ctx->transfer_pool_unsync, &screen->transfer_pool);
@@ -3428,16 +4145,19 @@ zink_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
    if (!ctx->blitter)
       goto fail;
 
-   ctx->program_cache = _mesa_hash_table_create(NULL,
-                                                hash_gfx_program,
-                                                equals_gfx_program);
-   ctx->compute_program_cache = _mesa_hash_table_create(NULL,
-                                                _mesa_hash_pointer,
-                                                _mesa_key_pointer_equal);
+   ctx->gfx_pipeline_state.shader_keys.last_vertex.key.vs_base.last_vertex_stage = true;
+   ctx->last_vertex_stage_dirty = true;
+   ctx->gfx_pipeline_state.shader_keys.key[PIPE_SHADER_VERTEX].size = sizeof(struct zink_vs_key_base);
+   ctx->gfx_pipeline_state.shader_keys.key[PIPE_SHADER_TESS_EVAL].size = sizeof(struct zink_vs_key_base);
+   ctx->gfx_pipeline_state.shader_keys.key[PIPE_SHADER_GEOMETRY].size = sizeof(struct zink_vs_key_base);
+   ctx->gfx_pipeline_state.shader_keys.key[PIPE_SHADER_FRAGMENT].size = sizeof(struct zink_fs_key);
+   _mesa_hash_table_init(&ctx->compute_program_cache, ctx, _mesa_hash_pointer, _mesa_key_pointer_equal);
+   _mesa_hash_table_init(&ctx->framebuffer_cache, ctx, hash_framebuffer_imageless, equals_framebuffer_imageless);
+   _mesa_set_init(&ctx->render_pass_state_cache, ctx, hash_rp_state, equals_rp_state);
    ctx->render_pass_cache = _mesa_hash_table_create(NULL,
                                                     hash_render_pass_state,
                                                     equals_render_pass_state);
-   if (!ctx->program_cache || !ctx->compute_program_cache || !ctx->render_pass_cache)
+   if (!ctx->render_pass_cache)
       goto fail;
 
    const uint8_t data[] = {0};
@@ -3449,10 +4169,15 @@ zink_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
       PIPE_BIND_STREAM_OUTPUT, PIPE_USAGE_DEFAULT, sizeof(data));
    if (!ctx->dummy_xfb_buffer)
       goto fail;
-   ctx->dummy_surface = zink_surface_create_null(ctx, PIPE_TEXTURE_2D, 1, 1, 1);
-   if (!ctx->dummy_surface)
-      goto fail;
-   ctx->dummy_bufferview = get_buffer_view(ctx, zink_resource(ctx->dummy_vertex_buffer), PIPE_FORMAT_R8_UNORM, 0, sizeof(data));
+   for (unsigned i = 0; i < ARRAY_SIZE(ctx->dummy_surface); i++) {
+      if (!(screen->info.props.limits.framebufferDepthSampleCounts & BITFIELD_BIT(i)))
+         continue;
+      ctx->dummy_surface[i] = zink_surface_create_null(ctx, PIPE_TEXTURE_2D, 1024, 1024, BITFIELD_BIT(i));
+      if (!ctx->dummy_surface[i])
+         goto fail;
+   }
+   VkBufferViewCreateInfo bvci = create_bvci(ctx, zink_resource(ctx->dummy_vertex_buffer), PIPE_FORMAT_R8_UNORM, 0, sizeof(data));
+   ctx->dummy_bufferview = get_buffer_view(ctx, zink_resource(ctx->dummy_vertex_buffer), &bvci);
    if (!ctx->dummy_bufferview)
       goto fail;
 
@@ -3465,6 +4190,27 @@ zink_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
          goto fail;
    }
 
+   ctx->base.create_texture_handle = zink_create_texture_handle;
+   ctx->base.delete_texture_handle = zink_delete_texture_handle;
+   ctx->base.make_texture_handle_resident = zink_make_texture_handle_resident;
+   ctx->base.create_image_handle = zink_create_image_handle;
+   ctx->base.delete_image_handle = zink_delete_image_handle;
+   ctx->base.make_image_handle_resident = zink_make_image_handle_resident;
+   for (unsigned i = 0; i < 2; i++) {
+      _mesa_hash_table_init(&ctx->di.bindless[i].img_handles, ctx, _mesa_hash_pointer, _mesa_key_pointer_equal);
+      _mesa_hash_table_init(&ctx->di.bindless[i].tex_handles, ctx, _mesa_hash_pointer, _mesa_key_pointer_equal);
+
+      /* allocate 1024 slots and reserve slot 0 */
+      util_idalloc_init(&ctx->di.bindless[i].tex_slots, ZINK_MAX_BINDLESS_HANDLES);
+      util_idalloc_alloc(&ctx->di.bindless[i].tex_slots);
+      util_idalloc_init(&ctx->di.bindless[i].img_slots, ZINK_MAX_BINDLESS_HANDLES);
+      util_idalloc_alloc(&ctx->di.bindless[i].img_slots);
+      ctx->di.bindless[i].buffer_infos = malloc(sizeof(VkImageView) * ZINK_MAX_BINDLESS_HANDLES);
+      ctx->di.bindless[i].img_infos = malloc(sizeof(VkDescriptorImageInfo) * ZINK_MAX_BINDLESS_HANDLES);
+      util_dynarray_init(&ctx->di.bindless[i].updates, NULL);
+      util_dynarray_init(&ctx->di.bindless[i].resident, NULL);
+   }
+
    ctx->have_timelines = screen->info.have_KHR_timeline_semaphore;
    simple_mtx_init(&ctx->batch_mtx, mtx_plain);
    zink_start_batch(ctx, &ctx->batch);
@@ -3477,12 +4223,14 @@ zink_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
    for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++) {
       /* need to update these based on screen config for null descriptors */
       for (unsigned j = 0; j < 32; j++) {
-         update_descriptor_state_ubo(ctx, i, j);
-         update_descriptor_state_sampler(ctx, i, j);
-         update_descriptor_state_ssbo(ctx, i, j);
-         update_descriptor_state_image(ctx, i, j);
+         update_descriptor_state_ubo(ctx, i, j, NULL);
+         update_descriptor_state_sampler(ctx, i, j, NULL);
+         update_descriptor_state_ssbo(ctx, i, j, NULL);
+         update_descriptor_state_image(ctx, i, j, NULL);
       }
    }
+   if (!screen->info.rb2_feats.nullDescriptor)
+      ctx->di.fbfetch.imageView = zink_csurface(ctx->dummy_surface[0])->image_view;
    p_atomic_inc(&screen->base.num_contexts);
 
    zink_select_draw_vbo(ctx);
@@ -3494,8 +4242,13 @@ zink_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
 
    struct threaded_context *tc = (struct threaded_context*)threaded_context_create(&ctx->base, &screen->transfer_pool,
                                                      zink_context_replace_buffer_storage,
-                                                     zink_create_tc_fence_for_tc,
-                                                     zink_context_is_resource_busy, true, &ctx->tc);
+                                                     &(struct threaded_context_options){
+                                                        .create_fence = zink_create_tc_fence_for_tc,
+                                                        .is_resource_busy = zink_context_is_resource_busy,
+                                                        .driver_calls_flush_notify = true,
+                                                        .unsynchronized_get_device_reset_status = true,
+                                                     },
+                                                     &ctx->tc);
 
    if (tc && (struct zink_context*)tc != ctx) {
       threaded_context_init_bytes_mapped_limit(tc, 4);
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_context.h b/mesa 3D driver/src/gallium/drivers/zink/zink_context.h
index f4ebce713e..a0ed22a1f5 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_context.h	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_context.h	
@@ -24,11 +24,14 @@
 #ifndef ZINK_CONTEXT_H
 #define ZINK_CONTEXT_H
 
+#define ZINK_FBFETCH_BINDING 6 //COMPUTE+1
 #define ZINK_SHADER_COUNT (PIPE_SHADER_TYPES - 1)
 
 #define ZINK_DEFAULT_MAX_DESCS 5000
 #define ZINK_DEFAULT_DESC_CLAMP (ZINK_DEFAULT_MAX_DESCS * 0.9)
 
+#define ZINK_MAX_BINDLESS_HANDLES 1024
+
 #include "zink_clear.h"
 #include "zink_pipeline.h"
 #include "zink_batch.h"
@@ -40,7 +43,7 @@
 #include "pipe/p_state.h"
 #include "util/u_rect.h"
 #include "util/u_threaded_context.h"
-
+#include "util/u_idalloc.h"
 #include "util/slab.h"
 #include "util/list.h"
 #include "util/u_dynarray.h"
@@ -79,6 +82,7 @@ struct zink_sampler_state {
 
 struct zink_buffer_view {
    struct pipe_reference reference;
+   struct pipe_resource *pres;
    VkBufferViewCreateInfo bvci;
    VkBufferView buffer_view;
    uint32_t hash;
@@ -137,6 +141,19 @@ struct zink_descriptor_surface {
    bool is_buffer;
 };
 
+struct zink_bindless_descriptor {
+   struct zink_descriptor_surface ds;
+   struct zink_sampler_state *sampler;
+   uint32_t handle;
+   uint32_t access; //PIPE_ACCESS_...
+};
+
+static inline struct zink_resource *
+zink_descriptor_surface_resource(struct zink_descriptor_surface *ds)
+{
+   return ds->is_buffer ? (struct zink_resource*)ds->bufferview->pres : (struct zink_resource*)ds->surface->base.texture;
+}
+
 typedef void (*pipe_draw_vbo_func)(struct pipe_context *pipe,
                                    const struct pipe_draw_info *info,
                                    unsigned drawid_offset,
@@ -156,6 +173,16 @@ typedef enum {
    ZINK_DYNAMIC_STATE,
 } zink_dynamic_state;
 
+typedef enum {
+   ZINK_NO_DYNAMIC_STATE2,
+   ZINK_DYNAMIC_STATE2,
+} zink_dynamic_state2;
+
+typedef enum {
+   ZINK_NO_DYNAMIC_VERTEX_INPUT,
+   ZINK_DYNAMIC_VERTEX_INPUT,
+} zink_dynamic_vertex_input;
+
 struct zink_context {
    struct pipe_context base;
    struct threaded_context *tc;
@@ -168,12 +195,11 @@ struct zink_context {
 
    struct pipe_device_reset_callback reset;
 
-   uint32_t curr_batch; //the current batch id
-
    simple_mtx_t batch_mtx;
    struct zink_fence *deferred_fence;
    struct zink_fence *last_fence; //the last command buffer submitted
-   struct hash_table batch_states; //submitted batch states
+   struct zink_batch_state *batch_states; //list of submitted batch states: ordered by increasing timeline id
+   unsigned batch_states_count; //number of states in `batch_states`
    struct util_dynarray free_batch_states; //unused batch states
    bool oom_flush;
    bool oom_stall;
@@ -181,7 +207,7 @@ struct zink_context {
 
    unsigned shader_has_inlinable_uniforms_mask;
    unsigned inlinable_uniforms_valid_mask;
-   uint32_t inlinable_uniforms[PIPE_SHADER_TYPES][MAX_INLINABLE_UNIFORMS];
+   uint32_t compute_inlinable_uniforms[MAX_INLINABLE_UNIFORMS];
 
    struct pipe_constant_buffer ubos[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS];
    struct pipe_shader_buffer ssbos[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_BUFFERS];
@@ -189,12 +215,16 @@ struct zink_context {
    struct zink_image_view image_views[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_IMAGES];
 
    struct pipe_framebuffer_state fb_state;
+   struct zink_framebuffer *(*get_framebuffer)(struct zink_context*);
+   void (*init_framebuffer)(struct zink_screen *screen, struct zink_framebuffer *fb, struct zink_render_pass *rp);
+   struct hash_table framebuffer_cache;
 
    struct zink_vertex_elements_state *element_state;
    struct zink_rasterizer_state *rast_state;
    struct zink_depth_stencil_alpha_state *dsa_state;
 
    struct hash_table desc_set_layouts[ZINK_DESCRIPTOR_TYPES];
+   struct set desc_pool_keys[ZINK_DESCRIPTOR_TYPES];
    bool pipeline_changed[2]; //gfx, compute
 
    struct zink_shader *gfx_stages[ZINK_SHADER_COUNT];
@@ -202,20 +232,25 @@ struct zink_context {
    bool shader_reads_drawid;
    bool shader_reads_basevertex;
    struct zink_gfx_pipeline_state gfx_pipeline_state;
-   enum pipe_prim_type gfx_prim_mode;
-   struct hash_table *program_cache;
+   /* there are 5 gfx stages, but VS and FS are assumed to be always present,
+    * thus only 3 stages need to be considered, giving 2^3 = 8 program caches.
+    */
+   struct hash_table program_cache[8];
+   uint32_t gfx_hash;
    struct zink_gfx_program *curr_program;
 
    struct zink_descriptor_data *dd;
 
    struct zink_shader *compute_stage;
    struct zink_compute_pipeline_state compute_pipeline_state;
-   struct hash_table *compute_program_cache;
+   struct hash_table compute_program_cache;
    struct zink_compute_program *curr_compute;
 
+   unsigned shader_stages : ZINK_SHADER_COUNT; /* mask of bound gfx shader stages */
    unsigned dirty_shader_stages : 6; /* mask of changed shader stages */
    bool last_vertex_stage_dirty;
 
+   struct set render_pass_state_cache;
    struct hash_table *render_pass_cache;
    bool new_swapchain;
    bool fb_changed;
@@ -225,6 +260,7 @@ struct zink_context {
    struct zink_framebuffer_clear fb_clears[PIPE_MAX_COLOR_BUFS + 1];
    uint16_t clears_enabled;
    uint16_t rp_clears_enabled;
+   uint16_t fbfetch_outputs;
 
    VkBuffer vbufs[PIPE_MAX_ATTRIBS];
    unsigned vbuf_offsets[PIPE_MAX_ATTRIBS];
@@ -244,8 +280,6 @@ struct zink_context {
    VkSampleLocationEXT vk_sample_locations[PIPE_MAX_SAMPLE_LOCATION_GRID_SIZE * PIPE_MAX_SAMPLE_LOCATION_GRID_SIZE];
    uint8_t sample_locations[2 * 4 * 8 * 16];
 
-   bool drawid_broken;
-
    struct pipe_stencil_ref stencil_ref;
 
    union {
@@ -266,9 +300,11 @@ struct zink_context {
 
    struct pipe_resource *dummy_vertex_buffer;
    struct pipe_resource *dummy_xfb_buffer;
-   struct pipe_surface *dummy_surface;
+   struct pipe_surface *dummy_surface[7];
    struct zink_buffer_view *dummy_bufferview;
 
+   unsigned buffer_rebind_counter;
+
    struct {
       /* descriptor info */
       VkDescriptorBufferInfo ubos[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS];
@@ -287,13 +323,32 @@ struct zink_context {
       VkBufferView texel_images[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_IMAGES];
       uint8_t num_images[PIPE_SHADER_TYPES];
 
+      VkDescriptorImageInfo fbfetch;
+
       struct zink_resource *descriptor_res[ZINK_DESCRIPTOR_TYPES][PIPE_SHADER_TYPES][PIPE_MAX_SAMPLERS];
       struct zink_descriptor_surface sampler_surfaces[PIPE_SHADER_TYPES][PIPE_MAX_SAMPLERS];
       struct zink_descriptor_surface image_surfaces[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_IMAGES];
+
+      struct {
+         struct util_idalloc tex_slots;
+         struct util_idalloc img_slots;
+         struct hash_table tex_handles;
+         struct hash_table img_handles;
+         VkBufferView *buffer_infos; //tex, img
+         VkDescriptorImageInfo *img_infos; //tex, img
+         struct util_dynarray updates;
+         struct util_dynarray resident;
+      } bindless[2];  //img, buffer
+      union {
+         bool bindless_dirty[2]; //tex, img
+         uint16_t any_bindless_dirty;
+      };
+      bool bindless_refs_dirty;
    } di;
    struct set *need_barriers[2]; //gfx, compute
    struct set update_barriers[2][2]; //[gfx, compute][current, next]
    uint8_t barrier_set_idx[2];
+   unsigned memory_barrier;
 
    uint32_t num_so_targets;
    struct pipe_stream_output_target *so_targets[PIPE_MAX_SO_OUTPUTS];
@@ -302,7 +357,10 @@ struct zink_context {
    bool first_frame_done;
    bool have_timelines;
 
+   bool gfx_dirty;
+
    bool is_device_lost;
+   bool primitive_restart;
    bool vertex_state_changed : 1;
    bool blend_state_changed : 1;
    bool rast_state_changed : 1;
@@ -335,12 +393,13 @@ zink_check_batch_completion(struct zink_context *ctx, uint32_t batch_id, bool ha
 
 void
 zink_flush_queue(struct zink_context *ctx);
-
+void
+zink_update_fbfetch(struct zink_context *ctx);
 bool
 zink_resource_access_is_write(VkAccessFlags flags);
 
 void
-zink_resource_buffer_barrier(struct zink_context *ctx, struct zink_batch *batch, struct zink_resource *res, VkAccessFlags flags, VkPipelineStageFlags pipeline);
+zink_resource_buffer_barrier(struct zink_context *ctx, struct zink_resource *res, VkAccessFlags flags, VkPipelineStageFlags pipeline);
 void
 zink_fake_buffer_barrier(struct zink_resource *res, VkAccessFlags flags, VkPipelineStageFlags pipeline);
 bool
@@ -348,41 +407,33 @@ zink_resource_image_needs_barrier(struct zink_resource *res, VkImageLayout new_l
 bool
 zink_resource_image_barrier_init(VkImageMemoryBarrier *imb, struct zink_resource *res, VkImageLayout new_layout, VkAccessFlags flags, VkPipelineStageFlags pipeline);
 void
-zink_resource_image_barrier(struct zink_context *ctx, struct zink_batch *batch, struct zink_resource *res,
+zink_resource_image_barrier(struct zink_context *ctx, struct zink_resource *res,
                       VkImageLayout new_layout, VkAccessFlags flags, VkPipelineStageFlags pipeline);
 
 bool
 zink_resource_needs_barrier(struct zink_resource *res, VkImageLayout layout, VkAccessFlags flags, VkPipelineStageFlags pipeline);
 void
-zink_resource_barrier(struct zink_context *ctx, struct zink_batch *batch, struct zink_resource *res, VkImageLayout layout, VkAccessFlags flags, VkPipelineStageFlags pipeline);
-void
 zink_update_descriptor_refs(struct zink_context *ctx, bool compute);
 void
 zink_init_vk_sample_locations(struct zink_context *ctx, VkSampleLocationsInfoEXT *loc);
 
 void
-zink_begin_render_pass(struct zink_context *ctx,
-                       struct zink_batch *batch);
+zink_begin_render_pass(struct zink_context *ctx);
 void
-zink_end_render_pass(struct zink_context *ctx, struct zink_batch *batch);
+zink_end_render_pass(struct zink_context *ctx);
 
-static inline struct zink_batch *
+static inline void
 zink_batch_rp(struct zink_context *ctx)
 {
-   struct zink_batch *batch = &ctx->batch;
-   if (!batch->in_rp) {
-      zink_begin_render_pass(ctx, batch);
-   }
-   return batch;
+   if (!ctx->batch.in_rp)
+      zink_begin_render_pass(ctx);
 }
 
-static inline struct zink_batch *
+static inline void
 zink_batch_no_rp(struct zink_context *ctx)
 {
-   struct zink_batch *batch = &ctx->batch;
-   zink_end_render_pass(ctx, batch);
-   assert(!batch->in_rp);
-   return batch;
+   zink_end_render_pass(ctx);
+   assert(!ctx->batch.in_rp);
 }
 
 static inline VkPipelineStageFlags
@@ -406,6 +457,11 @@ zink_pipeline_flags_from_pipe_stage(enum pipe_shader_type pstage)
    }
 }
 
+void
+zink_rebind_all_buffers(struct zink_context *ctx);
+
+void
+zink_flush_memory_barrier(struct zink_context *ctx, bool is_compute);
 void
 zink_init_draw_functions(struct zink_context *ctx, struct zink_screen *screen);
 void
@@ -447,18 +503,37 @@ zink_rect_from_box(const struct pipe_box *box)
    return (struct u_rect){box->x, box->x + box->width, box->y, box->y + box->height};
 }
 
-void
+static inline VkComponentSwizzle
+zink_component_mapping(enum pipe_swizzle swizzle)
+{
+   switch (swizzle) {
+   case PIPE_SWIZZLE_X: return VK_COMPONENT_SWIZZLE_R;
+   case PIPE_SWIZZLE_Y: return VK_COMPONENT_SWIZZLE_G;
+   case PIPE_SWIZZLE_Z: return VK_COMPONENT_SWIZZLE_B;
+   case PIPE_SWIZZLE_W: return VK_COMPONENT_SWIZZLE_A;
+   case PIPE_SWIZZLE_0: return VK_COMPONENT_SWIZZLE_ZERO;
+   case PIPE_SWIZZLE_1: return VK_COMPONENT_SWIZZLE_ONE;
+   case PIPE_SWIZZLE_NONE: return VK_COMPONENT_SWIZZLE_IDENTITY; // ???
+   default:
+      unreachable("unexpected swizzle");
+   }
+}
+
+enum pipe_swizzle
+zink_clamp_void_swizzle(const struct util_format_description *desc, enum pipe_swizzle swizzle);
+
+bool
 zink_resource_rebind(struct zink_context *ctx, struct zink_resource *res);
 
 void
 zink_rebind_framebuffer(struct zink_context *ctx, struct zink_resource *res);
 
 void
-zink_copy_buffer(struct zink_context *ctx, struct zink_batch *batch, struct zink_resource *dst, struct zink_resource *src,
+zink_copy_buffer(struct zink_context *ctx, struct zink_resource *dst, struct zink_resource *src,
                  unsigned dst_offset, unsigned src_offset, unsigned size);
 
 void
-zink_copy_image_buffer(struct zink_context *ctx, struct zink_batch *batch, struct zink_resource *dst, struct zink_resource *src,
+zink_copy_image_buffer(struct zink_context *ctx, struct zink_resource *dst, struct zink_resource *src,
                        unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz,
                        unsigned src_level, const struct pipe_box *src_box, enum pipe_map_flags map_flags);
 
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_descriptors.c b/mesa 3D driver/src/gallium/drivers/zink/zink_descriptors.c
index ccbb4be355..2a9c770da9 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_descriptors.c	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_descriptors.c	
@@ -44,8 +44,8 @@ struct zink_descriptor_pool {
    struct hash_table *desc_sets;
    struct hash_table *free_desc_sets;
    struct util_dynarray alloc_desc_sets;
+   const struct zink_descriptor_pool_key *key;
    VkDescriptorPool descpool;
-   struct zink_descriptor_pool_key key;
    unsigned num_resources;
    unsigned num_sets_allocated;
    simple_mtx_t mtx;
@@ -89,6 +89,7 @@ struct zink_program_descriptor_data_cached {
    struct zink_descriptor_set *last_set[ZINK_DESCRIPTOR_TYPES];
    unsigned num_refs[ZINK_DESCRIPTOR_TYPES];
    union zink_program_descriptor_refs *refs[ZINK_DESCRIPTOR_TYPES];
+   unsigned cache_misses[ZINK_DESCRIPTOR_TYPES];
 };
 
 
@@ -105,6 +106,7 @@ batch_add_desc_set(struct zink_batch *batch, struct zink_descriptor_set *zds)
        !batch_ptr_add_usage(batch, batch->state->dd->desc_sets, zds))
       return false;
    pipe_reference(NULL, &zds->reference);
+   pipe_reference(NULL, &zds->pool->reference);
    zink_batch_usage_set(&zds->batch_uses, batch->state);
    return true;
 }
@@ -211,7 +213,7 @@ static void
 descriptor_set_invalidate(struct zink_descriptor_set *zds)
 {
    zds->invalid = true;
-   for (unsigned i = 0; i < zds->pool->key.layout->num_descriptors; i++) {
+   for (unsigned i = 0; i < zds->pool->key->layout->num_bindings; i++) {
       switch (zds->pool->type) {
       case ZINK_DESCRIPTOR_TYPE_UBO:
       case ZINK_DESCRIPTOR_TYPE_SSBO:
@@ -250,13 +252,14 @@ descriptor_set_invalidate(struct zink_descriptor_set *zds)
    }
 }
 
-#ifndef NDEBUG
 static void
 descriptor_pool_clear(struct hash_table *ht)
 {
-   _mesa_hash_table_clear(ht, NULL);
+   hash_table_foreach(ht, entry) {
+      struct zink_descriptor_set *zds = entry->data;
+      descriptor_set_invalidate(zds);
+   }
 }
-#endif
 
 static void
 descriptor_pool_free(struct zink_screen *screen, struct zink_descriptor_pool *pool)
@@ -264,15 +267,13 @@ descriptor_pool_free(struct zink_screen *screen, struct zink_descriptor_pool *po
    if (!pool)
       return;
    if (pool->descpool)
-      vkDestroyDescriptorPool(screen->dev, pool->descpool, NULL);
+      VKSCR(DestroyDescriptorPool)(screen->dev, pool->descpool, NULL);
 
    simple_mtx_lock(&pool->mtx);
-#ifndef NDEBUG
    if (pool->desc_sets)
       descriptor_pool_clear(pool->desc_sets);
    if (pool->free_desc_sets)
       descriptor_pool_clear(pool->free_desc_sets);
-#endif
    if (pool->desc_sets)
       _mesa_hash_table_destroy(pool->desc_sets, NULL);
    if (pool->free_desc_sets)
@@ -284,27 +285,29 @@ descriptor_pool_free(struct zink_screen *screen, struct zink_descriptor_pool *po
    ralloc_free(pool);
 }
 
+static void
+descriptor_pool_delete(struct zink_context *ctx, struct zink_descriptor_pool *pool)
+{
+   struct zink_screen *screen = zink_screen(ctx->base.screen);
+   if (!pool)
+      return;
+   _mesa_hash_table_remove_key(ctx->dd->descriptor_pools[pool->type], pool->key);
+   descriptor_pool_free(screen, pool);
+}
+
 static struct zink_descriptor_pool *
 descriptor_pool_create(struct zink_screen *screen, enum zink_descriptor_type type,
-                       struct zink_descriptor_layout_key *layout_key, VkDescriptorPoolSize *sizes, unsigned num_type_sizes)
+                       const struct zink_descriptor_pool_key *pool_key)
 {
    struct zink_descriptor_pool *pool = rzalloc(NULL, struct zink_descriptor_pool);
    if (!pool)
       return NULL;
    pipe_reference_init(&pool->reference, 1);
    pool->type = type;
-   pool->key.layout = layout_key;
-   pool->key.num_type_sizes = num_type_sizes;
-   size_t types_size = num_type_sizes * sizeof(VkDescriptorPoolSize);
-   pool->key.sizes = ralloc_size(pool, types_size);
-   if (!pool->key.sizes) {
-      ralloc_free(pool);
-      return NULL;
-   }
-   memcpy(pool->key.sizes, sizes, types_size);
+   pool->key = pool_key;
    simple_mtx_init(&pool->mtx, mtx_plain);
-   for (unsigned i = 0; i < layout_key->num_descriptors; i++) {
-       pool->num_resources += layout_key->bindings[i].descriptorCount;
+   for (unsigned i = 0; i < pool_key->layout->num_bindings; i++) {
+       pool->num_resources += pool_key->layout->bindings[i].descriptorCount;
    }
    pool->desc_sets = _mesa_hash_table_create(NULL, desc_state_hash, desc_state_equal);
    if (!pool->desc_sets)
@@ -318,11 +321,11 @@ descriptor_pool_create(struct zink_screen *screen, enum zink_descriptor_type typ
 
    VkDescriptorPoolCreateInfo dpci = {0};
    dpci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
-   dpci.pPoolSizes = sizes;
-   dpci.poolSizeCount = num_type_sizes;
+   dpci.pPoolSizes = pool_key->sizes;
+   dpci.poolSizeCount = pool_key->sizes[1].descriptorCount ? 2 : 1;
    dpci.flags = 0;
    dpci.maxSets = ZINK_DEFAULT_MAX_DESCS;
-   if (vkCreateDescriptorPool(screen->dev, &dpci, 0, &pool->descpool) != VK_SUCCESS) {
+   if (VKSCR(CreateDescriptorPool)(screen->dev, &dpci, 0, &pool->descpool) != VK_SUCCESS) {
       debug_printf("vkCreateDescriptorPool failed\n");
       goto fail;
    }
@@ -359,14 +362,14 @@ descriptor_layout_create(struct zink_screen *screen, enum zink_descriptor_type t
    supp.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_SUPPORT;
    supp.pNext = NULL;
    supp.supported = VK_FALSE;
-   if (screen->vk.GetDescriptorSetLayoutSupport) {
-      screen->vk.GetDescriptorSetLayoutSupport(screen->dev, &dcslci, &supp);
+   if (VKSCR(GetDescriptorSetLayoutSupport)) {
+      VKSCR(GetDescriptorSetLayoutSupport)(screen->dev, &dcslci, &supp);
       if (supp.supported == VK_FALSE) {
          debug_printf("vkGetDescriptorSetLayoutSupport claims layout is unsupported\n");
          return VK_NULL_HANDLE;
       }
    }
-   if (vkCreateDescriptorSetLayout(screen->dev, &dcslci, 0, &dsl) != VK_SUCCESS)
+   if (VKSCR(CreateDescriptorSetLayout)(screen->dev, &dcslci, 0, &dsl) != VK_SUCCESS)
       debug_printf("vkCreateDescriptorSetLayout failed\n");
    return dsl;
 }
@@ -376,8 +379,10 @@ hash_descriptor_layout(const void *key)
 {
    uint32_t hash = 0;
    const struct zink_descriptor_layout_key *k = key;
-   hash = XXH32(&k->num_descriptors, sizeof(unsigned), hash);
-   hash = XXH32(k->bindings, k->num_descriptors * sizeof(VkDescriptorSetLayoutBinding), hash);
+   hash = XXH32(&k->num_bindings, sizeof(unsigned), hash);
+   /* only hash first 3 members: no holes and the rest are always constant */
+   for (unsigned i = 0; i < k->num_bindings; i++)
+      hash = XXH32(&k->bindings[i], offsetof(VkDescriptorSetLayoutBinding, stageFlags), hash);
 
    return hash;
 }
@@ -387,8 +392,35 @@ equals_descriptor_layout(const void *a, const void *b)
 {
    const struct zink_descriptor_layout_key *a_k = a;
    const struct zink_descriptor_layout_key *b_k = b;
-   return a_k->num_descriptors == b_k->num_descriptors &&
-          !memcmp(a_k->bindings, b_k->bindings, a_k->num_descriptors * sizeof(VkDescriptorSetLayoutBinding));
+   return a_k->num_bindings == b_k->num_bindings &&
+          !memcmp(a_k->bindings, b_k->bindings, a_k->num_bindings * sizeof(VkDescriptorSetLayoutBinding));
+}
+
+static struct zink_descriptor_layout *
+create_layout(struct zink_context *ctx, enum zink_descriptor_type type,
+              VkDescriptorSetLayoutBinding *bindings, unsigned num_bindings,
+              struct zink_descriptor_layout_key **layout_key)
+{
+   struct zink_screen *screen = zink_screen(ctx->base.screen);
+   VkDescriptorSetLayout dsl = descriptor_layout_create(screen, type, bindings, MAX2(num_bindings, 1));
+   if (!dsl)
+      return NULL;
+
+   struct zink_descriptor_layout_key *k = ralloc(ctx, struct zink_descriptor_layout_key);
+   k->num_bindings = num_bindings;
+   size_t bindings_size = MAX2(num_bindings, 1) * sizeof(VkDescriptorSetLayoutBinding);
+   k->bindings = ralloc_size(k, bindings_size);
+   if (!k->bindings) {
+      ralloc_free(k);
+      VKSCR(DestroyDescriptorSetLayout)(screen->dev, dsl, NULL);
+      return NULL;
+   }
+   memcpy(k->bindings, bindings, bindings_size);
+
+   struct zink_descriptor_layout *layout = rzalloc(ctx, struct zink_descriptor_layout);
+   layout->layout = dsl;
+   *layout_key = k;
+   return layout;
 }
 
 struct zink_descriptor_layout *
@@ -396,10 +428,9 @@ zink_descriptor_util_layout_get(struct zink_context *ctx, enum zink_descriptor_t
                       VkDescriptorSetLayoutBinding *bindings, unsigned num_bindings,
                       struct zink_descriptor_layout_key **layout_key)
 {
-   struct zink_screen *screen = zink_screen(ctx->base.screen);
    uint32_t hash = 0;
    struct zink_descriptor_layout_key key = {
-      .num_descriptors = num_bindings,
+      .num_bindings = num_bindings,
       .bindings = bindings,
    };
 
@@ -424,47 +455,112 @@ zink_descriptor_util_layout_get(struct zink_context *ctx, enum zink_descriptor_t
       }
    }
 
-   VkDescriptorSetLayout dsl = descriptor_layout_create(screen, type, key.bindings, MAX2(num_bindings, 1));
-   if (!dsl)
-      return VK_NULL_HANDLE;
-
-   struct zink_descriptor_layout_key *k = ralloc(ctx, struct zink_descriptor_layout_key);
-   k->num_descriptors = num_bindings;
-   size_t bindings_size = MAX2(num_bindings, 1) * sizeof(VkDescriptorSetLayoutBinding);
-   k->bindings = ralloc_size(k, bindings_size);
-   if (!k->bindings) {
-      ralloc_free(k);
-      vkDestroyDescriptorSetLayout(screen->dev, dsl, NULL);
-      return VK_NULL_HANDLE;
+   struct zink_descriptor_layout *layout = create_layout(ctx, type, bindings ? bindings : &null_binding, num_bindings, layout_key);
+   if (layout && type != ZINK_DESCRIPTOR_TYPES) {
+      _mesa_hash_table_insert_pre_hashed(&ctx->desc_set_layouts[type], hash, *layout_key, layout);
    }
-   memcpy(k->bindings, key.bindings, bindings_size);
-
-   struct zink_descriptor_layout *layout = rzalloc(ctx, struct zink_descriptor_layout);
-   layout->layout = dsl;
-   if (type != ZINK_DESCRIPTOR_TYPES) {
-      _mesa_hash_table_insert_pre_hashed(&ctx->desc_set_layouts[type], hash, k, layout);
-   }
-   *layout_key = k;
    return layout;
 }
 
+
+static uint32_t
+hash_descriptor_pool_key(const void *key)
+{
+   uint32_t hash = 0;
+   const struct zink_descriptor_pool_key *k = key;
+   hash = XXH32(&k->layout, sizeof(void*), hash);
+   const unsigned num_type_sizes = k->sizes[1].descriptorCount ? 2 : 1;
+   for (unsigned i = 0; i < num_type_sizes; i++)
+      hash = XXH32(&k->sizes[i], sizeof(VkDescriptorPoolSize), hash);
+
+   return hash;
+}
+
+static bool
+equals_descriptor_pool_key(const void *a, const void *b)
+{
+   const struct zink_descriptor_pool_key *a_k = a;
+   const struct zink_descriptor_pool_key *b_k = b;
+   const unsigned a_num_type_sizes = a_k->sizes[1].descriptorCount ? 2 : 1;
+   const unsigned b_num_type_sizes = b_k->sizes[1].descriptorCount ? 2 : 1;
+   return a_k->layout == b_k->layout &&
+          a_num_type_sizes == b_num_type_sizes &&
+          !memcmp(a_k->sizes, b_k->sizes, b_num_type_sizes * sizeof(VkDescriptorPoolSize));
+}
+
+struct zink_descriptor_pool_key *
+zink_descriptor_util_pool_key_get(struct zink_context *ctx, enum zink_descriptor_type type,
+                                  struct zink_descriptor_layout_key *layout_key,
+                                  VkDescriptorPoolSize *sizes, unsigned num_type_sizes)
+{
+   uint32_t hash = 0;
+   struct zink_descriptor_pool_key key;
+   if (type != ZINK_DESCRIPTOR_TYPES) {
+      key.layout = layout_key;
+      key.sizes[1].descriptorCount = 0;
+      memcpy(key.sizes, sizes, num_type_sizes * sizeof(VkDescriptorPoolSize));
+      hash = hash_descriptor_pool_key(&key);
+      struct set_entry *he = _mesa_set_search_pre_hashed(&ctx->desc_pool_keys[type], hash, &key);
+      if (he)
+         return (void*)he->key;
+   }
+
+   struct zink_descriptor_pool_key *pool_key = rzalloc(ctx, struct zink_descriptor_pool_key);
+   pool_key->layout = layout_key;
+   memcpy(pool_key->sizes, sizes, num_type_sizes * sizeof(VkDescriptorPoolSize));
+   if (type != ZINK_DESCRIPTOR_TYPES)
+      _mesa_set_add_pre_hashed(&ctx->desc_pool_keys[type], hash, pool_key);
+   return pool_key;
+}
+
+static void
+init_push_binding(VkDescriptorSetLayoutBinding *binding, unsigned i, VkDescriptorType type)
+{
+   binding->binding = tgsi_processor_to_shader_stage(i);
+   binding->descriptorType = type;
+   binding->descriptorCount = 1;
+   binding->stageFlags = zink_shader_stage(i);
+   binding->pImmutableSamplers = NULL;
+}
+
+static VkDescriptorType
+get_push_types(struct zink_screen *screen, enum zink_descriptor_type *dsl_type)
+{
+   *dsl_type = screen->descriptor_mode == ZINK_DESCRIPTOR_MODE_LAZY &&
+               screen->info.have_KHR_push_descriptor ? ZINK_DESCRIPTOR_TYPES : ZINK_DESCRIPTOR_TYPE_UBO;
+   return screen->descriptor_mode == ZINK_DESCRIPTOR_MODE_LAZY ?
+          VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER : VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
+}
+
+static struct zink_descriptor_layout *
+create_gfx_layout(struct zink_context *ctx, struct zink_descriptor_layout_key **layout_key, bool fbfetch)
+{
+   struct zink_screen *screen = zink_screen(ctx->base.screen);
+   VkDescriptorSetLayoutBinding bindings[PIPE_SHADER_TYPES];
+   enum zink_descriptor_type dsl_type;
+   VkDescriptorType vktype = get_push_types(screen, &dsl_type);
+   for (unsigned i = 0; i < ZINK_SHADER_COUNT; i++)
+      init_push_binding(&bindings[i], i, vktype);
+   if (fbfetch) {
+      bindings[ZINK_SHADER_COUNT].binding = ZINK_FBFETCH_BINDING;
+      bindings[ZINK_SHADER_COUNT].descriptorType = VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT;
+      bindings[ZINK_SHADER_COUNT].descriptorCount = 1;
+      bindings[ZINK_SHADER_COUNT].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
+      bindings[ZINK_SHADER_COUNT].pImmutableSamplers = NULL;
+   }
+   return create_layout(ctx, dsl_type, bindings, fbfetch ? ARRAY_SIZE(bindings) : ARRAY_SIZE(bindings) - 1, layout_key);
+}
+
 bool
 zink_descriptor_util_push_layouts_get(struct zink_context *ctx, struct zink_descriptor_layout **dsls, struct zink_descriptor_layout_key **layout_keys)
 {
    struct zink_screen *screen = zink_screen(ctx->base.screen);
-   VkDescriptorSetLayoutBinding bindings[PIPE_SHADER_TYPES];
-   for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++) {
-      bindings[i].binding = tgsi_processor_to_shader_stage(i);
-      bindings[i].descriptorType = screen->descriptor_mode == ZINK_DESCRIPTOR_MODE_LAZY ?
-                                   VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER : VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
-      bindings[i].descriptorCount = 1;
-      bindings[i].stageFlags = zink_shader_stage(i);
-      bindings[i].pImmutableSamplers = NULL;
-   }
-   enum zink_descriptor_type dsl_type = screen->descriptor_mode == ZINK_DESCRIPTOR_MODE_LAZY &&
-                                        screen->info.have_KHR_push_descriptor ? ZINK_DESCRIPTOR_TYPES : ZINK_DESCRIPTOR_TYPE_UBO;
-   dsls[0] = zink_descriptor_util_layout_get(ctx, dsl_type, bindings, ZINK_SHADER_COUNT, &layout_keys[0]);
-   dsls[1] = zink_descriptor_util_layout_get(ctx, dsl_type, &bindings[PIPE_SHADER_COMPUTE], 1, &layout_keys[1]);
+   VkDescriptorSetLayoutBinding compute_binding;
+   enum zink_descriptor_type dsl_type;
+   VkDescriptorType vktype = get_push_types(screen, &dsl_type);
+   init_push_binding(&compute_binding, PIPE_SHADER_COMPUTE, vktype);
+   dsls[0] = create_gfx_layout(ctx, &layout_keys[0], false);
+   dsls[1] = create_layout(ctx, dsl_type, &compute_binding, 1, &layout_keys[1]);
    return dsls[0] && dsls[1];
 }
 
@@ -487,12 +583,18 @@ zink_descriptor_util_init_null_set(struct zink_context *ctx, VkDescriptorSet des
                       zink_resource(ctx->dummy_vertex_buffer)->obj->buffer;
    push_info.offset = 0;
    push_info.range = VK_WHOLE_SIZE;
-   vkUpdateDescriptorSets(screen->dev, 1, &push_wd, 0, NULL);
+   VKSCR(UpdateDescriptorSets)(screen->dev, 1, &push_wd, 0, NULL);
 }
 
 VkImageLayout
 zink_descriptor_util_image_layout_eval(const struct zink_resource *res, bool is_compute)
 {
+   if (res->bindless[0] || res->bindless[1]) {
+      /* bindless needs most permissive layout */
+      if (res->image_bind_count[0] || res->image_bind_count[1])
+         return VK_IMAGE_LAYOUT_GENERAL;
+      return VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+   }
    return res->image_bind_count[is_compute] ? VK_IMAGE_LAYOUT_GENERAL :
                           res->aspect & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT) ?
                              //Vulkan-Docs#1490
@@ -504,48 +606,23 @@ zink_descriptor_util_image_layout_eval(const struct zink_resource *res, bool is_
                              VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
 }
 
-static uint32_t
-hash_descriptor_pool(const void *key)
-{
-   uint32_t hash = 0;
-   const struct zink_descriptor_pool_key *k = key;
-   hash = XXH32(&k->num_type_sizes, sizeof(unsigned), hash);
-   hash = XXH32(&k->layout, sizeof(k->layout), hash);
-   hash = XXH32(k->sizes, k->num_type_sizes * sizeof(VkDescriptorPoolSize), hash);
-
-   return hash;
-}
-
-static bool
-equals_descriptor_pool(const void *a, const void *b)
-{
-   const struct zink_descriptor_pool_key *a_k = a;
-   const struct zink_descriptor_pool_key *b_k = b;
-   return a_k->num_type_sizes == b_k->num_type_sizes &&
-          a_k->layout == b_k->layout &&
-          !memcmp(a_k->sizes, b_k->sizes, a_k->num_type_sizes * sizeof(VkDescriptorPoolSize));
-}
-
 static struct zink_descriptor_pool *
 descriptor_pool_get(struct zink_context *ctx, enum zink_descriptor_type type,
-                    struct zink_descriptor_layout_key *layout_key, VkDescriptorPoolSize *sizes, unsigned num_type_sizes)
+                    const struct zink_descriptor_pool_key *pool_key)
 {
    uint32_t hash = 0;
    if (type != ZINK_DESCRIPTOR_TYPES) {
-      struct zink_descriptor_pool_key key = {
-         .layout = layout_key,
-         .num_type_sizes = num_type_sizes,
-         .sizes = sizes,
-      };
-
-      hash = hash_descriptor_pool(&key);
-      struct hash_entry *he = _mesa_hash_table_search_pre_hashed(ctx->dd->descriptor_pools[type], hash, &key);
-      if (he)
-         return (void*)he->data;
+      hash = hash_descriptor_pool_key(pool_key);
+      struct hash_entry *he = _mesa_hash_table_search_pre_hashed(ctx->dd->descriptor_pools[type], hash, pool_key);
+      if (he) {
+         struct zink_descriptor_pool *pool = he->data;
+         pipe_reference(NULL, &pool->reference);
+         return pool;
+      }
    }
-   struct zink_descriptor_pool *pool = descriptor_pool_create(zink_screen(ctx->base.screen), type, layout_key, sizes, num_type_sizes);
+   struct zink_descriptor_pool *pool = descriptor_pool_create(zink_screen(ctx->base.screen), type, pool_key);
    if (type != ZINK_DESCRIPTOR_TYPES)
-      _mesa_hash_table_insert_pre_hashed(ctx->dd->descriptor_pools[type], hash, &pool->key, pool);
+      _mesa_hash_table_insert_pre_hashed(ctx->dd->descriptor_pools[type], hash, pool_key, pool);
    return pool;
 }
 
@@ -571,32 +648,13 @@ zink_descriptor_util_alloc_sets(struct zink_screen *screen, VkDescriptorSetLayou
       layouts[i] = dsl;
    dsai.pSetLayouts = layouts;
 
-   if (vkAllocateDescriptorSets(screen->dev, &dsai, sets) != VK_SUCCESS) {
+   if (VKSCR(AllocateDescriptorSets)(screen->dev, &dsai, sets) != VK_SUCCESS) {
       debug_printf("ZINK: %" PRIu64 " failed to allocate descriptor set :/\n", (uint64_t)dsl);
       return false;
    }
    return true;
 }
 
-unsigned
-zink_descriptor_program_num_sizes(struct zink_program *pg, enum zink_descriptor_type type)
-{
-   switch (type) {
-   case ZINK_DESCRIPTOR_TYPE_UBO:
-      return 1;
-   case ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW:
-      return !!pg->dd->sizes[ZDS_INDEX_COMBINED_SAMPLER].descriptorCount +
-             !!pg->dd->sizes[ZDS_INDEX_UNIFORM_TEXELS].descriptorCount;
-   case ZINK_DESCRIPTOR_TYPE_SSBO:
-      return 1;
-   case ZINK_DESCRIPTOR_TYPE_IMAGE:
-      return !!pg->dd->sizes[ZDS_INDEX_STORAGE_IMAGE].descriptorCount +
-             !!pg->dd->sizes[ZDS_INDEX_STORAGE_TEXELS].descriptorCount;
-   default: break;
-   }
-   unreachable("unknown type");
-}
-
 static struct zink_descriptor_set *
 allocate_desc_set(struct zink_context *ctx, struct zink_program *pg, enum zink_descriptor_type type, unsigned descs_used, bool is_compute)
 {
@@ -604,8 +662,8 @@ allocate_desc_set(struct zink_context *ctx, struct zink_program *pg, enum zink_d
    bool push_set = type == ZINK_DESCRIPTOR_TYPES;
    struct zink_descriptor_pool *pool = push_set ? ctx->dd->push_pool[is_compute] : pdd_cached(pg)->pool[type];
 #define DESC_BUCKET_FACTOR 10
-   unsigned bucket_size = pool->key.layout->num_descriptors ? DESC_BUCKET_FACTOR : 1;
-   if (pool->key.layout->num_descriptors) {
+   unsigned bucket_size = pool->key->layout->num_bindings ? DESC_BUCKET_FACTOR : 1;
+   if (pool->key->layout->num_bindings) {
       for (unsigned desc_factor = DESC_BUCKET_FACTOR; desc_factor < descs_used; desc_factor *= DESC_BUCKET_FACTOR)
          bucket_size = desc_factor;
    }
@@ -648,13 +706,13 @@ allocate_desc_set(struct zink_context *ctx, struct zink_program *pg, enum zink_d
 #endif
       switch (type) {
       case ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW:
-         zds->sampler_states = (struct zink_sampler_state**)&samplers[i * pool->key.layout->num_descriptors];
+         zds->sampler_states = (struct zink_sampler_state**)&samplers[i * pool->key->layout->num_bindings];
          FALLTHROUGH;
       case ZINK_DESCRIPTOR_TYPE_IMAGE:
-         zds->surfaces = &surfaces[i * pool->key.layout->num_descriptors];
+         zds->surfaces = &surfaces[i * pool->key->layout->num_bindings];
          break;
       default:
-         zds->res_objs = (struct zink_resource_object**)&res_objs[i * pool->key.layout->num_descriptors];
+         zds->res_objs = (struct zink_resource_object**)&res_objs[i * pool->key->layout->num_bindings];
          break;
       }
       zds->desc_set = desc_set[i];
@@ -675,8 +733,10 @@ populate_zds_key(struct zink_context *ctx, enum zink_descriptor_type type, bool
       key->exists[0] = true;
       if (type == ZINK_DESCRIPTOR_TYPES)
          key->state[0] = ctx->dd->push_state[is_compute];
-      else
+      else {
+         assert(ctx->dd->descriptor_states[is_compute].valid[type]);
          key->state[0] = ctx->dd->descriptor_states[is_compute].state[type];
+      }
    } else if (type == ZINK_DESCRIPTOR_TYPES) {
       /* gfx only */
       for (unsigned i = 0; i < ZINK_SHADER_COUNT; i++) {
@@ -720,7 +780,7 @@ zink_descriptor_set_get(struct zink_context *ctx,
    unsigned descs_used = 1;
    assert(type <= ZINK_DESCRIPTOR_TYPES);
 
-   assert(pool->key.layout->num_descriptors);
+   assert(pool->key->layout->num_bindings);
    uint32_t hash = push_set ? ctx->dd->push_state[is_compute] :
                               ctx->dd->descriptor_states[is_compute].state[type];
 
@@ -838,7 +898,7 @@ zink_descriptor_set_recycle(struct zink_descriptor_set *zds)
    if (refcount != 1)
       return;
    /* this is a null set */
-   if (!pool->key.layout->num_descriptors)
+   if (!pool->key->layout->num_bindings)
       return;
    simple_mtx_lock(&pool->mtx);
    if (zds->punted)
@@ -925,7 +985,7 @@ zink_descriptor_set_refs_clear(struct zink_descriptor_refs *refs, void *ptr)
 }
 
 static inline void
-zink_descriptor_pool_reference(struct zink_screen *screen,
+zink_descriptor_pool_reference(struct zink_context *ctx,
                                struct zink_descriptor_pool **dst,
                                struct zink_descriptor_pool *src)
 {
@@ -933,7 +993,7 @@ zink_descriptor_pool_reference(struct zink_screen *screen,
 
    if (pipe_reference_described(old_dst ? &old_dst->reference : NULL, &src->reference,
                                 (debug_reference_descriptor)debug_describe_zink_descriptor_pool))
-      descriptor_pool_free(screen, old_dst);
+      descriptor_pool_delete(ctx, old_dst);
    if (dst) *dst = src;
 }
 
@@ -1014,19 +1074,14 @@ zink_descriptor_program_init(struct zink_context *ctx, struct zink_program *pg)
       return true;
 
    for (unsigned i = 0; i < ZINK_DESCRIPTOR_TYPES; i++) {
-      if (!pg->dd->layout_key[i])
+      if (!pg->dd->pool_key[i])
          continue;
 
-      unsigned idx = zink_descriptor_type_to_size_idx(i);
-      VkDescriptorPoolSize *size = &pg->dd->sizes[idx];
-      /* this is a sampler/image set with no images only texels */
-      if (!size->descriptorCount)
-         size++;
-      unsigned num_sizes = zink_descriptor_program_num_sizes(pg, i);
-      struct zink_descriptor_pool *pool = descriptor_pool_get(ctx, i, pg->dd->layout_key[i], size, num_sizes);
+      const struct zink_descriptor_pool_key *pool_key = pg->dd->pool_key[i];
+      struct zink_descriptor_pool *pool = descriptor_pool_get(ctx, i, pool_key);
       if (!pool)
          return false;
-      zink_descriptor_pool_reference(screen, &pdd_cached(pg)->pool[i], pool);
+      pdd_cached(pg)->pool[i] = pool;
 
       if (screen->info.have_KHR_descriptor_update_template &&
           screen->descriptor_mode != ZINK_DESCRIPTOR_MODE_NOTEMPLATES)
@@ -1037,25 +1092,21 @@ zink_descriptor_program_init(struct zink_context *ctx, struct zink_program *pg)
 }
 
 void
-zink_descriptor_program_deinit(struct zink_screen *screen, struct zink_program *pg)
+zink_descriptor_program_deinit(struct zink_context *ctx, struct zink_program *pg)
 {
    if (!pg->dd)
       return;
    for (unsigned i = 0; i < ZINK_DESCRIPTOR_TYPES; i++)
-      zink_descriptor_pool_reference(screen, &pdd_cached(pg)->pool[i], NULL);
+      zink_descriptor_pool_reference(ctx, &pdd_cached(pg)->pool[i], NULL);
 
-   zink_descriptor_program_deinit_lazy(screen, pg);
+   zink_descriptor_program_deinit_lazy(ctx, pg);
 }
 
 static void
 zink_descriptor_pool_deinit(struct zink_context *ctx)
 {
-   struct zink_screen *screen = zink_screen(ctx->base.screen);
    for (unsigned i = 0; i < ZINK_DESCRIPTOR_TYPES; i++) {
-      hash_table_foreach(ctx->dd->descriptor_pools[i], entry) {
-         struct zink_descriptor_pool *pool = (void*)entry->data;
-         zink_descriptor_pool_reference(screen, &pool, NULL);
-      }
+      /* do not free: programs own these pools */
       _mesa_hash_table_destroy(ctx->dd->descriptor_pools[i], NULL);
    }
 }
@@ -1064,17 +1115,23 @@ static bool
 zink_descriptor_pool_init(struct zink_context *ctx)
 {
    for (unsigned i = 0; i < ZINK_DESCRIPTOR_TYPES; i++) {
-      ctx->dd->descriptor_pools[i] = _mesa_hash_table_create(ctx, hash_descriptor_pool, equals_descriptor_pool);
+      ctx->dd->descriptor_pools[i] = _mesa_hash_table_create(ctx, hash_descriptor_pool_key, equals_descriptor_pool_key);
       if (!ctx->dd->descriptor_pools[i])
          return false;
    }
    struct zink_screen *screen = zink_screen(ctx->base.screen);
-   VkDescriptorPoolSize sizes;
-   sizes.type = screen->descriptor_mode == ZINK_DESCRIPTOR_MODE_LAZY ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER : VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
-   sizes.descriptorCount = ZINK_SHADER_COUNT * ZINK_DEFAULT_MAX_DESCS;
-   ctx->dd->push_pool[0] = descriptor_pool_get(ctx, 0, ctx->dd->push_layout_keys[0], &sizes, 1);
-   sizes.descriptorCount = ZINK_DEFAULT_MAX_DESCS;
-   ctx->dd->push_pool[1] = descriptor_pool_get(ctx, 0, ctx->dd->push_layout_keys[1], &sizes, 1);
+   VkDescriptorPoolSize sizes[2];
+   sizes[0].type = screen->descriptor_mode == ZINK_DESCRIPTOR_MODE_LAZY ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER : VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
+   sizes[0].descriptorCount = ZINK_SHADER_COUNT * ZINK_DEFAULT_MAX_DESCS;
+   sizes[1].type = VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT;
+   sizes[1].descriptorCount = ZINK_DEFAULT_MAX_DESCS;
+   /* these are freed by ralloc */
+   struct zink_descriptor_pool_key *pool_key;
+   pool_key = zink_descriptor_util_pool_key_get(ctx, ZINK_DESCRIPTOR_TYPES, ctx->dd->push_layout_keys[0], sizes, ctx->dd->has_fbfetch ? 2 : 1);
+   ctx->dd->push_pool[0] = descriptor_pool_get(ctx, 0, pool_key);
+   sizes[0].descriptorCount = ZINK_DEFAULT_MAX_DESCS;
+   pool_key = zink_descriptor_util_pool_key_get(ctx, ZINK_DESCRIPTOR_TYPES, ctx->dd->push_layout_keys[1], sizes, 1);
+   ctx->dd->push_pool[1] = descriptor_pool_get(ctx, 0, pool_key);
    return ctx->dd->push_pool[0] && ctx->dd->push_pool[1];
 }
 
@@ -1157,7 +1214,8 @@ init_write_descriptor(struct zink_shader *shader, struct zink_descriptor_set *zd
     wd->dstBinding = shader ? shader->bindings[type][idx].binding : idx;
     wd->dstArrayElement = 0;
     wd->descriptorCount = shader ? shader->bindings[type][idx].size : 1;
-    wd->descriptorType = shader ? shader->bindings[type][idx].type : VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
+    wd->descriptorType = shader ? shader->bindings[type][idx].type :
+                                  idx == ZINK_FBFETCH_BINDING ? VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT : VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC;
     wd->dstSet = zds->desc_set;
     return num_wds + 1;
 }
@@ -1167,11 +1225,13 @@ update_push_ubo_descriptors(struct zink_context *ctx, struct zink_descriptor_set
                             bool is_compute, bool cache_hit, uint32_t *dynamic_offsets)
 {
    struct zink_screen *screen = zink_screen(ctx->base.screen);
-   VkWriteDescriptorSet wds[ZINK_SHADER_COUNT];
+   VkWriteDescriptorSet wds[ZINK_SHADER_COUNT + 1];
    VkDescriptorBufferInfo buffer_infos[ZINK_SHADER_COUNT];
    struct zink_shader **stages;
+   bool fbfetch = false;
 
    unsigned num_stages = is_compute ? 1 : ZINK_SHADER_COUNT;
+   struct zink_program *pg = is_compute ? &ctx->curr_compute->base : &ctx->curr_program->base;
    if (is_compute)
       stages = &ctx->curr_compute->shader;
    else
@@ -1190,20 +1250,37 @@ update_push_ubo_descriptors(struct zink_context *ctx, struct zink_descriptor_set
        * because of this, we have to populate the dynamic offsets by their shader stage to ensure they
        * match what the driver expects
        */
-      dynamic_offsets[dynamic_idx] = info->offset;
+      const bool used = (pg->dd->push_usage & BITFIELD_BIT(pstage)) == BITFIELD_BIT(pstage);
+      dynamic_offsets[dynamic_idx] = used ? info->offset : 0;
       if (!cache_hit) {
-         struct zink_resource *res = zink_get_resource_for_descriptor(ctx, ZINK_DESCRIPTOR_TYPE_UBO, pstage, 0);
          init_write_descriptor(NULL, zds, ZINK_DESCRIPTOR_TYPE_UBO, tgsi_processor_to_shader_stage(pstage), &wds[i], 0);
-         desc_set_res_add(zds, res, i, cache_hit);
+         if (used) {
+            desc_set_res_add(zds, ctx->di.descriptor_res[ZINK_DESCRIPTOR_TYPE_UBO][pstage][0], i, cache_hit);
+            buffer_infos[i].buffer = info->buffer;
+            buffer_infos[i].range = info->range;
+         } else {
+            desc_set_res_add(zds, NULL, i, cache_hit);
+            if (unlikely(!screen->info.rb2_feats.nullDescriptor))
+               buffer_infos[i].buffer = zink_resource(ctx->dummy_vertex_buffer)->obj->buffer;
+            else
+               buffer_infos[i].buffer = VK_NULL_HANDLE;
+            buffer_infos[i].range = VK_WHOLE_SIZE;
+         }
          /* these are dynamic UBO descriptors, so we have to always set 0 as the descriptor offset */
-         buffer_infos[i] = *info;
          buffer_infos[i].offset = 0;
          wds[i].pBufferInfo = &buffer_infos[i];
       }
    }
+   if (unlikely(!cache_hit && !is_compute && ctx->fbfetch_outputs)) {
+      struct zink_resource *res = zink_resource(ctx->fb_state.cbufs[0]->texture);
+      init_write_descriptor(NULL, zds, 0, MESA_SHADER_STAGES, &wds[ZINK_SHADER_COUNT], 0);
+      desc_set_res_add(zds, res, ZINK_SHADER_COUNT, cache_hit);
+      wds[ZINK_SHADER_COUNT].pImageInfo = &ctx->di.fbfetch;
+      fbfetch = true;
+   }
 
    if (!cache_hit)
-      vkUpdateDescriptorSets(screen->dev, num_stages, wds, 0, NULL);
+      VKSCR(UpdateDescriptorSets)(screen->dev, num_stages + !!fbfetch, wds, 0, NULL);
    return num_stages;
 }
 
@@ -1228,7 +1305,7 @@ set_descriptor_set_refs(struct zink_context *ctx, struct zink_descriptor_set *zd
 }
 
 static void
-update_descriptors_internal(struct zink_context *ctx, struct zink_descriptor_set **zds, struct zink_program *pg, bool *cache_hit)
+update_descriptors_internal(struct zink_context *ctx, enum zink_descriptor_type type, struct zink_descriptor_set *zds, struct zink_program *pg, bool cache_hit)
 {
    struct zink_screen *screen = zink_screen(ctx->base.screen);
    struct zink_shader **stages;
@@ -1239,143 +1316,174 @@ update_descriptors_internal(struct zink_context *ctx, struct zink_descriptor_set
    else
       stages = &ctx->gfx_stages[0];
 
-   for (unsigned h = 0; h < ZINK_DESCRIPTOR_TYPES; h++) {
-      if (cache_hit[h] || !zds[h])
-         continue;
+   if (cache_hit || !zds)
+      return;
 
-      if (screen->info.have_KHR_descriptor_update_template &&
-          screen->descriptor_mode != ZINK_DESCRIPTOR_MODE_NOTEMPLATES) {
-         set_descriptor_set_refs(ctx, zds[h], pg, cache_hit[h]);
-         zink_descriptor_set_update_lazy(ctx, pg, h, zds[h]->desc_set);
-         continue;
-      }
-
-      unsigned num_resources = 0;
-      ASSERTED unsigned num_bindings = zds[h]->pool->num_resources;
-      VkWriteDescriptorSet wds[ZINK_MAX_DESCRIPTORS_PER_TYPE];
-      unsigned num_wds = 0;
-
-      for (int i = 0; i < num_stages; i++) {
-         struct zink_shader *shader = stages[i];
-         if (!shader)
-            continue;
-         enum pipe_shader_type stage = pipe_shader_type_from_mesa(shader->nir->info.stage);
-         for (int j = 0; j < shader->num_bindings[h]; j++) {
-            int index = shader->bindings[h][j].index;
-            switch (h) {
-            case ZINK_DESCRIPTOR_TYPE_UBO:
-               if (!index)
-                  continue;
-            FALLTHROUGH;
-            case ZINK_DESCRIPTOR_TYPE_SSBO: {
-               VkDescriptorBufferInfo *info;
-               struct zink_resource *res = zink_get_resource_for_descriptor(ctx, h, stage, index);
-               if (h == ZINK_DESCRIPTOR_TYPE_UBO)
-                  info = &ctx->di.ubos[stage][index];
-               else
-                  info = &ctx->di.ssbos[stage][index];
-               assert(num_resources < num_bindings);
-               desc_set_res_add(zds[h], res, num_resources++, cache_hit[h]);
-               wds[num_wds].pBufferInfo = info;
-            }
-            break;
-            case ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW:
-            case ZINK_DESCRIPTOR_TYPE_IMAGE: {
-               VkDescriptorImageInfo *image_info;
-               VkBufferView *buffer_info;
-               if (h == ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW) {
-                  image_info = &ctx->di.textures[stage][index];
-                  buffer_info = &ctx->di.tbos[stage][index];
-               } else {
-                  image_info = &ctx->di.images[stage][index];
-                  buffer_info = &ctx->di.texel_images[stage][index];
-               }
-               bool is_buffer = zink_shader_descriptor_is_buffer(shader, h, j);
-               for (unsigned k = 0; k < shader->bindings[h][j].size; k++) {
-                  assert(num_resources < num_bindings);
-                  if (h == ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW) {
-                     struct zink_sampler_state *sampler = NULL;
-                     if (!is_buffer && image_info->imageView)
-                        sampler = ctx->sampler_states[stage][index + k];;
-
-                     desc_set_sampler_add(ctx, zds[h], &ctx->di.sampler_surfaces[stage][index + k], sampler, num_resources++, cache_hit[h]);
-                  } else {
-                     struct zink_image_view *image_view = &ctx->image_views[stage][index + k];
-                     desc_set_image_add(ctx, zds[h], image_view, num_resources++, is_buffer, cache_hit[h]);
-                  }
-               }
-               if (is_buffer)
-                  wds[num_wds].pTexelBufferView = buffer_info;
-               else
-                  wds[num_wds].pImageInfo = image_info;
-            }
-            break;
-            default:
-               unreachable("unknown descriptor type");
-            }
-            num_wds = init_write_descriptor(shader, zds[h], h, j, &wds[num_wds], num_wds);
-         }
-      }
-      if (num_wds)
-         vkUpdateDescriptorSets(screen->dev, num_wds, wds, 0, NULL);
+   if (screen->info.have_KHR_descriptor_update_template &&
+       screen->descriptor_mode != ZINK_DESCRIPTOR_MODE_NOTEMPLATES) {
+      set_descriptor_set_refs(ctx, zds, pg, cache_hit);
+      zink_descriptor_set_update_lazy(ctx, pg, type, zds->desc_set);
+      return;
    }
+
+   unsigned num_resources = 0;
+   ASSERTED unsigned num_bindings = zds->pool->num_resources;
+   VkWriteDescriptorSet wds[ZINK_MAX_DESCRIPTORS_PER_TYPE];
+   unsigned num_wds = 0;
+
+   for (int i = 0; i < num_stages; i++) {
+      struct zink_shader *shader = stages[i];
+      if (!shader)
+         continue;
+      enum pipe_shader_type stage = pipe_shader_type_from_mesa(shader->nir->info.stage);
+      for (int j = 0; j < shader->num_bindings[type]; j++) {
+         int index = shader->bindings[type][j].index;
+         switch (type) {
+         case ZINK_DESCRIPTOR_TYPE_UBO:
+            if (!index)
+               continue;
+         FALLTHROUGH;
+         case ZINK_DESCRIPTOR_TYPE_SSBO: {
+            VkDescriptorBufferInfo *info;
+            struct zink_resource *res = ctx->di.descriptor_res[type][stage][index];
+            if (type == ZINK_DESCRIPTOR_TYPE_UBO)
+               info = &ctx->di.ubos[stage][index];
+            else
+               info = &ctx->di.ssbos[stage][index];
+            assert(num_resources < num_bindings);
+            desc_set_res_add(zds, res, num_resources++, cache_hit);
+            wds[num_wds].pBufferInfo = info;
+         }
+         break;
+         case ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW:
+         case ZINK_DESCRIPTOR_TYPE_IMAGE: {
+            VkDescriptorImageInfo *image_info;
+            VkBufferView *buffer_info;
+            if (type == ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW) {
+               image_info = &ctx->di.textures[stage][index];
+               buffer_info = &ctx->di.tbos[stage][index];
+            } else {
+               image_info = &ctx->di.images[stage][index];
+               buffer_info = &ctx->di.texel_images[stage][index];
+            }
+            bool is_buffer = zink_shader_descriptor_is_buffer(shader, type, j);
+            for (unsigned k = 0; k < shader->bindings[type][j].size; k++) {
+               assert(num_resources < num_bindings);
+               if (type == ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW) {
+                  struct zink_sampler_state *sampler = NULL;
+                  if (!is_buffer && image_info->imageView)
+                     sampler = ctx->sampler_states[stage][index + k];;
+
+                  desc_set_sampler_add(ctx, zds, &ctx->di.sampler_surfaces[stage][index + k], sampler, num_resources++, cache_hit);
+               } else {
+                  struct zink_image_view *image_view = &ctx->image_views[stage][index + k];
+                  desc_set_image_add(ctx, zds, image_view, num_resources++, is_buffer, cache_hit);
+               }
+            }
+            if (is_buffer)
+               wds[num_wds].pTexelBufferView = buffer_info;
+            else
+               wds[num_wds].pImageInfo = image_info;
+         }
+         break;
+         default:
+            unreachable("unknown descriptor type");
+         }
+         num_wds = init_write_descriptor(shader, zds, type, j, &wds[num_wds], num_wds);
+      }
+   }
+   if (num_wds)
+      VKSCR(UpdateDescriptorSets)(screen->dev, num_wds, wds, 0, NULL);
 }
 
 static void
 zink_context_update_descriptor_states(struct zink_context *ctx, struct zink_program *pg);
 
+#define MAX_CACHE_MISSES 50
+
 void
 zink_descriptors_update(struct zink_context *ctx, bool is_compute)
 {
    struct zink_program *pg = is_compute ? (struct zink_program *)ctx->curr_compute : (struct zink_program *)ctx->curr_program;
 
    zink_context_update_descriptor_states(ctx, pg);
-   bool cache_hit[ZINK_DESCRIPTOR_TYPES + 1];
-   VkDescriptorSet sets[ZINK_DESCRIPTOR_TYPES + 1];
-   struct zink_descriptor_set *zds[ZINK_DESCRIPTOR_TYPES + 1];
-   /* push set is indexed in vulkan as 0 but isn't in the general pool array */
-   ctx->dd->changed[is_compute][ZINK_DESCRIPTOR_TYPES] |= ctx->dd->pg[is_compute] != pg;
-   if (pg->dd->push_usage)
-      zds[ZINK_DESCRIPTOR_TYPES] = zink_descriptor_set_get(ctx, ZINK_DESCRIPTOR_TYPES, is_compute, &cache_hit[ZINK_DESCRIPTOR_TYPES]);
-   else {
-      zds[ZINK_DESCRIPTOR_TYPES] = NULL;
-      cache_hit[ZINK_DESCRIPTOR_TYPES] = false;
-   }
-   ctx->dd->changed[is_compute][ZINK_DESCRIPTOR_TYPES] = false;
-   sets[0] = zds[ZINK_DESCRIPTOR_TYPES] ? zds[ZINK_DESCRIPTOR_TYPES]->desc_set : ctx->dd->dummy_set;
-   for (int h = 0; h < ZINK_DESCRIPTOR_TYPES; h++) {
-      ctx->dd->changed[is_compute][h] |= ctx->dd->pg[is_compute] != pg;
-      if (pg->dsl[h + 1]) {
-         /* null set has null pool */
-         if (pdd_cached(pg)->pool[h])
-            zds[h] = zink_descriptor_set_get(ctx, h, is_compute, &cache_hit[h]);
-         else
-            zds[h] = NULL;
-         /* reuse dummy set for bind */
-         sets[h + 1] = zds[h] ? zds[h]->desc_set : ctx->dd->dummy_set;
-      } else {
-         zds[h] = NULL;
-      }
-      if (!zds[h])
-         cache_hit[h] = false;
-      ctx->dd->changed[is_compute][h] = false;
-   }
+   bool cache_hit;
+   VkDescriptorSet desc_set;
+   struct zink_descriptor_set *zds;
+
    struct zink_batch *batch = &ctx->batch;
-   zink_batch_reference_program(batch, pg);
+   VkPipelineBindPoint bp = is_compute ? VK_PIPELINE_BIND_POINT_COMPUTE : VK_PIPELINE_BIND_POINT_GRAPHICS;
 
-   uint32_t dynamic_offsets[PIPE_MAX_CONSTANT_BUFFERS];
-   unsigned dynamic_offset_idx = 0;
+   {
+      uint32_t dynamic_offsets[PIPE_MAX_CONSTANT_BUFFERS];
+      unsigned dynamic_offset_idx = 0;
 
-   if (pg->dd->push_usage) // push set
-      dynamic_offset_idx = update_push_ubo_descriptors(ctx, zds[ZINK_DESCRIPTOR_TYPES],
-                                                       is_compute, cache_hit[ZINK_DESCRIPTOR_TYPES], dynamic_offsets);
+      /* push set is indexed in vulkan as 0 but isn't in the general pool array */
+      ctx->dd->changed[is_compute][ZINK_DESCRIPTOR_TYPES] |= ctx->dd->pg[is_compute] != pg;
+      if (pg->dd->push_usage) {
+         zds = zink_descriptor_set_get(ctx, ZINK_DESCRIPTOR_TYPES, is_compute, &cache_hit);
+      } else {
+         zds = NULL;
+         cache_hit = false;
+      }
+      ctx->dd->changed[is_compute][ZINK_DESCRIPTOR_TYPES] = false;
+      desc_set = zds ? zds->desc_set : ctx->dd->dummy_set;
 
-   update_descriptors_internal(ctx, zds, pg, cache_hit);
+      if (pg->dd->push_usage) // push set
+         dynamic_offset_idx = update_push_ubo_descriptors(ctx, zds,
+                                                          is_compute, cache_hit, dynamic_offsets);
+      VKCTX(CmdBindDescriptorSets)(batch->state->cmdbuf, bp,
+                              pg->layout, 0, 1, &desc_set,
+                              dynamic_offset_idx, dynamic_offsets);
+   }
 
-   vkCmdBindDescriptorSets(batch->state->cmdbuf, is_compute ? VK_PIPELINE_BIND_POINT_COMPUTE : VK_PIPELINE_BIND_POINT_GRAPHICS,
-                           pg->layout, 0, pg->num_dsl, sets,
-                           dynamic_offset_idx, dynamic_offsets);
+   {
+      for (int h = 0; h < ZINK_DESCRIPTOR_TYPES; h++) {
+         if (pdd_cached(pg)->cache_misses[h] < MAX_CACHE_MISSES) {
+            ctx->dd->changed[is_compute][h] |= ctx->dd->pg[is_compute] != pg;
+            if (pg->dsl[h + 1]) {
+               /* null set has null pool */
+               if (pdd_cached(pg)->pool[h]) {
+                  zds = zink_descriptor_set_get(ctx, h, is_compute, &cache_hit);
+                  if (cache_hit) {
+                     pdd_cached(pg)->cache_misses[h] = 0;
+                  } else if (likely(zink_screen(ctx->base.screen)->descriptor_mode != ZINK_DESCRIPTOR_MODE_NOFALLBACK)) {
+                     if (++pdd_cached(pg)->cache_misses[h] == MAX_CACHE_MISSES) {
+                        const char *set_names[] = {
+                           "UBO",
+                           "TEXTURES",
+                           "SSBO",
+                           "IMAGES",
+                        };
+                        debug_printf("zink: descriptor cache exploded for prog %p set %s: getting lazy (not a bug, just lettin you know)\n", pg, set_names[h]);
+                     }
+                  }
+               } else
+                  zds = NULL;
+               /* reuse dummy set for bind */
+               desc_set = zds ? zds->desc_set : ctx->dd->dummy_set;
+               update_descriptors_internal(ctx, h, zds, pg, cache_hit);
+
+               VKCTX(CmdBindDescriptorSets)(batch->state->cmdbuf, bp,
+                                            pg->layout, h + 1, 1, &desc_set,
+                                            0, NULL);
+               if (pdd_cached(pg)->cache_misses[h] == MAX_CACHE_MISSES)
+                  zink_descriptor_pool_reference(ctx, &pdd_cached(pg)->pool[h], NULL);
+            }
+         } else {
+            zink_descriptors_update_lazy_masked(ctx, is_compute, BITFIELD_BIT(h), 0);
+         }
+         ctx->dd->changed[is_compute][h] = false;
+      }
+   }
    ctx->dd->pg[is_compute] = pg;
+
+   if (pg->dd->bindless && unlikely(!ctx->dd->bindless_bound)) {
+      VKCTX(CmdBindDescriptorSets)(batch->state->cmdbuf, bp,
+                                   pg->layout, ZINK_DESCRIPTOR_BINDLESS, 1, &ctx->dd->bindless_set,
+                                   0, NULL);
+      ctx->dd->bindless_bound = true;
+   }
 }
 
 void
@@ -1398,6 +1506,10 @@ zink_batch_descriptor_reset(struct zink_screen *screen, struct zink_batch_state
        */
       pipe_reference(&zds->reference, NULL);
       zink_descriptor_set_recycle(zds);
+      if (zds->reference.count == 1) {
+         struct zink_descriptor_pool *pool = zds->pool;
+         zink_descriptor_pool_reference(bs->ctx, &pool, NULL);
+      }
       _mesa_set_remove(bs->dd->desc_sets, entry);
    }
    zink_batch_descriptor_reset_lazy(screen, bs);
@@ -1412,43 +1524,24 @@ zink_batch_descriptor_init(struct zink_screen *screen, struct zink_batch_state *
    return !!bs->dd->desc_sets;
 }
 
-struct zink_resource *
-zink_get_resource_for_descriptor(struct zink_context *ctx, enum zink_descriptor_type type, enum pipe_shader_type shader, int idx)
-{
-   switch (type) {
-   case ZINK_DESCRIPTOR_TYPE_UBO:
-      return zink_resource(ctx->ubos[shader][idx].buffer);
-   case ZINK_DESCRIPTOR_TYPE_SSBO:
-      return zink_resource(ctx->ssbos[shader][idx].buffer);
-   case ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW:
-      return ctx->sampler_views[shader][idx] ? zink_resource(ctx->sampler_views[shader][idx]->texture) : NULL;
-   case ZINK_DESCRIPTOR_TYPE_IMAGE:
-      return zink_resource(ctx->image_views[shader][idx].base.resource);
-   default:
-      break;
-   }
-   unreachable("unknown descriptor type!");
-   return NULL;
-}
-
 static uint32_t
 calc_descriptor_state_hash_ubo(struct zink_context *ctx, enum pipe_shader_type shader, int idx, uint32_t hash, bool need_offset)
 {
-   struct zink_resource *res = zink_get_resource_for_descriptor(ctx, ZINK_DESCRIPTOR_TYPE_UBO, shader, idx);
+   struct zink_resource *res = ctx->di.descriptor_res[ZINK_DESCRIPTOR_TYPE_UBO][shader][idx];
    struct zink_resource_object *obj = res ? res->obj : NULL;
    hash = XXH32(&obj, sizeof(void*), hash);
-   void *hash_data = &ctx->ubos[shader][idx].buffer_size;
+   void *hash_data = &ctx->di.ubos[shader][idx].range;
    size_t data_size = sizeof(unsigned);
    hash = XXH32(hash_data, data_size, hash);
    if (need_offset)
-      hash = XXH32(&ctx->ubos[shader][idx].buffer_offset, sizeof(unsigned), hash);
+      hash = XXH32(&ctx->di.ubos[shader][idx].offset, sizeof(unsigned), hash);
    return hash;
 }
 
 static uint32_t
 calc_descriptor_state_hash_ssbo(struct zink_context *ctx, struct zink_shader *zs, enum pipe_shader_type shader, int i, int idx, uint32_t hash)
 {
-   struct zink_resource *res = zink_get_resource_for_descriptor(ctx, ZINK_DESCRIPTOR_TYPE_SSBO, shader, idx);
+   struct zink_resource *res = ctx->di.descriptor_res[ZINK_DESCRIPTOR_TYPE_SSBO][shader][idx];
    struct zink_resource_object *obj = res ? res->obj : NULL;
    hash = XXH32(&obj, sizeof(void*), hash);
    if (obj) {
@@ -1598,7 +1691,8 @@ zink_context_update_descriptor_states(struct zink_context *ctx, struct zink_prog
       ctx->dd->last_push_usage[pg->is_compute] = pg->dd->push_usage;
    }
    for (unsigned i = 0; i < ZINK_DESCRIPTOR_TYPES; i++) {
-      if (pdd_cached(pg)->pool[i] && !ctx->dd->descriptor_states[pg->is_compute].valid[i])
+      if (pdd_cached(pg)->pool[i] && pdd_cached(pg)->cache_misses[i] < MAX_CACHE_MISSES &&
+          !ctx->dd->descriptor_states[pg->is_compute].valid[i])
          update_descriptor_state(ctx, i, pg->is_compute);
    }
 }
@@ -1606,6 +1700,7 @@ zink_context_update_descriptor_states(struct zink_context *ctx, struct zink_prog
 void
 zink_context_invalidate_descriptor_state(struct zink_context *ctx, enum pipe_shader_type shader, enum zink_descriptor_type type, unsigned start, unsigned count)
 {
+   zink_context_invalidate_descriptor_state_lazy(ctx, shader, type, start, count);
    if (type == ZINK_DESCRIPTOR_TYPE_UBO && !start) {
       /* ubo 0 is the push set */
       ctx->dd->push_state[shader == PIPE_SHADER_COMPUTE] = 0;
@@ -1645,9 +1740,12 @@ zink_descriptors_deinit(struct zink_context *ctx)
 bool
 zink_descriptor_layouts_init(struct zink_context *ctx)
 {
-   for (unsigned i = 0; i < ZINK_DESCRIPTOR_TYPES; i++)
+   for (unsigned i = 0; i < ZINK_DESCRIPTOR_TYPES; i++) {
       if (!_mesa_hash_table_init(&ctx->desc_set_layouts[i], ctx, hash_descriptor_layout, equals_descriptor_layout))
          return false;
+      if (!_mesa_set_init(&ctx->desc_pool_keys[i], ctx, hash_descriptor_pool_key, equals_descriptor_pool_key))
+         return false;
+   }
    return true;
 }
 
@@ -1658,11 +1756,135 @@ zink_descriptor_layouts_deinit(struct zink_context *ctx)
    for (unsigned i = 0; i < ZINK_DESCRIPTOR_TYPES; i++) {
       hash_table_foreach(&ctx->desc_set_layouts[i], he) {
          struct zink_descriptor_layout *layout = he->data;
-         vkDestroyDescriptorSetLayout(screen->dev, layout->layout, NULL);
+         VKSCR(DestroyDescriptorSetLayout)(screen->dev, layout->layout, NULL);
          if (layout->desc_template)
-            screen->vk.DestroyDescriptorUpdateTemplate(screen->dev, layout->desc_template, NULL);
+            VKSCR(DestroyDescriptorUpdateTemplate)(screen->dev, layout->desc_template, NULL);
          ralloc_free(layout);
          _mesa_hash_table_remove(&ctx->desc_set_layouts[i], he);
       }
    }
 }
+
+
+void
+zink_descriptor_util_init_fbfetch(struct zink_context *ctx)
+{
+   if (ctx->dd->has_fbfetch)
+      return;
+
+   struct zink_screen *screen = zink_screen(ctx->base.screen);
+   VKSCR(DestroyDescriptorSetLayout)(screen->dev, ctx->dd->push_dsl[0]->layout, NULL);
+   ralloc_free(ctx->dd->push_dsl[0]);
+   ralloc_free(ctx->dd->push_layout_keys[0]);
+   ctx->dd->push_dsl[0] = create_gfx_layout(ctx, &ctx->dd->push_layout_keys[0], true);
+   ctx->dd->has_fbfetch = true;
+   if (screen->descriptor_mode != ZINK_DESCRIPTOR_MODE_LAZY)
+      zink_descriptor_pool_init(ctx);
+}
+
+ALWAYS_INLINE static VkDescriptorType
+type_from_bindless_index(unsigned idx)
+{
+   switch (idx) {
+   case 0: return VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+   case 1: return VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER;
+   case 2: return VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
+   case 3: return VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER;
+   default:
+      unreachable("unknown index");
+   }
+}
+
+void
+zink_descriptors_init_bindless(struct zink_context *ctx)
+{
+   if (ctx->dd->bindless_set)
+      return;
+
+   struct zink_screen *screen = zink_screen(ctx->base.screen);
+   VkDescriptorSetLayoutBinding bindings[4];
+   const unsigned num_bindings = 4;
+   VkDescriptorSetLayoutCreateInfo dcslci = {0};
+   dcslci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
+   dcslci.pNext = NULL;
+   VkDescriptorSetLayoutBindingFlagsCreateInfo fci = {0};
+   VkDescriptorBindingFlags flags[4];
+   dcslci.pNext = &fci;
+   dcslci.flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_UPDATE_AFTER_BIND_POOL_BIT;
+   fci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO;
+   fci.bindingCount = num_bindings;
+   fci.pBindingFlags = flags;
+   for (unsigned i = 0; i < num_bindings; i++) {
+      flags[i] = VK_DESCRIPTOR_BINDING_UPDATE_AFTER_BIND_BIT | VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT | VK_DESCRIPTOR_BINDING_UPDATE_UNUSED_WHILE_PENDING_BIT;
+   }
+   for (unsigned i = 0; i < num_bindings; i++) {
+      bindings[i].binding = i;
+      bindings[i].descriptorType = type_from_bindless_index(i);
+      bindings[i].descriptorCount = ZINK_MAX_BINDLESS_HANDLES;
+      bindings[i].stageFlags = VK_SHADER_STAGE_ALL_GRAPHICS | VK_SHADER_STAGE_COMPUTE_BIT;
+      bindings[i].pImmutableSamplers = NULL;
+   }
+   
+   dcslci.bindingCount = num_bindings;
+   dcslci.pBindings = bindings;
+   if (VKSCR(CreateDescriptorSetLayout)(screen->dev, &dcslci, 0, &ctx->dd->bindless_layout) != VK_SUCCESS) {
+      debug_printf("vkCreateDescriptorSetLayout failed\n");
+      return;
+   }
+
+   VkDescriptorPoolCreateInfo dpci = {0};
+   VkDescriptorPoolSize sizes[4];
+   for (unsigned i = 0; i < 4; i++) {
+      sizes[i].type = type_from_bindless_index(i);
+      sizes[i].descriptorCount = ZINK_MAX_BINDLESS_HANDLES;
+   }
+   dpci.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
+   dpci.pPoolSizes = sizes;
+   dpci.poolSizeCount = 4;
+   dpci.flags = VK_DESCRIPTOR_POOL_CREATE_UPDATE_AFTER_BIND_BIT;
+   dpci.maxSets = 1;
+   if (VKSCR(CreateDescriptorPool)(screen->dev, &dpci, 0, &ctx->dd->bindless_pool) != VK_SUCCESS) {
+      debug_printf("vkCreateDescriptorPool failed\n");
+      return;
+   }
+
+   zink_descriptor_util_alloc_sets(screen, ctx->dd->bindless_layout, ctx->dd->bindless_pool, &ctx->dd->bindless_set, 1);
+}
+
+void
+zink_descriptors_deinit_bindless(struct zink_context *ctx)
+{
+   struct zink_screen *screen = zink_screen(ctx->base.screen);
+   if (ctx->dd->bindless_layout)
+      VKSCR(DestroyDescriptorSetLayout)(screen->dev, ctx->dd->bindless_layout, NULL);
+   if (ctx->dd->bindless_pool)
+      VKSCR(DestroyDescriptorPool)(screen->dev, ctx->dd->bindless_pool, NULL);
+}
+
+void
+zink_descriptors_update_bindless(struct zink_context *ctx)
+{
+   struct zink_screen *screen = zink_screen(ctx->base.screen);
+   for (unsigned i = 0; i < 2; i++) {
+      if (!ctx->di.bindless_dirty[i])
+         continue;
+      while (util_dynarray_contains(&ctx->di.bindless[i].updates, uint32_t)) {
+         uint32_t handle = util_dynarray_pop(&ctx->di.bindless[i].updates, uint32_t);
+         bool is_buffer = ZINK_BINDLESS_IS_BUFFER(handle);
+         VkWriteDescriptorSet wd;
+         wd.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+         wd.pNext = NULL;
+         wd.dstSet = ctx->dd->bindless_set;
+         wd.dstBinding = is_buffer ? i * 2 + 1: i * 2;
+         wd.dstArrayElement = is_buffer ? handle - ZINK_MAX_BINDLESS_HANDLES : handle;
+         wd.descriptorCount = 1;
+         wd.descriptorType = type_from_bindless_index(wd.dstBinding);
+         if (is_buffer)
+            wd.pTexelBufferView = &ctx->di.bindless[i].buffer_infos[wd.dstArrayElement];
+         else
+            wd.pImageInfo = &ctx->di.bindless[i].img_infos[handle];
+         VKSCR(UpdateDescriptorSets)(screen->dev, 1, &wd, 0, NULL);
+      }
+   }
+   ctx->di.any_bindless_dirty = 0;
+}
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_descriptors.h b/mesa 3D driver/src/gallium/drivers/zink/zink_descriptors.h
index 09e7254634..91fcf3a6d7 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_descriptors.h	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_descriptors.h	
@@ -46,9 +46,12 @@ enum zink_descriptor_type {
    ZINK_DESCRIPTOR_TYPE_SSBO,
    ZINK_DESCRIPTOR_TYPE_IMAGE,
    ZINK_DESCRIPTOR_TYPES,
+   ZINK_DESCRIPTOR_BINDLESS,
 };
 
-#define ZINK_MAX_DESCRIPTORS_PER_TYPE 32
+#define ZINK_MAX_DESCRIPTORS_PER_TYPE (32 * ZINK_SHADER_COUNT)
+
+#define ZINK_BINDLESS_IS_BUFFER(HANDLE) (HANDLE >= ZINK_MAX_BINDLESS_HANDLES)
 
 struct zink_descriptor_refs {
    struct util_dynarray refs;
@@ -88,9 +91,8 @@ struct zink_descriptor_state_key {
 };
 
 struct zink_descriptor_layout_key {
-   unsigned num_descriptors;
+   unsigned num_bindings;
    VkDescriptorSetLayoutBinding *bindings;
-   unsigned use_count;
 };
 
 struct zink_descriptor_layout {
@@ -99,9 +101,9 @@ struct zink_descriptor_layout {
 };
 
 struct zink_descriptor_pool_key {
+   unsigned use_count;
    struct zink_descriptor_layout_key *layout;
-   unsigned num_type_sizes;
-   VkDescriptorPoolSize *sizes;
+   VkDescriptorPoolSize sizes[2];
 };
 
 struct zink_descriptor_reference {
@@ -109,7 +111,6 @@ struct zink_descriptor_reference {
    bool *invalid;
 };
 
-
 struct zink_descriptor_data {
    struct zink_descriptor_state gfx_descriptor_states[ZINK_SHADER_COUNT]; // keep incremental hashes here
    struct zink_descriptor_state descriptor_states[2]; // gfx, compute
@@ -129,15 +130,21 @@ struct zink_descriptor_data {
    struct zink_descriptor_layout *dummy_dsl;
    VkDescriptorSet dummy_set;
 
+   VkDescriptorSetLayout bindless_layout;
+   VkDescriptorPool bindless_pool;
+   VkDescriptorSet bindless_set;
+   bool bindless_bound;
+
    bool changed[2][ZINK_DESCRIPTOR_TYPES + 1];
+   bool has_fbfetch;
    struct zink_program *pg[2]; //gfx, compute
 };
 
 struct zink_program_descriptor_data {
    uint8_t push_usage;
-   VkDescriptorPoolSize sizes[6]; //zink_descriptor_size_index
-   struct zink_descriptor_layout_key *layout_key[ZINK_DESCRIPTOR_TYPES]; //push set doesn't need one
+   bool bindless;
    uint8_t binding_usage;
+   struct zink_descriptor_pool_key *pool_key[ZINK_DESCRIPTOR_TYPES]; //push set doesn't need one
    struct zink_descriptor_layout *layouts[ZINK_DESCRIPTOR_TYPES + 1];
    VkDescriptorUpdateTemplateKHR push_template;
 };
@@ -184,8 +191,6 @@ zink_descriptor_type_to_size_idx(enum zink_descriptor_type type)
    }
    unreachable("unknown type");
 }
-unsigned
-zink_descriptor_program_num_sizes(struct zink_program *pg, enum zink_descriptor_type type);
 bool
 zink_descriptor_layouts_init(struct zink_context *ctx);
 
@@ -202,15 +207,24 @@ struct zink_descriptor_layout *
 zink_descriptor_util_layout_get(struct zink_context *ctx, enum zink_descriptor_type type,
                       VkDescriptorSetLayoutBinding *bindings, unsigned num_bindings,
                       struct zink_descriptor_layout_key **layout_key);
+struct zink_descriptor_pool_key *
+zink_descriptor_util_pool_key_get(struct zink_context *ctx, enum zink_descriptor_type type,
+                                  struct zink_descriptor_layout_key *layout_key,
+                                  VkDescriptorPoolSize *sizes, unsigned num_type_sizes);
+void
+zink_descriptor_util_init_fbfetch(struct zink_context *ctx);
 bool
 zink_descriptor_util_push_layouts_get(struct zink_context *ctx, struct zink_descriptor_layout **dsls, struct zink_descriptor_layout_key **layout_keys);
 void
 zink_descriptor_util_init_null_set(struct zink_context *ctx, VkDescriptorSet desc_set);
-struct zink_resource *
-zink_get_resource_for_descriptor(struct zink_context *ctx, enum zink_descriptor_type type, enum pipe_shader_type shader, int idx);
 VkImageLayout
 zink_descriptor_util_image_layout_eval(const struct zink_resource *res, bool is_compute);
-
+void
+zink_descriptors_init_bindless(struct zink_context *ctx);
+void
+zink_descriptors_deinit_bindless(struct zink_context *ctx);
+void
+zink_descriptors_update_bindless(struct zink_context *ctx);
 /* these two can't be called in lazy mode */
 void
 zink_descriptor_set_refs_clear(struct zink_descriptor_refs *refs, void *ptr);
@@ -225,7 +239,7 @@ bool
 zink_descriptor_program_init(struct zink_context *ctx, struct zink_program *pg);
 
 void
-zink_descriptor_program_deinit(struct zink_screen *screen, struct zink_program *pg);
+zink_descriptor_program_deinit(struct zink_context *ctx, struct zink_program *pg);
 
 void
 zink_descriptors_update(struct zink_context *ctx, bool is_compute);
@@ -259,7 +273,7 @@ bool
 zink_descriptor_program_init_lazy(struct zink_context *ctx, struct zink_program *pg);
 
 void
-zink_descriptor_program_deinit_lazy(struct zink_screen *screen, struct zink_program *pg);
+zink_descriptor_program_deinit_lazy(struct zink_context *ctx, struct zink_program *pg);
 
 void
 zink_descriptors_update_lazy(struct zink_context *ctx, bool is_compute);
@@ -283,6 +297,8 @@ zink_descriptors_deinit_lazy(struct zink_context *ctx);
 
 void
 zink_descriptor_set_update_lazy(struct zink_context *ctx, struct zink_program *pg, enum zink_descriptor_type type, VkDescriptorSet set);
+void
+zink_descriptors_update_lazy_masked(struct zink_context *ctx, bool is_compute, uint8_t changed_sets, uint8_t bind_sets);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_descriptors_lazy.c b/mesa 3D driver/src/gallium/drivers/zink/zink_descriptors_lazy.c
index a1f88b6fa4..dd0b29f908 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_descriptors_lazy.c	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_descriptors_lazy.c	
@@ -34,16 +34,19 @@
 #include "zink_resource.h"
 #include "zink_screen.h"
 
+#define MAX_LAZY_DESCRIPTORS (ZINK_DEFAULT_MAX_DESCS / 10)
+
 struct zink_descriptor_data_lazy {
    struct zink_descriptor_data base;
-   VkDescriptorUpdateTemplateEntry push_entries[PIPE_SHADER_TYPES];
+   VkDescriptorUpdateTemplateEntry push_entries[PIPE_SHADER_TYPES]; //gfx+fbfetch
+   VkDescriptorUpdateTemplateEntry compute_push_entry;
    bool push_state_changed[2]; //gfx, compute
    uint8_t state_changed[2]; //gfx, compute
 };
 
 struct zink_descriptor_pool {
    VkDescriptorPool pool;
-   VkDescriptorSet sets[ZINK_DEFAULT_MAX_DESCS];
+   VkDescriptorSet sets[MAX_LAZY_DESCRIPTORS];
    unsigned set_idx;
    unsigned sets_alloc;
 };
@@ -54,8 +57,11 @@ struct zink_batch_descriptor_data_lazy {
    struct hash_table pools[ZINK_DESCRIPTOR_TYPES];
    struct zink_descriptor_pool *push_pool[2];
    struct zink_program *pg[2]; //gfx, compute
+   uint32_t compat_id[2];
    VkDescriptorSetLayout dsl[2][ZINK_DESCRIPTOR_TYPES];
+   VkDescriptorSet sets[2][ZINK_DESCRIPTOR_TYPES + 1];
    unsigned push_usage[2];
+   bool has_fbfetch;
 };
 
 ALWAYS_INLINE static struct zink_descriptor_data_lazy *
@@ -121,6 +127,25 @@ init_template_entry(struct zink_shader *shader, enum zink_descriptor_type type,
     (*entry_idx)++;
 }
 
+static uint16_t
+descriptor_program_num_sizes(VkDescriptorPoolSize *sizes, enum zink_descriptor_type type)
+{
+   switch (type) {
+   case ZINK_DESCRIPTOR_TYPE_UBO:
+      return 1;
+   case ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW:
+      return !!sizes[ZDS_INDEX_COMBINED_SAMPLER].descriptorCount +
+             !!sizes[ZDS_INDEX_UNIFORM_TEXELS].descriptorCount;
+   case ZINK_DESCRIPTOR_TYPE_SSBO:
+      return 1;
+   case ZINK_DESCRIPTOR_TYPE_IMAGE:
+      return !!sizes[ZDS_INDEX_STORAGE_IMAGE].descriptorCount +
+             !!sizes[ZDS_INDEX_STORAGE_TEXELS].descriptorCount;
+   default: break;
+   }
+   unreachable("unknown type");
+}
+
 bool
 zink_descriptor_program_init_lazy(struct zink_context *ctx, struct zink_program *pg)
 {
@@ -129,19 +154,26 @@ zink_descriptor_program_init_lazy(struct zink_context *ctx, struct zink_program
    VkDescriptorUpdateTemplateEntry entries[ZINK_DESCRIPTOR_TYPES][PIPE_SHADER_TYPES * 32];
    unsigned num_bindings[ZINK_DESCRIPTOR_TYPES] = {0};
    uint8_t has_bindings = 0;
+   unsigned push_count = 0;
+   uint16_t num_type_sizes[ZINK_DESCRIPTOR_TYPES];
+   VkDescriptorPoolSize sizes[6] = {0}; //zink_descriptor_size_index
 
    struct zink_shader **stages;
    if (pg->is_compute)
       stages = &((struct zink_compute_program*)pg)->shader;
-   else
+   else {
       stages = ((struct zink_gfx_program*)pg)->shaders;
+      if (stages[PIPE_SHADER_FRAGMENT]->nir->info.fs.uses_fbfetch_output) {
+         zink_descriptor_util_init_fbfetch(ctx);
+         push_count = 1;
+      }
+   }
 
    if (!pg->dd)
       pg->dd = (void*)rzalloc(pg, struct zink_program_descriptor_data);
    if (!pg->dd)
       return false;
 
-   unsigned push_count = 0;
    unsigned entry_idx[ZINK_DESCRIPTOR_TYPES] = {0};
 
    unsigned num_shaders = pg->is_compute ? 1 : ZINK_SHADER_COUNT;
@@ -172,8 +204,8 @@ zink_descriptor_program_init_lazy(struct zink_context *ctx, struct zink_program
             binding->pImmutableSamplers = NULL;
 
             enum zink_descriptor_size_index idx = zink_vktype_to_size_idx(shader->bindings[j][k].type);
-            pg->dd->sizes[idx].descriptorCount += shader->bindings[j][k].size;
-            pg->dd->sizes[idx].type = shader->bindings[j][k].type;
+            sizes[idx].descriptorCount += shader->bindings[j][k].size;
+            sizes[idx].type = shader->bindings[j][k].type;
             switch (shader->bindings[j][k].type) {
             case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
             case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
@@ -192,19 +224,25 @@ zink_descriptor_program_init_lazy(struct zink_context *ctx, struct zink_program
             num_bindings[j]++;
             has_bindings |= BITFIELD_BIT(j);
          }
+         num_type_sizes[j] = descriptor_program_num_sizes(sizes, j);
       }
+      pg->dd->bindless |= shader->bindless;
    }
+   if (pg->dd->bindless)
+      zink_descriptors_init_bindless(ctx);
    pg->dd->binding_usage = has_bindings;
    if (!has_bindings && !push_count) {
       ralloc_free(pg->dd);
       pg->dd = NULL;
 
-      pg->layout = zink_pipeline_layout_create(screen, pg);
+      pg->layout = zink_pipeline_layout_create(screen, pg, &pg->compat_id);
       return !!pg->layout;
    }
 
    pg->dsl[pg->num_dsl++] = push_count ? ctx->dd->push_dsl[pg->is_compute]->layout : ctx->dd->dummy_dsl->layout;
    if (has_bindings) {
+      for (unsigned i = 0; i < ARRAY_SIZE(sizes); i++)
+         sizes[i].descriptorCount *= screen->descriptor_mode == ZINK_DESCRIPTOR_MODE_LAZY ? MAX_LAZY_DESCRIPTORS : ZINK_DEFAULT_MAX_DESCS;
       u_foreach_bit(type, has_bindings) {
          for (unsigned i = 0; i < type; i++) {
             /* push set is always 0 */
@@ -214,16 +252,33 @@ zink_descriptor_program_init_lazy(struct zink_context *ctx, struct zink_program
                pg->dd->binding_usage |= BITFIELD_BIT(i);
             }
          }
-         pg->dd->layouts[pg->num_dsl] = zink_descriptor_util_layout_get(ctx, type, bindings[type], num_bindings[type], &pg->dd->layout_key[type]);
-         pg->dd->layout_key[type]->use_count++;
+         struct zink_descriptor_layout_key *key;
+         pg->dd->layouts[pg->num_dsl] = zink_descriptor_util_layout_get(ctx, type, bindings[type], num_bindings[type], &key);
+         enum zink_descriptor_size_index idx = zink_descriptor_type_to_size_idx(type);
+         VkDescriptorPoolSize *sz = &sizes[idx];
+         if (!sz->descriptorCount)
+            sz++;
+         pg->dd->pool_key[type] = zink_descriptor_util_pool_key_get(ctx, type, key, sz, num_type_sizes[type]);
+         pg->dd->pool_key[type]->use_count++;
          pg->dsl[pg->num_dsl] = pg->dd->layouts[pg->num_dsl]->layout;
          pg->num_dsl++;
       }
-      for (unsigned i = 0; i < ARRAY_SIZE(pg->dd->sizes); i++)
-         pg->dd->sizes[i].descriptorCount *= ZINK_DEFAULT_MAX_DESCS;
+   }
+   /* TODO: make this dynamic? */
+   if (pg->dd->bindless) {
+      pg->num_dsl = ZINK_DESCRIPTOR_BINDLESS + 1;
+      pg->dsl[ZINK_DESCRIPTOR_BINDLESS] = ctx->dd->bindless_layout;
+      for (unsigned i = 0; i < ZINK_DESCRIPTOR_BINDLESS; i++) {
+         if (!pg->dsl[i]) {
+            /* inject a null dsl */
+            pg->dsl[i] = ctx->dd->dummy_dsl->layout;
+            if (i != ZINK_DESCRIPTOR_TYPES)
+               pg->dd->binding_usage |= BITFIELD_BIT(i);
+         }
+      }
    }
 
-   pg->layout = zink_pipeline_layout_create(screen, pg);
+   pg->layout = zink_pipeline_layout_create(screen, pg, &pg->compat_id);
    if (!pg->layout)
       return false;
    if (!screen->info.have_KHR_descriptor_update_template || screen->descriptor_mode == ZINK_DESCRIPTOR_MODE_NOTEMPLATES)
@@ -238,18 +293,19 @@ zink_descriptor_program_init_lazy(struct zink_context *ctx, struct zink_program
    /* number of descriptors in template */
    unsigned wd_count[ZINK_DESCRIPTOR_TYPES + 1];
    if (push_count)
-      wd_count[0] = pg->is_compute ? 1 : ZINK_SHADER_COUNT;
+      wd_count[0] = pg->is_compute ? 1 : (ZINK_SHADER_COUNT + !!ctx->dd->has_fbfetch);
    for (unsigned i = 0; i < ZINK_DESCRIPTOR_TYPES; i++)
-      wd_count[i + 1] = pg->dd->layout_key[i] ? pg->dd->layout_key[i]->num_descriptors : 0;
+      wd_count[i + 1] = pg->dd->pool_key[i] ? pg->dd->pool_key[i]->layout->num_bindings : 0;
 
    VkDescriptorUpdateTemplateEntry *push_entries[2] = {
       dd_lazy(ctx)->push_entries,
-      &dd_lazy(ctx)->push_entries[PIPE_SHADER_COMPUTE],
+      &dd_lazy(ctx)->compute_push_entry,
    };
    for (unsigned i = 0; i < pg->num_dsl; i++) {
       bool is_push = i == 0;
       /* no need for empty templates */
       if (pg->dsl[i] == ctx->dd->dummy_dsl->layout ||
+          pg->dsl[i] == ctx->dd->bindless_layout ||
           (!is_push && pg->dd->layouts[i]->desc_template))
          continue;
       template[i].sType = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO;
@@ -265,7 +321,7 @@ zink_descriptor_program_init_lazy(struct zink_context *ctx, struct zink_program
       template[i].pipelineLayout = pg->layout;
       template[i].set = i;
       VkDescriptorUpdateTemplateKHR t;
-      if (screen->vk.CreateDescriptorUpdateTemplate(screen->dev, &template[i], NULL, &t) != VK_SUCCESS)
+      if (VKSCR(CreateDescriptorUpdateTemplate)(screen->dev, &template[i], NULL, &t) != VK_SUCCESS)
          return false;
       if (is_push)
          pg->dd->push_template = t;
@@ -276,19 +332,20 @@ zink_descriptor_program_init_lazy(struct zink_context *ctx, struct zink_program
 }
 
 void
-zink_descriptor_program_deinit_lazy(struct zink_screen *screen, struct zink_program *pg)
+zink_descriptor_program_deinit_lazy(struct zink_context *ctx, struct zink_program *pg)
 {
+   struct zink_screen *screen = zink_screen(ctx->base.screen);
    for (unsigned i = 0; pg->num_dsl && i < ZINK_DESCRIPTOR_TYPES; i++) {
-      if (pg->dd->layout_key[i])
-         pg->dd->layout_key[i]->use_count--;
+      if (pg->dd->pool_key[i])
+         pg->dd->pool_key[i]->use_count--;
    }
    if (pg->dd && pg->dd->push_template)
-      screen->vk.DestroyDescriptorUpdateTemplate(screen->dev, pg->dd->push_template, NULL);
+      VKSCR(DestroyDescriptorUpdateTemplate)(screen->dev, pg->dd->push_template, NULL);
    ralloc_free(pg->dd);
 }
 
 static VkDescriptorPool
-create_pool(struct zink_screen *screen, unsigned num_type_sizes, VkDescriptorPoolSize *sizes, unsigned flags)
+create_pool(struct zink_screen *screen, unsigned num_type_sizes, const VkDescriptorPoolSize *sizes, unsigned flags)
 {
    VkDescriptorPool pool;
    VkDescriptorPoolCreateInfo dpci = {0};
@@ -296,8 +353,8 @@ create_pool(struct zink_screen *screen, unsigned num_type_sizes, VkDescriptorPoo
    dpci.pPoolSizes = sizes;
    dpci.poolSizeCount = num_type_sizes;
    dpci.flags = flags;
-   dpci.maxSets = ZINK_DEFAULT_MAX_DESCS;
-   if (vkCreateDescriptorPool(screen->dev, &dpci, 0, &pool) != VK_SUCCESS) {
+   dpci.maxSets = MAX_LAZY_DESCRIPTORS;
+   if (VKSCR(CreateDescriptorPool)(screen->dev, &dpci, 0, &pool) != VK_SUCCESS) {
       debug_printf("vkCreateDescriptorPool failed\n");
       return VK_NULL_HANDLE;
    }
@@ -314,12 +371,11 @@ check_pool_alloc(struct zink_context *ctx, struct zink_descriptor_pool *pool, st
    struct zink_screen *screen = zink_screen(ctx->base.screen);
    /* allocate up to $current * 10, e.g., 10 -> 100 or 100 -> 1000 */
    if (pool->set_idx == pool->sets_alloc) {
-      unsigned sets_to_alloc = MIN2(MAX2(pool->sets_alloc * 10, 10), ZINK_DEFAULT_MAX_DESCS) - pool->sets_alloc;
+      unsigned sets_to_alloc = MIN2(MIN2(MAX2(pool->sets_alloc * 10, 10), MAX_LAZY_DESCRIPTORS) - pool->sets_alloc, 100);
       if (!sets_to_alloc) {
          /* overflowed pool: queue for deletion on next reset */
          util_dynarray_append(&bdd->overflowed_pools, struct zink_descriptor_pool*, pool);
          _mesa_hash_table_remove(&bdd->pools[type], he);
-         ctx->oom_flush = true;
          return get_descriptor_pool_lazy(ctx, pg, type, bdd, is_compute);
       }
       if (!zink_descriptor_util_alloc_sets(screen, pg->dsl[type + 1],
@@ -331,16 +387,19 @@ check_pool_alloc(struct zink_context *ctx, struct zink_descriptor_pool *pool, st
 }
 
 static struct zink_descriptor_pool *
-create_push_pool(struct zink_screen *screen, struct zink_batch_descriptor_data_lazy *bdd, bool is_compute)
+create_push_pool(struct zink_screen *screen, struct zink_batch_descriptor_data_lazy *bdd, bool is_compute, bool has_fbfetch)
 {
    struct zink_descriptor_pool *pool = rzalloc(bdd, struct zink_descriptor_pool);
-   VkDescriptorPoolSize sizes;
-   sizes.type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
+   VkDescriptorPoolSize sizes[2];
+   sizes[0].type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
    if (is_compute)
-      sizes.descriptorCount = ZINK_DEFAULT_MAX_DESCS;
-   else
-      sizes.descriptorCount = ZINK_SHADER_COUNT * ZINK_DEFAULT_MAX_DESCS;
-   pool->pool = create_pool(screen, 1, &sizes, 0);
+      sizes[0].descriptorCount = MAX_LAZY_DESCRIPTORS;
+   else {
+      sizes[0].descriptorCount = ZINK_SHADER_COUNT * MAX_LAZY_DESCRIPTORS;
+      sizes[1].type = VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT;
+      sizes[1].descriptorCount = MAX_LAZY_DESCRIPTORS;
+   }
+   pool->pool = create_pool(screen, !is_compute && has_fbfetch ? 2 : 1, sizes, 0);
    return pool;
 }
 
@@ -349,13 +408,12 @@ check_push_pool_alloc(struct zink_context *ctx, struct zink_descriptor_pool *poo
 {
    struct zink_screen *screen = zink_screen(ctx->base.screen);
    /* allocate up to $current * 10, e.g., 10 -> 100 or 100 -> 1000 */
-   if (pool->set_idx == pool->sets_alloc) {
-      unsigned sets_to_alloc = MIN2(MAX2(pool->sets_alloc * 10, 10), ZINK_DEFAULT_MAX_DESCS) - pool->sets_alloc;
-      if (!sets_to_alloc) {
+   if (pool->set_idx == pool->sets_alloc || unlikely(ctx->dd->has_fbfetch != bdd->has_fbfetch)) {
+      unsigned sets_to_alloc = MIN2(MIN2(MAX2(pool->sets_alloc * 10, 10), MAX_LAZY_DESCRIPTORS) - pool->sets_alloc, 100);
+      if (!sets_to_alloc || unlikely(ctx->dd->has_fbfetch != bdd->has_fbfetch)) {
          /* overflowed pool: queue for deletion on next reset */
          util_dynarray_append(&bdd->overflowed_pools, struct zink_descriptor_pool*, pool);
-         bdd->push_pool[is_compute] = create_push_pool(screen, bdd, is_compute);
-         ctx->oom_flush = true;
+         bdd->push_pool[is_compute] = create_push_pool(screen, bdd, is_compute, ctx->dd->has_fbfetch);
          return check_push_pool_alloc(ctx, bdd->push_pool[is_compute], bdd, is_compute);
       }
       if (!zink_descriptor_util_alloc_sets(screen, ctx->dd->push_dsl[is_compute]->layout,
@@ -370,7 +428,8 @@ static struct zink_descriptor_pool *
 get_descriptor_pool_lazy(struct zink_context *ctx, struct zink_program *pg, enum zink_descriptor_type type, struct zink_batch_descriptor_data_lazy *bdd, bool is_compute)
 {
    struct zink_screen *screen = zink_screen(ctx->base.screen);
-   struct hash_entry *he = _mesa_hash_table_search(&bdd->pools[type], pg->dd->layout_key[type]);
+   const struct zink_descriptor_pool_key *pool_key = pg->dd->pool_key[type];
+   struct hash_entry *he = _mesa_hash_table_search(&bdd->pools[type], pool_key);
    struct zink_descriptor_pool *pool;
    if (he) {
       pool = he->data;
@@ -379,17 +438,13 @@ get_descriptor_pool_lazy(struct zink_context *ctx, struct zink_program *pg, enum
    pool = rzalloc(bdd, struct zink_descriptor_pool);
    if (!pool)
       return NULL;
-   unsigned idx = zink_descriptor_type_to_size_idx(type);
-   VkDescriptorPoolSize *size = &pg->dd->sizes[idx];
-   /* this is a sampler/image set with no images only texels */
-   if (!size->descriptorCount)
-      size++;
-   pool->pool = create_pool(screen, zink_descriptor_program_num_sizes(pg, type), size, 0);
+   const unsigned num_type_sizes = pool_key->sizes[1].descriptorCount ? 2 : 1;
+   pool->pool = create_pool(screen, num_type_sizes, pool_key->sizes, 0);
    if (!pool->pool) {
       ralloc_free(pool);
       return NULL;
    }
-   _mesa_hash_table_insert(&bdd->pools[type], pg->dd->layout_key[type], pool);
+   _mesa_hash_table_insert(&bdd->pools[type], pool_key, pool);
    return check_pool_alloc(ctx, pool, he, pg, type, bdd, is_compute);
 }
 
@@ -405,22 +460,15 @@ get_descriptor_set_lazy(struct zink_descriptor_pool *pool)
 
 static bool
 populate_sets(struct zink_context *ctx, struct zink_batch_descriptor_data_lazy *bdd,
-              struct zink_program *pg, uint8_t *changed_sets, bool need_push, VkDescriptorSet *sets)
+              struct zink_program *pg, uint8_t *changed_sets, VkDescriptorSet *sets)
 {
-   if (need_push && !zink_screen(ctx->base.screen)->info.have_KHR_push_descriptor) {
-         struct zink_descriptor_pool *pool = check_push_pool_alloc(ctx, bdd->push_pool[pg->is_compute], bdd, pg->is_compute);
-         sets[0] = get_descriptor_set_lazy(pool);
-         if (!sets[0])
-            return false;
-   } else
-      sets[0] = VK_NULL_HANDLE;
    u_foreach_bit(type, *changed_sets) {
-      if (pg->dd->layout_key[type]) {
+      if (pg->dd->pool_key[type]) {
          struct zink_descriptor_pool *pool = get_descriptor_pool_lazy(ctx, pg, type, bdd, pg->is_compute);
-         sets[type + 1] = get_descriptor_set_lazy(pool);
+         sets[type] = get_descriptor_set_lazy(pool);
       } else
-         sets[type + 1] = ctx->dd->dummy_set;
-      if (!sets[type + 1])
+         sets[type] = ctx->dd->dummy_set;
+      if (!sets[type])
          return false;
    }
    return true;
@@ -430,17 +478,59 @@ void
 zink_descriptor_set_update_lazy(struct zink_context *ctx, struct zink_program *pg, enum zink_descriptor_type type, VkDescriptorSet set)
 {
    struct zink_screen *screen = zink_screen(ctx->base.screen);
-   screen->vk.UpdateDescriptorSetWithTemplate(screen->dev, set, pg->dd->layouts[type + 1]->desc_template, ctx);
+   VKCTX(UpdateDescriptorSetWithTemplate)(screen->dev, set, pg->dd->layouts[type + 1]->desc_template, ctx);
+}
+
+void
+zink_descriptors_update_lazy_masked(struct zink_context *ctx, bool is_compute, uint8_t changed_sets, uint8_t bind_sets)
+{
+   struct zink_screen *screen = zink_screen(ctx->base.screen);
+   struct zink_batch_state *bs = ctx->batch.state;
+   struct zink_batch_descriptor_data_lazy *bdd = bdd_lazy(bs);
+   struct zink_program *pg = is_compute ? &ctx->curr_compute->base : &ctx->curr_program->base;
+   VkDescriptorSet desc_sets[ZINK_DESCRIPTOR_TYPES];
+   if (!pg->dd->binding_usage || (!changed_sets && !bind_sets))
+      return;
+
+   if (!populate_sets(ctx, bdd, pg, &changed_sets, desc_sets)) {
+      debug_printf("ZINK: couldn't get descriptor sets!\n");
+      return;
+   }
+   /* no flushing allowed */
+   assert(ctx->batch.state == bs);
+
+   u_foreach_bit(type, changed_sets) {
+      assert(type + 1 < pg->num_dsl);
+      if (pg->dd->pool_key[type]) {
+         VKSCR(UpdateDescriptorSetWithTemplate)(screen->dev, desc_sets[type], pg->dd->layouts[type + 1]->desc_template, ctx);
+         VKSCR(CmdBindDescriptorSets)(bs->cmdbuf,
+                                 is_compute ? VK_PIPELINE_BIND_POINT_COMPUTE : VK_PIPELINE_BIND_POINT_GRAPHICS,
+                                 /* set index incremented by 1 to account for push set */
+                                 pg->layout, type + 1, 1, &desc_sets[type],
+                                 0, NULL);
+         bdd->sets[is_compute][type + 1] = desc_sets[type];
+      }
+   }
+   u_foreach_bit(type, bind_sets & ~changed_sets) {
+      if (!pg->dd->pool_key[type])
+         bdd->sets[is_compute][type + 1] = ctx->dd->dummy_set;
+      assert(bdd->sets[is_compute][type + 1]);
+      VKSCR(CmdBindDescriptorSets)(bs->cmdbuf,
+                              is_compute ? VK_PIPELINE_BIND_POINT_COMPUTE : VK_PIPELINE_BIND_POINT_GRAPHICS,
+                              /* set index incremented by 1 to account for push set */
+                              pg->layout, type + 1, 1, &bdd->sets[is_compute][type + 1],
+                              0, NULL);
+   }
 }
 
 void
 zink_descriptors_update_lazy(struct zink_context *ctx, bool is_compute)
 {
-   struct zink_screen *screen = zink_screen(ctx->base.screen);
-   struct zink_batch *batch = &ctx->batch;
    struct zink_batch_state *bs = ctx->batch.state;
    struct zink_batch_descriptor_data_lazy *bdd = bdd_lazy(bs);
    struct zink_program *pg = is_compute ? &ctx->curr_compute->base : &ctx->curr_program->base;
+   struct zink_screen *screen = zink_screen(ctx->base.screen);
+   bool have_KHR_push_descriptor = screen->info.have_KHR_push_descriptor;
 
    bool batch_changed = !bdd->pg[is_compute];
    if (batch_changed) {
@@ -464,57 +554,61 @@ zink_descriptors_update_lazy(struct zink_context *ctx, bool is_compute)
        dd_lazy(ctx)->push_state_changed[is_compute] |= bdd->push_usage[is_compute] != pg->dd->push_usage;
        bdd->push_usage[is_compute] = pg->dd->push_usage;
    }
-   bdd->pg[is_compute] = pg;
 
-   VkDescriptorSet desc_sets[5];
    uint8_t changed_sets = pg->dd->binding_usage & dd_lazy(ctx)->state_changed[is_compute];
    bool need_push = pg->dd->push_usage &&
                     (dd_lazy(ctx)->push_state_changed[is_compute] || batch_changed);
-   if (!populate_sets(ctx, bdd, pg, &changed_sets, need_push, desc_sets)) {
-      debug_printf("ZINK: couldn't get descriptor sets!\n");
-      return;
-   }
-   /* no flushing allowed */
-   assert(ctx->batch.state == bs);
-   bs = ctx->batch.state;
-
-   if (pg->dd->binding_usage && changed_sets) {
-      u_foreach_bit(type, changed_sets) {
-         if (pg->dd->layout_key[type])
-            screen->vk.UpdateDescriptorSetWithTemplate(screen->dev, desc_sets[type + 1], pg->dd->layouts[type + 1]->desc_template, ctx);
-         assert(type + 1 < pg->num_dsl);
-         vkCmdBindDescriptorSets(bs->cmdbuf,
-                                 is_compute ? VK_PIPELINE_BIND_POINT_COMPUTE : VK_PIPELINE_BIND_POINT_GRAPHICS,
-                                 /* set index incremented by 1 to account for push set */
-                                 pg->layout, type + 1, 1, &desc_sets[type + 1],
-                                 0, NULL);
+   VkDescriptorSet push_set = VK_NULL_HANDLE;
+   if (need_push && !have_KHR_push_descriptor) {
+      struct zink_descriptor_pool *pool = check_push_pool_alloc(ctx, bdd->push_pool[pg->is_compute], bdd, pg->is_compute);
+      push_set = get_descriptor_set_lazy(pool);
+      if (!push_set) {
+         mesa_loge("ZINK: failed to get push descriptor set!");
+         /* just jam something in to avoid a hang */
+         push_set = ctx->dd->dummy_set;
       }
-      dd_lazy(ctx)->state_changed[is_compute] = false;
    }
-
-   if (pg->dd->push_usage && dd_lazy(ctx)->push_state_changed[is_compute]) {
-      if (screen->info.have_KHR_push_descriptor)
-         screen->vk.CmdPushDescriptorSetWithTemplateKHR(batch->state->cmdbuf, pg->dd->push_template,
-                                                     pg->layout, 0, ctx);
-      else {
-         assert(desc_sets[0]);
-         screen->vk.UpdateDescriptorSetWithTemplate(screen->dev, desc_sets[0], pg->dd->push_template, ctx);
-         vkCmdBindDescriptorSets(batch->state->cmdbuf,
+   /*
+    * when binding a pipeline, the pipeline can correctly access any previously bound
+    * descriptor sets which were bound with compatible pipeline layouts
+    * VK 14.2.2
+    */
+   uint8_t bind_sets = bdd->pg[is_compute] && bdd->compat_id[is_compute] == pg->compat_id ? 0 : pg->dd->binding_usage;
+   if (pg->dd->push_usage && (dd_lazy(ctx)->push_state_changed[is_compute] || bind_sets)) {
+      if (have_KHR_push_descriptor) {
+         if (dd_lazy(ctx)->push_state_changed[is_compute])
+            VKCTX(CmdPushDescriptorSetWithTemplateKHR)(bs->cmdbuf, pg->dd->push_template,
+                                                        pg->layout, 0, ctx);
+      } else {
+         if (dd_lazy(ctx)->push_state_changed[is_compute]) {
+            VKCTX(UpdateDescriptorSetWithTemplate)(screen->dev, push_set, pg->dd->push_template, ctx);
+            bdd->sets[is_compute][0] = push_set;
+         }
+         assert(push_set || bdd->sets[is_compute][0]);
+         VKCTX(CmdBindDescriptorSets)(bs->cmdbuf,
                                  is_compute ? VK_PIPELINE_BIND_POINT_COMPUTE : VK_PIPELINE_BIND_POINT_GRAPHICS,
-                                 pg->layout, 0, 1, &desc_sets[0],
+                                 pg->layout, 0, 1, push_set ? &push_set : &bdd->sets[is_compute][0],
                                  0, NULL);
       }
       dd_lazy(ctx)->push_state_changed[is_compute] = false;
-   } else if (dd_lazy(ctx)->push_state_changed[is_compute]) {
-      vkCmdBindDescriptorSets(bs->cmdbuf,
+   } else if (dd_lazy(ctx)->push_state_changed[is_compute] || bind_sets) {
+      VKCTX(CmdBindDescriptorSets)(bs->cmdbuf,
                               is_compute ? VK_PIPELINE_BIND_POINT_COMPUTE : VK_PIPELINE_BIND_POINT_GRAPHICS,
                               pg->layout, 0, 1, &ctx->dd->dummy_set,
                               0, NULL);
       dd_lazy(ctx)->push_state_changed[is_compute] = false;
    }
-   /* set again in case of flushing */
+   zink_descriptors_update_lazy_masked(ctx, is_compute, changed_sets, bind_sets);
+   if (pg->dd->bindless && unlikely(!ctx->dd->bindless_bound)) {
+      VKCTX(CmdBindDescriptorSets)(ctx->batch.state->cmdbuf, is_compute ? VK_PIPELINE_BIND_POINT_COMPUTE : VK_PIPELINE_BIND_POINT_GRAPHICS,
+                                   pg->layout, ZINK_DESCRIPTOR_BINDLESS, 1, &ctx->dd->bindless_set,
+                                   0, NULL);
+      ctx->dd->bindless_bound = true;
+   }
    bdd->pg[is_compute] = pg;
    ctx->dd->pg[is_compute] = pg;
+   bdd->compat_id[is_compute] = pg->compat_id;
+   dd_lazy(ctx)->state_changed[is_compute] = false;
 }
 
 void
@@ -536,13 +630,13 @@ zink_batch_descriptor_deinit_lazy(struct zink_screen *screen, struct zink_batch_
       for (unsigned i = 0; i < ZINK_DESCRIPTOR_TYPES; i++) {
          hash_table_foreach(&bdd->pools[i], entry) {
             struct zink_descriptor_pool *pool = (void*)entry->data;
-            vkDestroyDescriptorPool(screen->dev, pool->pool, NULL);
+            VKSCR(DestroyDescriptorPool)(screen->dev, pool->pool, NULL);
          }
       }
       if (bdd->push_pool[0])
-         vkDestroyDescriptorPool(screen->dev, bdd->push_pool[0]->pool, NULL);
+         VKSCR(DestroyDescriptorPool)(screen->dev, bdd->push_pool[0]->pool, NULL);
       if (bdd->push_pool[1])
-         vkDestroyDescriptorPool(screen->dev, bdd->push_pool[1]->pool, NULL);
+         VKSCR(DestroyDescriptorPool)(screen->dev, bdd->push_pool[1]->pool, NULL);
    }
    ralloc_free(bs->dd);
 }
@@ -550,7 +644,7 @@ zink_batch_descriptor_deinit_lazy(struct zink_screen *screen, struct zink_batch_
 static void
 pool_destroy(struct zink_screen *screen, struct zink_descriptor_pool *pool)
 {
-   vkDestroyDescriptorPool(screen->dev, pool->pool, NULL);
+   VKSCR(DestroyDescriptorPool)(screen->dev, pool->pool, NULL);
    ralloc_free(pool);
 }
 
@@ -562,7 +656,7 @@ zink_batch_descriptor_reset_lazy(struct zink_screen *screen, struct zink_batch_s
    struct zink_batch_descriptor_data_lazy *bdd = bdd_lazy(bs);
    for (unsigned i = 0; i < ZINK_DESCRIPTOR_TYPES; i++) {
       hash_table_foreach(&bdd->pools[i], entry) {
-         const struct zink_descriptor_layout_key *key = entry->key;
+         const struct zink_descriptor_pool_key *key = entry->key;
          struct zink_descriptor_pool *pool = (void*)entry->data;
          if (key->use_count)
             pool->set_idx = 0;
@@ -598,12 +692,22 @@ zink_batch_descriptor_init_lazy(struct zink_screen *screen, struct zink_batch_st
    }
    util_dynarray_init(&bdd->overflowed_pools, bs->dd);
    if (!screen->info.have_KHR_push_descriptor) {
-      bdd->push_pool[0] = create_push_pool(screen, bdd, false);
-      bdd->push_pool[1] = create_push_pool(screen, bdd, true);
+      bdd->push_pool[0] = create_push_pool(screen, bdd, false, false);
+      bdd->push_pool[1] = create_push_pool(screen, bdd, true, false);
    }
    return true;
 }
 
+static void
+init_push_template_entry(VkDescriptorUpdateTemplateEntry *entry, unsigned i)
+{
+   entry->dstBinding = tgsi_processor_to_shader_stage(i);
+   entry->descriptorCount = 1;
+   entry->descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
+   entry->offset = offsetof(struct zink_context, di.ubos[i][0]);
+   entry->stride = sizeof(VkDescriptorBufferInfo);
+}
+
 bool
 zink_descriptors_init_lazy(struct zink_context *ctx)
 {
@@ -615,14 +719,17 @@ zink_descriptors_init_lazy(struct zink_context *ctx)
    if (screen->descriptor_mode == ZINK_DESCRIPTOR_MODE_NOTEMPLATES)
       printf("ZINK: CACHED/NOTEMPLATES DESCRIPTORS\n");
    else if (screen->info.have_KHR_descriptor_update_template) {
-      for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++) {
+      for (unsigned i = 0; i < ZINK_SHADER_COUNT; i++) {
          VkDescriptorUpdateTemplateEntry *entry = &dd_lazy(ctx)->push_entries[i];
-         entry->dstBinding = tgsi_processor_to_shader_stage(i);
-         entry->descriptorCount = 1;
-         entry->descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
-         entry->offset = offsetof(struct zink_context, di.ubos[i][0]);
-         entry->stride = sizeof(VkDescriptorBufferInfo);
+         init_push_template_entry(entry, i);
       }
+      init_push_template_entry(&dd_lazy(ctx)->compute_push_entry, PIPE_SHADER_COMPUTE);
+      VkDescriptorUpdateTemplateEntry *entry = &dd_lazy(ctx)->push_entries[ZINK_SHADER_COUNT]; //fbfetch
+      entry->dstBinding = ZINK_FBFETCH_BINDING;
+      entry->descriptorCount = 1;
+      entry->descriptorType = VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT;
+      entry->offset = offsetof(struct zink_context, di.fbfetch);
+      entry->stride = sizeof(VkDescriptorImageInfo);
       if (screen->descriptor_mode == ZINK_DESCRIPTOR_MODE_LAZY)
          printf("ZINK: USING LAZY DESCRIPTORS\n");
    }
@@ -638,6 +745,7 @@ zink_descriptors_init_lazy(struct zink_context *ctx)
    zink_descriptor_util_alloc_sets(screen, ctx->dd->dummy_dsl->layout,
                                    ctx->dd->dummy_pool, &ctx->dd->dummy_set, 1);
    zink_descriptor_util_init_null_set(ctx, ctx->dd->dummy_set);
+
    return true;
 }
 
@@ -647,12 +755,11 @@ zink_descriptors_deinit_lazy(struct zink_context *ctx)
    if (ctx->dd) {
       struct zink_screen *screen = zink_screen(ctx->base.screen);
       if (ctx->dd->dummy_pool)
-         vkDestroyDescriptorPool(screen->dev, ctx->dd->dummy_pool, NULL);
-      if (screen->descriptor_mode == ZINK_DESCRIPTOR_MODE_LAZY &&
-          screen->info.have_KHR_push_descriptor) {
-         vkDestroyDescriptorSetLayout(screen->dev, ctx->dd->push_dsl[0]->layout, NULL);
-         vkDestroyDescriptorSetLayout(screen->dev, ctx->dd->push_dsl[1]->layout, NULL);
-      }
+         VKSCR(DestroyDescriptorPool)(screen->dev, ctx->dd->dummy_pool, NULL);
+      if (ctx->dd->push_dsl[0])
+         VKSCR(DestroyDescriptorSetLayout)(screen->dev, ctx->dd->push_dsl[0]->layout, NULL);
+      if (ctx->dd->push_dsl[1])
+         VKSCR(DestroyDescriptorSetLayout)(screen->dev, ctx->dd->push_dsl[1]->layout, NULL);
    }
    ralloc_free(ctx->dd);
 }
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_device_info.py b/mesa 3D driver/src/gallium/drivers/zink/zink_device_info.py
index 7863002bd1..c371563f7b 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_device_info.py	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_device_info.py	
@@ -65,6 +65,8 @@ EXTENSIONS = [
     Extension("VK_KHR_maintenance3"),
     Extension("VK_KHR_external_memory"),
     Extension("VK_KHR_external_memory_fd"),
+    Extension("VK_EXT_external_memory_dma_buf"),
+    Extension("VK_EXT_queue_family_foreign"),
     Extension("VK_EXT_provoking_vertex",
        alias="pv",
        features=True,
@@ -73,6 +75,14 @@ EXTENSIONS = [
     Extension("VK_EXT_shader_viewport_index_layer"),
     Extension("VK_KHR_get_memory_requirements2"),
     Extension("VK_EXT_post_depth_coverage"),
+    Extension("VK_KHR_8bit_storage",
+              alias="storage_8bit",
+              features=True,
+              conditions=["$feats.storageBuffer8BitAccess"]),
+    Extension("VK_KHR_16bit_storage",
+              alias="storage_16bit",
+              features=True,
+              conditions=["$feats.storageBuffer16BitAccess"]),
     Extension("VK_KHR_driver_properties",
         alias="driver",
         properties=True),
@@ -104,6 +114,10 @@ EXTENSIONS = [
         alias="index_uint8",
         features=True,
         conditions=["$feats.indexTypeUint8"]),
+    Extension("VK_KHR_imageless_framebuffer",
+        alias="imgless",
+        features=True,
+        conditions=["$feats.imagelessFramebuffer"]),
     Extension("VK_EXT_robustness2",
         alias="rb2",
         properties=True,
@@ -137,6 +151,10 @@ EXTENSIONS = [
         alias="dynamic_state",
         features=True,
         conditions=["$feats.extendedDynamicState"]),
+    Extension("VK_EXT_extended_dynamic_state2",
+        alias="dynamic_state2",
+        features=True,
+        conditions=["$feats.extendedDynamicState2"]),
     Extension("VK_EXT_pipeline_creation_cache_control",
         alias="pipeline_cache_control",
         features=True,
@@ -149,7 +167,7 @@ EXTENSIONS = [
         properties=True,
         features=True,
         guard=True),
-    Extension("VK_KHR_timeline_semaphore"),
+    Extension("VK_KHR_timeline_semaphore", alias="timeline", features=True),
     Extension("VK_EXT_4444_formats",
         alias="format_4444",
         features=True),
@@ -179,8 +197,17 @@ EXTENSIONS = [
         alias="vertex_input",
 	features=True,
 	conditions=["$feats.vertexInputDynamicState"]),
+    Extension("VK_EXT_primitive_topology_list_restart",
+        alias="list_restart",
+	features=True,
+	conditions=["$feats.primitiveTopologyListRestart"]),
     Extension("VK_KHR_dedicated_allocation",
         alias="dedicated"),
+    Extension("VK_EXT_descriptor_indexing",
+        alias="desc_indexing",
+        features=True,
+        properties=True,
+        conditions=["$feats.descriptorBindingPartiallyBound"]),
 ]
 
 # constructor: Versions(device_version(major, minor, patch), struct_version(major, minor))
@@ -190,9 +217,7 @@ EXTENSIONS = [
 #
 #  - struct_version: Vulkan version, as tuple, to use with structures and macros
 VERSIONS = [
-    # VkPhysicalDeviceVulkan11Properties and VkPhysicalDeviceVulkan11Features is new from Vk 1.2, not Vk 1.1
-    # https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/vkspec.html#_new_structures
-    Version((1,2,0), (1,1)),
+    Version((1,1,0), (1,1)),
     Version((1,2,0), (1,2)),
 ]
 
@@ -201,6 +226,7 @@ VERSIONS = [
 REPLACEMENTS = {
     "ROBUSTNESS2": "ROBUSTNESS_2",
     "PROPERTIES_PROPERTIES": "PROPERTIES",
+    "EXTENDED_DYNAMIC_STATE2": "EXTENDED_DYNAMIC_STATE_2",
 }
 
 
@@ -326,30 +352,10 @@ zink_get_physical_device_info(struct zink_screen *screen)
          %for ext in extensions:
          <%helpers:guard ext="${ext}">
             if (!strcmp(extensions[i].extensionName, "${ext.name}")) {
-         %if ext.core_since:
-         %for version in versions:
-         %if ext.core_since.struct_version == version.struct_version:
-               if (${version.version()} >= screen->vk_version) {
-         %if not (ext.has_features or ext.has_properties):
-                  info->have_${ext.name_with_vendor()} = true;
-         %else:
-                  support_${ext.name_with_vendor()} = true;
-         %endif
-               } else {
-         %if not (ext.has_features or ext.has_properties):
-                  info->have_${ext.name_with_vendor()} = true;
-         %else:
-                  support_${ext.name_with_vendor()} = true;
-         %endif
-               }
-         %endif
-         %endfor
-         %else:
          %if not (ext.has_features or ext.has_properties):
                info->have_${ext.name_with_vendor()} = true;
          %else:
                support_${ext.name_with_vendor()} = true;
-         %endif
          %endif
             }
          </%helpers:guard>
@@ -366,7 +372,12 @@ zink_get_physical_device_info(struct zink_screen *screen)
       info->feats.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
 
 %for version in versions:
+%if version.device_version < (1,2,0):
+      if (VK_MAKE_VERSION(1,2,0) <= screen->vk_version) {
+         /* VkPhysicalDeviceVulkan11Features was added in 1.2, not 1.1 as one would think */
+%else:
       if (${version.version()} <= screen->vk_version) {
+%endif
          info->feats${version.struct()}.sType = ${version.stype("FEATURES")};
          info->feats${version.struct()}.pNext = info->feats.pNext;
          info->feats.pNext = &info->feats${version.struct()};
@@ -397,7 +408,12 @@ zink_get_physical_device_info(struct zink_screen *screen)
       props.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
 
 %for version in versions:
+%if version.device_version < (1,2,0):
+      if (VK_MAKE_VERSION(1,2,0) <= screen->vk_version) {
+         /* VkPhysicalDeviceVulkan11Properties was added in 1.2, not 1.1 as one would think */
+%else:
       if (${version.version()} <= screen->vk_version) {
+%endif
          info->props${version.struct()}.sType = ${version.stype("PROPERTIES")};
          info->props${version.struct()}.pNext = props.pNext;
          props.pNext = &info->props${version.struct()};
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_draw.cpp b/mesa 3D driver/src/gallium/drivers/zink/zink_draw.cpp
index 15ea1a4527..c1bbe5d49e 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_draw.cpp	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_draw.cpp	
@@ -34,10 +34,10 @@ zink_emit_xfb_counter_barrier(struct zink_context *ctx)
          continue;
       struct zink_resource *res = zink_resource(t->counter_buffer);
       if (t->counter_buffer_valid)
-          zink_resource_buffer_barrier(ctx, NULL, res, VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT,
+          zink_resource_buffer_barrier(ctx, res, VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT,
                                        VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT);
       else
-          zink_resource_buffer_barrier(ctx, NULL, res, VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT,
+          zink_resource_buffer_barrier(ctx, res, VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT,
                                        VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT);
    }
    ctx->xfb_barrier = false;
@@ -57,7 +57,7 @@ zink_emit_xfb_vertex_input_barrier(struct zink_context *ctx, struct zink_resourc
     *
     * - 20.3.1. Drawing Transform Feedback
     */
-   zink_resource_buffer_barrier(ctx, NULL, res, VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT,
+   zink_resource_buffer_barrier(ctx, res, VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT,
                                 VK_PIPELINE_STAGE_VERTEX_INPUT_BIT);
 }
 
@@ -65,7 +65,6 @@ static void
 zink_emit_stream_output_targets(struct pipe_context *pctx)
 {
    struct zink_context *ctx = zink_context(pctx);
-   struct zink_screen *screen = zink_screen(pctx->screen);
    struct zink_batch *batch = &ctx->batch;
    VkBuffer buffers[PIPE_MAX_SO_OUTPUTS] = {0};
    VkDeviceSize buffer_offsets[PIPE_MAX_SO_OUTPUTS] = {0};
@@ -81,21 +80,19 @@ zink_emit_stream_output_targets(struct pipe_context *pctx)
          continue;
       }
       struct zink_resource *res = zink_resource(t->base.buffer);
-      if (!(res->bind_history & ZINK_RESOURCE_USAGE_STREAMOUT))
+      if (!res->so_valid)
          /* resource has been rebound */
          t->counter_buffer_valid = false;
       buffers[i] = res->obj->buffer;
-      zink_resource_buffer_barrier(ctx, NULL, zink_resource(t->base.buffer),
-                                   VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT, VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT);
       zink_batch_reference_resource_rw(batch, res, true);
       buffer_offsets[i] = t->base.buffer_offset;
       buffer_sizes[i] = t->base.buffer_size;
-      res->bind_history |= ZINK_RESOURCE_USAGE_STREAMOUT;
+      res->so_valid = true;
       util_range_add(t->base.buffer, &res->valid_buffer_range, t->base.buffer_offset,
                      t->base.buffer_offset + t->base.buffer_size);
    }
 
-   screen->vk.CmdBindTransformFeedbackBuffersEXT(batch->state->cmdbuf, 0, ctx->num_so_targets,
+   VKCTX(CmdBindTransformFeedbackBuffersEXT)(batch->state->cmdbuf, 0, ctx->num_so_targets,
                                                  buffers, buffer_offsets,
                                                  buffer_sizes);
    ctx->dirty_so_targets = false;
@@ -105,7 +102,7 @@ ALWAYS_INLINE static void
 check_buffer_barrier(struct zink_context *ctx, struct pipe_resource *pres, VkAccessFlags flags, VkPipelineStageFlags pipeline)
 {
    struct zink_resource *res = zink_resource(pres);
-   zink_resource_buffer_barrier(ctx, NULL, res, flags, pipeline);
+   zink_resource_buffer_barrier(ctx, res, flags, pipeline);
 }
 
 ALWAYS_INLINE static void
@@ -123,7 +120,7 @@ barrier_draw_buffers(struct zink_context *ctx, const struct pipe_draw_info *dinf
    }
 }
 
-template <zink_dynamic_state HAS_DYNAMIC_STATE>
+template <zink_dynamic_state HAS_DYNAMIC_STATE, zink_dynamic_vertex_input HAS_VERTEX_INPUT>
 static void
 zink_bind_vertex_buffers(struct zink_batch *batch, struct zink_context *ctx)
 {
@@ -143,87 +140,81 @@ zink_bind_vertex_buffers(struct zink_batch *batch, struct zink_context *ctx)
       if (vb->buffer.resource) {
          buffers[i] = ctx->vbufs[buffer_id];
          assert(buffers[i]);
-         if (screen->info.have_EXT_vertex_input_dynamic_state)
+         if (HAS_VERTEX_INPUT)
             elems->hw_state.dynbindings[i].stride = vb->stride;
          buffer_offsets[i] = ctx->vbuf_offsets[buffer_id];
          buffer_strides[i] = vb->stride;
+         zink_batch_resource_usage_set(&ctx->batch, zink_resource(vb->buffer.resource), false);
       } else {
          buffers[i] = zink_resource(ctx->dummy_vertex_buffer)->obj->buffer;
          buffer_offsets[i] = 0;
          buffer_strides[i] = 0;
-         if (screen->info.have_EXT_vertex_input_dynamic_state)
+         if (HAS_VERTEX_INPUT)
             elems->hw_state.dynbindings[i].stride = 0;
       }
    }
 
-   if (HAS_DYNAMIC_STATE)
-      screen->vk.CmdBindVertexBuffers2EXT(batch->state->cmdbuf, 0,
+   if (HAS_DYNAMIC_STATE && !HAS_VERTEX_INPUT)
+      VKCTX(CmdBindVertexBuffers2EXT)(batch->state->cmdbuf, 0,
                                           elems->hw_state.num_bindings,
                                           buffers, buffer_offsets, NULL, buffer_strides);
    else
-      vkCmdBindVertexBuffers(batch->state->cmdbuf, 0,
+      VKSCR(CmdBindVertexBuffers)(batch->state->cmdbuf, 0,
                              elems->hw_state.num_bindings,
                              buffers, buffer_offsets);
 
-   if (screen->info.have_EXT_vertex_input_dynamic_state)
-      screen->vk.CmdSetVertexInputEXT(batch->state->cmdbuf,
+   if (HAS_VERTEX_INPUT)
+      VKCTX(CmdSetVertexInputEXT)(batch->state->cmdbuf,
                                       elems->hw_state.num_bindings, elems->hw_state.dynbindings,
                                       elems->hw_state.num_attribs, elems->hw_state.dynattribs);
 
-   ctx->vertex_state_changed = false;
    ctx->vertex_buffers_dirty = false;
 }
 
-static void
-update_compute_program(struct zink_context *ctx)
-{
-   const unsigned bits = 1 << PIPE_SHADER_COMPUTE;
-   if (ctx->dirty_shader_stages & bits) {
-      struct zink_compute_program *comp = zink_create_compute_program(ctx, ctx->compute_stage);
-      _mesa_hash_table_insert(ctx->compute_program_cache, comp->shader, comp);
-      ctx->compute_pipeline_state.dirty = true;
-      ctx->curr_compute = comp;
-      ctx->dirty_shader_stages &= bits;
-      zink_batch_reference_program(&ctx->batch, &ctx->curr_compute->base);
-   }
-}
-
 static void
 update_gfx_program(struct zink_context *ctx)
 {
    if (ctx->last_vertex_stage_dirty) {
-      if (ctx->gfx_stages[PIPE_SHADER_GEOMETRY])
-         ctx->dirty_shader_stages |= BITFIELD_BIT(PIPE_SHADER_GEOMETRY);
-      else if (ctx->gfx_stages[PIPE_SHADER_TESS_EVAL])
-         ctx->dirty_shader_stages |= BITFIELD_BIT(PIPE_SHADER_TESS_EVAL);
-      else
-         ctx->dirty_shader_stages |= BITFIELD_BIT(PIPE_SHADER_VERTEX);
+      enum pipe_shader_type pstage = pipe_shader_type_from_mesa(ctx->last_vertex_stage->nir->info.stage);
+      ctx->dirty_shader_stages |= BITFIELD_BIT(pstage);
+      memcpy(&ctx->gfx_pipeline_state.shader_keys.key[pstage].key.vs_base,
+             &ctx->gfx_pipeline_state.shader_keys.last_vertex.key.vs_base,
+             sizeof(struct zink_vs_key_base));
       ctx->last_vertex_stage_dirty = false;
    }
-   unsigned bits = u_bit_consecutive(PIPE_SHADER_VERTEX, 5);
-   if (ctx->dirty_shader_stages & bits) {
+   unsigned bits = BITFIELD_MASK(PIPE_SHADER_COMPUTE);
+   if (ctx->gfx_dirty) {
       struct zink_gfx_program *prog = NULL;
-      struct hash_entry *entry = _mesa_hash_table_search(ctx->program_cache,
-                                                         ctx->gfx_stages);
-      if (entry)
-         zink_update_gfx_program(ctx, (struct zink_gfx_program*)entry->data);
-      else {
-         prog = zink_create_gfx_program(ctx, ctx->gfx_stages);
-         entry = _mesa_hash_table_insert(ctx->program_cache, prog->shaders, prog);
+
+      struct hash_table *ht = &ctx->program_cache[ctx->shader_stages >> 2];
+      const uint32_t hash = ctx->gfx_hash;
+      struct hash_entry *entry = _mesa_hash_table_search_pre_hashed(ht, hash, ctx->gfx_stages);
+      if (entry) {
+         prog = (struct zink_gfx_program*)entry->data;
+         u_foreach_bit(stage, prog->stages_present & ~ctx->dirty_shader_stages)
+            ctx->gfx_pipeline_state.modules[stage] = prog->modules[stage]->shader;
+      } else {
+         ctx->dirty_shader_stages |= bits;
+         prog = zink_create_gfx_program(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.vertices_per_patch + 1);
+         _mesa_hash_table_insert_pre_hashed(ht, hash, prog->shaders, prog);
       }
-      prog = (struct zink_gfx_program*)(entry ? entry->data : NULL);
-      if (prog && prog != ctx->curr_program) {
-         ctx->gfx_pipeline_state.combined_dirty = true;
+      zink_update_gfx_program(ctx, prog);
+      if (prog && prog != ctx->curr_program)
          zink_batch_reference_program(&ctx->batch, &prog->base);
-      }
+      if (ctx->curr_program)
+         ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash;
       ctx->curr_program = prog;
-      ctx->dirty_shader_stages &= ~bits;
+      ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash;
+      ctx->gfx_dirty = false;
+   } else if (ctx->dirty_shader_stages & bits) {
+      zink_update_gfx_program(ctx, ctx->curr_program);
    }
+   ctx->dirty_shader_stages &= ~bits;
 }
 
 static bool
 line_width_needed(enum pipe_prim_type reduced_prim,
-                  VkPolygonMode polygon_mode)
+                  unsigned polygon_mode)
 {
    switch (reduced_prim) {
    case PIPE_PRIM_POINTS:
@@ -243,7 +234,7 @@ line_width_needed(enum pipe_prim_type reduced_prim,
 ALWAYS_INLINE static void
 update_drawid(struct zink_context *ctx, unsigned draw_id)
 {
-   vkCmdPushConstants(ctx->batch.state->cmdbuf, ctx->curr_program->base.layout, VK_SHADER_STAGE_VERTEX_BIT,
+   VKCTX(CmdPushConstants)(ctx->batch.state->cmdbuf, ctx->curr_program->base.layout, VK_SHADER_STAGE_VERTEX_BIT,
                       offsetof(struct zink_gfx_push_constant, draw_id), sizeof(unsigned),
                       &draw_id);
 }
@@ -260,7 +251,7 @@ draw_indexed_need_index_buffer_unref(struct zink_context *ctx,
    if (dinfo->increment_draw_id && needs_drawid) {
       for (unsigned i = 0; i < num_draws; i++) {
          update_drawid(ctx, draw_id);
-         vkCmdDrawIndexed(cmdbuf,
+         VKCTX(CmdDrawIndexed)(cmdbuf,
             draws[i].count, dinfo->instance_count,
             0, draws[i].index_bias, dinfo->start_instance);
          draw_id++;
@@ -269,7 +260,7 @@ draw_indexed_need_index_buffer_unref(struct zink_context *ctx,
       if (needs_drawid)
          update_drawid(ctx, draw_id);
       for (unsigned i = 0; i < num_draws; i++)
-         vkCmdDrawIndexed(cmdbuf,
+         VKCTX(CmdDrawIndexed)(cmdbuf,
             draws[i].count, dinfo->instance_count,
             0, draws[i].index_bias, dinfo->start_instance);
 
@@ -289,7 +280,7 @@ draw_indexed(struct zink_context *ctx,
    if (dinfo->increment_draw_id && needs_drawid) {
       for (unsigned i = 0; i < num_draws; i++) {
          update_drawid(ctx, draw_id);
-         vkCmdDrawIndexed(cmdbuf,
+         VKCTX(CmdDrawIndexed)(cmdbuf,
             draws[i].count, dinfo->instance_count,
             draws[i].start, draws[i].index_bias, dinfo->start_instance);
          draw_id++;
@@ -298,13 +289,13 @@ draw_indexed(struct zink_context *ctx,
       if (needs_drawid)
          update_drawid(ctx, draw_id);
       if (HAS_MULTIDRAW) {
-         zink_screen(ctx->base.screen)->vk.CmdDrawMultiIndexedEXT(cmdbuf, num_draws, (const VkMultiDrawIndexedInfoEXT*)draws,
-                                                                   dinfo->instance_count,
-                                                                   dinfo->start_instance, sizeof(struct pipe_draw_start_count_bias),
-                                                                   dinfo->index_bias_varies ? NULL : &draws[0].index_bias);
+         VKCTX(CmdDrawMultiIndexedEXT)(cmdbuf, num_draws, (const VkMultiDrawIndexedInfoEXT*)draws,
+                                       dinfo->instance_count,
+                                       dinfo->start_instance, sizeof(struct pipe_draw_start_count_bias),
+                                       dinfo->index_bias_varies ? NULL : &draws[0].index_bias);
       } else {
          for (unsigned i = 0; i < num_draws; i++)
-            vkCmdDrawIndexed(cmdbuf,
+            VKCTX(CmdDrawIndexed)(cmdbuf,
                draws[i].count, dinfo->instance_count,
                draws[i].start, draws[i].index_bias, dinfo->start_instance);
       }
@@ -324,24 +315,35 @@ draw(struct zink_context *ctx,
    if (dinfo->increment_draw_id && needs_drawid) {
       for (unsigned i = 0; i < num_draws; i++) {
          update_drawid(ctx, draw_id);
-         vkCmdDraw(cmdbuf, draws[i].count, dinfo->instance_count, draws[i].start, dinfo->start_instance);
+         VKCTX(CmdDraw)(cmdbuf, draws[i].count, dinfo->instance_count, draws[i].start, dinfo->start_instance);
          draw_id++;
       }
    } else {
       if (needs_drawid)
          update_drawid(ctx, draw_id);
       if (HAS_MULTIDRAW)
-         zink_screen(ctx->base.screen)->vk.CmdDrawMultiEXT(cmdbuf, num_draws, (const VkMultiDrawInfoEXT*)draws,
-                                                            dinfo->instance_count, dinfo->start_instance,
-                                                            sizeof(struct pipe_draw_start_count_bias));
+         VKCTX(CmdDrawMultiEXT)(cmdbuf, num_draws, (const VkMultiDrawInfoEXT*)draws,
+                                dinfo->instance_count, dinfo->start_instance,
+                                sizeof(struct pipe_draw_start_count_bias));
       else {
          for (unsigned i = 0; i < num_draws; i++)
-            vkCmdDraw(cmdbuf, draws[i].count, dinfo->instance_count, draws[i].start, dinfo->start_instance);
+            VKCTX(CmdDraw)(cmdbuf, draws[i].count, dinfo->instance_count, draws[i].start, dinfo->start_instance);
 
       }
    }
 }
 
+ALWAYS_INLINE static VkPipelineStageFlags
+find_pipeline_bits(uint32_t *mask)
+{
+   for (unsigned i = 0; i < ZINK_SHADER_COUNT; i++) {
+      if (mask[i]) {
+         return zink_pipeline_flags_from_pipe_stage((enum pipe_shader_type)i);
+      }
+   }
+   return 0;
+}
+
 static void
 update_barriers(struct zink_context *ctx, bool is_compute)
 {
@@ -369,24 +371,29 @@ update_barriers(struct zink_context *ctx, bool is_compute)
                   pipeline |= VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
                   bind_count -= util_bitcount(res->vbo_bind_mask);
                }
+               bind_count -= res->so_bind_count;
             }
             if (bind_count)
                access |= VK_ACCESS_SHADER_READ_BIT;
          }
          if (is_compute)
             pipeline = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
-         else {
-            u_foreach_bit(stage, res->bind_history) {
-               if ((1 << stage) != ZINK_RESOURCE_USAGE_STREAMOUT)
-                  pipeline |= zink_pipeline_flags_from_pipe_stage((enum pipe_shader_type)stage);
-            }
+         else if (!pipeline) {
+            if (res->ubo_bind_count[0])
+               pipeline |= find_pipeline_bits(res->ubo_bind_mask);
+            if (!pipeline)
+               pipeline |= find_pipeline_bits(res->ssbo_bind_mask);
+            if (!pipeline)
+               pipeline |= find_pipeline_bits(res->sampler_binds);
+            if (!pipeline) //must be a shader image
+               pipeline = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
          }
          if (res->base.b.target == PIPE_BUFFER)
-            zink_resource_buffer_barrier(ctx, NULL, res, access, pipeline);
+            zink_resource_buffer_barrier(ctx, res, access, pipeline);
          else {
             VkImageLayout layout = zink_descriptor_util_image_layout_eval(res, is_compute);
             if (layout != res->layout)
-               zink_resource_image_barrier(ctx, NULL, res, layout, access, pipeline);
+               zink_resource_image_barrier(ctx, res, layout, access, pipeline);
          }
          /* always barrier on draw if this resource has either multiple image write binds or
           * image write binds and image read binds
@@ -400,7 +407,49 @@ update_barriers(struct zink_context *ctx, bool is_compute)
    }
 }
 
-template <zink_multidraw HAS_MULTIDRAW, zink_dynamic_state HAS_DYNAMIC_STATE, bool BATCH_CHANGED>
+template <bool BATCH_CHANGED>
+static bool
+update_gfx_pipeline(struct zink_context *ctx, struct zink_batch_state *bs, enum pipe_prim_type mode)
+{
+   VkPipeline prev_pipeline = ctx->gfx_pipeline_state.pipeline;
+   update_gfx_program(ctx);
+   VkPipeline pipeline = zink_get_gfx_pipeline(ctx, ctx->curr_program, &ctx->gfx_pipeline_state, mode);
+   bool pipeline_changed = prev_pipeline != pipeline;
+   if (BATCH_CHANGED || pipeline_changed)
+      VKCTX(CmdBindPipeline)(bs->cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);
+   return pipeline_changed;
+}
+
+static bool
+hack_conditional_render(struct pipe_context *pctx,
+                        const struct pipe_draw_info *dinfo,
+                        unsigned drawid_offset,
+                        const struct pipe_draw_indirect_info *dindirect,
+                        const struct pipe_draw_start_count_bias *draws,
+                        unsigned num_draws)
+{
+   struct zink_context *ctx = zink_context(pctx);
+   struct zink_batch_state *bs = ctx->batch.state;
+   static bool warned;
+   if (!warned) {
+      fprintf(stderr, "ZINK: warning, this is cpu-based conditional rendering, say bye-bye to fps\n");
+      warned = true;
+   }
+   if (!zink_check_conditional_render(ctx))
+      return false;
+   if (bs != ctx->batch.state) {
+      bool prev = ctx->render_condition_active;
+      ctx->render_condition_active = false;
+      zink_select_draw_vbo(ctx);
+      pctx->draw_vbo(pctx, dinfo, drawid_offset, dindirect, draws, num_draws);
+      ctx->render_condition_active = prev;
+      return false;
+   }
+   return true;
+}
+
+template <zink_multidraw HAS_MULTIDRAW, zink_dynamic_state HAS_DYNAMIC_STATE, zink_dynamic_state2 HAS_DYNAMIC_STATE2,
+          zink_dynamic_vertex_input HAS_VERTEX_INPUT, bool BATCH_CHANGED>
 void
 zink_draw_vbo(struct pipe_context *pctx,
               const struct pipe_draw_info *dinfo,
@@ -420,36 +469,25 @@ zink_draw_vbo(struct pipe_context *pctx,
    VkBuffer counter_buffers[PIPE_MAX_SO_OUTPUTS];
    VkDeviceSize counter_buffer_offsets[PIPE_MAX_SO_OUTPUTS];
    bool need_index_buffer_unref = false;
-   bool mode_changed = ctx->gfx_pipeline_state.mode != dinfo->mode;
+   bool mode_changed = ctx->gfx_pipeline_state.gfx_prim_mode != dinfo->mode;
    bool reads_drawid = ctx->shader_reads_drawid;
    bool reads_basevertex = ctx->shader_reads_basevertex;
    unsigned work_count = ctx->batch.work_count;
-   enum pipe_prim_type mode = dinfo->mode;
+   enum pipe_prim_type mode = (enum pipe_prim_type)dinfo->mode;
 
+   if (unlikely(!screen->info.have_EXT_conditional_rendering)) {
+      if (!hack_conditional_render(pctx, dinfo, drawid_offset, dindirect, draws, num_draws))
+         return;
+   }
+
+   if (ctx->memory_barrier)
+      zink_flush_memory_barrier(ctx, false);
    update_barriers(ctx, false);
 
-   if (ctx->gfx_pipeline_state.vertices_per_patch != dinfo->vertices_per_patch)
-      ctx->gfx_pipeline_state.dirty = true;
-   bool drawid_broken = ctx->drawid_broken;
-   ctx->drawid_broken = false;
-   if (reads_drawid && (!dindirect || !dindirect->buffer))
-      ctx->drawid_broken = (drawid_offset != 0 ||
-                           (!HAS_MULTIDRAW && num_draws > 1) ||
-                           (HAS_MULTIDRAW && num_draws > 1 && !dinfo->increment_draw_id));
-   if (drawid_broken != ctx->drawid_broken)
-      ctx->dirty_shader_stages |= BITFIELD_BIT(PIPE_SHADER_VERTEX);
-   ctx->gfx_pipeline_state.vertices_per_patch = dinfo->vertices_per_patch;
-   if (ctx->rast_state->base.point_quad_rasterization &&
-       ctx->gfx_prim_mode != mode) {
-      if (ctx->gfx_prim_mode == PIPE_PRIM_POINTS || mode == PIPE_PRIM_POINTS)
-         ctx->dirty_shader_stages |= BITFIELD_BIT(PIPE_SHADER_FRAGMENT);
+   if (unlikely(ctx->buffer_rebind_counter < screen->buffer_rebind_counter)) {
+      ctx->buffer_rebind_counter = screen->buffer_rebind_counter;
+      zink_rebind_all_buffers(ctx);
    }
-   ctx->gfx_prim_mode = mode;
-   update_gfx_program(ctx);
-
-   if (ctx->gfx_pipeline_state.primitive_restart != dinfo->primitive_restart)
-      ctx->gfx_pipeline_state.dirty = true;
-   ctx->gfx_pipeline_state.primitive_restart = dinfo->primitive_restart;
 
    unsigned index_offset = 0;
    unsigned index_size = dinfo->index_size;
@@ -467,24 +505,24 @@ zink_draw_vbo(struct pipe_context *pctx,
       }
       assert(index_size <= 4 && index_size != 3);
       assert(index_size != 1 || screen->info.have_EXT_index_type_uint8);
-      const VkIndexType index_type[3] = {
-         VK_INDEX_TYPE_UINT8_EXT,
-         VK_INDEX_TYPE_UINT16,
-         VK_INDEX_TYPE_UINT32,
-      };
-      struct zink_resource *res = zink_resource(index_buffer);
-      vkCmdBindIndexBuffer(batch->state->cmdbuf, res->obj->buffer, index_offset, index_type[index_size >> 1]);
    }
 
-   if (zink_program_has_descriptors(&ctx->curr_program->base))
-      screen->descriptors_update(ctx, false);
-
    bool have_streamout = !!ctx->num_so_targets;
    if (have_streamout) {
       if (ctx->xfb_barrier)
          zink_emit_xfb_counter_barrier(ctx);
-      if (ctx->dirty_so_targets)
-         zink_emit_stream_output_targets(pctx);
+      if (ctx->dirty_so_targets) {
+         /* have to loop here and below because barriers must be emitted out of renderpass,
+          * but xfb buffers can't be bound before the renderpass is active to avoid
+          * breaking from recursion
+          */
+         for (unsigned i = 0; i < ctx->num_so_targets; i++) {
+            struct zink_so_target *t = (struct zink_so_target *)ctx->so_targets[i];
+            if (t)
+               zink_resource_buffer_barrier(ctx, zink_resource(t->base.buffer),
+                                            VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT, VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT);
+         }
+      }
    }
 
    if (so_target)
@@ -495,15 +533,55 @@ zink_draw_vbo(struct pipe_context *pctx,
    if (BATCH_CHANGED)
       zink_update_descriptor_refs(ctx, false);
 
-   batch = zink_batch_rp(ctx);
+   zink_batch_rp(ctx);
 
-   VkPipeline prev_pipeline = ctx->gfx_pipeline_state.pipeline;
-   VkPipeline pipeline = zink_get_gfx_pipeline(ctx, ctx->curr_program,
-                                               &ctx->gfx_pipeline_state,
-                                               mode);
-   bool pipeline_changed = prev_pipeline != pipeline;
-   if (BATCH_CHANGED || pipeline_changed)
-      vkCmdBindPipeline(batch->state->cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);
+   /* these must be after renderpass start to avoid issues with recursion */
+   uint8_t vertices_per_patch = ctx->gfx_pipeline_state.patch_vertices ? ctx->gfx_pipeline_state.patch_vertices - 1 : 0;
+   if (ctx->gfx_pipeline_state.vertices_per_patch != vertices_per_patch)
+      ctx->gfx_pipeline_state.dirty = true;
+   bool drawid_broken = false;
+   if (reads_drawid && (!dindirect || !dindirect->buffer))
+      drawid_broken = (drawid_offset != 0 ||
+                      (!HAS_MULTIDRAW && num_draws > 1) ||
+                      (HAS_MULTIDRAW && num_draws > 1 && !dinfo->increment_draw_id));
+   if (drawid_broken != zink_get_last_vertex_key(ctx)->push_drawid)
+      zink_set_last_vertex_key(ctx)->push_drawid = drawid_broken;
+   ctx->gfx_pipeline_state.vertices_per_patch = vertices_per_patch;
+   if (mode_changed) {
+      bool points_changed = false;
+      if (mode == PIPE_PRIM_POINTS) {
+         ctx->gfx_pipeline_state.has_points++;
+         points_changed = true;
+      } else if (ctx->gfx_pipeline_state.gfx_prim_mode == PIPE_PRIM_POINTS) {
+         ctx->gfx_pipeline_state.has_points--;
+         points_changed = true;
+      }
+      if (points_changed && ctx->rast_state->base.point_quad_rasterization)
+         zink_set_fs_point_coord_key(ctx);
+   }
+   ctx->gfx_pipeline_state.gfx_prim_mode = mode;
+
+   if (index_size) {
+      const VkIndexType index_type[3] = {
+         VK_INDEX_TYPE_UINT8_EXT,
+         VK_INDEX_TYPE_UINT16,
+         VK_INDEX_TYPE_UINT32,
+      };
+      struct zink_resource *res = zink_resource(index_buffer);
+      VKCTX(CmdBindIndexBuffer)(batch->state->cmdbuf, res->obj->buffer, index_offset, index_type[index_size >> 1]);
+   }
+   if (!HAS_DYNAMIC_STATE2) {
+      if (ctx->gfx_pipeline_state.primitive_restart != dinfo->primitive_restart)
+         ctx->gfx_pipeline_state.dirty = true;
+      ctx->gfx_pipeline_state.primitive_restart = dinfo->primitive_restart;
+   }
+
+   if (have_streamout && ctx->dirty_so_targets)
+      zink_emit_stream_output_targets(pctx);
+
+   bool pipeline_changed = false;
+   if (!HAS_DYNAMIC_STATE)
+      pipeline_changed = update_gfx_pipeline<BATCH_CHANGED>(ctx, batch->state, mode);
 
    if (BATCH_CHANGED || ctx->vp_state_changed || (!HAS_DYNAMIC_STATE && pipeline_changed)) {
       VkViewport viewports[PIPE_MAX_VIEWPORTS];
@@ -521,9 +599,9 @@ zink_draw_vbo(struct pipe_context *pctx,
          viewports[i] = viewport;
       }
       if (HAS_DYNAMIC_STATE)
-         screen->vk.CmdSetViewportWithCountEXT(batch->state->cmdbuf, ctx->vp_state.num_viewports, viewports);
+         VKCTX(CmdSetViewportWithCountEXT)(batch->state->cmdbuf, ctx->vp_state.num_viewports, viewports);
       else
-         vkCmdSetViewport(batch->state->cmdbuf, 0, ctx->vp_state.num_viewports, viewports);
+         VKCTX(CmdSetViewport)(batch->state->cmdbuf, 0, ctx->vp_state.num_viewports, viewports);
    }
    if (BATCH_CHANGED || ctx->scissor_changed || ctx->vp_state_changed || (!HAS_DYNAMIC_STATE && pipeline_changed)) {
       VkRect2D scissors[PIPE_MAX_VIEWPORTS];
@@ -543,39 +621,39 @@ zink_draw_vbo(struct pipe_context *pctx,
          }
       }
       if (HAS_DYNAMIC_STATE)
-         screen->vk.CmdSetScissorWithCountEXT(batch->state->cmdbuf, ctx->vp_state.num_viewports, scissors);
+         VKCTX(CmdSetScissorWithCountEXT)(batch->state->cmdbuf, ctx->vp_state.num_viewports, scissors);
       else
-         vkCmdSetScissor(batch->state->cmdbuf, 0, ctx->vp_state.num_viewports, scissors);
+         VKCTX(CmdSetScissor)(batch->state->cmdbuf, 0, ctx->vp_state.num_viewports, scissors);
    }
    ctx->vp_state_changed = false;
    ctx->scissor_changed = false;
 
    if (BATCH_CHANGED || ctx->stencil_ref_changed) {
-      vkCmdSetStencilReference(batch->state->cmdbuf, VK_STENCIL_FACE_FRONT_BIT,
+      VKCTX(CmdSetStencilReference)(batch->state->cmdbuf, VK_STENCIL_FACE_FRONT_BIT,
                                ctx->stencil_ref.ref_value[0]);
-      vkCmdSetStencilReference(batch->state->cmdbuf, VK_STENCIL_FACE_BACK_BIT,
+      VKCTX(CmdSetStencilReference)(batch->state->cmdbuf, VK_STENCIL_FACE_BACK_BIT,
                                ctx->stencil_ref.ref_value[1]);
       ctx->stencil_ref_changed = false;
    }
 
    if (HAS_DYNAMIC_STATE && (BATCH_CHANGED || ctx->dsa_state_changed)) {
-      screen->vk.CmdSetDepthBoundsTestEnableEXT(batch->state->cmdbuf, dsa_state->hw_state.depth_bounds_test);
+      VKCTX(CmdSetDepthBoundsTestEnableEXT)(batch->state->cmdbuf, dsa_state->hw_state.depth_bounds_test);
       if (dsa_state->hw_state.depth_bounds_test)
-         vkCmdSetDepthBounds(batch->state->cmdbuf,
+         VKCTX(CmdSetDepthBounds)(batch->state->cmdbuf,
                              dsa_state->hw_state.min_depth_bounds,
                              dsa_state->hw_state.max_depth_bounds);
-      screen->vk.CmdSetDepthTestEnableEXT(batch->state->cmdbuf, dsa_state->hw_state.depth_test);
+      VKCTX(CmdSetDepthTestEnableEXT)(batch->state->cmdbuf, dsa_state->hw_state.depth_test);
       if (dsa_state->hw_state.depth_test)
-         screen->vk.CmdSetDepthCompareOpEXT(batch->state->cmdbuf, dsa_state->hw_state.depth_compare_op);
-      screen->vk.CmdSetDepthWriteEnableEXT(batch->state->cmdbuf, dsa_state->hw_state.depth_write);
-      screen->vk.CmdSetStencilTestEnableEXT(batch->state->cmdbuf, dsa_state->hw_state.stencil_test);
+         VKCTX(CmdSetDepthCompareOpEXT)(batch->state->cmdbuf, dsa_state->hw_state.depth_compare_op);
+      VKCTX(CmdSetDepthWriteEnableEXT)(batch->state->cmdbuf, dsa_state->hw_state.depth_write);
+      VKCTX(CmdSetStencilTestEnableEXT)(batch->state->cmdbuf, dsa_state->hw_state.stencil_test);
       if (dsa_state->hw_state.stencil_test) {
-         screen->vk.CmdSetStencilOpEXT(batch->state->cmdbuf, VK_STENCIL_FACE_FRONT_BIT,
+         VKCTX(CmdSetStencilOpEXT)(batch->state->cmdbuf, VK_STENCIL_FACE_FRONT_BIT,
                                        dsa_state->hw_state.stencil_front.failOp,
                                        dsa_state->hw_state.stencil_front.passOp,
                                        dsa_state->hw_state.stencil_front.depthFailOp,
                                        dsa_state->hw_state.stencil_front.compareOp);
-         screen->vk.CmdSetStencilOpEXT(batch->state->cmdbuf, VK_STENCIL_FACE_BACK_BIT,
+         VKCTX(CmdSetStencilOpEXT)(batch->state->cmdbuf, VK_STENCIL_FACE_BACK_BIT,
                                        dsa_state->hw_state.stencil_back.failOp,
                                        dsa_state->hw_state.stencil_back.passOp,
                                        dsa_state->hw_state.stencil_back.depthFailOp,
@@ -583,13 +661,13 @@ zink_draw_vbo(struct pipe_context *pctx,
       }
       if (dsa_state->base.stencil[0].enabled) {
          if (dsa_state->base.stencil[1].enabled) {
-            vkCmdSetStencilWriteMask(batch->state->cmdbuf, VK_STENCIL_FACE_FRONT_BIT, dsa_state->hw_state.stencil_front.writeMask);
-            vkCmdSetStencilWriteMask(batch->state->cmdbuf, VK_STENCIL_FACE_BACK_BIT, dsa_state->hw_state.stencil_back.writeMask);
-            vkCmdSetStencilCompareMask(batch->state->cmdbuf, VK_STENCIL_FACE_FRONT_BIT, dsa_state->hw_state.stencil_front.compareMask);
-            vkCmdSetStencilCompareMask(batch->state->cmdbuf, VK_STENCIL_FACE_BACK_BIT, dsa_state->hw_state.stencil_back.compareMask);
+            VKCTX(CmdSetStencilWriteMask)(batch->state->cmdbuf, VK_STENCIL_FACE_FRONT_BIT, dsa_state->hw_state.stencil_front.writeMask);
+            VKCTX(CmdSetStencilWriteMask)(batch->state->cmdbuf, VK_STENCIL_FACE_BACK_BIT, dsa_state->hw_state.stencil_back.writeMask);
+            VKCTX(CmdSetStencilCompareMask)(batch->state->cmdbuf, VK_STENCIL_FACE_FRONT_BIT, dsa_state->hw_state.stencil_front.compareMask);
+            VKCTX(CmdSetStencilCompareMask)(batch->state->cmdbuf, VK_STENCIL_FACE_BACK_BIT, dsa_state->hw_state.stencil_back.compareMask);
          } else {
-            vkCmdSetStencilWriteMask(batch->state->cmdbuf, VK_STENCIL_FACE_FRONT_AND_BACK, dsa_state->hw_state.stencil_front.writeMask);
-            vkCmdSetStencilCompareMask(batch->state->cmdbuf, VK_STENCIL_FACE_FRONT_AND_BACK, dsa_state->hw_state.stencil_front.compareMask);
+            VKCTX(CmdSetStencilWriteMask)(batch->state->cmdbuf, VK_STENCIL_FACE_FRONT_AND_BACK, dsa_state->hw_state.stencil_front.writeMask);
+            VKCTX(CmdSetStencilCompareMask)(batch->state->cmdbuf, VK_STENCIL_FACE_FRONT_AND_BACK, dsa_state->hw_state.stencil_front.compareMask);
          }
       }
    }
@@ -597,13 +675,15 @@ zink_draw_vbo(struct pipe_context *pctx,
 
    bool rast_state_changed = ctx->rast_state_changed;
    if (HAS_DYNAMIC_STATE && (BATCH_CHANGED || rast_state_changed))
-      screen->vk.CmdSetFrontFaceEXT(batch->state->cmdbuf, ctx->gfx_pipeline_state.front_face);
+      VKCTX(CmdSetFrontFaceEXT)(batch->state->cmdbuf, ctx->gfx_pipeline_state.dyn_state1.front_face);
    if ((BATCH_CHANGED || rast_state_changed) &&
        screen->info.have_EXT_line_rasterization && rast_state->base.line_stipple_enable)
-      screen->vk.CmdSetLineStippleEXT(batch->state->cmdbuf, rast_state->base.line_stipple_factor, rast_state->base.line_stipple_pattern);
+      VKCTX(CmdSetLineStippleEXT)(batch->state->cmdbuf, rast_state->base.line_stipple_factor, rast_state->base.line_stipple_pattern);
 
    if (BATCH_CHANGED || ctx->rast_state_changed || mode_changed) {
-      enum pipe_prim_type reduced_prim = u_reduced_prim(mode);
+      enum pipe_prim_type reduced_prim = ctx->last_vertex_stage->reduced_prim;
+      if (reduced_prim == PIPE_PRIM_MAX)
+         reduced_prim = u_reduced_prim(mode);
 
       bool depth_bias = false;
       switch (reduced_prim) {
@@ -625,14 +705,14 @@ zink_draw_vbo(struct pipe_context *pctx,
 
       if (line_width_needed(reduced_prim, rast_state->hw_state.polygon_mode)) {
          if (screen->info.feats.features.wideLines || rast_state->line_width == 1.0f)
-            vkCmdSetLineWidth(batch->state->cmdbuf, rast_state->line_width);
+            VKCTX(CmdSetLineWidth)(batch->state->cmdbuf, rast_state->line_width);
          else
             debug_printf("BUG: wide lines not supported, needs fallback!");
       }
       if (depth_bias)
-         vkCmdSetDepthBias(batch->state->cmdbuf, rast_state->offset_units, rast_state->offset_clamp, rast_state->offset_scale);
+         VKCTX(CmdSetDepthBias)(batch->state->cmdbuf, rast_state->offset_units, rast_state->offset_clamp, rast_state->offset_scale);
       else
-         vkCmdSetDepthBias(batch->state->cmdbuf, 0.0f, 0.0f, 0.0f);
+         VKCTX(CmdSetDepthBias)(batch->state->cmdbuf, 0.0f, 0.0f, 0.0f);
    }
    ctx->rast_state_changed = false;
 
@@ -640,33 +720,55 @@ zink_draw_vbo(struct pipe_context *pctx,
       if (ctx->sample_locations_changed) {
          VkSampleLocationsInfoEXT loc;
          zink_init_vk_sample_locations(ctx, &loc);
-         screen->vk.CmdSetSampleLocationsEXT(batch->state->cmdbuf, &loc);
+         VKCTX(CmdSetSampleLocationsEXT)(batch->state->cmdbuf, &loc);
       }
       ctx->sample_locations_changed = false;
    }
 
    if ((BATCH_CHANGED || ctx->blend_state_changed) &&
        ctx->gfx_pipeline_state.blend_state->need_blend_constants) {
-      vkCmdSetBlendConstants(batch->state->cmdbuf, ctx->blend_constants);
+      VKCTX(CmdSetBlendConstants)(batch->state->cmdbuf, ctx->blend_constants);
    }
    ctx->blend_state_changed = false;
 
    if (BATCH_CHANGED || ctx->vertex_buffers_dirty)
-      zink_bind_vertex_buffers<HAS_DYNAMIC_STATE>(batch, ctx);
+      zink_bind_vertex_buffers<HAS_DYNAMIC_STATE, HAS_VERTEX_INPUT>(batch, ctx);
+
+   zink_query_update_gs_states(ctx);
+
+   if (BATCH_CHANGED) {
+      ctx->pipeline_changed[0] = false;
+      zink_select_draw_vbo(ctx);
+   }
+
+   if (HAS_DYNAMIC_STATE) {
+      update_gfx_pipeline<BATCH_CHANGED>(ctx, batch->state, mode);
+      if (BATCH_CHANGED || mode_changed)
+         VKCTX(CmdSetPrimitiveTopologyEXT)(batch->state->cmdbuf, zink_primitive_topology(mode));
+   }
+
+   if (HAS_DYNAMIC_STATE2 && (BATCH_CHANGED || ctx->primitive_restart != dinfo->primitive_restart)) {
+      VKCTX(CmdSetPrimitiveRestartEnableEXT)(batch->state->cmdbuf, dinfo->primitive_restart);
+      ctx->primitive_restart = dinfo->primitive_restart;
+   }
+
+   if (zink_program_has_descriptors(&ctx->curr_program->base))
+      screen->descriptors_update(ctx, false);
+
+   if (ctx->di.any_bindless_dirty && ctx->curr_program->base.dd->bindless)
+      zink_descriptors_update_bindless(ctx);
 
    if (reads_basevertex) {
       unsigned draw_mode_is_indexed = index_size > 0;
-      vkCmdPushConstants(batch->state->cmdbuf, ctx->curr_program->base.layout, VK_SHADER_STAGE_VERTEX_BIT,
+      VKCTX(CmdPushConstants)(batch->state->cmdbuf, ctx->curr_program->base.layout, VK_SHADER_STAGE_VERTEX_BIT,
                          offsetof(struct zink_gfx_push_constant, draw_mode_is_indexed), sizeof(unsigned),
                          &draw_mode_is_indexed);
    }
    if (ctx->curr_program->shaders[PIPE_SHADER_TESS_CTRL] && ctx->curr_program->shaders[PIPE_SHADER_TESS_CTRL]->is_generated)
-      vkCmdPushConstants(batch->state->cmdbuf, ctx->curr_program->base.layout, VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT,
+      VKCTX(CmdPushConstants)(batch->state->cmdbuf, ctx->curr_program->base.layout, VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT,
                          offsetof(struct zink_gfx_push_constant, default_inner_level), sizeof(float) * 6,
                          &ctx->tess_levels[0]);
 
-   zink_query_update_gs_states(ctx);
-
    if (have_streamout) {
       for (unsigned i = 0; i < ctx->num_so_targets; i++) {
          struct zink_so_target *t = zink_so_target(ctx->so_targets[i]);
@@ -681,15 +783,10 @@ zink_draw_vbo(struct pipe_context *pctx,
             }
          }
       }
-      screen->vk.CmdBeginTransformFeedbackEXT(batch->state->cmdbuf, 0, ctx->num_so_targets, counter_buffers, counter_buffer_offsets);
+      VKCTX(CmdBeginTransformFeedbackEXT)(batch->state->cmdbuf, 0, ctx->num_so_targets, counter_buffers, counter_buffer_offsets);
    }
 
-   if (BATCH_CHANGED) {
-      ctx->pipeline_changed[0] = false;
-      zink_select_draw_vbo(ctx);
-   }
-
-   bool needs_drawid = reads_drawid && ctx->drawid_broken;
+   bool needs_drawid = reads_drawid && zink_get_last_vertex_key(ctx)->push_drawid;
    work_count += num_draws;
    if (index_size > 0) {
       if (dindirect && dindirect->buffer) {
@@ -701,11 +798,11 @@ zink_draw_vbo(struct pipe_context *pctx,
          if (dindirect->indirect_draw_count) {
              struct zink_resource *indirect_draw_count = zink_resource(dindirect->indirect_draw_count);
              zink_batch_reference_resource_rw(batch, indirect_draw_count, false);
-             screen->vk.CmdDrawIndexedIndirectCount(batch->state->cmdbuf, indirect->obj->buffer, dindirect->offset,
-                                           indirect_draw_count->obj->buffer, dindirect->indirect_draw_count_offset,
-                                           dindirect->draw_count, dindirect->stride);
+             VKCTX(CmdDrawIndexedIndirectCount)(batch->state->cmdbuf, indirect->obj->buffer, dindirect->offset,
+                                                indirect_draw_count->obj->buffer, dindirect->indirect_draw_count_offset,
+                                                dindirect->draw_count, dindirect->stride);
          } else
-            vkCmdDrawIndexedIndirect(batch->state->cmdbuf, indirect->obj->buffer, dindirect->offset, dindirect->draw_count, dindirect->stride);
+            VKCTX(CmdDrawIndexedIndirect)(batch->state->cmdbuf, indirect->obj->buffer, dindirect->offset, dindirect->draw_count, dindirect->stride);
       } else {
          if (need_index_buffer_unref)
             draw_indexed_need_index_buffer_unref(ctx, dinfo, draws, num_draws, drawid_offset, needs_drawid);
@@ -718,7 +815,7 @@ zink_draw_vbo(struct pipe_context *pctx,
             update_drawid(ctx, drawid_offset);
          zink_batch_reference_resource_rw(batch, zink_resource(so_target->base.buffer), false);
          zink_batch_reference_resource_rw(batch, zink_resource(so_target->counter_buffer), true);
-         screen->vk.CmdDrawIndirectByteCountEXT(batch->state->cmdbuf, dinfo->instance_count, dinfo->start_instance,
+         VKCTX(CmdDrawIndirectByteCountEXT)(batch->state->cmdbuf, dinfo->instance_count, dinfo->start_instance,
                                        zink_resource(so_target->counter_buffer)->obj->buffer, so_target->counter_buffer_offset, 0,
                                        MIN2(so_target->stride, screen->info.tf_props.maxTransformFeedbackBufferDataStride));
       } else if (dindirect && dindirect->buffer) {
@@ -730,11 +827,11 @@ zink_draw_vbo(struct pipe_context *pctx,
          if (dindirect->indirect_draw_count) {
              struct zink_resource *indirect_draw_count = zink_resource(dindirect->indirect_draw_count);
              zink_batch_reference_resource_rw(batch, indirect_draw_count, false);
-             screen->vk.CmdDrawIndirectCount(batch->state->cmdbuf, indirect->obj->buffer, dindirect->offset,
+             VKCTX(CmdDrawIndirectCount)(batch->state->cmdbuf, indirect->obj->buffer, dindirect->offset,
                                            indirect_draw_count->obj->buffer, dindirect->indirect_draw_count_offset,
                                            dindirect->draw_count, dindirect->stride);
          } else
-            vkCmdDrawIndirect(batch->state->cmdbuf, indirect->obj->buffer, dindirect->offset, dindirect->draw_count, dindirect->stride);
+            VKCTX(CmdDrawIndirect)(batch->state->cmdbuf, indirect->obj->buffer, dindirect->offset, dindirect->draw_count, dindirect->stride);
       } else {
          draw<HAS_MULTIDRAW>(ctx, dinfo, draws, num_draws, drawid_offset, needs_drawid);
       }
@@ -749,9 +846,10 @@ zink_draw_vbo(struct pipe_context *pctx,
             t->counter_buffer_valid = true;
          }
       }
-      screen->vk.CmdEndTransformFeedbackEXT(batch->state->cmdbuf, 0, ctx->num_so_targets, counter_buffers, counter_buffer_offsets);
+      VKCTX(CmdEndTransformFeedbackEXT)(batch->state->cmdbuf, 0, ctx->num_so_targets, counter_buffers, counter_buffer_offsets);
    }
    batch->has_work = true;
+   batch->last_was_compute = false;
    ctx->batch.work_count = work_count;
    /* flush if there's >100k draws */
    if (unlikely(work_count >= 30000) || ctx->oom_flush)
@@ -767,11 +865,13 @@ zink_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info)
    struct zink_batch *batch = &ctx->batch;
 
    update_barriers(ctx, true);
-
-   update_compute_program(ctx);
+   if (ctx->memory_barrier)
+      zink_flush_memory_barrier(ctx, true);
 
    if (zink_program_has_descriptors(&ctx->curr_compute->base))
       screen->descriptors_update(ctx, true);
+   if (ctx->di.any_bindless_dirty && ctx->curr_compute->base.dd->bindless)
+      zink_descriptors_update_bindless(ctx);
 
    zink_program_update_compute_pipeline_state(ctx, ctx->curr_compute, info->block);
    VkPipeline prev_pipeline = ctx->compute_pipeline_state.pipeline;
@@ -784,55 +884,83 @@ zink_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info)
    }
 
    if (prev_pipeline != pipeline || BATCH_CHANGED)
-      vkCmdBindPipeline(batch->state->cmdbuf, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+      VKCTX(CmdBindPipeline)(batch->state->cmdbuf, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
    if (BATCH_CHANGED) {
       ctx->pipeline_changed[1] = false;
       zink_select_launch_grid(ctx);
    }
 
    if (BITSET_TEST(ctx->compute_stage->nir->info.system_values_read, SYSTEM_VALUE_WORK_DIM))
-      vkCmdPushConstants(batch->state->cmdbuf, ctx->curr_compute->base.layout, VK_SHADER_STAGE_COMPUTE_BIT,
+      VKCTX(CmdPushConstants)(batch->state->cmdbuf, ctx->curr_compute->base.layout, VK_SHADER_STAGE_COMPUTE_BIT,
                          offsetof(struct zink_cs_push_constant, work_dim), sizeof(uint32_t),
                          &info->work_dim);
 
    batch->work_count++;
+   zink_batch_no_rp(ctx);
    if (info->indirect) {
-      vkCmdDispatchIndirect(batch->state->cmdbuf, zink_resource(info->indirect)->obj->buffer, info->indirect_offset);
+      /*
+         VK_ACCESS_INDIRECT_COMMAND_READ_BIT specifies read access to indirect command data read as
+         part of an indirect build, trace, drawing or dispatching command. Such access occurs in the
+         VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT pipeline stage.
+
+         - Chapter 7. Synchronization and Cache Control
+       */
+      check_buffer_barrier(ctx, info->indirect, VK_ACCESS_INDIRECT_COMMAND_READ_BIT, VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT);
+      VKCTX(CmdDispatchIndirect)(batch->state->cmdbuf, zink_resource(info->indirect)->obj->buffer, info->indirect_offset);
       zink_batch_reference_resource_rw(batch, zink_resource(info->indirect), false);
    } else
-      vkCmdDispatch(batch->state->cmdbuf, info->grid[0], info->grid[1], info->grid[2]);
+      VKCTX(CmdDispatch)(batch->state->cmdbuf, info->grid[0], info->grid[1], info->grid[2]);
    batch->has_work = true;
+   batch->last_was_compute = true;
    /* flush if there's >100k computes */
    if (unlikely(ctx->batch.work_count >= 30000) || ctx->oom_flush)
       pctx->flush(pctx, NULL, 0);
 }
 
-template <zink_multidraw HAS_MULTIDRAW, zink_dynamic_state HAS_DYNAMIC_STATE, bool BATCH_CHANGED>
+template <zink_multidraw HAS_MULTIDRAW, zink_dynamic_state HAS_DYNAMIC_STATE, zink_dynamic_state2 HAS_DYNAMIC_STATE2,
+          zink_dynamic_vertex_input HAS_VERTEX_INPUT, bool BATCH_CHANGED>
 static void
-init_batch_changed_functions(struct zink_context *ctx, pipe_draw_vbo_func draw_vbo_array[2][2][2])
+init_batch_changed_functions(struct zink_context *ctx, pipe_draw_vbo_func draw_vbo_array[2][2][2][2][2])
 {
-   draw_vbo_array[HAS_MULTIDRAW][HAS_DYNAMIC_STATE][BATCH_CHANGED] =
-   zink_draw_vbo<HAS_MULTIDRAW, HAS_DYNAMIC_STATE, BATCH_CHANGED>;
+   draw_vbo_array[HAS_MULTIDRAW][HAS_DYNAMIC_STATE][HAS_DYNAMIC_STATE2][HAS_VERTEX_INPUT][BATCH_CHANGED] =
+   zink_draw_vbo<HAS_MULTIDRAW, HAS_DYNAMIC_STATE, HAS_DYNAMIC_STATE2, HAS_VERTEX_INPUT, BATCH_CHANGED>;
+}
+
+template <zink_multidraw HAS_MULTIDRAW, zink_dynamic_state HAS_DYNAMIC_STATE, zink_dynamic_state2 HAS_DYNAMIC_STATE2,
+          zink_dynamic_vertex_input HAS_VERTEX_INPUT>
+static void
+init_vertex_input_functions(struct zink_context *ctx, pipe_draw_vbo_func draw_vbo_array[2][2][2][2][2])
+{
+   init_batch_changed_functions<HAS_MULTIDRAW, HAS_DYNAMIC_STATE, HAS_DYNAMIC_STATE2, HAS_VERTEX_INPUT, false>(ctx, draw_vbo_array);
+   init_batch_changed_functions<HAS_MULTIDRAW, HAS_DYNAMIC_STATE, HAS_DYNAMIC_STATE2, HAS_VERTEX_INPUT, true>(ctx, draw_vbo_array);
+}
+
+template <zink_multidraw HAS_MULTIDRAW, zink_dynamic_state HAS_DYNAMIC_STATE, zink_dynamic_state2 HAS_DYNAMIC_STATE2>
+static void
+init_dynamic_state2_functions(struct zink_context *ctx, pipe_draw_vbo_func draw_vbo_array[2][2][2][2][2])
+{
+   init_vertex_input_functions<HAS_MULTIDRAW, HAS_DYNAMIC_STATE, HAS_DYNAMIC_STATE2, ZINK_NO_DYNAMIC_VERTEX_INPUT>(ctx, draw_vbo_array);
+   init_vertex_input_functions<HAS_MULTIDRAW, HAS_DYNAMIC_STATE, HAS_DYNAMIC_STATE2, ZINK_DYNAMIC_VERTEX_INPUT>(ctx, draw_vbo_array);
 }
 
 template <zink_multidraw HAS_MULTIDRAW, zink_dynamic_state HAS_DYNAMIC_STATE>
 static void
-init_dynamic_state_functions(struct zink_context *ctx, pipe_draw_vbo_func draw_vbo_array[2][2][2])
+init_dynamic_state_functions(struct zink_context *ctx, pipe_draw_vbo_func draw_vbo_array[2][2][2][2][2])
 {
-   init_batch_changed_functions<HAS_MULTIDRAW, HAS_DYNAMIC_STATE, false>(ctx, draw_vbo_array);
-   init_batch_changed_functions<HAS_MULTIDRAW, HAS_DYNAMIC_STATE, true>(ctx, draw_vbo_array);
+   init_dynamic_state2_functions<HAS_MULTIDRAW, HAS_DYNAMIC_STATE, ZINK_NO_DYNAMIC_STATE2>(ctx, draw_vbo_array);
+   init_dynamic_state2_functions<HAS_MULTIDRAW, HAS_DYNAMIC_STATE, ZINK_DYNAMIC_STATE2>(ctx, draw_vbo_array);
 }
 
 template <zink_multidraw HAS_MULTIDRAW>
 static void
-init_multidraw_functions(struct zink_context *ctx, pipe_draw_vbo_func draw_vbo_array[2][2][2])
+init_multidraw_functions(struct zink_context *ctx, pipe_draw_vbo_func draw_vbo_array[2][2][2][2][2])
 {
    init_dynamic_state_functions<HAS_MULTIDRAW, ZINK_NO_DYNAMIC_STATE>(ctx, draw_vbo_array);
    init_dynamic_state_functions<HAS_MULTIDRAW, ZINK_DYNAMIC_STATE>(ctx, draw_vbo_array);
 }
 
 static void
-init_all_draw_functions(struct zink_context *ctx, pipe_draw_vbo_func draw_vbo_array[2][2][2])
+init_all_draw_functions(struct zink_context *ctx, pipe_draw_vbo_func draw_vbo_array[2][2][2][2][2])
 {
    init_multidraw_functions<ZINK_NO_MULTIDRAW>(ctx, draw_vbo_array);
    init_multidraw_functions<ZINK_MULTIDRAW>(ctx, draw_vbo_array);
@@ -869,20 +997,79 @@ zink_invalid_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info
    unreachable("compute shader not bound");
 }
 
+template <unsigned STAGE_MASK>
+static uint32_t
+hash_gfx_program(const void *key)
+{
+   const struct zink_shader **shaders = (const struct zink_shader**)key;
+   uint32_t base_hash = shaders[PIPE_SHADER_VERTEX]->hash ^ shaders[PIPE_SHADER_FRAGMENT]->hash;
+   if (STAGE_MASK == 0) //VS+FS
+      return base_hash;
+   if (STAGE_MASK == 1) //VS+GS+FS
+      return base_hash ^ shaders[PIPE_SHADER_GEOMETRY]->hash;
+   /*VS+TCS+FS isn't a thing */
+   /*VS+TCS+GS+FS isn't a thing */
+   if (STAGE_MASK == 4) //VS+TES+FS
+      return base_hash ^ shaders[PIPE_SHADER_TESS_EVAL]->hash;
+   if (STAGE_MASK == 5) //VS+TES+GS+FS
+      return base_hash ^ shaders[PIPE_SHADER_GEOMETRY]->hash ^ shaders[PIPE_SHADER_TESS_EVAL]->hash;
+   if (STAGE_MASK == 6) //VS+TCS+TES+FS
+      return base_hash ^ shaders[PIPE_SHADER_TESS_CTRL]->hash ^ shaders[PIPE_SHADER_TESS_EVAL]->hash;
+
+   /* all stages */
+   return base_hash ^ shaders[PIPE_SHADER_GEOMETRY]->hash ^ shaders[PIPE_SHADER_TESS_CTRL]->hash ^ shaders[PIPE_SHADER_TESS_EVAL]->hash;
+}
+
+template <unsigned STAGE_MASK>
+static bool
+equals_gfx_program(const void *a, const void *b)
+{
+   const void **sa = (const void**)a;
+   const void **sb = (const void**)b;
+   if (STAGE_MASK == 0) //VS+FS
+      return !memcmp(a, b, sizeof(void*) * 2);
+   if (STAGE_MASK == 1) //VS+GS+FS
+      return !memcmp(a, b, sizeof(void*) * 3);
+   /*VS+TCS+FS isn't a thing */
+   /*VS+TCS+GS+FS isn't a thing */
+   if (STAGE_MASK == 4) //VS+TES+FS
+      return sa[PIPE_SHADER_TESS_EVAL] == sb[PIPE_SHADER_TESS_EVAL] && !memcmp(a, b, sizeof(void*) * 2);
+   if (STAGE_MASK == 5) //VS+TES+GS+FS
+      return sa[PIPE_SHADER_TESS_EVAL] == sb[PIPE_SHADER_TESS_EVAL] && !memcmp(a, b, sizeof(void*) * 3);
+   if (STAGE_MASK == 6) //VS+TCS+TES+FS
+      return !memcmp(&sa[PIPE_SHADER_TESS_CTRL], &sb[PIPE_SHADER_TESS_CTRL], sizeof(void*) * 2) &&
+             !memcmp(a, b, sizeof(void*) * 2);
+
+   /* all stages */
+   return !memcmp(a, b, sizeof(void*) * ZINK_SHADER_COUNT);
+}
+
 extern "C"
 void
 zink_init_draw_functions(struct zink_context *ctx, struct zink_screen *screen)
 {
-   pipe_draw_vbo_func draw_vbo_array[2][2][2]; //multidraw, dynamic state, batch changed
+   pipe_draw_vbo_func draw_vbo_array[2][2][2][2] //multidraw, dynamic state, dynamic state2, dynamic vertex input,
+                                    [2];   //batch changed
    init_all_draw_functions(ctx, draw_vbo_array);
    memcpy(ctx->draw_vbo, &draw_vbo_array[screen->info.have_EXT_multi_draw]
-                                        [screen->info.have_EXT_extended_dynamic_state],
+                                        [screen->info.have_EXT_extended_dynamic_state]
+                                        [screen->info.have_EXT_extended_dynamic_state2]
+                                        [screen->info.have_EXT_vertex_input_dynamic_state],
                                         sizeof(ctx->draw_vbo));
 
    /* Bind a fake draw_vbo, so that draw_vbo isn't NULL, which would skip
     * initialization of callbacks in upper layers (such as u_threaded_context).
     */
    ctx->base.draw_vbo = zink_invalid_draw_vbo;
+
+   _mesa_hash_table_init(&ctx->program_cache[0], ctx, hash_gfx_program<0>, equals_gfx_program<0>);
+   _mesa_hash_table_init(&ctx->program_cache[1], ctx, hash_gfx_program<1>, equals_gfx_program<1>);
+   _mesa_hash_table_init(&ctx->program_cache[2], ctx, hash_gfx_program<2>, equals_gfx_program<2>);
+   _mesa_hash_table_init(&ctx->program_cache[3], ctx, hash_gfx_program<3>, equals_gfx_program<3>);
+   _mesa_hash_table_init(&ctx->program_cache[4], ctx, hash_gfx_program<4>, equals_gfx_program<4>);
+   _mesa_hash_table_init(&ctx->program_cache[5], ctx, hash_gfx_program<5>, equals_gfx_program<5>);
+   _mesa_hash_table_init(&ctx->program_cache[6], ctx, hash_gfx_program<6>, equals_gfx_program<6>);
+   _mesa_hash_table_init(&ctx->program_cache[7], ctx, hash_gfx_program<7>, equals_gfx_program<7>);
 }
 
 void
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_fence.c b/mesa 3D driver/src/gallium/drivers/zink/zink_fence.c
index e0fadd86da..b2118618bc 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_fence.c	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_fence.c	
@@ -34,9 +34,7 @@
 static void
 destroy_fence(struct zink_screen *screen, struct zink_tc_fence *mfence)
 {
-   struct zink_batch_state *bs = zink_batch_state(mfence->fence);
    mfence->fence = NULL;
-   zink_batch_state_reference(screen, &bs, NULL);
    tc_unflushed_batch_token_reference(&mfence->tc_token, NULL);
    FREE(mfence);
 }
@@ -100,9 +98,6 @@ tc_fence_finish(struct zink_context *ctx, struct zink_tc_fence *mfence, uint64_t
          threaded_context_flush(&ctx->base, mfence->tc_token, *timeout_ns == 0);
       }
 
-      if (!timeout_ns)
-         return false;
-
       /* this is a tc mfence, so we're just waiting on the queue mfence to complete
        * after being signaled by the real mfence
        */
@@ -136,9 +131,9 @@ zink_vkfence_wait(struct zink_screen *screen, struct zink_fence *fence, uint64_t
 
    VkResult ret;
    if (timeout_ns)
-      ret = vkWaitForFences(screen->dev, 1, &fence->fence, VK_TRUE, timeout_ns);
+      ret = VKSCR(WaitForFences)(screen->dev, 1, &fence->fence, VK_TRUE, timeout_ns);
    else
-      ret = vkGetFenceStatus(screen->dev, fence->fence);
+      ret = VKSCR(GetFenceStatus)(screen->dev, fence->fence);
    success = zink_screen_handle_vkresult(screen, ret);
 
    if (success) {
@@ -206,7 +201,7 @@ zink_fence_server_sync(struct pipe_context *pctx, struct pipe_fence_handle *pfen
 {
    struct zink_tc_fence *mfence = zink_tc_fence(pfence);
 
-   if (pctx && mfence->deferred_ctx == pctx)
+   if (mfence->deferred_ctx == pctx)
       return;
 
    if (mfence->deferred_ctx) {
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_format.c b/mesa 3D driver/src/gallium/drivers/zink/zink_format.c
index 3e1bea6572..358f409922 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_format.c	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_format.c	
@@ -1,3 +1,4 @@
+#include "util/format/u_format.h"
 #include "zink_format.h"
 
 static const VkFormat formats[PIPE_FORMAT_COUNT] = {
@@ -147,8 +148,79 @@ static const VkFormat formats[PIPE_FORMAT_COUNT] = {
    [PIPE_FORMAT_BPTC_RGB_UFLOAT] = VK_FORMAT_BC6H_UFLOAT_BLOCK,
 };
 
+enum pipe_format
+zink_decompose_vertex_format(enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
+   unsigned first_non_void = util_format_get_first_non_void_channel(format);
+   enum pipe_format new_format;
+   assert(first_non_void == 0);
+   if (!desc->is_array)
+      return PIPE_FORMAT_NONE;
+   if (desc->is_unorm) {
+      enum pipe_format unorm_formats[] = {
+         PIPE_FORMAT_R8_UNORM,
+         PIPE_FORMAT_R16_UNORM,
+         PIPE_FORMAT_R32_UNORM
+      };
+      return unorm_formats[desc->channel[first_non_void].size >> 4];
+   } else if (desc->is_snorm) {
+      enum pipe_format snorm_formats[] = {
+         PIPE_FORMAT_R8_SNORM,
+         PIPE_FORMAT_R16_SNORM,
+         PIPE_FORMAT_R32_SNORM
+      };
+      return snorm_formats[desc->channel[first_non_void].size >> 4];
+   } else {
+      enum pipe_format uint_formats[][3] = {
+         {PIPE_FORMAT_R8_USCALED, PIPE_FORMAT_R16_USCALED, PIPE_FORMAT_R32_USCALED},
+         {PIPE_FORMAT_R8_UINT, PIPE_FORMAT_R16_UINT, PIPE_FORMAT_R32_UINT},
+      };
+      enum pipe_format sint_formats[][3] = {
+         {PIPE_FORMAT_R8_SSCALED, PIPE_FORMAT_R16_SSCALED, PIPE_FORMAT_R32_SSCALED},
+         {PIPE_FORMAT_R8_SINT, PIPE_FORMAT_R16_SINT, PIPE_FORMAT_R32_SINT},
+      };
+      switch (desc->channel[first_non_void].type) {
+      case UTIL_FORMAT_TYPE_UNSIGNED:
+         return uint_formats[desc->channel[first_non_void].pure_integer][desc->channel[first_non_void].size >> 4];
+      case UTIL_FORMAT_TYPE_SIGNED:
+         return sint_formats[desc->channel[first_non_void].pure_integer][desc->channel[first_non_void].size >> 4];
+      case UTIL_FORMAT_TYPE_FLOAT:
+         return desc->channel[first_non_void].size == 16 ? PIPE_FORMAT_R16_FLOAT : PIPE_FORMAT_R32_FLOAT;
+         break;
+      default:
+         return PIPE_FORMAT_NONE;
+      }
+   }
+   return new_format;
+}
+
 VkFormat
 zink_pipe_format_to_vk_format(enum pipe_format format)
 {
    return formats[format];
 }
+
+
+bool
+zink_format_is_voidable_rgba_variant(enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
+   unsigned chan;
+
+   if(desc->block.width != 1 ||
+      desc->block.height != 1 ||
+      (desc->block.bits != 32 && desc->block.bits != 64))
+      return false;
+
+   if (desc->nr_channels != 4)
+      return false;
+
+   unsigned size = desc->channel[0].size;
+   for(chan = 0; chan < 4; ++chan) {
+      if(desc->channel[chan].size != size)
+         return false;
+   }
+
+   return true;
+}
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_format.h b/mesa 3D driver/src/gallium/drivers/zink/zink_format.h
index 966ee9bd9a..3324265177 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_format.h	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_format.h	
@@ -26,9 +26,15 @@
 
 #include "pipe/p_format.h"
 
+#include <stdbool.h>
 #include <vulkan/vulkan.h>
 
+enum pipe_format
+zink_decompose_vertex_format(enum pipe_format format);
+
 VkFormat
 zink_pipe_format_to_vk_format(enum pipe_format format);
 
+bool
+zink_format_is_voidable_rgba_variant(enum pipe_format format);
 #endif
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_framebuffer.c b/mesa 3D driver/src/gallium/drivers/zink/zink_framebuffer.c
index b409fab4af..192450dd37 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_framebuffer.c	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_framebuffer.c	
@@ -28,6 +28,7 @@
 #include "zink_screen.h"
 #include "zink_surface.h"
 
+#include "util/u_framebuffer.h"
 #include "util/u_memory.h"
 #include "util/u_string.h"
 
@@ -37,20 +38,163 @@ zink_destroy_framebuffer(struct zink_screen *screen,
 {
    hash_table_foreach(&fb->objects, he) {
 #if defined(_WIN64) || defined(__x86_64__)
-      vkDestroyFramebuffer(screen->dev, he->data, NULL);
+      VKSCR(DestroyFramebuffer)(screen->dev, he->data, NULL);
 #else
       VkFramebuffer *ptr = he->data;
-      vkDestroyFramebuffer(screen->dev, *ptr, NULL);
+      VKSCR(DestroyFramebuffer)(screen->dev, *ptr, NULL);
 #endif
    }
 
-   if (fb->null_surface)
-      pipe_resource_reference(&fb->null_surface->texture, NULL);
-   zink_surface_reference(screen, (struct zink_surface**)&fb->null_surface, NULL);
-
    ralloc_free(fb);
 }
 
+void
+zink_init_framebuffer_imageless(struct zink_screen *screen, struct zink_framebuffer *fb, struct zink_render_pass *rp)
+{
+   VkFramebuffer ret;
+
+   if (fb->rp == rp)
+      return;
+
+   uint32_t hash = _mesa_hash_pointer(rp);
+
+   struct hash_entry *he = _mesa_hash_table_search_pre_hashed(&fb->objects, hash, rp);
+   if (he) {
+#if defined(_WIN64) || defined(__x86_64__)
+      ret = (VkFramebuffer)he->data;
+#else
+      VkFramebuffer *ptr = he->data;
+      ret = *ptr;
+#endif
+      goto out;
+   }
+
+   assert(rp->state.num_cbufs + rp->state.have_zsbuf + rp->state.num_cresolves + rp->state.num_zsresolves == fb->state.num_attachments);
+
+   VkFramebufferCreateInfo fci;
+   fci.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO;
+   fci.flags = VK_FRAMEBUFFER_CREATE_IMAGELESS_BIT;
+   fci.renderPass = rp->render_pass;
+   fci.attachmentCount = fb->state.num_attachments;
+   fci.pAttachments = NULL;
+   fci.width = fb->state.width;
+   fci.height = fb->state.height;
+   fci.layers = fb->state.layers + 1;
+
+   VkFramebufferAttachmentsCreateInfo attachments;
+   attachments.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_ATTACHMENTS_CREATE_INFO;
+   attachments.pNext = NULL;
+   attachments.attachmentImageInfoCount = fb->state.num_attachments;
+   attachments.pAttachmentImageInfos = fb->infos;
+   fci.pNext = &attachments;
+
+   if (VKSCR(CreateFramebuffer)(screen->dev, &fci, NULL, &ret) != VK_SUCCESS)
+      return;
+#if defined(_WIN64) || defined(__x86_64__)
+   _mesa_hash_table_insert_pre_hashed(&fb->objects, hash, rp, ret);
+#else
+   VkFramebuffer *ptr = ralloc(fb, VkFramebuffer);
+   if (!ptr) {
+      VKSCR(DestroyFramebuffer)(screen->dev, ret, NULL);
+      return;
+   }
+   *ptr = ret;
+   _mesa_hash_table_insert_pre_hashed(&fb->objects, hash, rp, ptr);
+#endif
+out:
+   fb->rp = rp;
+   fb->fb = ret;
+}
+
+static void
+populate_attachment_info(VkFramebufferAttachmentImageInfo *att, struct zink_surface_info *info)
+{
+   att->sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_ATTACHMENT_IMAGE_INFO;
+   att->pNext = NULL;
+   memcpy(&att->flags, &info->flags, offsetof(struct zink_surface_info, format));
+   att->viewFormatCount = 1;
+   att->pViewFormats = &info->format;
+}
+
+static struct zink_framebuffer *
+create_framebuffer_imageless(struct zink_context *ctx, struct zink_framebuffer_state *state)
+{
+   struct zink_screen *screen = zink_screen(ctx->base.screen);
+   struct zink_framebuffer *fb = rzalloc(ctx, struct zink_framebuffer);
+   if (!fb)
+      return NULL;
+   pipe_reference_init(&fb->reference, 1);
+
+   if (!_mesa_hash_table_init(&fb->objects, fb, _mesa_hash_pointer, _mesa_key_pointer_equal))
+      goto fail;
+   memcpy(&fb->state, state, sizeof(struct zink_framebuffer_state));
+   for (int i = 0; i < state->num_attachments; i++)
+      populate_attachment_info(&fb->infos[i], &fb->state.infos[i]);
+
+   return fb;
+fail:
+   zink_destroy_framebuffer(screen, fb);
+   return NULL;
+}
+
+struct zink_framebuffer *
+zink_get_framebuffer_imageless(struct zink_context *ctx)
+{
+   assert(zink_screen(ctx->base.screen)->info.have_KHR_imageless_framebuffer);
+
+   struct zink_framebuffer_state state;
+   const unsigned cresolve_offset = ctx->fb_state.nr_cbufs + !!ctx->fb_state.zsbuf;
+   unsigned num_resolves = 0;
+   for (int i = 0; i < ctx->fb_state.nr_cbufs; i++) {
+      struct pipe_surface *psurf = ctx->fb_state.cbufs[i];
+      if (!psurf)
+         psurf = ctx->dummy_surface[util_logbase2_ceil(ctx->gfx_pipeline_state.rast_samples+1)];
+      struct zink_surface *surface = zink_csurface(psurf);
+      struct zink_surface *transient = zink_transient_surface(psurf);
+      if (transient) {
+         memcpy(&state.infos[i], &transient->info, sizeof(transient->info));
+         memcpy(&state.infos[cresolve_offset + i], &surface->info, sizeof(surface->info));
+         num_resolves++;
+      } else {
+         memcpy(&state.infos[i], &surface->info, sizeof(surface->info));
+      }
+   }
+
+   state.num_attachments = ctx->fb_state.nr_cbufs;
+   const unsigned zsresolve_offset = cresolve_offset + num_resolves;
+   if (ctx->fb_state.zsbuf) {
+      struct pipe_surface *psurf = ctx->fb_state.zsbuf;
+      struct zink_surface *surface = zink_csurface(psurf);
+      struct zink_surface *transient = zink_transient_surface(psurf);
+      if (transient) {
+         memcpy(&state.infos[state.num_attachments], &transient->info, sizeof(transient->info));
+         memcpy(&state.infos[zsresolve_offset], &surface->info, sizeof(surface->info));
+         num_resolves++;
+      } else {
+         memcpy(&state.infos[state.num_attachments], &surface->info, sizeof(surface->info));
+      }
+      state.num_attachments++;
+   }
+
+   /* avoid bitfield explosion */
+   assert(state.num_attachments + num_resolves < 16);
+   state.num_attachments += num_resolves;
+   state.width = MAX2(ctx->fb_state.width, 1);
+   state.height = MAX2(ctx->fb_state.height, 1);
+   state.layers = MAX2(util_framebuffer_get_num_layers(&ctx->fb_state), 1) - 1;
+   state.samples = ctx->fb_state.samples - 1;
+
+   struct zink_framebuffer *fb;
+   struct hash_entry *entry = _mesa_hash_table_search(&ctx->framebuffer_cache, &state);
+   if (entry)
+      return entry->data;
+
+   fb = create_framebuffer_imageless(ctx, &state);
+   _mesa_hash_table_insert(&ctx->framebuffer_cache, &fb->state, fb);
+
+   return fb;
+}
+
 void
 zink_init_framebuffer(struct zink_screen *screen, struct zink_framebuffer *fb, struct zink_render_pass *rp)
 {
@@ -72,6 +216,8 @@ zink_init_framebuffer(struct zink_screen *screen, struct zink_framebuffer *fb, s
       goto out;
    }
 
+   assert(rp->state.num_cbufs + rp->state.have_zsbuf + rp->state.num_cresolves + rp->state.num_zsresolves == fb->state.num_attachments);
+
    VkFramebufferCreateInfo fci = {0};
    fci.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO;
    fci.renderPass = rp->render_pass;
@@ -79,16 +225,16 @@ zink_init_framebuffer(struct zink_screen *screen, struct zink_framebuffer *fb, s
    fci.pAttachments = fb->state.attachments;
    fci.width = fb->state.width;
    fci.height = fb->state.height;
-   fci.layers = fb->state.layers;
+   fci.layers = fb->state.layers + 1;
 
-   if (vkCreateFramebuffer(screen->dev, &fci, NULL, &ret) != VK_SUCCESS)
+   if (VKSCR(CreateFramebuffer)(screen->dev, &fci, NULL, &ret) != VK_SUCCESS)
       return;
 #if defined(_WIN64) || defined(__x86_64__)
    _mesa_hash_table_insert_pre_hashed(&fb->objects, hash, rp, ret);
 #else
    VkFramebuffer *ptr = ralloc(fb, VkFramebuffer);
    if (!ptr) {
-      vkDestroyFramebuffer(screen->dev, ret, NULL);
+      VKSCR(DestroyFramebuffer)(screen->dev, ret, NULL);
       return;
    }
    *ptr = ret;
@@ -99,10 +245,10 @@ zink_init_framebuffer(struct zink_screen *screen, struct zink_framebuffer *fb, s
    fb->fb = ret;
 }
 
-struct zink_framebuffer *
-zink_create_framebuffer(struct zink_context *ctx,
-                        struct zink_framebuffer_state *state,
-                        struct pipe_surface **attachments)
+static struct zink_framebuffer *
+create_framebuffer(struct zink_context *ctx,
+                   struct zink_framebuffer_state *state,
+                   struct pipe_surface **attachments)
 {
    struct zink_screen *screen = zink_screen(ctx->base.screen);
    struct zink_framebuffer *fb = rzalloc(NULL, struct zink_framebuffer);
@@ -113,17 +259,15 @@ zink_create_framebuffer(struct zink_context *ctx,
    for (int i = 0; i < state->num_attachments; i++) {
       struct zink_surface *surf;
       if (state->attachments[i]) {
-         surf = zink_surface(attachments[i]);
+         surf = zink_csurface(attachments[i]);
          /* no ref! */
          fb->surfaces[i] = attachments[i];
          num_attachments++;
+         util_dynarray_append(&surf->framebuffer_refs, struct zink_framebuffer*, fb);
       } else {
-         if (!fb->null_surface)
-            fb->null_surface = zink_surface_create_null(ctx, PIPE_TEXTURE_2D, state->width, state->height, state->samples);
-         surf = zink_surface(fb->null_surface);
-         state->attachments[i] = zink_surface(fb->null_surface)->image_view;
+         surf = zink_csurface(ctx->dummy_surface[util_logbase2_ceil(state->samples+1)]);
+         state->attachments[i] = surf->image_view;
       }
-      util_dynarray_append(&surf->framebuffer_refs, struct zink_framebuffer*, fb);
    }
    pipe_reference_init(&fb->reference, 1 + num_attachments);
 
@@ -142,3 +286,86 @@ debug_describe_zink_framebuffer(char* buf, const struct zink_framebuffer *ptr)
 {
    sprintf(buf, "zink_framebuffer");
 }
+
+struct zink_framebuffer *
+zink_get_framebuffer(struct zink_context *ctx)
+{
+   struct zink_screen *screen = zink_screen(ctx->base.screen);
+
+   assert(!screen->info.have_KHR_imageless_framebuffer);
+
+   struct pipe_surface *attachments[2 * (PIPE_MAX_COLOR_BUFS + 1)] = {0};
+   const unsigned cresolve_offset = ctx->fb_state.nr_cbufs + !!ctx->fb_state.zsbuf;
+   unsigned num_resolves = 0;
+
+   struct zink_framebuffer_state state = {0};
+   for (int i = 0; i < ctx->fb_state.nr_cbufs; i++) {
+      struct pipe_surface *psurf = ctx->fb_state.cbufs[i];
+      if (psurf) {
+         struct zink_surface *surf = zink_csurface(psurf);
+         struct zink_surface *transient = zink_transient_surface(psurf);
+         if (transient) {
+            state.attachments[i] = transient->image_view;
+            state.attachments[cresolve_offset + i] = surf->image_view;
+            attachments[cresolve_offset + i] = psurf;
+            psurf = &transient->base;
+            num_resolves++;
+         } else {
+            state.attachments[i] = surf->image_view;
+         }
+      } else {
+         state.attachments[i] = VK_NULL_HANDLE;
+      }
+      attachments[i] = psurf;
+   }
+
+   state.num_attachments = ctx->fb_state.nr_cbufs;
+   const unsigned zsresolve_offset = cresolve_offset + num_resolves;
+   if (ctx->fb_state.zsbuf) {
+      struct pipe_surface *psurf = ctx->fb_state.zsbuf;
+      if (psurf) {
+         struct zink_surface *surf = zink_csurface(psurf);
+         struct zink_surface *transient = zink_transient_surface(psurf);
+         if (transient) {
+            state.attachments[state.num_attachments] = transient->image_view;
+            state.attachments[zsresolve_offset] = surf->image_view;
+            attachments[zsresolve_offset] = psurf;
+            psurf = &transient->base;
+            num_resolves++;
+         } else {
+            state.attachments[state.num_attachments] = surf->image_view;
+         }
+      } else {
+         state.attachments[state.num_attachments] = VK_NULL_HANDLE;
+      }
+      attachments[state.num_attachments++] = psurf;
+   }
+
+   /* avoid bitfield explosion */
+   assert(state.num_attachments + num_resolves < 16);
+   state.num_attachments += num_resolves;
+   state.width = MAX2(ctx->fb_state.width, 1);
+   state.height = MAX2(ctx->fb_state.height, 1);
+   state.layers = MAX2(util_framebuffer_get_num_layers(&ctx->fb_state), 1) - 1;
+   state.samples = ctx->fb_state.samples - 1;
+
+   struct zink_framebuffer *fb;
+   simple_mtx_lock(&screen->framebuffer_mtx);
+   struct hash_entry *entry = _mesa_hash_table_search(&screen->framebuffer_cache, &state);
+   if (entry) {
+      fb = (void*)entry->data;
+      struct zink_framebuffer *fb_ref = NULL;
+      /* this gains 1 ref every time we reuse it */
+      zink_framebuffer_reference(screen, &fb_ref, fb);
+   } else {
+      /* this adds 1 extra ref on creation because all newly-created framebuffers are
+       * going to be bound; necessary to handle framebuffers which have no "real" attachments
+       * and are only using null surfaces since the only ref they get is the extra one here
+       */
+      fb = create_framebuffer(ctx, &state, attachments);
+      _mesa_hash_table_insert(&screen->framebuffer_cache, &fb->state, fb);
+   }
+   simple_mtx_unlock(&screen->framebuffer_mtx);
+
+   return fb;
+}
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_framebuffer.h b/mesa 3D driver/src/gallium/drivers/zink/zink_framebuffer.h
index 73212cc3af..4fb8bf67b9 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_framebuffer.h	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_framebuffer.h	
@@ -36,10 +36,14 @@ struct zink_render_pass;
 
 struct zink_framebuffer_state {
    uint32_t width;
-   uint16_t height, layers;
-   uint8_t samples;
-   uint8_t num_attachments;
-   VkImageView attachments[PIPE_MAX_COLOR_BUFS + 1];
+   uint16_t height;
+   uint32_t layers:6;
+   uint32_t samples:6;
+   uint32_t num_attachments:4;
+   union {
+      VkImageView attachments[PIPE_MAX_COLOR_BUFS + 1];
+      struct zink_surface_info infos[PIPE_MAX_COLOR_BUFS + 1];
+   };
 };
 
 struct zink_framebuffer {
@@ -49,19 +53,18 @@ struct zink_framebuffer {
    VkFramebuffer fb;
    struct zink_render_pass *rp;
 
-   struct pipe_surface *surfaces[PIPE_MAX_COLOR_BUFS + 1];
-   struct pipe_surface *null_surface; /* for use with unbound attachments */
    struct zink_framebuffer_state state;
+   union {
+      struct pipe_surface *surfaces[PIPE_MAX_COLOR_BUFS + 1];
+      VkFramebufferAttachmentImageInfo infos[PIPE_MAX_COLOR_BUFS + 1];
+   };
    struct hash_table objects;
 };
 
-struct zink_framebuffer *
-zink_create_framebuffer(struct zink_context *ctx,
-                        struct zink_framebuffer_state *fb,
-                        struct pipe_surface **attachments);
-
 void
 zink_init_framebuffer(struct zink_screen *screen, struct zink_framebuffer *fb, struct zink_render_pass *rp);
+void
+zink_init_framebuffer_imageless(struct zink_screen *screen, struct zink_framebuffer *fb, struct zink_render_pass *rp);
 
 void
 zink_destroy_framebuffer(struct zink_screen *screen,
@@ -87,4 +90,9 @@ zink_framebuffer_reference(struct zink_screen *screen,
    return ret;
 }
 
+struct zink_framebuffer *
+zink_get_framebuffer_imageless(struct zink_context *ctx);
+
+struct zink_framebuffer *
+zink_get_framebuffer(struct zink_context *ctx);
 #endif
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_pipeline.c b/mesa 3D driver/src/gallium/drivers/zink/zink_pipeline.c
index d60bcc2041..b16b64701d 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_pipeline.c	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_pipeline.c	
@@ -51,6 +51,7 @@ zink_create_gfx_pipeline(struct zink_screen *screen,
                          struct zink_gfx_pipeline_state *state,
                          VkPrimitiveTopology primitive_topology)
 {
+   struct zink_rasterizer_hw_state *hw_rast_state = (void*)state;
    VkPipelineVertexInputStateCreateInfo vertex_input_state;
    if (!screen->info.have_EXT_vertex_input_dynamic_state) {
       memset(&vertex_input_state, 0, sizeof(vertex_input_state));
@@ -73,66 +74,77 @@ zink_create_gfx_pipeline(struct zink_screen *screen,
    VkPipelineInputAssemblyStateCreateInfo primitive_state = {0};
    primitive_state.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
    primitive_state.topology = primitive_topology;
-   switch (primitive_topology) {
-   case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
-   case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
-   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
-   case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
-   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
-   case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
-      if (state->primitive_restart)
-         debug_printf("restart_index set with unsupported primitive topology %u\n", primitive_topology);
-      primitive_state.primitiveRestartEnable = VK_FALSE;
-      break;
-   default:
-      primitive_state.primitiveRestartEnable = state->primitive_restart ? VK_TRUE : VK_FALSE;
+   if (!screen->info.have_EXT_extended_dynamic_state2) {
+      switch (primitive_topology) {
+      case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
+      case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
+      case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
+      case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
+      case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
+      case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
+         if (state->primitive_restart)
+            debug_printf("restart_index set with unsupported primitive topology %u\n", primitive_topology);
+         primitive_state.primitiveRestartEnable = VK_FALSE;
+         break;
+      default:
+         primitive_state.primitiveRestartEnable = state->primitive_restart ? VK_TRUE : VK_FALSE;
+      }
    }
 
    VkPipelineColorBlendAttachmentState blend_att[PIPE_MAX_COLOR_BUFS];
    VkPipelineColorBlendStateCreateInfo blend_state = {0};
    blend_state.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO;
-   if (state->void_alpha_attachments) {
-      for (unsigned i = 0; i < state->num_attachments; i++) {
-         blend_att[i] = state->blend_state->attachments[i];
-         if (state->void_alpha_attachments & BITFIELD_BIT(i)) {
-            blend_att[i].dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO;
-            blend_att[i].srcColorBlendFactor = clamp_void_blend_factor(blend_att[i].srcColorBlendFactor);
-            blend_att[i].dstColorBlendFactor = clamp_void_blend_factor(blend_att[i].dstColorBlendFactor);
+   if (state->blend_state) {
+      unsigned num_attachments = state->render_pass->state.num_rts;
+      if (state->render_pass->state.have_zsbuf)
+         num_attachments--;
+      if (state->void_alpha_attachments) {
+         for (unsigned i = 0; i < num_attachments; i++) {
+            blend_att[i] = state->blend_state->attachments[i];
+            if (state->void_alpha_attachments & BITFIELD_BIT(i)) {
+               blend_att[i].dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO;
+               blend_att[i].srcColorBlendFactor = clamp_void_blend_factor(blend_att[i].srcColorBlendFactor);
+               blend_att[i].dstColorBlendFactor = clamp_void_blend_factor(blend_att[i].dstColorBlendFactor);
+            }
          }
-      }
-      blend_state.pAttachments = blend_att;
-   } else
-      blend_state.pAttachments = state->blend_state->attachments;
-   blend_state.attachmentCount = state->num_attachments;
-   blend_state.logicOpEnable = state->blend_state->logicop_enable;
-   blend_state.logicOp = state->blend_state->logicop_func;
+         blend_state.pAttachments = blend_att;
+      } else
+         blend_state.pAttachments = state->blend_state->attachments;
+      blend_state.attachmentCount = num_attachments;
+      blend_state.logicOpEnable = state->blend_state->logicop_enable;
+      blend_state.logicOp = state->blend_state->logicop_func;
+   }
 
    VkPipelineMultisampleStateCreateInfo ms_state = {0};
    ms_state.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO;
-   ms_state.rasterizationSamples = state->rast_samples;
-   ms_state.alphaToCoverageEnable = state->blend_state->alpha_to_coverage;
-   ms_state.alphaToOneEnable = state->blend_state->alpha_to_one;
+   ms_state.rasterizationSamples = state->rast_samples + 1;
+   if (state->blend_state) {
+      ms_state.alphaToCoverageEnable = state->blend_state->alpha_to_coverage;
+      if (state->blend_state->alpha_to_one && !screen->info.feats.features.alphaToOne)
+         warn_missing_feature("alphaToOne");
+      ms_state.alphaToOneEnable = state->blend_state->alpha_to_one;
+   }
    ms_state.pSampleMask = state->sample_mask ? &state->sample_mask : NULL;
-   if (state->rast_state->force_persample_interp) {
+   if (hw_rast_state->force_persample_interp) {
       ms_state.sampleShadingEnable = VK_TRUE;
       ms_state.minSampleShading = 1.0;
    }
 
    VkPipelineViewportStateCreateInfo viewport_state = {0};
    viewport_state.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO;
-   viewport_state.viewportCount = state->num_viewports;
+   viewport_state.viewportCount = screen->info.have_EXT_extended_dynamic_state ? 0 : state->dyn_state1.num_viewports;
    viewport_state.pViewports = NULL;
-   viewport_state.scissorCount = state->num_viewports;
+   viewport_state.scissorCount = screen->info.have_EXT_extended_dynamic_state ? 0 : state->dyn_state1.num_viewports;
    viewport_state.pScissors = NULL;
 
    VkPipelineRasterizationStateCreateInfo rast_state = {0};
    rast_state.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO;
 
-   rast_state.depthClampEnable = state->rast_state->depth_clamp;
-   rast_state.rasterizerDiscardEnable = state->rast_state->rasterizer_discard;
-   rast_state.polygonMode = state->rast_state->polygon_mode;
-   rast_state.cullMode = state->rast_state->cull_mode;
-   rast_state.frontFace = state->front_face;
+   rast_state.depthClampEnable = hw_rast_state->depth_clamp;
+   rast_state.rasterizerDiscardEnable = hw_rast_state->rasterizer_discard;
+   rast_state.polygonMode = hw_rast_state->polygon_mode;
+   rast_state.cullMode = hw_rast_state->cull_mode;
+   rast_state.frontFace = state->dyn_state1.front_face;
 
    rast_state.depthBiasEnable = VK_TRUE;
    rast_state.depthBiasConstantFactor = 0.0;
@@ -142,26 +154,27 @@ zink_create_gfx_pipeline(struct zink_screen *screen,
 
    VkPipelineRasterizationProvokingVertexStateCreateInfoEXT pv_state;
    pv_state.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT;
-   pv_state.provokingVertexMode = state->rast_state->pv_mode;
-   if (screen->info.have_EXT_provoking_vertex &&
-       state->rast_state->pv_mode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT) {
+   pv_state.provokingVertexMode = hw_rast_state->pv_last ?
+                                  VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT :
+                                  VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT;
+   if (screen->info.have_EXT_provoking_vertex && hw_rast_state->pv_last) {
       pv_state.pNext = rast_state.pNext;
       rast_state.pNext = &pv_state;
    }
 
    VkPipelineDepthStencilStateCreateInfo depth_stencil_state = {0};
    depth_stencil_state.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO;
-   depth_stencil_state.depthTestEnable = state->depth_stencil_alpha_state->depth_test;
-   depth_stencil_state.depthCompareOp = state->depth_stencil_alpha_state->depth_compare_op;
-   depth_stencil_state.depthBoundsTestEnable = state->depth_stencil_alpha_state->depth_bounds_test;
-   depth_stencil_state.minDepthBounds = state->depth_stencil_alpha_state->min_depth_bounds;
-   depth_stencil_state.maxDepthBounds = state->depth_stencil_alpha_state->max_depth_bounds;
-   depth_stencil_state.stencilTestEnable = state->depth_stencil_alpha_state->stencil_test;
-   depth_stencil_state.front = state->depth_stencil_alpha_state->stencil_front;
-   depth_stencil_state.back = state->depth_stencil_alpha_state->stencil_back;
-   depth_stencil_state.depthWriteEnable = state->depth_stencil_alpha_state->depth_write;
+   depth_stencil_state.depthTestEnable = state->dyn_state1.depth_stencil_alpha_state->depth_test;
+   depth_stencil_state.depthCompareOp = state->dyn_state1.depth_stencil_alpha_state->depth_compare_op;
+   depth_stencil_state.depthBoundsTestEnable = state->dyn_state1.depth_stencil_alpha_state->depth_bounds_test;
+   depth_stencil_state.minDepthBounds = state->dyn_state1.depth_stencil_alpha_state->min_depth_bounds;
+   depth_stencil_state.maxDepthBounds = state->dyn_state1.depth_stencil_alpha_state->max_depth_bounds;
+   depth_stencil_state.stencilTestEnable = state->dyn_state1.depth_stencil_alpha_state->stencil_test;
+   depth_stencil_state.front = state->dyn_state1.depth_stencil_alpha_state->stencil_front;
+   depth_stencil_state.back = state->dyn_state1.depth_stencil_alpha_state->stencil_back;
+   depth_stencil_state.depthWriteEnable = state->dyn_state1.depth_stencil_alpha_state->depth_write;
 
-   VkDynamicState dynamicStateEnables[24] = {
+   VkDynamicState dynamicStateEnables[30] = {
       VK_DYNAMIC_STATE_LINE_WIDTH,
       VK_DYNAMIC_STATE_DEPTH_BIAS,
       VK_DYNAMIC_STATE_BLEND_CONSTANTS,
@@ -180,26 +193,29 @@ zink_create_gfx_pipeline(struct zink_screen *screen,
       dynamicStateEnables[state_count++] = VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK;
       dynamicStateEnables[state_count++] = VK_DYNAMIC_STATE_STENCIL_OP_EXT;
       dynamicStateEnables[state_count++] = VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT;
-      dynamicStateEnables[state_count++] = VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT;
       dynamicStateEnables[state_count++] = VK_DYNAMIC_STATE_FRONT_FACE_EXT;
+      dynamicStateEnables[state_count++] = VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY_EXT;
       if (state->sample_locations_enabled)
          dynamicStateEnables[state_count++] = VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT;
    } else {
       dynamicStateEnables[state_count++] = VK_DYNAMIC_STATE_VIEWPORT;
       dynamicStateEnables[state_count++] = VK_DYNAMIC_STATE_SCISSOR;
    }
-   if (screen->info.have_EXT_vertex_input_dynamic_state) {
+   if (screen->info.have_EXT_vertex_input_dynamic_state)
       dynamicStateEnables[state_count++] = VK_DYNAMIC_STATE_VERTEX_INPUT_EXT;
-   }
+   else if (screen->info.have_EXT_extended_dynamic_state)
+      dynamicStateEnables[state_count++] = VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT;
+   if (screen->info.have_EXT_extended_dynamic_state2)
+      dynamicStateEnables[state_count++] = VK_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE_EXT;
 
    VkPipelineRasterizationLineStateCreateInfoEXT rast_line_state;
    if (screen->info.have_EXT_line_rasterization) {
       rast_line_state.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT;
       rast_line_state.pNext = rast_state.pNext;
       rast_line_state.stippledLineEnable = VK_FALSE;
-      rast_line_state.lineRasterizationMode = state->rast_state->line_mode;
+      rast_line_state.lineRasterizationMode = hw_rast_state->line_mode;
 
-      if (state->rast_state->line_stipple_enable) {
+      if (hw_rast_state->line_stipple_enable) {
          dynamicStateEnables[state_count++] = VK_DYNAMIC_STATE_LINE_STIPPLE_EXT;
          rast_line_state.stippledLineEnable = VK_TRUE;
       }
@@ -229,7 +245,7 @@ zink_create_gfx_pipeline(struct zink_screen *screen,
    VkPipelineTessellationDomainOriginStateCreateInfo tdci = {0};
    if (prog->shaders[PIPE_SHADER_TESS_CTRL] && prog->shaders[PIPE_SHADER_TESS_EVAL]) {
       tci.sType = VK_STRUCTURE_TYPE_PIPELINE_TESSELLATION_STATE_CREATE_INFO;
-      tci.patchControlPoints = state->vertices_per_patch;
+      tci.patchControlPoints = state->vertices_per_patch + 1;
       pci.pTessellationState = &tci;
       tci.pNext = &tdci;
       tdci.sType = VK_STRUCTURE_TYPE_PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO;
@@ -260,7 +276,6 @@ zink_create_gfx_pipeline(struct zink_screen *screen,
       debug_printf("vkCreateGraphicsPipelines failed\n");
       return VK_NULL_HANDLE;
    }
-   zink_screen_update_pipeline_cache(screen, &prog->base);
 
    return pipeline;
 }
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_pipeline.h b/mesa 3D driver/src/gallium/drivers/zink/zink_pipeline.h
index 6946480802..4acc6c4428 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_pipeline.h	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_pipeline.h	
@@ -27,6 +27,8 @@
 #include <vulkan/vulkan.h>
 
 #include "pipe/p_state.h"
+#include "zink_shader_keys.h"
+#include "zink_state.h"
 
 struct zink_blend_state;
 struct zink_depth_stencil_alpha_state;
@@ -38,37 +40,33 @@ struct zink_screen;
 struct zink_vertex_elements_state;
 
 struct zink_gfx_pipeline_state {
-   struct zink_render_pass *render_pass;
-
-   uint8_t void_alpha_attachments:PIPE_MAX_COLOR_BUFS;
-   uint32_t num_attachments;
-   struct zink_blend_state *blend_state;
-
-   struct zink_rasterizer_hw_state *rast_state;
-
+   uint32_t rast_state : ZINK_RAST_HW_STATE_SIZE; //zink_rasterizer_hw_state
+   uint32_t vertices_per_patch:5;
+   uint32_t rast_samples:7;
+   uint32_t void_alpha_attachments:PIPE_MAX_COLOR_BUFS;
    VkSampleMask sample_mask;
-   uint8_t rast_samples;
-   uint8_t vertices_per_patch;
 
-   unsigned num_viewports;
-
-   bool primitive_restart;
+   unsigned rp_state;
+   uint32_t blend_id;
 
    /* Pre-hashed value for table lookup, invalid when zero.
     * Members after this point are not included in pipeline state hash key */
    uint32_t hash;
    bool dirty;
 
-   struct zink_depth_stencil_alpha_hw_state *depth_stencil_alpha_state; //non-dynamic state
-   VkFrontFace front_face;
+   struct {
+      struct zink_depth_stencil_alpha_hw_state *depth_stencil_alpha_state; //non-dynamic state
+      VkFrontFace front_face;
+      unsigned num_viewports;
+   } dyn_state1;
+
+   bool primitive_restart; //dynamic state2
 
    VkShaderModule modules[PIPE_SHADER_TYPES - 1];
-   uint32_t module_hash;
-
-   uint32_t combined_hash;
-   bool combined_dirty;
+   bool modules_changed;
 
    struct zink_vertex_elements_hw_state *element_state;
+   uint32_t vertex_hash;
 
    uint32_t final_hash;
 
@@ -76,9 +74,18 @@ struct zink_gfx_pipeline_state {
    uint32_t vertex_strides[PIPE_MAX_ATTRIBS];
    bool sample_locations_enabled;
    bool have_EXT_extended_dynamic_state;
-
+   bool have_EXT_extended_dynamic_state2;
+   uint8_t has_points; //either gs outputs points or prim type is points
+   struct {
+      struct zink_shader_key key[5];
+      struct zink_shader_key last_vertex;
+   } shader_keys;
+   struct zink_blend_state *blend_state;
+   struct zink_render_pass *render_pass;
    VkPipeline pipeline;
-   enum pipe_prim_type mode : 8;
+   uint8_t patch_vertices;
+   unsigned idx : 8;
+   enum pipe_prim_type gfx_prim_mode; //pending mode
 };
 
 struct zink_compute_pipeline_state {
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_program.c b/mesa 3D driver/src/gallium/drivers/zink/zink_program.c
index f0e12314b6..393f2c10a7 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_program.c	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_program.c	
@@ -37,6 +37,7 @@
 #include "util/set.h"
 #include "util/u_debug.h"
 #include "util/u_memory.h"
+#include "util/u_prim.h"
 #include "tgsi/tgsi_from_mesa.h"
 
 /* for pipeline cache */
@@ -65,241 +66,135 @@ debug_describe_zink_compute_program(char *buf, const struct zink_compute_program
    sprintf(buf, "zink_compute_program");
 }
 
-/* copied from iris */
-struct keybox {
-   uint16_t size;
-   gl_shader_stage stage;
-   uint8_t data[0];
-};
-
-static struct keybox *
-make_keybox(void *mem_ctx, gl_shader_stage stage, const void *key, uint32_t key_size, void *base, uint32_t base_size)
+static bool
+shader_key_matches(const struct zink_shader_module *zm, const struct zink_shader_key *key, unsigned num_uniforms)
 {
-   struct keybox *keybox =
-      ralloc_size(mem_ctx, sizeof(struct keybox) + key_size + base_size);
-
-   keybox->stage = stage;
-   keybox->size = key_size + base_size;
-   memcpy(keybox->data, key, key_size);
-   if (base_size)
-      memcpy(&keybox->data[key_size], base, base_size);
-   return keybox;
+   if (zm->key_size != key->size || zm->num_uniforms != num_uniforms)
+      return false;
+   return !memcmp(zm->key, key, zm->key_size) &&
+          (!num_uniforms || !memcmp(zm->key + zm->key_size, key->base.inlined_uniform_values, zm->num_uniforms * sizeof(uint32_t)));
 }
 
 static uint32_t
-keybox_hash(const void *void_key)
+shader_module_hash(const struct zink_shader_module *zm)
 {
-   const struct keybox *key = void_key;
-   return _mesa_hash_data(&key->stage, key->size + sizeof(key->stage));
-}
-
-static bool
-keybox_equals(const void *void_a, const void *void_b)
-{
-   const struct keybox *a = void_a, *b = void_b;
-   if (a->size != b->size)
-      return false;
-
-   return memcmp(a->data, b->data, a->size) == 0;
-}
-
-static void
-shader_key_vs_gen(struct zink_context *ctx, struct zink_shader *zs,
-                  struct zink_shader *shaders[ZINK_SHADER_COUNT], struct zink_shader_key *key)
-{
-   struct zink_vs_key *vs_key = &key->key.vs;
-   key->size = sizeof(struct zink_vs_key);
-
-   vs_key->clip_halfz = ctx->rast_state->base.clip_halfz;
-   switch (zs->nir->info.stage) {
-   case MESA_SHADER_VERTEX:
-      vs_key->last_vertex_stage = !shaders[PIPE_SHADER_TESS_EVAL] && !shaders[PIPE_SHADER_GEOMETRY];
-      vs_key->push_drawid = ctx->drawid_broken;
-      break;
-   case MESA_SHADER_TESS_EVAL:
-      vs_key->last_vertex_stage = !shaders[PIPE_SHADER_GEOMETRY];
-      break;
-   case MESA_SHADER_GEOMETRY:
-      vs_key->last_vertex_stage = true;
-      break;
-   default:
-      unreachable("impossible case");
-   }
-}
-
-static void
-shader_key_fs_gen(struct zink_context *ctx, struct zink_shader *zs,
-                  struct zink_shader *shaders[ZINK_SHADER_COUNT], struct zink_shader_key *key)
-{
-   struct zink_screen *screen = zink_screen(ctx->base.screen);
-   struct zink_fs_key *fs_key = &key->key.fs;
-   key->size = sizeof(struct zink_fs_key);
-
-   /* if gl_SampleMask[] is written to, we have to ensure that we get a shader with the same sample count:
-    * in GL, rast_samples==1 means ignore gl_SampleMask[]
-    * in VK, gl_SampleMask[] is never ignored
-    */
-   if (zs->nir->info.outputs_written & (1 << FRAG_RESULT_SAMPLE_MASK))
-      fs_key->samples = !!ctx->fb_state.samples;
-   fs_key->force_dual_color_blend = screen->driconf.dual_color_blend_by_location &&
-                                    ctx->gfx_pipeline_state.blend_state->dual_src_blend &&
-                                    ctx->gfx_pipeline_state.blend_state->attachments[1].blendEnable;
-   if (((shaders[PIPE_SHADER_GEOMETRY] && shaders[PIPE_SHADER_GEOMETRY]->nir->info.gs.output_primitive == GL_POINTS) ||
-       ctx->gfx_prim_mode == PIPE_PRIM_POINTS) && ctx->rast_state->base.point_quad_rasterization && ctx->rast_state->base.sprite_coord_enable) {
-      fs_key->coord_replace_bits = ctx->rast_state->base.sprite_coord_enable;
-      fs_key->coord_replace_yinvert = !!ctx->rast_state->base.sprite_coord_mode;
-   }
-}
-
-static void
-shader_key_tcs_gen(struct zink_context *ctx, struct zink_shader *zs,
-                   struct zink_shader *shaders[ZINK_SHADER_COUNT], struct zink_shader_key *key)
-{
-   struct zink_tcs_key *tcs_key = &key->key.tcs;
-   key->size = sizeof(struct zink_tcs_key);
-
-   tcs_key->vertices_per_patch = ctx->gfx_pipeline_state.vertices_per_patch;
-   tcs_key->vs_outputs_written = shaders[PIPE_SHADER_VERTEX]->nir->info.outputs_written;
-}
-
-typedef void (*zink_shader_key_gen)(struct zink_context *ctx, struct zink_shader *zs,
-                                    struct zink_shader *shaders[ZINK_SHADER_COUNT],
-                                    struct zink_shader_key *key);
-static zink_shader_key_gen shader_key_vtbl[] =
-{
-   [MESA_SHADER_VERTEX] = shader_key_vs_gen,
-   [MESA_SHADER_TESS_CTRL] = shader_key_tcs_gen,
-   /* reusing vs key for now since we're only using clip_halfz */
-   [MESA_SHADER_TESS_EVAL] = shader_key_vs_gen,
-   [MESA_SHADER_GEOMETRY] = shader_key_vs_gen,
-   [MESA_SHADER_FRAGMENT] = shader_key_fs_gen,
-};
-
-/* return pointer to make function reusable */
-static inline struct zink_shader_module **
-get_default_shader_module_ptr(struct zink_gfx_program *prog, struct zink_shader *zs, struct zink_shader_key *key)
-{
-   if (zs->nir->info.stage == MESA_SHADER_VERTEX ||
-       zs->nir->info.stage == MESA_SHADER_TESS_EVAL) {
-      /* no streamout or halfz */
-      if (!zink_vs_key(key)->last_vertex_stage)
-         return &prog->default_variants[zs->nir->info.stage][1];
-   }
-   return &prog->default_variants[zs->nir->info.stage][0];
+   unsigned key_size = zm->key_size + zm->num_uniforms * sizeof(uint32_t);
+   return _mesa_hash_data(zm->key, key_size);
 }
 
 static struct zink_shader_module *
-get_shader_module_for_stage(struct zink_context *ctx, struct zink_shader *zs, struct zink_gfx_program *prog)
+get_shader_module_for_stage(struct zink_context *ctx, struct zink_screen *screen,
+                            struct zink_shader *zs, struct zink_gfx_program *prog,
+                            struct zink_gfx_pipeline_state *state)
 {
    gl_shader_stage stage = zs->nir->info.stage;
    enum pipe_shader_type pstage = pipe_shader_type_from_mesa(stage);
-   struct zink_shader_key key = {0};
    VkShaderModule mod;
-   struct zink_shader_module *zm;
-   struct zink_shader_module **default_zm = NULL;
-   struct keybox *keybox;
-   uint32_t hash;
+   struct zink_shader_module *zm = NULL;
    unsigned base_size = 0;
+   struct zink_shader_key *key = &state->shader_keys.key[pstage];
 
-   shader_key_vtbl[stage](ctx, zs, ctx->gfx_stages, &key);
-   /* this is default variant if there is no default or it matches the default */
-   if (prog->default_variant_key[pstage]) {
-      const struct keybox *tmp = prog->default_variant_key[pstage];
-      /* if comparing against the existing default, use the base variant key size since
-       * we're only checking the stage-specific data
-       */
-      key.is_default_variant = !memcmp(tmp->data, &key, key.size);
-   } else
-      key.is_default_variant = true;
-
-   if (zs->nir->info.num_inlinable_uniforms &&
+   if (ctx && zs->nir->info.num_inlinable_uniforms &&
        ctx->inlinable_uniforms_valid_mask & BITFIELD64_BIT(pstage)) {
-      key.inline_uniforms = true;
-      memcpy(key.base.inlined_uniform_values,
-             ctx->inlinable_uniforms[pstage],
-             zs->nir->info.num_inlinable_uniforms * 4);
-      base_size = zs->nir->info.num_inlinable_uniforms * sizeof(uint32_t);
-      key.is_default_variant = false;
+      if (prog->inlined_variant_count[pstage] < ZINK_MAX_INLINED_VARIANTS)
+         base_size = zs->nir->info.num_inlinable_uniforms;
+      else
+         key->inline_uniforms = false;
    }
-   if (key.is_default_variant) {
-      default_zm = get_default_shader_module_ptr(prog, zs, &key);
-      if (*default_zm)
-         return *default_zm;
-   }
-   keybox = make_keybox(prog, stage, &key, key.size, &key.base, base_size);
-   hash = keybox_hash(keybox);
-   struct hash_entry *entry = _mesa_hash_table_search_pre_hashed(&prog->base.shader_cache[pstage],
-                                                                 hash, keybox);
 
-   if (entry) {
-      ralloc_free(keybox);
-      zm = entry->data;
-   } else {
-      zm = CALLOC_STRUCT(zink_shader_module);
+   struct zink_shader_module *iter, *next;
+   LIST_FOR_EACH_ENTRY_SAFE(iter, next, &prog->shader_cache[pstage][!!base_size], list) {
+      if (!shader_key_matches(iter, key, base_size))
+         continue;
+      list_delinit(&iter->list);
+      zm = iter;
+      break;
+   }
+
+   if (!zm) {
+      zm = malloc(sizeof(struct zink_shader_module) + key->size + base_size * sizeof(uint32_t));
       if (!zm) {
-         ralloc_free(keybox);
          return NULL;
       }
-      mod = zink_shader_compile(zink_screen(ctx->base.screen), zs, prog->nir[stage], &key);
+      mod = zink_shader_compile(screen, zs, prog->nir[stage], key);
       if (!mod) {
-         ralloc_free(keybox);
          FREE(zm);
          return NULL;
       }
       zm->shader = mod;
-
-      _mesa_hash_table_insert_pre_hashed(&prog->base.shader_cache[pstage], hash, keybox, zm);
-      if (key.is_default_variant) {
-         /* previously returned */
-         *default_zm = zm;
-         zm->default_variant = true;
-         prog->default_variant_key[pstage] = keybox;
-      }
+      list_inithead(&zm->list);
+      zm->num_uniforms = base_size;
+      zm->key_size = key->size;
+      memcpy(zm->key, key, key->size);
+      if (base_size)
+         memcpy(zm->key + key->size, &key->base, base_size * sizeof(uint32_t));
+      zm->hash = shader_module_hash(zm);
+      zm->default_variant = !base_size && list_is_empty(&prog->shader_cache[pstage][0]);
+      if (base_size)
+         prog->inlined_variant_count[pstage]++;
    }
+   list_add(&zm->list, &prog->shader_cache[pstage][!!base_size]);
    return zm;
 }
 
 static void
 zink_destroy_shader_module(struct zink_screen *screen, struct zink_shader_module *zm)
 {
-   vkDestroyShaderModule(screen->dev, zm->shader, NULL);
+   VKSCR(DestroyShaderModule)(screen->dev, zm->shader, NULL);
    free(zm);
 }
 
 static void
-destroy_shader_cache(struct zink_screen *screen, struct hash_table *sc)
+destroy_shader_cache(struct zink_screen *screen, struct list_head *sc)
 {
-   hash_table_foreach(sc, entry) {
-      struct zink_shader_module *zm = entry->data;
+   struct zink_shader_module *zm, *next;
+   LIST_FOR_EACH_ENTRY_SAFE(zm, next, sc, list) {
+      list_delinit(&zm->list);
       zink_destroy_shader_module(screen, zm);
    }
 }
 
 static void
-update_shader_modules(struct zink_context *ctx, struct zink_gfx_program *prog)
+update_shader_modules(struct zink_context *ctx,
+                      struct zink_screen *screen,
+                      struct zink_gfx_program *prog, uint32_t mask,
+                      struct zink_gfx_pipeline_state *state)
 {
    bool hash_changed = false;
    bool default_variants = true;
-   bool first = !!prog->modules[PIPE_SHADER_VERTEX];
-   u_foreach_bit(pstage, ctx->dirty_shader_stages & prog->stages_present) {
+   bool first = !prog->modules[PIPE_SHADER_VERTEX];
+   uint32_t variant_hash = prog->last_variant_hash;
+   u_foreach_bit(pstage, mask) {
       assert(prog->shaders[pstage]);
-      struct zink_shader_module *zm = get_shader_module_for_stage(ctx, prog->shaders[pstage], prog);
-      if (prog->modules[pstage] != zm)
-         hash_changed = true;
+      struct zink_shader_module *zm = get_shader_module_for_stage(ctx, screen, prog->shaders[pstage], prog, state);
+      if (prog->modules[pstage] == zm)
+         continue;
+      if (prog->modules[pstage])
+         variant_hash ^= prog->modules[pstage]->hash;
+      hash_changed = true;
       default_variants &= zm->default_variant;
       prog->modules[pstage] = zm;
-      ctx->gfx_pipeline_state.modules[pstage] = zm->shader;
+      variant_hash ^= prog->modules[pstage]->hash;
+      state->modules[pstage] = zm->shader;
    }
 
-   if (hash_changed) {
+   if (hash_changed && state) {
+      if (!first && likely(state->pipeline)) //avoid on first hash
+         state->final_hash ^= prog->last_variant_hash;
+
       if (default_variants && !first)
          prog->last_variant_hash = prog->default_variant_hash;
-      else
-         prog->last_variant_hash = _mesa_hash_data(ctx->gfx_pipeline_state.modules, sizeof(ctx->gfx_pipeline_state.modules));
-      ctx->gfx_pipeline_state.combined_dirty = true;
+      else {
+         prog->last_variant_hash = variant_hash;
+         if (first) {
+            p_atomic_dec(&prog->base.reference.count);
+            prog->default_variant_hash = prog->last_variant_hash;
+         }
+      }
+
+      state->final_hash ^= prog->last_variant_hash;
+      state->modules_changed = true;
    }
-   ctx->gfx_pipeline_state.module_hash = prog->last_variant_hash;
-   ctx->dirty_shader_stages &= ~u_bit_consecutive(PIPE_SHADER_VERTEX, 5);
 }
 
 static uint32_t
@@ -307,9 +202,11 @@ hash_gfx_pipeline_state(const void *key)
 {
    const struct zink_gfx_pipeline_state *state = key;
    uint32_t hash = _mesa_hash_data(key, offsetof(struct zink_gfx_pipeline_state, hash));
+   if (!state->have_EXT_extended_dynamic_state2)
+      hash = XXH32(&state->primitive_restart, 1, hash);
    if (state->have_EXT_extended_dynamic_state)
       return hash;
-   return XXH32(&state->depth_stencil_alpha_state, sizeof(void*), hash);
+   return XXH32(&state->dyn_state1, sizeof(state->dyn_state1), hash);
 }
 
 static bool
@@ -329,10 +226,16 @@ equals_gfx_pipeline_state(const void *a, const void *b)
          if (sa->vertex_strides[idx_a] != sb->vertex_strides[idx_b])
             return false;
       }
-      if (sa->front_face != sb->front_face)
+      if (sa->dyn_state1.front_face != sb->dyn_state1.front_face)
          return false;
-      if (!!sa->depth_stencil_alpha_state != !!sb->depth_stencil_alpha_state ||
-          memcmp(sa->depth_stencil_alpha_state, sb->depth_stencil_alpha_state, sizeof(struct zink_depth_stencil_alpha_hw_state)))
+      if (!!sa->dyn_state1.depth_stencil_alpha_state != !!sb->dyn_state1.depth_stencil_alpha_state ||
+          (sa->dyn_state1.depth_stencil_alpha_state &&
+           memcmp(sa->dyn_state1.depth_stencil_alpha_state, sb->dyn_state1.depth_stencil_alpha_state,
+                  sizeof(struct zink_depth_stencil_alpha_hw_state))))
+         return false;
+   }
+   if (!sa->have_EXT_extended_dynamic_state2) {
+      if (sa->primitive_restart != sb->primitive_restart)
          return false;
    }
    return !memcmp(sa->modules, sb->modules, sizeof(sa->modules)) &&
@@ -342,11 +245,11 @@ equals_gfx_pipeline_state(const void *a, const void *b)
 void
 zink_update_gfx_program(struct zink_context *ctx, struct zink_gfx_program *prog)
 {
-   update_shader_modules(ctx, prog);
+   update_shader_modules(ctx, zink_screen(ctx->base.screen), prog, ctx->dirty_shader_stages & prog->stages_present, &ctx->gfx_pipeline_state);
 }
 
 VkPipelineLayout
-zink_pipeline_layout_create(struct zink_screen *screen, struct zink_program *pg)
+zink_pipeline_layout_create(struct zink_screen *screen, struct zink_program *pg, uint32_t *compat)
 {
    VkPipelineLayoutCreateInfo plci = {0};
    plci.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
@@ -374,11 +277,13 @@ zink_pipeline_layout_create(struct zink_screen *screen, struct zink_program *pg)
    plci.pPushConstantRanges = &pcr[0];
 
    VkPipelineLayout layout;
-   if (vkCreatePipelineLayout(screen->dev, &plci, NULL, &layout) != VK_SUCCESS) {
+   if (VKSCR(CreatePipelineLayout)(screen->dev, &plci, NULL, &layout) != VK_SUCCESS) {
       debug_printf("vkCreatePipelineLayout failed!\n");
       return VK_NULL_HANDLE;
    }
 
+   *compat = _mesa_hash_data(pg->dsl, pg->num_dsl * sizeof(pg->dsl[0]));
+
    return layout;
 }
 
@@ -410,7 +315,8 @@ assign_io(struct zink_gfx_program *prog, struct zink_shader *stages[ZINK_SHADER_
 
 struct zink_gfx_program *
 zink_create_gfx_program(struct zink_context *ctx,
-                        struct zink_shader *stages[ZINK_SHADER_COUNT])
+                        struct zink_shader *stages[ZINK_SHADER_COUNT],
+                        unsigned vertices_per_patch)
 {
    struct zink_screen *screen = zink_screen(ctx->base.screen);
    struct zink_gfx_program *prog = rzalloc(NULL, struct zink_gfx_program);
@@ -420,8 +326,9 @@ zink_create_gfx_program(struct zink_context *ctx,
    pipe_reference_init(&prog->base.reference, 1);
 
    for (int i = 0; i < ZINK_SHADER_COUNT; ++i) {
+      list_inithead(&prog->shader_cache[i][0]);
+      list_inithead(&prog->shader_cache[i][1]);
       if (stages[i]) {
-         _mesa_hash_table_init(&prog->base.shader_cache[i], prog, keybox_hash, keybox_equals);
          prog->shaders[i] = stages[i];
          prog->stages_present |= BITFIELD_BIT(i);
       }
@@ -429,26 +336,12 @@ zink_create_gfx_program(struct zink_context *ctx,
    if (stages[PIPE_SHADER_TESS_EVAL] && !stages[PIPE_SHADER_TESS_CTRL]) {
       prog->shaders[PIPE_SHADER_TESS_EVAL]->generated =
       prog->shaders[PIPE_SHADER_TESS_CTRL] =
-        zink_shader_tcs_create(ctx, stages[PIPE_SHADER_VERTEX]);
-        _mesa_hash_table_init(&prog->base.shader_cache[PIPE_SHADER_TESS_CTRL], prog, keybox_hash, keybox_equals);
+        zink_shader_tcs_create(screen, stages[PIPE_SHADER_VERTEX], vertices_per_patch);
       prog->stages_present |= BITFIELD_BIT(PIPE_SHADER_TESS_CTRL);
    }
 
-   /* always force shader creation during init */
-   ctx->dirty_shader_stages |= prog->stages_present;
    assign_io(prog, prog->shaders);
 
-   update_shader_modules(ctx, prog);
-   prog->default_variant_hash = ctx->gfx_pipeline_state.module_hash;
-
-   for (int i = 0; i < ARRAY_SIZE(prog->pipelines); ++i) {
-      prog->pipelines[i] = _mesa_hash_table_create(NULL,
-                                                   NULL,
-                                                   equals_gfx_pipeline_state);
-      if (!prog->pipelines[i])
-         goto fail;
-   }
-
    if (stages[PIPE_SHADER_GEOMETRY])
       prog->last_vertex_stage = stages[PIPE_SHADER_GEOMETRY];
    else if (stages[PIPE_SHADER_TESS_EVAL])
@@ -456,19 +349,26 @@ zink_create_gfx_program(struct zink_context *ctx,
    else
       prog->last_vertex_stage = stages[PIPE_SHADER_VERTEX];
 
+   for (int i = 0; i < ARRAY_SIZE(prog->pipelines); ++i) {
+      _mesa_hash_table_init(&prog->pipelines[i], prog, NULL, equals_gfx_pipeline_state);
+      /* only need first 3/4 for point/line/tri/patch */
+      if (screen->info.have_EXT_extended_dynamic_state &&
+          i == (prog->last_vertex_stage->nir->info.stage == MESA_SHADER_TESS_EVAL ? 4 : 3))
+         break;
+   }
+
    struct mesa_sha1 sctx;
    _mesa_sha1_init(&sctx);
    for (int i = 0; i < ZINK_SHADER_COUNT; ++i) {
-      if (prog->modules[i]) {
+      if (prog->shaders[i]) {
          simple_mtx_lock(&prog->shaders[i]->lock);
          _mesa_set_add(prog->shaders[i]->programs, prog);
          simple_mtx_unlock(&prog->shaders[i]->lock);
-         zink_gfx_program_reference(screen, NULL, prog);
+         zink_gfx_program_reference(ctx, NULL, prog);
          _mesa_sha1_update(&sctx, prog->shaders[i]->base.sha1, sizeof(prog->shaders[i]->base.sha1));
       }
    }
    _mesa_sha1_final(&sctx, prog->base.sha1);
-   p_atomic_dec(&prog->base.reference.count);
 
    if (!screen->descriptor_program_init(ctx, &prog->base))
       goto fail;
@@ -478,7 +378,7 @@ zink_create_gfx_program(struct zink_context *ctx,
 
 fail:
    if (prog)
-      zink_destroy_gfx_program(screen, prog);
+      zink_destroy_gfx_program(ctx, prog);
    return NULL;
 }
 
@@ -531,16 +431,11 @@ zink_create_compute_program(struct zink_context *ctx, struct zink_shader *shader
 
    pipe_reference_init(&comp->base.reference, 1);
    comp->base.is_compute = true;
-   /* TODO: cs shader keys placeholder for now */
-   _mesa_hash_table_init(&comp->base.shader_cache[0], comp, _mesa_hash_pointer, _mesa_key_pointer_equal);
 
    comp->module = CALLOC_STRUCT(zink_shader_module);
    assert(comp->module);
    comp->module->shader = zink_shader_compile(screen, shader, shader->nir, NULL);
    assert(comp->module->shader);
-   _mesa_hash_table_insert(&comp->base.shader_cache[0], shader, comp->module);
-
-   ctx->dirty_shader_stages &= ~(1 << PIPE_SHADER_COMPUTE);
 
    comp->pipelines = _mesa_hash_table_create(NULL, hash_compute_pipeline_state,
                                              equals_compute_pipeline_state);
@@ -557,7 +452,7 @@ zink_create_compute_program(struct zink_context *ctx, struct zink_shader *shader
 
 fail:
    if (comp)
-      zink_destroy_compute_program(screen, comp);
+      zink_destroy_compute_program(ctx, comp);
    return NULL;
 }
 
@@ -663,42 +558,57 @@ zink_program_num_bindings(const struct zink_program *pg, bool is_compute)
 }
 
 void
-zink_destroy_gfx_program(struct zink_screen *screen,
+zink_destroy_gfx_program(struct zink_context *ctx,
                          struct zink_gfx_program *prog)
 {
+   struct zink_screen *screen = zink_screen(ctx->base.screen);
    if (prog->base.layout)
-      vkDestroyPipelineLayout(screen->dev, prog->base.layout, NULL);
+      VKSCR(DestroyPipelineLayout)(screen->dev, prog->base.layout, NULL);
 
    for (int i = 0; i < ZINK_SHADER_COUNT; ++i) {
       if (prog->shaders[i]) {
          _mesa_set_remove_key(prog->shaders[i]->programs, prog);
          prog->shaders[i] = NULL;
-         destroy_shader_cache(screen, &prog->base.shader_cache[i]);
       }
+      destroy_shader_cache(screen, &prog->shader_cache[i][0]);
+      destroy_shader_cache(screen, &prog->shader_cache[i][1]);
+      ralloc_free(prog->nir[i]);
    }
 
-   for (int i = 0; i < ARRAY_SIZE(prog->pipelines); ++i) {
-      hash_table_foreach(prog->pipelines[i], entry) {
+   unsigned max_idx = ARRAY_SIZE(prog->pipelines);
+   if (screen->info.have_EXT_extended_dynamic_state) {
+      /* only need first 3/4 for point/line/tri/patch */
+      if ((prog->stages_present &
+          (BITFIELD_BIT(PIPE_SHADER_TESS_EVAL) | BITFIELD_BIT(PIPE_SHADER_GEOMETRY))) ==
+          BITFIELD_BIT(PIPE_SHADER_TESS_EVAL))
+         max_idx = 4;
+      else
+         max_idx = 3;
+      max_idx++;
+   }
+
+   for (int i = 0; i < max_idx; ++i) {
+      hash_table_foreach(&prog->pipelines[i], entry) {
          struct gfx_pipeline_cache_entry *pc_entry = entry->data;
 
-         vkDestroyPipeline(screen->dev, pc_entry->pipeline, NULL);
+         VKSCR(DestroyPipeline)(screen->dev, pc_entry->pipeline, NULL);
          free(pc_entry);
       }
-      _mesa_hash_table_destroy(prog->pipelines[i], NULL);
    }
    if (prog->base.pipeline_cache)
-      vkDestroyPipelineCache(screen->dev, prog->base.pipeline_cache, NULL);
-   screen->descriptor_program_deinit(screen, &prog->base);
+      VKSCR(DestroyPipelineCache)(screen->dev, prog->base.pipeline_cache, NULL);
+   screen->descriptor_program_deinit(ctx, &prog->base);
 
    ralloc_free(prog);
 }
 
 void
-zink_destroy_compute_program(struct zink_screen *screen,
-                         struct zink_compute_program *comp)
+zink_destroy_compute_program(struct zink_context *ctx,
+                             struct zink_compute_program *comp)
 {
+   struct zink_screen *screen = zink_screen(ctx->base.screen);
    if (comp->base.layout)
-      vkDestroyPipelineLayout(screen->dev, comp->base.layout, NULL);
+      VKSCR(DestroyPipelineLayout)(screen->dev, comp->base.layout, NULL);
 
    if (comp->shader)
       _mesa_set_remove_key(comp->shader->programs, comp);
@@ -706,59 +616,42 @@ zink_destroy_compute_program(struct zink_screen *screen,
    hash_table_foreach(comp->pipelines, entry) {
       struct compute_pipeline_cache_entry *pc_entry = entry->data;
 
-      vkDestroyPipeline(screen->dev, pc_entry->pipeline, NULL);
+      VKSCR(DestroyPipeline)(screen->dev, pc_entry->pipeline, NULL);
       free(pc_entry);
    }
    _mesa_hash_table_destroy(comp->pipelines, NULL);
-   destroy_shader_cache(screen, &comp->base.shader_cache[0]);
+   VKSCR(DestroyShaderModule)(screen->dev, comp->module->shader, NULL);
+   free(comp->module);
    if (comp->base.pipeline_cache)
-      vkDestroyPipelineCache(screen->dev, comp->base.pipeline_cache, NULL);
-   screen->descriptor_program_deinit(screen, &comp->base);
+      VKSCR(DestroyPipelineCache)(screen->dev, comp->base.pipeline_cache, NULL);
+   screen->descriptor_program_deinit(ctx, &comp->base);
 
    ralloc_free(comp);
 }
 
-static VkPrimitiveTopology
-primitive_topology(enum pipe_prim_type mode)
+static unsigned
+get_pipeline_idx(bool have_EXT_extended_dynamic_state, enum pipe_prim_type mode, VkPrimitiveTopology vkmode)
 {
-   switch (mode) {
-   case PIPE_PRIM_POINTS:
-      return VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
-
-   case PIPE_PRIM_LINES:
-      return VK_PRIMITIVE_TOPOLOGY_LINE_LIST;
-
-   case PIPE_PRIM_LINE_STRIP:
-      return VK_PRIMITIVE_TOPOLOGY_LINE_STRIP;
-
-   case PIPE_PRIM_TRIANGLES:
-      return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
-
-   case PIPE_PRIM_TRIANGLE_STRIP:
-      return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP;
-
-   case PIPE_PRIM_TRIANGLE_FAN:
-      return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN;
-
-   case PIPE_PRIM_LINE_STRIP_ADJACENCY:
-      return VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY;
-
-   case PIPE_PRIM_LINES_ADJACENCY:
-      return VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY;
-
-   case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
-      return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY;
-
-   case PIPE_PRIM_TRIANGLES_ADJACENCY:
-      return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY;
-
-   case PIPE_PRIM_PATCHES:
-      return VK_PRIMITIVE_TOPOLOGY_PATCH_LIST;
-
-   default:
-      unreachable("unexpected enum pipe_prim_type");
+   /* VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY_EXT specifies that the topology state in
+    * VkPipelineInputAssemblyStateCreateInfo only specifies the topology class,
+    * and the specific topology order and adjacency must be set dynamically
+    * with vkCmdSetPrimitiveTopologyEXT before any drawing commands.
+    */
+   if (have_EXT_extended_dynamic_state) {
+      if (mode == PIPE_PRIM_PATCHES)
+         return 3;
+      switch (u_reduced_prim(mode)) {
+      case PIPE_PRIM_POINTS:
+         return 0;
+      case PIPE_PRIM_LINES:
+         return 1;
+      default:
+         return 2;
+      }
    }
+   return vkmode;
 }
+                 
 
 VkPipeline
 zink_get_gfx_pipeline(struct zink_context *ctx,
@@ -768,48 +661,48 @@ zink_get_gfx_pipeline(struct zink_context *ctx,
 {
    struct zink_screen *screen = zink_screen(ctx->base.screen);
    const bool have_EXT_vertex_input_dynamic_state = screen->info.have_EXT_vertex_input_dynamic_state;
-   if (!state->dirty && !state->combined_dirty && mode == state->mode &&
-       (have_EXT_vertex_input_dynamic_state || !ctx->vertex_state_changed))
-      return state->pipeline;
+   const bool have_EXT_extended_dynamic_state = screen->info.have_EXT_extended_dynamic_state;
 
-   VkPrimitiveTopology vkmode = primitive_topology(mode);
-   assert(vkmode <= ARRAY_SIZE(prog->pipelines));
+   VkPrimitiveTopology vkmode = zink_primitive_topology(mode);
+   const unsigned idx = get_pipeline_idx(screen->info.have_EXT_extended_dynamic_state, mode, vkmode);
+   assert(idx <= ARRAY_SIZE(prog->pipelines));
+   if (!state->dirty && !state->modules_changed &&
+       (have_EXT_vertex_input_dynamic_state || !ctx->vertex_state_changed) &&
+       idx == state->idx)
+      return state->pipeline;
 
    struct hash_entry *entry = NULL;
 
    if (state->dirty) {
-      if (!have_EXT_vertex_input_dynamic_state)
-         ctx->vertex_state_changed = true;
-      state->combined_dirty = true;
+      if (state->pipeline) //avoid on first hash
+         state->final_hash ^= state->hash;
       state->hash = hash_gfx_pipeline_state(state);
+      state->final_hash ^= state->hash;
       state->dirty = false;
    }
-   if (state->combined_dirty) {
-      if (!have_EXT_vertex_input_dynamic_state)
-         ctx->vertex_state_changed = true;
-      state->combined_hash = XXH32(&state->module_hash, sizeof(uint32_t), state->hash);
-      state->combined_dirty = false;
-   }
-   if (have_EXT_vertex_input_dynamic_state)
-      state->final_hash = state->combined_hash;
-   else
-      if (ctx->vertex_state_changed) {
-         uint32_t hash = state->combined_hash;
-         if (!state->have_EXT_extended_dynamic_state) {
-            /* if we don't have dynamic states, we have to hash the enabled vertex buffer bindings */
-            uint32_t vertex_buffers_enabled_mask = state->vertex_buffers_enabled_mask;
-            hash = XXH32(&vertex_buffers_enabled_mask, sizeof(uint32_t), hash);
+   if (!have_EXT_vertex_input_dynamic_state && ctx->vertex_state_changed) {
+      if (state->pipeline)
+         state->final_hash ^= state->vertex_hash;
+      if (!have_EXT_extended_dynamic_state) {
+         uint32_t hash = 0;
+         /* if we don't have dynamic states, we have to hash the enabled vertex buffer bindings */
+         uint32_t vertex_buffers_enabled_mask = state->vertex_buffers_enabled_mask;
+         hash = XXH32(&vertex_buffers_enabled_mask, sizeof(uint32_t), hash);
 
-            for (unsigned i = 0; i < state->element_state->num_bindings; i++) {
-               struct pipe_vertex_buffer *vb = ctx->vertex_buffers + ctx->element_state->binding_map[i];
-               state->vertex_strides[i] = vb->buffer.resource ? vb->stride : 0;
-               hash = XXH32(&state->vertex_strides[i], sizeof(uint32_t), hash);
-            }
+         for (unsigned i = 0; i < state->element_state->num_bindings; i++) {
+            struct pipe_vertex_buffer *vb = ctx->vertex_buffers + ctx->element_state->binding_map[i];
+            state->vertex_strides[i] = vb->buffer.resource ? vb->stride : 0;
+            hash = XXH32(&state->vertex_strides[i], sizeof(uint32_t), hash);
          }
-         state->final_hash = XXH32(&state->element_state, sizeof(void*), hash);
-         ctx->vertex_state_changed = false;
-      }
-   entry = _mesa_hash_table_search_pre_hashed(prog->pipelines[vkmode], state->final_hash, state);
+         state->vertex_hash = hash ^ state->element_state->hash;
+      } else
+         state->vertex_hash = state->element_state->hash;
+      state->final_hash ^= state->vertex_hash;
+   }
+   state->modules_changed = false;
+   ctx->vertex_state_changed = false;
+
+   entry = _mesa_hash_table_search_pre_hashed(&prog->pipelines[idx], state->final_hash, state);
 
    if (!entry) {
       util_queue_fence_wait(&prog->base.cache_fence);
@@ -818,6 +711,7 @@ zink_get_gfx_pipeline(struct zink_context *ctx,
       if (pipeline == VK_NULL_HANDLE)
          return VK_NULL_HANDLE;
 
+      zink_screen_update_pipeline_cache(screen, &prog->base);
       struct gfx_pipeline_cache_entry *pc_entry = CALLOC_STRUCT(gfx_pipeline_cache_entry);
       if (!pc_entry)
          return VK_NULL_HANDLE;
@@ -825,13 +719,13 @@ zink_get_gfx_pipeline(struct zink_context *ctx,
       memcpy(&pc_entry->state, state, sizeof(*state));
       pc_entry->pipeline = pipeline;
 
-      entry = _mesa_hash_table_insert_pre_hashed(prog->pipelines[vkmode], state->final_hash, state, pc_entry);
+      entry = _mesa_hash_table_insert_pre_hashed(&prog->pipelines[idx], state->final_hash, pc_entry, pc_entry);
       assert(entry);
    }
 
    struct gfx_pipeline_cache_entry *cache_entry = entry->data;
    state->pipeline = cache_entry->pipeline;
-   state->mode = mode;
+   state->idx = idx;
    return state->pipeline;
 }
 
@@ -864,7 +758,7 @@ zink_get_compute_pipeline(struct zink_screen *screen,
       memcpy(&pc_entry->state, state, sizeof(*state));
       pc_entry->pipeline = pipeline;
 
-      entry = _mesa_hash_table_insert_pre_hashed(comp->pipelines, state->hash, state, pc_entry);
+      entry = _mesa_hash_table_insert_pre_hashed(comp->pipelines, state->hash, pc_entry, pc_entry);
       assert(entry);
    }
 
@@ -877,31 +771,65 @@ static inline void
 bind_stage(struct zink_context *ctx, enum pipe_shader_type stage,
            struct zink_shader *shader)
 {
+   if (shader && shader->nir->info.num_inlinable_uniforms)
+      ctx->shader_has_inlinable_uniforms_mask |= 1 << stage;
+   else
+      ctx->shader_has_inlinable_uniforms_mask &= ~(1 << stage);
+
    if (stage == PIPE_SHADER_COMPUTE) {
       if (shader && shader != ctx->compute_stage) {
-         struct hash_entry *entry = _mesa_hash_table_search(ctx->compute_program_cache, shader);
+         struct hash_entry *entry = _mesa_hash_table_search(&ctx->compute_program_cache, shader);
          if (entry) {
             ctx->compute_pipeline_state.dirty = true;
             ctx->curr_compute = entry->data;
-         } else
-            ctx->dirty_shader_stages |= 1 << stage;
+         } else {
+            struct zink_compute_program *comp = zink_create_compute_program(ctx, shader);
+            _mesa_hash_table_insert(&ctx->compute_program_cache, comp->shader, comp);
+            ctx->compute_pipeline_state.dirty = true;
+            ctx->curr_compute = comp;
+            zink_batch_reference_program(&ctx->batch, &ctx->curr_compute->base);
+         }
       } else if (!shader)
          ctx->curr_compute = NULL;
       ctx->compute_stage = shader;
       zink_select_launch_grid(ctx);
    } else {
+      if (ctx->gfx_stages[stage])
+         ctx->gfx_hash ^= ctx->gfx_stages[stage]->hash;
       ctx->gfx_stages[stage] = shader;
-      ctx->gfx_pipeline_state.combined_dirty = true;
-      if (!shader) {
+      ctx->gfx_dirty = ctx->gfx_stages[PIPE_SHADER_FRAGMENT] && ctx->gfx_stages[PIPE_SHADER_VERTEX];
+      ctx->gfx_pipeline_state.modules_changed = true;
+      if (shader) {
+         ctx->shader_stages |= BITFIELD_BIT(stage);
+         ctx->gfx_hash ^= ctx->gfx_stages[stage]->hash;
+      } else {
          ctx->gfx_pipeline_state.modules[stage] = VK_NULL_HANDLE;
+         if (ctx->curr_program)
+            ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash;
          ctx->curr_program = NULL;
+         ctx->shader_stages &= ~BITFIELD_BIT(stage);
       }
-      ctx->dirty_shader_stages |= 1 << stage;
    }
-   if (shader && shader->nir->info.num_inlinable_uniforms)
-      ctx->shader_has_inlinable_uniforms_mask |= 1 << stage;
+}
+
+static void
+bind_last_vertex_stage(struct zink_context *ctx)
+{
+   enum pipe_shader_type old = ctx->last_vertex_stage ? pipe_shader_type_from_mesa(ctx->last_vertex_stage->nir->info.stage) : PIPE_SHADER_TYPES;
+   if (ctx->gfx_stages[PIPE_SHADER_GEOMETRY])
+      ctx->last_vertex_stage = ctx->gfx_stages[PIPE_SHADER_GEOMETRY];
+   else if (ctx->gfx_stages[PIPE_SHADER_TESS_EVAL])
+      ctx->last_vertex_stage = ctx->gfx_stages[PIPE_SHADER_TESS_EVAL];
    else
-      ctx->shader_has_inlinable_uniforms_mask &= ~(1 << stage);
+      ctx->last_vertex_stage = ctx->gfx_stages[PIPE_SHADER_VERTEX];
+   enum pipe_shader_type current = ctx->last_vertex_stage ? pipe_shader_type_from_mesa(ctx->last_vertex_stage->nir->info.stage) : PIPE_SHADER_VERTEX;
+   if (old != current) {
+      if (old != PIPE_SHADER_TYPES) {
+         memset(&ctx->gfx_pipeline_state.shader_keys.key[old].key.vs_base, 0, sizeof(struct zink_vs_key_base));
+         ctx->dirty_shader_stages |= BITFIELD_BIT(old);
+      }
+      ctx->last_vertex_stage_dirty = true;
+   }
 }
 
 static void
@@ -909,11 +837,10 @@ zink_bind_vs_state(struct pipe_context *pctx,
                    void *cso)
 {
    struct zink_context *ctx = zink_context(pctx);
+   if (!cso && !ctx->gfx_stages[PIPE_SHADER_VERTEX])
+      return;
+   void *prev = ctx->gfx_stages[PIPE_SHADER_VERTEX];
    bind_stage(ctx, PIPE_SHADER_VERTEX, cso);
-   if (!ctx->gfx_stages[PIPE_SHADER_GEOMETRY] &&
-       !ctx->gfx_stages[PIPE_SHADER_TESS_EVAL]) {
-      ctx->last_vertex_stage = cso;
-   }
    if (cso) {
       struct zink_shader *zs = cso;
       ctx->shader_reads_drawid = BITSET_TEST(zs->nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
@@ -922,13 +849,30 @@ zink_bind_vs_state(struct pipe_context *pctx,
       ctx->shader_reads_drawid = false;
       ctx->shader_reads_basevertex = false;
    }
+   if (ctx->last_vertex_stage == prev)
+      ctx->last_vertex_stage = cso;
+
 }
 
 static void
 zink_bind_fs_state(struct pipe_context *pctx,
                    void *cso)
 {
-   bind_stage(zink_context(pctx), PIPE_SHADER_FRAGMENT, cso);
+   struct zink_context *ctx = zink_context(pctx);
+   if (!cso && !ctx->gfx_stages[PIPE_SHADER_FRAGMENT])
+      return;
+   bind_stage(ctx, PIPE_SHADER_FRAGMENT, cso);
+   ctx->fbfetch_outputs = 0;
+   if (cso) {
+      nir_shader *nir = ctx->gfx_stages[PIPE_SHADER_FRAGMENT]->nir;
+      if (nir->info.fs.uses_fbfetch_output) {
+         nir_foreach_shader_out_variable(var, ctx->gfx_stages[PIPE_SHADER_FRAGMENT]->nir) {
+            if (var->data.fb_fetch_output)
+               ctx->fbfetch_outputs |= BITFIELD_BIT(var->data.location - FRAG_RESULT_DATA0);
+         }
+      }
+   }
+   zink_update_fbfetch(ctx);
 }
 
 static void
@@ -936,17 +880,17 @@ zink_bind_gs_state(struct pipe_context *pctx,
                    void *cso)
 {
    struct zink_context *ctx = zink_context(pctx);
-   if (!!ctx->gfx_stages[PIPE_SHADER_GEOMETRY] != !!cso)
-      ctx->dirty_shader_stages |= BITFIELD_BIT(PIPE_SHADER_VERTEX) |
-                                  BITFIELD_BIT(PIPE_SHADER_TESS_EVAL);
+   if (!cso && !ctx->gfx_stages[PIPE_SHADER_GEOMETRY])
+      return;
+   bool had_points = ctx->gfx_stages[PIPE_SHADER_GEOMETRY] ? ctx->gfx_stages[PIPE_SHADER_GEOMETRY]->nir->info.gs.output_primitive == GL_POINTS : false;
    bind_stage(ctx, PIPE_SHADER_GEOMETRY, cso);
-   if (cso)
-      ctx->last_vertex_stage = cso;
-   else {
-      if (ctx->gfx_stages[PIPE_SHADER_TESS_EVAL])
-         ctx->last_vertex_stage = ctx->gfx_stages[PIPE_SHADER_TESS_EVAL];
-      else
-         ctx->last_vertex_stage = ctx->gfx_stages[PIPE_SHADER_VERTEX];
+   bind_last_vertex_stage(ctx);
+   if (cso) {
+      if (!had_points && ctx->last_vertex_stage->nir->info.gs.output_primitive == GL_POINTS)
+         ctx->gfx_pipeline_state.has_points++;
+   } else {
+      if (had_points)
+         ctx->gfx_pipeline_state.has_points--;
    }
 }
 
@@ -962,21 +906,17 @@ zink_bind_tes_state(struct pipe_context *pctx,
                    void *cso)
 {
    struct zink_context *ctx = zink_context(pctx);
+   if (!cso && !ctx->gfx_stages[PIPE_SHADER_TESS_EVAL])
+      return;
    if (!!ctx->gfx_stages[PIPE_SHADER_TESS_EVAL] != !!cso) {
       if (!cso) {
          /* if unsetting a TESS that uses a generated TCS, ensure the TCS is unset */
          if (ctx->gfx_stages[PIPE_SHADER_TESS_EVAL]->generated)
             ctx->gfx_stages[PIPE_SHADER_TESS_CTRL] = NULL;
       }
-      ctx->dirty_shader_stages |= BITFIELD_BIT(PIPE_SHADER_VERTEX);
    }
    bind_stage(ctx, PIPE_SHADER_TESS_EVAL, cso);
-   if (!ctx->gfx_stages[PIPE_SHADER_GEOMETRY]) {
-      if (cso)
-         ctx->last_vertex_stage = cso;
-      else
-         ctx->last_vertex_stage = ctx->gfx_stages[PIPE_SHADER_VERTEX];
-   }
+   bind_last_vertex_stage(ctx);
 }
 
 static void *
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_program.h b/mesa 3D driver/src/gallium/drivers/zink/zink_program.h
index eb77c828ec..7de44385c1 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_program.h	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_program.h	
@@ -64,8 +64,13 @@ struct zink_cs_push_constant {
  * allowing us to skip going through shader keys
  */
 struct zink_shader_module {
+   struct list_head list;
    VkShaderModule shader;
+   uint32_t hash;
    bool default_variant;
+   uint8_t num_uniforms;
+   uint8_t key_size;
+   uint8_t key[0]; /* | key | uniforms | */
 };
 
 struct zink_program {
@@ -79,14 +84,16 @@ struct zink_program {
 
    struct zink_program_descriptor_data *dd;
 
+   uint32_t compat_id;
    VkPipelineLayout layout;
-   VkDescriptorSetLayout dsl[ZINK_DESCRIPTOR_TYPES + 1]; // one for each type + push
+   VkDescriptorSetLayout dsl[ZINK_DESCRIPTOR_TYPES + 2]; // one for each type + push + bindless
    unsigned num_dsl;
 
-   /* the shader cache stores a mapping of zink_shader_key::VkShaderModule */
-   struct hash_table shader_cache[ZINK_SHADER_COUNT];
+   bool removed;
 };
 
+#define ZINK_MAX_INLINED_VARIANTS 5
+
 struct zink_gfx_program {
    struct zink_program base;
 
@@ -95,12 +102,13 @@ struct zink_gfx_program {
 
    struct zink_shader_module *modules[ZINK_SHADER_COUNT]; // compute stage doesn't belong here
 
-   struct zink_shader_module *default_variants[ZINK_SHADER_COUNT][2]; //[default, no streamout]
-   const void *default_variant_key[ZINK_SHADER_COUNT];
    struct zink_shader *last_vertex_stage;
 
+   struct list_head shader_cache[ZINK_SHADER_COUNT][2]; //normal, inline uniforms
+   unsigned inlined_variant_count[ZINK_SHADER_COUNT];
+
    struct zink_shader *shaders[ZINK_SHADER_COUNT];
-   struct hash_table *pipelines[11]; // number of draw modes we support
+   struct hash_table pipelines[11]; // number of draw modes we support
    uint32_t default_variant_hash;
    uint32_t last_variant_hash;
 };
@@ -133,6 +141,48 @@ zink_desc_type_from_vktype(VkDescriptorType type)
    }
 }
 
+static inline VkPrimitiveTopology
+zink_primitive_topology(enum pipe_prim_type mode)
+{
+   switch (mode) {
+   case PIPE_PRIM_POINTS:
+      return VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
+
+   case PIPE_PRIM_LINES:
+      return VK_PRIMITIVE_TOPOLOGY_LINE_LIST;
+
+   case PIPE_PRIM_LINE_STRIP:
+      return VK_PRIMITIVE_TOPOLOGY_LINE_STRIP;
+
+   case PIPE_PRIM_TRIANGLES:
+      return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
+
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP;
+
+   case PIPE_PRIM_TRIANGLE_FAN:
+      return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN;
+
+   case PIPE_PRIM_LINE_STRIP_ADJACENCY:
+      return VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY;
+
+   case PIPE_PRIM_LINES_ADJACENCY:
+      return VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY;
+
+   case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
+      return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY;
+
+   case PIPE_PRIM_TRIANGLES_ADJACENCY:
+      return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY;
+
+   case PIPE_PRIM_PATCHES:
+      return VK_PRIMITIVE_TOPOLOGY_PATCH_LIST;
+
+   default:
+      unreachable("unexpected enum pipe_prim_type");
+   }
+}
+
 void
 zink_delete_shader_state(struct pipe_context *pctx, void *cso);
 void *
@@ -152,10 +202,11 @@ zink_update_gfx_program(struct zink_context *ctx, struct zink_gfx_program *prog)
 
 struct zink_gfx_program *
 zink_create_gfx_program(struct zink_context *ctx,
-                        struct zink_shader *stages[ZINK_SHADER_COUNT]);
+                        struct zink_shader *stages[ZINK_SHADER_COUNT],
+                        unsigned vertices_per_patch);
 
 void
-zink_destroy_gfx_program(struct zink_screen *screen,
+zink_destroy_gfx_program(struct zink_context *ctx,
                          struct zink_gfx_program *prog);
 
 VkPipeline
@@ -174,7 +225,7 @@ void
 debug_describe_zink_gfx_program(char* buf, const struct zink_gfx_program *ptr);
 
 static inline bool
-zink_gfx_program_reference(struct zink_screen *screen,
+zink_gfx_program_reference(struct zink_context *ctx,
                            struct zink_gfx_program **dst,
                            struct zink_gfx_program *src)
 {
@@ -183,7 +234,7 @@ zink_gfx_program_reference(struct zink_screen *screen,
 
    if (pipe_reference_described(old_dst ? &old_dst->base.reference : NULL, &src->base.reference,
                                 (debug_reference_descriptor)debug_describe_zink_gfx_program)) {
-      zink_destroy_gfx_program(screen, old_dst);
+      zink_destroy_gfx_program(ctx, old_dst);
       ret = true;
    }
    if (dst) *dst = src;
@@ -193,14 +244,14 @@ zink_gfx_program_reference(struct zink_screen *screen,
 struct zink_compute_program *
 zink_create_compute_program(struct zink_context *ctx, struct zink_shader *shader);
 void
-zink_destroy_compute_program(struct zink_screen *screen,
-                         struct zink_compute_program *comp);
+zink_destroy_compute_program(struct zink_context *ctx,
+                             struct zink_compute_program *comp);
 
 void
 debug_describe_zink_compute_program(char* buf, const struct zink_compute_program *ptr);
 
 static inline bool
-zink_compute_program_reference(struct zink_screen *screen,
+zink_compute_program_reference(struct zink_context *ctx,
                            struct zink_compute_program **dst,
                            struct zink_compute_program *src)
 {
@@ -209,7 +260,7 @@ zink_compute_program_reference(struct zink_screen *screen,
 
    if (pipe_reference_described(old_dst ? &old_dst->base.reference : NULL, &src->base.reference,
                                 (debug_reference_descriptor)debug_describe_zink_compute_program)) {
-      zink_destroy_compute_program(screen, old_dst);
+      zink_destroy_compute_program(ctx, old_dst);
       ret = true;
    }
    if (dst) *dst = src;
@@ -217,7 +268,7 @@ zink_compute_program_reference(struct zink_screen *screen,
 }
 
 VkPipelineLayout
-zink_pipeline_layout_create(struct zink_screen *screen, struct zink_program *pg);
+zink_pipeline_layout_create(struct zink_screen *screen, struct zink_program *pg, uint32_t *compat);
 
 void
 zink_program_update_compute_pipeline_state(struct zink_context *ctx, struct zink_compute_program *comp, const uint block[3]);
@@ -232,6 +283,59 @@ zink_program_has_descriptors(const struct zink_program *pg)
 {
    return pg->num_dsl > 0;
 }
+
+static inline struct zink_fs_key *
+zink_set_fs_key(struct zink_context *ctx)
+{
+   ctx->dirty_shader_stages |= BITFIELD_BIT(PIPE_SHADER_FRAGMENT);
+   return (struct zink_fs_key *)&ctx->gfx_pipeline_state.shader_keys.key[PIPE_SHADER_FRAGMENT];
+}
+
+static inline const struct zink_fs_key *
+zink_get_fs_key(struct zink_context *ctx)
+{
+   return (const struct zink_fs_key *)&ctx->gfx_pipeline_state.shader_keys.key[PIPE_SHADER_FRAGMENT];
+}
+
+static inline struct zink_vs_key *
+zink_set_vs_key(struct zink_context *ctx)
+{
+   ctx->dirty_shader_stages |= BITFIELD_BIT(PIPE_SHADER_VERTEX);
+   return (struct zink_vs_key *)&ctx->gfx_pipeline_state.shader_keys.key[PIPE_SHADER_VERTEX];
+}
+
+static inline const struct zink_vs_key *
+zink_get_vs_key(struct zink_context *ctx)
+{
+   return (const struct zink_vs_key *)&ctx->gfx_pipeline_state.shader_keys.key[PIPE_SHADER_VERTEX];
+}
+
+static inline struct zink_vs_key_base *
+zink_set_last_vertex_key(struct zink_context *ctx)
+{
+   ctx->last_vertex_stage_dirty = true;
+   return (struct zink_vs_key_base *)&ctx->gfx_pipeline_state.shader_keys.last_vertex;
+}
+
+static inline const struct zink_vs_key_base *
+zink_get_last_vertex_key(struct zink_context *ctx)
+{
+   return (const struct zink_vs_key_base *)&ctx->gfx_pipeline_state.shader_keys.last_vertex;
+}
+
+static inline void
+zink_set_fs_point_coord_key(struct zink_context *ctx)
+{
+   const struct zink_fs_key *fs = zink_get_fs_key(ctx);
+   bool disable = !ctx->gfx_pipeline_state.has_points || !ctx->rast_state->base.sprite_coord_enable;
+   uint8_t coord_replace_bits = disable ? 0 : ctx->rast_state->base.sprite_coord_enable;
+   bool coord_replace_yinvert = disable ? false : !!ctx->rast_state->base.sprite_coord_mode;
+   if (fs->coord_replace_bits != coord_replace_bits || fs->coord_replace_yinvert != coord_replace_yinvert) {
+      zink_set_fs_key(ctx)->coord_replace_bits = coord_replace_bits;
+      zink_set_fs_key(ctx)->coord_replace_yinvert = coord_replace_yinvert;
+   }
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_query.c b/mesa 3D driver/src/gallium/drivers/zink/zink_query.c
index cdfb15f3c5..636fbc2689 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_query.c	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_query.c	
@@ -11,7 +11,11 @@
 #include "util/u_inlines.h"
 #include "util/u_memory.h"
 
+#if defined(PIPE_ARCH_X86_64) || defined(PIPE_ARCH_PPC_64) || defined(PIPE_ARCH_AARCH64) || defined(PIPE_ARCH_MIPS64)
 #define NUM_QUERIES 5000
+#else
+#define NUM_QUERIES 500
+#endif
 
 struct zink_query_buffer {
    struct list_head list;
@@ -39,7 +43,6 @@ struct zink_query {
    bool dead; /* query should be destroyed when its fence finishes */
    bool needs_update; /* query needs to update its qbos */
 
-   unsigned fences;
    struct list_head active_list;
 
    struct list_head stats_list; /* when active, statistics queries are added to ctx->primitives_generated_queries */
@@ -49,7 +52,10 @@ struct zink_query {
    struct zink_batch_usage *batch_id; //batch that the query was started in
 
    struct list_head buffers;
-   struct zink_query_buffer *curr_qbo;
+   union {
+      struct zink_query_buffer *curr_qbo;
+      struct pipe_fence_handle *fence; //PIPE_QUERY_GPU_FINISHED
+   };
 
    struct zink_resource *predicate;
    bool predicate_dirty;
@@ -188,7 +194,7 @@ qbo_sync_from_prev(struct zink_context *ctx, struct zink_query *query, unsigned
    unsigned qbo_offset = last_start * get_num_results(query->type) * sizeof(uint64_t);
    query->curr_query = id_offset;
    query->curr_qbo->num_results = id_offset;
-   zink_copy_buffer(ctx, NULL, zink_resource(query->curr_qbo->buffer), zink_resource(prev->buffer), 0,
+   zink_copy_buffer(ctx, zink_resource(query->curr_qbo->buffer), zink_resource(prev->buffer), 0,
                     qbo_offset,
                     id_offset * result_size);
 }
@@ -202,7 +208,7 @@ qbo_append(struct pipe_screen *screen, struct zink_query *query)
    if (!qbo)
       return false;
    qbo->buffer = pipe_buffer_create(screen, PIPE_BIND_QUERY_BUFFER,
-                                  PIPE_USAGE_STREAM,
+                                  PIPE_USAGE_STAGING,
                                   /* this is the maximum possible size of the results in a given buffer */
                                   NUM_QUERIES * get_num_results(query->type) * sizeof(uint64_t));
    if (!qbo->buffer)
@@ -210,7 +216,7 @@ qbo_append(struct pipe_screen *screen, struct zink_query *query)
    if (query->type == PIPE_QUERY_PRIMITIVES_GENERATED) {
       /* need separate xfb buffer */
       qbo->xfb_buffers[0] = pipe_buffer_create(screen, PIPE_BIND_QUERY_BUFFER,
-                                     PIPE_USAGE_STREAM,
+                                     PIPE_USAGE_STAGING,
                                      /* this is the maximum possible size of the results in a given buffer */
                                      NUM_QUERIES * get_num_results(query->type) * sizeof(uint64_t));
       if (!qbo->xfb_buffers[0])
@@ -220,7 +226,7 @@ qbo_append(struct pipe_screen *screen, struct zink_query *query)
       for (unsigned i = 0; i < ARRAY_SIZE(qbo->xfb_buffers); i++) {
          /* need separate xfb buffer */
          qbo->xfb_buffers[i] = pipe_buffer_create(screen, PIPE_BIND_QUERY_BUFFER,
-                                        PIPE_USAGE_STREAM,
+                                        PIPE_USAGE_STAGING,
                                         /* this is the maximum possible size of the results in a given buffer */
                                         NUM_QUERIES * get_num_results(query->type) * sizeof(uint64_t));
          if (!qbo->xfb_buffers[i])
@@ -241,9 +247,9 @@ qbo_append(struct pipe_screen *screen, struct zink_query *query)
 static void
 destroy_query(struct zink_screen *screen, struct zink_query *query)
 {
-   assert(!p_atomic_read(&query->fences));
+   assert(zink_screen_usage_check_completion(screen, query->batch_id));
    if (query->query_pool)
-      vkDestroyQueryPool(screen->dev, query->query_pool, NULL);
+      VKSCR(DestroyQueryPool)(screen->dev, query->query_pool, NULL);
    struct zink_query_buffer *qbo, *next;
    LIST_FOR_EACH_ENTRY_SAFE(qbo, next, &query->buffers, list) {
       pipe_resource_reference(&qbo->buffer, NULL);
@@ -253,7 +259,7 @@ destroy_query(struct zink_screen *screen, struct zink_query *query)
    }
    for (unsigned i = 0; i < ARRAY_SIZE(query->xfb_query_pool); i++) {
       if (query->xfb_query_pool[i])
-         vkDestroyQueryPool(screen->dev, query->xfb_query_pool[i], NULL);
+         VKSCR(DestroyQueryPool)(screen->dev, query->xfb_query_pool[i], NULL);
    }
    pipe_resource_reference((struct pipe_resource**)&query->predicate, NULL);
    FREE(query);
@@ -280,10 +286,14 @@ zink_create_query(struct pipe_context *pctx,
 
    query->index = index;
    query->type = query_type;
+   if (query->type == PIPE_QUERY_GPU_FINISHED)
+      return (struct pipe_query *)query;
    query->vkqtype = convert_query_type(query_type, &query->precise);
    if (query->vkqtype == -1)
       return NULL;
 
+   assert(!query->precise || query->vkqtype == VK_QUERY_TYPE_OCCLUSION);
+
    query->curr_query = 0;
 
    pool_create.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO;
@@ -295,7 +305,7 @@ zink_create_query(struct pipe_context *pctx,
    else if (query_type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE)
       pool_create.pipelineStatistics = pipeline_statistic_convert(index);
 
-   VkResult status = vkCreateQueryPool(screen->dev, &pool_create, NULL, &query->query_pool);
+   VkResult status = VKSCR(CreateQueryPool)(screen->dev, &pool_create, NULL, &query->query_pool);
    if (status != VK_SUCCESS)
       goto fail;
    if (query_type == PIPE_QUERY_PRIMITIVES_GENERATED) {
@@ -304,13 +314,13 @@ zink_create_query(struct pipe_context *pctx,
       pool_create.queryType = VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT;
       pool_create.queryCount = NUM_QUERIES;
 
-      status = vkCreateQueryPool(screen->dev, &pool_create, NULL, &query->xfb_query_pool[0]);
+      status = VKSCR(CreateQueryPool)(screen->dev, &pool_create, NULL, &query->xfb_query_pool[0]);
       if (status != VK_SUCCESS)
          goto fail;
    } else if (query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
       /* need to monitor all xfb streams */
       for (unsigned i = 0; i < ARRAY_SIZE(query->xfb_query_pool); i++) {
-         status = vkCreateQueryPool(screen->dev, &pool_create, NULL, &query->xfb_query_pool[i]);
+         status = VKSCR(CreateQueryPool)(screen->dev, &pool_create, NULL, &query->xfb_query_pool[i]);
          if (status != VK_SUCCESS)
             goto fail;
       }
@@ -338,10 +348,11 @@ zink_destroy_query(struct pipe_context *pctx,
    struct zink_screen *screen = zink_screen(pctx->screen);
    struct zink_query *query = (struct zink_query *)q;
 
-   p_atomic_set(&query->dead, true);
-   if (p_atomic_read(&query->fences)) {
-      if (query->xfb_running)
-        zink_fence_wait(pctx);
+   /* only destroy if this query isn't active on any batches,
+    * otherwise just mark dead and wait
+    */
+   if (query->batch_id) {
+      p_atomic_set(&query->dead, true);
       return;
    }
 
@@ -349,12 +360,13 @@ zink_destroy_query(struct pipe_context *pctx,
 }
 
 void
-zink_prune_query(struct zink_screen *screen, struct zink_query *query)
+zink_prune_query(struct zink_screen *screen, struct zink_batch_state *bs, struct zink_query *query)
 {
-   if (!p_atomic_dec_return(&query->fences)) {
-      if (p_atomic_read(&query->dead))
-         destroy_query(screen, query);
-   }
+   if (!zink_batch_usage_matches(query->batch_id, bs))
+      return;
+   query->batch_id = NULL;
+   if (p_atomic_read(&query->dead))
+      destroy_query(screen, query);
 }
 
 static void
@@ -434,7 +446,7 @@ get_query_result(struct pipe_context *pctx,
       flags |= PIPE_MAP_DONTBLOCK;
    if (query->base.flushed)
       /* this is not a context-safe operation; ensure map doesn't use slab alloc */
-      flags |= PIPE_MAP_THREAD_SAFE | PIPE_MAP_UNSYNCHRONIZED;
+      flags |= PIPE_MAP_THREAD_SAFE;
 
    util_query_clear_result(result, query->type);
 
@@ -461,6 +473,8 @@ get_query_result(struct pipe_context *pctx,
          if (!xfb_results) {
             if (wait)
                debug_printf("zink: xfb qbo read failed!");
+            pipe_buffer_unmap(pctx, xfer);
+            return false;
          }
       }
       check_query_results(query, result, is_timestamp ? 1 : qbo->num_results, results, xfb_results);
@@ -545,11 +559,11 @@ copy_pool_results_to_buffer(struct zink_context *ctx, struct zink_query *query,
    zink_batch_no_rp(ctx);
    /* if it's a single query that doesn't need special handling, we can copy it and be done */
    zink_batch_reference_resource_rw(batch, res, true);
-   zink_resource_buffer_barrier(ctx, batch, res, VK_ACCESS_TRANSFER_WRITE_BIT, 0);
+   zink_resource_buffer_barrier(ctx, res, VK_ACCESS_TRANSFER_WRITE_BIT, 0);
    util_range_add(&res->base.b, &res->valid_buffer_range, offset, offset + result_size);
    assert(query_id < NUM_QUERIES);
-   vkCmdCopyQueryPoolResults(batch->state->cmdbuf, pool, query_id, num_results, res->obj->buffer,
-                             offset, 0, flags);
+   VKCTX(CmdCopyQueryPoolResults)(batch->state->cmdbuf, pool, query_id, num_results, res->obj->buffer,
+                             offset, type_size, flags);
 }
 
 static void
@@ -571,24 +585,26 @@ reset_pool(struct zink_context *ctx, struct zink_batch *batch, struct zink_query
    if (q->needs_update)
       update_qbo(ctx, q);
 
-   vkCmdResetQueryPool(batch->state->cmdbuf, q->query_pool, 0, NUM_QUERIES);
+   VKCTX(CmdResetQueryPool)(batch->state->cmdbuf, q->query_pool, 0, NUM_QUERIES);
    if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED)
-      vkCmdResetQueryPool(batch->state->cmdbuf, q->xfb_query_pool[0], 0, NUM_QUERIES);
+      VKCTX(CmdResetQueryPool)(batch->state->cmdbuf, q->xfb_query_pool[0], 0, NUM_QUERIES);
    else if (q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
       for (unsigned i = 0; i < ARRAY_SIZE(q->xfb_query_pool); i++)
-         vkCmdResetQueryPool(batch->state->cmdbuf, q->xfb_query_pool[i], 0, NUM_QUERIES);
+         VKCTX(CmdResetQueryPool)(batch->state->cmdbuf, q->xfb_query_pool[i], 0, NUM_QUERIES);
    }
    memset(q->have_gs, 0, sizeof(q->have_gs));
    memset(q->have_xfb, 0, sizeof(q->have_xfb));
    q->last_start = q->curr_query = 0;
    q->needs_reset = false;
-   /* create new qbo for non-timestamp queries */
-   if (q->type != PIPE_QUERY_TIMESTAMP) {
-      if (qbo_append(ctx->base.screen, q))
-         reset_qbo(q);
-      else
-         debug_printf("zink: qbo alloc failed on reset!");
-   }
+   /* create new qbo for non-timestamp queries:
+    * timestamp queries should never need more than 2 entries in the qbo
+    */
+   if (q->type == PIPE_QUERY_TIMESTAMP)
+      return;
+   if (qbo_append(ctx->base.screen, q))
+      reset_qbo(q);
+   else
+      debug_printf("zink: qbo alloc failed on reset!");
    if (id_offset)
       qbo_sync_from_prev(ctx, q, id_offset, last_start);
 }
@@ -649,9 +665,11 @@ begin_query(struct zink_context *ctx, struct zink_batch *batch, struct zink_quer
    q->active = true;
    batch->has_work = true;
    if (q->type == PIPE_QUERY_TIME_ELAPSED) {
-      vkCmdWriteTimestamp(batch->state->cmdbuf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, q->query_pool, q->curr_query);
+      VKCTX(CmdWriteTimestamp)(batch->state->cmdbuf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, q->query_pool, q->curr_query);
       q->curr_query++;
       update_qbo(ctx, q);
+      zink_batch_usage_set(&q->batch_id, batch->state);
+      _mesa_set_add(batch->state->active_queries, q);
    }
    /* ignore the rest of begin_query for timestamps */
    if (is_time_query(q))
@@ -661,31 +679,30 @@ begin_query(struct zink_context *ctx, struct zink_batch *batch, struct zink_quer
    if (q->type == PIPE_QUERY_PRIMITIVES_EMITTED ||
        q->type == PIPE_QUERY_PRIMITIVES_GENERATED ||
        q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) {
-      zink_screen(ctx->base.screen)->vk.CmdBeginQueryIndexedEXT(batch->state->cmdbuf,
-                                                                q->xfb_query_pool[0] ? q->xfb_query_pool[0] : q->query_pool,
-                                                                q->curr_query,
-                                                                flags,
-                                                                q->index);
+      VKCTX(CmdBeginQueryIndexedEXT)(batch->state->cmdbuf,
+                                     q->xfb_query_pool[0] ? q->xfb_query_pool[0] : q->query_pool,
+                                     q->curr_query,
+                                     flags,
+                                     q->index);
       q->xfb_running = true;
    } else if (q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
-      zink_screen(ctx->base.screen)->vk.CmdBeginQueryIndexedEXT(batch->state->cmdbuf,
-                                                                q->query_pool,
-                                                                q->curr_query,
-                                                                flags,
-                                                                0);
+      VKCTX(CmdBeginQueryIndexedEXT)(batch->state->cmdbuf,
+                                     q->query_pool,
+                                     q->curr_query,
+                                     flags,
+                                     0);
       for (unsigned i = 0; i < ARRAY_SIZE(q->xfb_query_pool); i++)
-         zink_screen(ctx->base.screen)->vk.CmdBeginQueryIndexedEXT(batch->state->cmdbuf,
-                                                                   q->xfb_query_pool[i],
-                                                                   q->curr_query,
-                                                                   flags,
-                                                                   i + 1);
+         VKCTX(CmdBeginQueryIndexedEXT)(batch->state->cmdbuf,
+                                        q->xfb_query_pool[i],
+                                        q->curr_query,
+                                        flags,
+                                        i + 1);
       q->xfb_running = true;
    }
    if (q->vkqtype != VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT)
-      vkCmdBeginQuery(batch->state->cmdbuf, q->query_pool, q->curr_query, flags);
+      VKCTX(CmdBeginQuery)(batch->state->cmdbuf, q->query_pool, q->curr_query, flags);
    if (needs_stats_list(q))
       list_addtail(&q->stats_list, &ctx->primitives_generated_queries);
-   p_atomic_inc(&q->fences);
    zink_batch_usage_set(&q->batch_id, batch->state);
    _mesa_set_add(batch->state->active_queries, q);
 }
@@ -708,49 +725,50 @@ zink_begin_query(struct pipe_context *pctx,
 }
 
 static void
-end_query(struct zink_context *ctx, struct zink_batch *batch, struct zink_query *q)
+update_query_id(struct zink_context *ctx, struct zink_query *q)
 {
-   struct zink_screen *screen = zink_screen(ctx->base.screen);
-   ASSERTED struct zink_query_buffer *qbo = q->curr_qbo;
-   assert(qbo);
-   batch->has_work = true;
-   q->active = q->type == PIPE_QUERY_TIMESTAMP;
-   if (is_time_query(q)) {
-      if (q->needs_reset)
-         reset_pool(ctx, batch, q);
-      vkCmdWriteTimestamp(batch->state->cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
-                          q->query_pool, q->curr_query);
-      zink_batch_usage_set(&q->batch_id, batch->state);
-   } else if (q->type == PIPE_QUERY_PRIMITIVES_EMITTED ||
-            q->type == PIPE_QUERY_PRIMITIVES_GENERATED ||
-            q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) {
-      screen->vk.CmdEndQueryIndexedEXT(batch->state->cmdbuf, q->xfb_query_pool[0] ? q->xfb_query_pool[0] :
-                                                                                    q->query_pool,
-                                       q->curr_query, q->index);
-   }
-
-   else if (q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
-      screen->vk.CmdEndQueryIndexedEXT(batch->state->cmdbuf, q->query_pool, q->curr_query, 0);
-      for (unsigned i = 0; i < ARRAY_SIZE(q->xfb_query_pool); i++) {
-         screen->vk.CmdEndQueryIndexedEXT(batch->state->cmdbuf, q->xfb_query_pool[i], q->curr_query, i + 1);
-      }
-   }
-   if (q->vkqtype != VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT && !is_time_query(q))
-      vkCmdEndQuery(batch->state->cmdbuf, q->query_pool, q->curr_query);
-
-   if (needs_stats_list(q))
-      list_delinit(&q->stats_list);
    if (++q->curr_query == NUM_QUERIES) {
       /* always reset on start; this ensures we can actually submit the batch that the current query is on */
       q->needs_reset = true;
    }
+   ctx->batch.has_work = true;
 
-   if (batch->in_rp)
+   if (ctx->batch.in_rp)
       q->needs_update = true;
    else
       update_qbo(ctx, q);
 }
 
+static void
+end_query(struct zink_context *ctx, struct zink_batch *batch, struct zink_query *q)
+{
+   ASSERTED struct zink_query_buffer *qbo = q->curr_qbo;
+   assert(qbo);
+   assert(!is_time_query(q));
+   q->active = false;
+   if (q->type == PIPE_QUERY_PRIMITIVES_EMITTED ||
+            q->type == PIPE_QUERY_PRIMITIVES_GENERATED ||
+            q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) {
+      VKCTX(CmdEndQueryIndexedEXT)(batch->state->cmdbuf,
+                                   q->xfb_query_pool[0] ? q->xfb_query_pool[0] : q->query_pool,
+                                   q->curr_query, q->index);
+   }
+
+   else if (q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
+      VKCTX(CmdEndQueryIndexedEXT)(batch->state->cmdbuf, q->query_pool, q->curr_query, 0);
+      for (unsigned i = 0; i < ARRAY_SIZE(q->xfb_query_pool); i++) {
+         VKCTX(CmdEndQueryIndexedEXT)(batch->state->cmdbuf, q->xfb_query_pool[i], q->curr_query, i + 1);
+      }
+   }
+   if (q->vkqtype != VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT && !is_time_query(q))
+      VKCTX(CmdEndQuery)(batch->state->cmdbuf, q->query_pool, q->curr_query);
+
+   if (needs_stats_list(q))
+      list_delinit(&q->stats_list);
+
+   update_query_id(ctx, q);
+}
+
 static bool
 zink_end_query(struct pipe_context *pctx,
                struct pipe_query *q)
@@ -759,12 +777,25 @@ zink_end_query(struct pipe_context *pctx,
    struct zink_query *query = (struct zink_query *)q;
    struct zink_batch *batch = &ctx->batch;
 
+   if (query->type == PIPE_QUERY_GPU_FINISHED) {
+      pctx->flush(pctx, &query->fence, PIPE_FLUSH_DEFERRED);
+      return true;
+   }
+
    /* FIXME: this can be called from a thread, but it needs to write to the cmdbuf */
    threaded_context_unwrap_sync(pctx);
 
    if (needs_stats_list(query))
       list_delinit(&query->stats_list);
-   if (query->active)
+   if (is_time_query(query)) {
+      if (query->needs_reset)
+         reset_pool(ctx, batch, query);
+      VKCTX(CmdWriteTimestamp)(batch->state->cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
+                          query->query_pool, query->curr_query);
+      zink_batch_usage_set(&query->batch_id, batch->state);
+      _mesa_set_add(batch->state->active_queries, query);
+      update_query_id(ctx, query);
+   } else if (query->active)
       end_query(ctx, batch, query);
 
    return true;
@@ -779,6 +810,14 @@ zink_get_query_result(struct pipe_context *pctx,
    struct zink_query *query = (void*)q;
    struct zink_context *ctx = zink_context(pctx);
 
+   if (query->type == PIPE_QUERY_GPU_FINISHED) {
+      struct pipe_screen *screen = pctx->screen;
+
+      result->b = screen->fence_finish(screen, query->base.flushed ? NULL : pctx,
+                                        query->fence, wait ? PIPE_TIMEOUT_INFINITE : 0);
+      return result->b;
+   }
+
    if (query->needs_update)
       update_qbo(ctx, query);
 
@@ -801,7 +840,7 @@ zink_suspend_queries(struct zink_context *ctx, struct zink_batch *batch)
    set_foreach(batch->state->active_queries, entry) {
       struct zink_query *query = (void*)entry->key;
       /* if a query isn't active here then we don't need to reactivate it on the next batch */
-      if (query->active) {
+      if (query->active && !is_time_query(query)) {
          end_query(ctx, batch, query);
          /* the fence is going to steal the set off the batch, so we have to copy
           * the active queries onto a list
@@ -853,8 +892,9 @@ zink_set_active_query_state(struct pipe_context *pctx, bool enable)
 void
 zink_start_conditional_render(struct zink_context *ctx)
 {
+   if (unlikely(!zink_screen(ctx->base.screen)->info.have_EXT_conditional_rendering))
+      return;
    struct zink_batch *batch = &ctx->batch;
-   struct zink_screen *screen = zink_screen(ctx->base.screen);
    VkConditionalRenderingFlagsEXT begin_flags = 0;
    if (ctx->render_condition.inverted)
       begin_flags = VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
@@ -862,7 +902,7 @@ zink_start_conditional_render(struct zink_context *ctx)
    begin_info.sType = VK_STRUCTURE_TYPE_CONDITIONAL_RENDERING_BEGIN_INFO_EXT;
    begin_info.buffer = ctx->render_condition.query->predicate->obj->buffer;
    begin_info.flags = begin_flags;
-   screen->vk.CmdBeginConditionalRenderingEXT(batch->state->cmdbuf, &begin_info);
+   VKCTX(CmdBeginConditionalRenderingEXT)(batch->state->cmdbuf, &begin_info);
    zink_batch_reference_resource_rw(batch, ctx->render_condition.query->predicate, false);
 }
 
@@ -870,9 +910,24 @@ void
 zink_stop_conditional_render(struct zink_context *ctx)
 {
    struct zink_batch *batch = &ctx->batch;
-   struct zink_screen *screen = zink_screen(ctx->base.screen);
    zink_clear_apply_conditionals(ctx);
-   screen->vk.CmdEndConditionalRenderingEXT(batch->state->cmdbuf);
+   if (unlikely(!zink_screen(ctx->base.screen)->info.have_EXT_conditional_rendering))
+      return;
+   VKCTX(CmdEndConditionalRenderingEXT)(batch->state->cmdbuf);
+}
+
+bool
+zink_check_conditional_render(struct zink_context *ctx)
+{
+   if (!ctx->render_condition_active)
+      return true;
+   assert(ctx->render_condition.query);
+
+   union pipe_query_result result;
+   zink_get_query_result(&ctx->base, (struct pipe_query*)ctx->render_condition.query, true, &result);
+   return is_bool_query(ctx->render_condition.query) ?
+          ctx->render_condition.inverted != result.b :
+          ctx->render_condition.inverted != !!result.u64;
 }
 
 static void
@@ -948,7 +1003,6 @@ zink_get_query_result_resource(struct pipe_context *pctx,
    VkQueryResultFlagBits size_flags = result_type <= PIPE_QUERY_TYPE_U32 ? 0 : VK_QUERY_RESULT_64_BIT;
    unsigned num_queries = query->curr_query - query->last_start;
    unsigned query_id = query->last_start;
-   unsigned fences = p_atomic_read(&query->fences);
 
    if (index == -1) {
       /* VK_QUERY_RESULT_WITH_AVAILABILITY_BIT will ALWAYS write some kind of result data
@@ -960,9 +1014,9 @@ zink_get_query_result_resource(struct pipe_context *pctx,
        */
 
       VkQueryResultFlags flag = is_time_query(query) ? 0 : VK_QUERY_RESULT_PARTIAL_BIT;
-      if (!fences) {
+      if (zink_batch_usage_check_completion(ctx, query->batch_id)) {
          uint64_t u64[2] = {0};
-         if (vkGetQueryPoolResults(screen->dev, query->query_pool, query_id, 1, 2 * result_size, u64,
+         if (VKCTX(GetQueryPoolResults)(screen->dev, query->query_pool, query_id, 1, 2 * result_size, u64,
                                    0, size_flags | VK_QUERY_RESULT_WITH_AVAILABILITY_BIT | flag) == VK_SUCCESS) {
             pipe_buffer_write(pctx, pres, offset, result_size, (unsigned char*)u64 + result_size);
             return;
@@ -970,7 +1024,7 @@ zink_get_query_result_resource(struct pipe_context *pctx,
       }
       struct pipe_resource *staging = pipe_buffer_create(pctx->screen, 0, PIPE_USAGE_STAGING, result_size * 2);
       copy_results_to_buffer(ctx, query, zink_resource(staging), 0, 1, size_flags | VK_QUERY_RESULT_WITH_AVAILABILITY_BIT | flag);
-      zink_copy_buffer(ctx, &ctx->batch, res, zink_resource(staging), offset, result_size, result_size);
+      zink_copy_buffer(ctx, res, zink_resource(staging), offset, result_size, result_size);
       pipe_resource_reference(&staging, NULL);
       return;
    }
@@ -983,7 +1037,7 @@ zink_get_query_result_resource(struct pipe_context *pctx,
             if (query->needs_update)
                update_qbo(ctx, query);
             /* internal qbo always writes 64bit value so we can just direct copy */
-            zink_copy_buffer(ctx, NULL, res, zink_resource(query->curr_qbo->buffer), offset,
+            zink_copy_buffer(ctx, res, zink_resource(query->curr_qbo->buffer), offset,
                              get_buffer_offset(query, query->curr_qbo->buffer, query->last_start),
                              result_size);
          } else
@@ -1010,7 +1064,7 @@ zink_get_timestamp(struct pipe_context *pctx)
    VkCalibratedTimestampInfoEXT cti = {0};
    cti.sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT;
    cti.timeDomain = VK_TIME_DOMAIN_DEVICE_EXT;
-   screen->vk.GetCalibratedTimestampsEXT(screen->dev, 1, &cti, &timestamp, &deviation);
+   VKSCR(GetCalibratedTimestampsEXT)(screen->dev, 1, &cti, &timestamp, &deviation);
    timestamp_to_nanoseconds(screen, &timestamp);
    return timestamp;
 }
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_query.h b/mesa 3D driver/src/gallium/drivers/zink/zink_query.h
index 1f90ecc226..73fd31eeda 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_query.h	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_query.h	
@@ -24,7 +24,10 @@
 #ifndef ZINK_QUERY_H
 #define ZINK_QUERY_H
 
+#include <stdbool.h>
+
 struct zink_batch;
+struct zink_batch_state;
 struct zink_context;
 struct zink_fence;
 struct zink_query;
@@ -40,7 +43,7 @@ void
 zink_resume_queries(struct zink_context *ctx, struct zink_batch *batch);
 
 void
-zink_prune_query(struct zink_screen *screen, struct zink_query *query);
+zink_prune_query(struct zink_screen *screen, struct zink_batch_state *bs, struct zink_query *query);
 
 void
 zink_query_update_gs_states(struct zink_context *ctx);
@@ -50,6 +53,9 @@ zink_start_conditional_render(struct zink_context *ctx);
 
 void
 zink_stop_conditional_render(struct zink_context *ctx);
+
+bool
+zink_check_conditional_render(struct zink_context *ctx);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_render_pass.c b/mesa 3D driver/src/gallium/drivers/zink/zink_render_pass.c
index 84228d3334..265bf941e2 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_render_pass.c	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_render_pass.c	
@@ -29,19 +29,22 @@
 #include "util/u_string.h"
 
 static VkRenderPass
-create_render_pass(VkDevice dev, struct zink_render_pass_state *state)
+create_render_pass(struct zink_screen *screen, struct zink_render_pass_state *state, struct zink_render_pass_pipeline_state *pstate)
 {
 
    VkAttachmentReference color_refs[PIPE_MAX_COLOR_BUFS], zs_ref;
+   VkAttachmentReference input_attachments[PIPE_MAX_COLOR_BUFS];
    VkAttachmentDescription attachments[PIPE_MAX_COLOR_BUFS + 1];
    VkPipelineStageFlags dep_pipeline = 0;
    VkAccessFlags dep_access = 0;
+   unsigned input_count = 0;
 
+   pstate->num_attachments = state->num_cbufs;
    for (int i = 0; i < state->num_cbufs; i++) {
       struct zink_rt_attrib *rt = state->rts + i;
       attachments[i].flags = 0;
-      attachments[i].format = rt->format;
-      attachments[i].samples = rt->samples;
+      pstate->attachments[i].format = attachments[i].format = rt->format;
+      pstate->attachments[i].samples = attachments[i].samples = rt->samples;
       attachments[i].loadOp = rt->clear_color ? VK_ATTACHMENT_LOAD_OP_CLEAR :
                                                 state->swapchain_init && rt->swapchain ?
                                                 VK_ATTACHMENT_LOAD_OP_DONT_CARE :
@@ -50,11 +53,14 @@ create_render_pass(VkDevice dev, struct zink_render_pass_state *state)
       attachments[i].stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
       attachments[i].stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
       /* if layout changes are ever handled here, need VkAttachmentSampleLocationsEXT */
-      attachments[i].initialLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
-      attachments[i].finalLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+      VkImageLayout layout = rt->fbfetch ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+      attachments[i].initialLayout = layout;
+      attachments[i].finalLayout = layout;
       color_refs[i].attachment = i;
-      color_refs[i].layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+      color_refs[i].layout = layout;
       dep_pipeline |= VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+      if (rt->fbfetch)
+         memcpy(&input_attachments[input_count++], &color_refs[i], sizeof(VkAttachmentReference));
       dep_access |= VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
       if (attachments[i].loadOp == VK_ATTACHMENT_LOAD_OP_LOAD)
          dep_access |= VK_ACCESS_COLOR_ATTACHMENT_READ_BIT;
@@ -66,8 +72,8 @@ create_render_pass(VkDevice dev, struct zink_render_pass_state *state)
       bool has_clear = rt->clear_color || rt->clear_stencil;
       VkImageLayout layout = rt->needs_write || has_clear ? VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL : VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL;
       attachments[num_attachments].flags = 0;
-      attachments[num_attachments].format = rt->format;
-      attachments[num_attachments].samples = rt->samples;
+      pstate->attachments[num_attachments].format = attachments[num_attachments].format = rt->format;
+      pstate->attachments[num_attachments].samples = attachments[num_attachments].samples = rt->samples;
       attachments[num_attachments].loadOp = rt->clear_color ? VK_ATTACHMENT_LOAD_OP_CLEAR : VK_ATTACHMENT_LOAD_OP_LOAD;
       attachments[num_attachments].storeOp = VK_ATTACHMENT_STORE_OP_STORE;
       attachments[num_attachments].stencilLoadOp = rt->clear_stencil ? VK_ATTACHMENT_LOAD_OP_CLEAR : VK_ATTACHMENT_LOAD_OP_LOAD;
@@ -85,6 +91,7 @@ create_render_pass(VkDevice dev, struct zink_render_pass_state *state)
 
       zs_ref.attachment = num_attachments++;
       zs_ref.layout = layout;
+      pstate->num_attachments++;
    }
 
    VkSubpassDependency deps[] = {
@@ -97,6 +104,8 @@ create_render_pass(VkDevice dev, struct zink_render_pass_state *state)
    subpass.colorAttachmentCount = state->num_cbufs;
    subpass.pColorAttachments = color_refs;
    subpass.pDepthStencilAttachment = state->have_zsbuf ? &zs_ref : NULL;
+   subpass.inputAttachmentCount = input_count;
+   subpass.pInputAttachments = input_attachments;
 
    VkRenderPassCreateInfo rpci = {0};
    rpci.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO;
@@ -108,7 +117,7 @@ create_render_pass(VkDevice dev, struct zink_render_pass_state *state)
    rpci.pDependencies = deps;
 
    VkRenderPass render_pass;
-   if (vkCreateRenderPass(dev, &rpci, NULL, &render_pass) != VK_SUCCESS) {
+   if (VKSCR(CreateRenderPass)(screen->dev, &rpci, NULL, &render_pass) != VK_SUCCESS) {
       debug_printf("vkCreateRenderPass failed\n");
       return VK_NULL_HANDLE;
    }
@@ -116,15 +125,171 @@ create_render_pass(VkDevice dev, struct zink_render_pass_state *state)
    return render_pass;
 }
 
+static VkRenderPass
+create_render_pass2(struct zink_screen *screen, struct zink_render_pass_state *state, struct zink_render_pass_pipeline_state *pstate)
+{
+
+   VkAttachmentReference2 color_refs[PIPE_MAX_COLOR_BUFS], color_resolves[PIPE_MAX_COLOR_BUFS], zs_ref, zs_resolve;
+   VkAttachmentReference2 input_attachments[PIPE_MAX_COLOR_BUFS];
+   VkAttachmentDescription2 attachments[2 * (PIPE_MAX_COLOR_BUFS + 1)];
+   VkPipelineStageFlags dep_pipeline = 0;
+   VkAccessFlags dep_access = 0;
+   unsigned input_count = 0;
+   const unsigned cresolve_offset = state->num_cbufs + state->have_zsbuf;
+   const unsigned zsresolve_offset = cresolve_offset + state->num_cresolves;
+
+   pstate->num_attachments = state->num_cbufs;
+   pstate->num_cresolves = state->num_cresolves;
+   pstate->num_zsresolves = state->num_zsresolves;
+   for (int i = 0; i < state->num_cbufs; i++) {
+      struct zink_rt_attrib *rt = state->rts + i;
+      attachments[i].sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
+      attachments[i].pNext = NULL;
+      attachments[i].flags = 0;
+      pstate->attachments[i].format = attachments[i].format = rt->format;
+      pstate->attachments[i].samples = attachments[i].samples = rt->samples;
+      attachments[i].loadOp = rt->clear_color ? VK_ATTACHMENT_LOAD_OP_CLEAR :
+                                                /* TODO: need replicate EXT */
+                                                //rt->resolve || (state->swapchain_init && rt->swapchain) ?
+                                                state->swapchain_init && rt->swapchain ?
+                                                VK_ATTACHMENT_LOAD_OP_DONT_CARE :
+                                                VK_ATTACHMENT_LOAD_OP_LOAD;
+
+      /* TODO: need replicate EXT */
+      //attachments[i].storeOp = rt->resolve ? VK_ATTACHMENT_STORE_OP_DONT_CARE : VK_ATTACHMENT_STORE_OP_STORE;
+      attachments[i].storeOp = VK_ATTACHMENT_STORE_OP_STORE;
+      attachments[i].stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+      attachments[i].stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
+      /* if layout changes are ever handled here, need VkAttachmentSampleLocationsEXT */
+      VkImageLayout layout = rt->fbfetch ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+      attachments[i].initialLayout = layout;
+      attachments[i].finalLayout = layout;
+      color_refs[i].sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2;
+      color_refs[i].pNext = NULL;
+      color_refs[i].attachment = i;
+      color_refs[i].layout = layout;
+      dep_pipeline |= VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+      if (rt->fbfetch)
+         memcpy(&input_attachments[input_count++], &color_refs[i], sizeof(VkAttachmentReference2));
+      dep_access |= VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+      if (attachments[i].loadOp == VK_ATTACHMENT_LOAD_OP_LOAD)
+         dep_access |= VK_ACCESS_COLOR_ATTACHMENT_READ_BIT;
+
+      if (rt->resolve) {
+         memcpy(&attachments[cresolve_offset + i], &attachments[i], sizeof(VkAttachmentDescription2));
+         attachments[cresolve_offset + i].loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+         attachments[cresolve_offset + i].storeOp = VK_ATTACHMENT_STORE_OP_STORE;
+         attachments[cresolve_offset + i].samples = 1;
+         memcpy(&color_resolves[i], &color_refs[i], sizeof(VkAttachmentReference2));
+         color_resolves[i].attachment = cresolve_offset + i;
+         if (attachments[cresolve_offset + i].loadOp == VK_ATTACHMENT_LOAD_OP_LOAD)
+            dep_access |= VK_ACCESS_COLOR_ATTACHMENT_READ_BIT;
+      }
+   }
+
+   int num_attachments = state->num_cbufs;
+   if (state->have_zsbuf)  {
+      struct zink_rt_attrib *rt = state->rts + state->num_cbufs;
+      bool has_clear = rt->clear_color || rt->clear_stencil;
+      VkImageLayout layout = rt->needs_write || has_clear ? VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL : VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL;
+      attachments[num_attachments].sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2;
+      attachments[num_attachments].pNext = NULL;
+      attachments[num_attachments].flags = 0;
+      pstate->attachments[num_attachments].format = attachments[num_attachments].format = rt->format;
+      pstate->attachments[num_attachments].samples = attachments[num_attachments].samples = rt->samples;
+      attachments[num_attachments].loadOp = rt->clear_color ? VK_ATTACHMENT_LOAD_OP_CLEAR : VK_ATTACHMENT_LOAD_OP_LOAD;
+      attachments[num_attachments].stencilLoadOp = rt->clear_stencil ? VK_ATTACHMENT_LOAD_OP_CLEAR : VK_ATTACHMENT_LOAD_OP_LOAD;
+      /* TODO: need replicate EXT */
+      //attachments[num_attachments].storeOp = rt->resolve ? VK_ATTACHMENT_LOAD_OP_DONT_CARE : VK_ATTACHMENT_STORE_OP_STORE;
+      //attachments[num_attachments].stencilStoreOp = rt->resolve ? VK_ATTACHMENT_LOAD_OP_DONT_CARE : VK_ATTACHMENT_STORE_OP_STORE;
+      attachments[num_attachments].storeOp = VK_ATTACHMENT_STORE_OP_STORE;
+      attachments[num_attachments].stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE;
+      /* if layout changes are ever handled here, need VkAttachmentSampleLocationsEXT */
+      attachments[num_attachments].initialLayout = layout;
+      attachments[num_attachments].finalLayout = layout;
+
+      dep_pipeline |= VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT;
+      if (layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL)
+         dep_access |= VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
+      if (attachments[num_attachments].loadOp == VK_ATTACHMENT_LOAD_OP_LOAD ||
+          attachments[num_attachments].stencilLoadOp == VK_ATTACHMENT_LOAD_OP_LOAD)
+         dep_access |= VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT;
+
+      zs_ref.sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2;
+      zs_ref.pNext = NULL;
+      zs_ref.attachment = num_attachments++;
+      zs_ref.layout = layout;
+      if (rt->resolve) {
+         memcpy(&attachments[zsresolve_offset], &attachments[num_attachments], sizeof(VkAttachmentDescription2));
+         attachments[zsresolve_offset].loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+         attachments[zsresolve_offset].stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+         attachments[zsresolve_offset].storeOp = VK_ATTACHMENT_STORE_OP_STORE;
+         attachments[zsresolve_offset].stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE;
+         attachments[zsresolve_offset].samples = 1;
+         memcpy(&zs_resolve, &zs_ref, sizeof(VkAttachmentReference2));
+         zs_ref.attachment = zsresolve_offset;
+         if (attachments[zsresolve_offset].loadOp == VK_ATTACHMENT_LOAD_OP_LOAD ||
+             attachments[zsresolve_offset].stencilLoadOp == VK_ATTACHMENT_LOAD_OP_LOAD)
+            dep_access |= VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT;
+      }
+      pstate->num_attachments++;
+   }
+
+   VkSubpassDependency2 deps[] = {
+      [0] = {VK_STRUCTURE_TYPE_SUBPASS_DEPENDENCY_2, NULL, VK_SUBPASS_EXTERNAL, 0, dep_pipeline, dep_pipeline, 0, dep_access, VK_DEPENDENCY_BY_REGION_BIT, 0},
+      [1] = {VK_STRUCTURE_TYPE_SUBPASS_DEPENDENCY_2, NULL, 0, VK_SUBPASS_EXTERNAL, dep_pipeline, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, dep_access, 0, VK_DEPENDENCY_BY_REGION_BIT, 0}
+   };
+
+   VkSubpassDescription2 subpass = {0};
+   VkSubpassDescriptionDepthStencilResolve zsresolve;
+   subpass.sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2;
+   subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS;
+   subpass.colorAttachmentCount = state->num_cbufs;
+   subpass.pColorAttachments = color_refs;
+   subpass.pDepthStencilAttachment = state->have_zsbuf ? &zs_ref : NULL;
+   subpass.inputAttachmentCount = input_count;
+   subpass.pInputAttachments = input_attachments;
+   if (state->num_cresolves)
+      subpass.pResolveAttachments = color_resolves;
+   if (state->num_zsresolves) {
+      subpass.pNext = &zsresolve;
+      zsresolve.sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE;
+      zsresolve.pNext = NULL;
+      zsresolve.depthResolveMode = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT;
+      zsresolve.stencilResolveMode = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT;
+      zsresolve.pDepthStencilResolveAttachment = &zs_resolve;
+   } else
+      subpass.pNext = NULL;
+
+   VkRenderPassCreateInfo2 rpci = {0};
+   rpci.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2;
+   rpci.attachmentCount = num_attachments + state->num_cresolves + state->num_zsresolves;
+   rpci.pAttachments = attachments;
+   rpci.subpassCount = 1;
+   rpci.pSubpasses = &subpass;
+   rpci.dependencyCount = 2;
+   rpci.pDependencies = deps;
+
+   VkRenderPass render_pass;
+   if (VKSCR(CreateRenderPass2)(screen->dev, &rpci, NULL, &render_pass) != VK_SUCCESS) {
+      debug_printf("vkCreateRenderPass2 failed\n");
+      return VK_NULL_HANDLE;
+   }
+
+   return render_pass;
+}
+
 struct zink_render_pass *
 zink_create_render_pass(struct zink_screen *screen,
-                        struct zink_render_pass_state *state)
+                        struct zink_render_pass_state *state,
+                        struct zink_render_pass_pipeline_state *pstate)
 {
    struct zink_render_pass *rp = CALLOC_STRUCT(zink_render_pass);
    if (!rp)
       goto fail;
 
-   rp->render_pass = create_render_pass(screen->dev, state);
+   rp->render_pass = screen->vk_version >= VK_MAKE_VERSION(1,2,0) ?
+                     create_render_pass2(screen, state, pstate) : create_render_pass(screen, state, pstate);
    if (!rp->render_pass)
       goto fail;
    memcpy(&rp->state, state, sizeof(struct zink_render_pass_state));
@@ -140,7 +305,7 @@ void
 zink_destroy_render_pass(struct zink_screen *screen,
                          struct zink_render_pass *rp)
 {
-   vkDestroyRenderPass(screen->dev, rp->render_pass, NULL);
+   VKSCR(DestroyRenderPass)(screen->dev, rp->render_pass, NULL);
    FREE(rp);
 }
 
@@ -156,7 +321,7 @@ zink_render_pass_attachment_get_barrier_info(const struct zink_render_pass *rp,
       *access |= VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
       if (!rt->clear_color && (!rp->state.swapchain_init || !rt->swapchain))
          *access |= VK_ACCESS_COLOR_ATTACHMENT_READ_BIT;
-      return VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+      return rt->fbfetch ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
    }
 
    assert(rp->state.have_zsbuf);
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_render_pass.h b/mesa 3D driver/src/gallium/drivers/zink/zink_render_pass.h
index 8b4441e5fe..2e02f1566d 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_render_pass.h	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_render_pass.h	
@@ -35,30 +35,54 @@ struct zink_rt_attrib {
   VkFormat format;
   VkSampleCountFlagBits samples;
   bool clear_color;
-  bool clear_stencil;
+  union {
+     bool clear_stencil;
+     bool fbfetch;
+  };
   union {
      bool swapchain;
      bool needs_write;
   };
+  bool resolve;
 };
 
 struct zink_render_pass_state {
-   uint8_t num_cbufs : 4; /* PIPE_MAX_COLOR_BUFS = 8 */
+   uint8_t num_cbufs : 5; /* PIPE_MAX_COLOR_BUFS = 8 */
    uint8_t have_zsbuf : 1;
-   bool swapchain_init;
+   uint8_t samples:1; //for fs samplemask
+   uint8_t swapchain_init:1;
+   uint32_t num_zsresolves : 1;
+   uint32_t num_cresolves : 23; /* PIPE_MAX_COLOR_BUFS, but this is a struct hole */
    struct zink_rt_attrib rts[PIPE_MAX_COLOR_BUFS + 1];
    unsigned num_rts;
    uint32_t clears; //for extra verification and update flagging
+   uint32_t msaa_expand_mask;
+};
+
+struct zink_pipeline_rt {
+   VkFormat format;
+   VkSampleCountFlagBits samples;
+};
+
+struct zink_render_pass_pipeline_state {
+   uint32_t num_attachments:26;
+   uint32_t num_cresolves:4;
+   uint32_t num_zsresolves:1;
+   bool samples:1; //for fs samplemask
+   struct zink_pipeline_rt attachments[PIPE_MAX_COLOR_BUFS + 1];
+   unsigned id;
 };
 
 struct zink_render_pass {
    VkRenderPass render_pass;
    struct zink_render_pass_state state;
+   unsigned pipeline_state;
 };
 
 struct zink_render_pass *
 zink_create_render_pass(struct zink_screen *screen,
-                        struct zink_render_pass_state *state);
+                        struct zink_render_pass_state *state,
+                        struct zink_render_pass_pipeline_state *pstate);
 
 void
 zink_destroy_render_pass(struct zink_screen *screen,
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_resource.c b/mesa 3D driver/src/gallium/drivers/zink/zink_resource.c
index 13dc331069..64516a2900 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_resource.c	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_resource.c	
@@ -42,7 +42,7 @@
 #include "util/u_inlines.h"
 #include "util/u_memory.h"
 #include "util/u_upload_mgr.h"
-
+#include "util/os_file.h"
 #include "frontend/sw_winsys.h"
 
 #ifndef _WIN32
@@ -50,6 +50,7 @@
 #endif
 
 #ifdef ZINK_USE_DMABUF
+#include <xf86drm.h>
 #include "drm-uapi/drm_fourcc.h"
 #else
 /* these won't actually be used */
@@ -57,6 +58,19 @@
 #define DRM_FORMAT_MOD_LINEAR 0
 #endif
 
+
+static bool
+equals_ivci(const void *a, const void *b)
+{
+   return memcmp(a, b, sizeof(VkImageViewCreateInfo)) == 0;
+}
+
+static bool
+equals_bvci(const void *a, const void *b)
+{
+   return memcmp(a, b, sizeof(VkBufferViewCreateInfo)) == 0;
+}
+
 static void
 zink_transfer_flush_region(struct pipe_context *pctx,
                            struct pipe_transfer *ptrans,
@@ -73,10 +87,10 @@ zink_destroy_resource_object(struct zink_screen *screen, struct zink_resource_ob
 {
    if (obj->is_buffer) {
       util_dynarray_foreach(&obj->tmp, VkBuffer, buffer)
-         vkDestroyBuffer(screen->dev, *buffer, NULL);
-      vkDestroyBuffer(screen->dev, obj->buffer, NULL);
+         VKSCR(DestroyBuffer)(screen->dev, *buffer, NULL);
+      VKSCR(DestroyBuffer)(screen->dev, obj->buffer, NULL);
    } else {
-      vkDestroyImage(screen->dev, obj->image, NULL);
+      VKSCR(DestroyImage)(screen->dev, obj->image, NULL);
    }
 
    util_dynarray_fini(&obj->tmp);
@@ -94,12 +108,20 @@ zink_resource_destroy(struct pipe_screen *pscreen,
    if (pres->target == PIPE_BUFFER) {
       util_range_destroy(&res->valid_buffer_range);
       util_idalloc_mt_free(&screen->buffer_ids, res->base.buffer_id_unique);
+      assert(!_mesa_hash_table_num_entries(&res->bufferview_cache));
+      simple_mtx_destroy(&res->bufferview_mtx);
+      ralloc_free(res->bufferview_cache.table);
+   } else {
+      assert(!_mesa_hash_table_num_entries(&res->surface_cache));
+      simple_mtx_destroy(&res->surface_mtx);
+      ralloc_free(res->surface_cache.table);
    }
+   /* no need to do anything for the caches, these objects own the resource lifetimes */
 
    zink_resource_object_reference(screen, &res->obj, NULL);
    zink_resource_object_reference(screen, &res->scanout_obj, NULL);
    threaded_resource_deinit(pres);
-   FREE(res);
+   FREE_CL(res);
 }
 
 static VkImageAspectFlags
@@ -125,6 +147,7 @@ create_bci(struct zink_screen *screen, const struct pipe_resource *templ, unsign
    bci.pNext = NULL;
    bci.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
    bci.queueFamilyIndexCount = 0;
+   bci.pQueueFamilyIndices = NULL;
    bci.size = templ->width0;
    bci.flags = 0;
    assert(bci.size > 0);
@@ -155,8 +178,8 @@ check_ici(struct zink_screen *screen, VkImageCreateInfo *ici, uint64_t modifier)
    VkImageFormatProperties image_props;
    VkResult ret;
    assert(modifier == DRM_FORMAT_MOD_INVALID ||
-          (screen->vk.GetPhysicalDeviceImageFormatProperties2 && screen->info.have_EXT_image_drm_format_modifier));
-   if (screen->vk.GetPhysicalDeviceImageFormatProperties2) {
+          (VKSCR(GetPhysicalDeviceImageFormatProperties2) && screen->info.have_EXT_image_drm_format_modifier));
+   if (VKSCR(GetPhysicalDeviceImageFormatProperties2)) {
       VkImageFormatProperties2 props2;
       props2.sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2;
       props2.pNext = NULL;
@@ -179,10 +202,10 @@ check_ici(struct zink_screen *screen, VkImageCreateInfo *ici, uint64_t modifier)
       } else
          info.pNext = NULL;
 
-      ret = screen->vk.GetPhysicalDeviceImageFormatProperties2(screen->pdev, &info, &props2);
+      ret = VKSCR(GetPhysicalDeviceImageFormatProperties2)(screen->pdev, &info, &props2);
       image_props = props2.imageFormatProperties;
    } else
-      ret = vkGetPhysicalDeviceImageFormatProperties(screen->pdev, ici->format, ici->imageType,
+      ret = VKSCR(GetPhysicalDeviceImageFormatProperties)(screen->pdev, ici->format, ici->imageType,
                                                    ici->tiling, ici->usage, ici->flags, &image_props);
    return ret == VK_SUCCESS;
 }
@@ -191,24 +214,30 @@ static VkImageUsageFlags
 get_image_usage_for_feats(struct zink_screen *screen, VkFormatFeatureFlags feats, const struct pipe_resource *templ, unsigned bind)
 {
    VkImageUsageFlags usage = 0;
-   /* sadly, gallium doesn't let us know if it'll ever need this, so we have to assume */
-   if (feats & VK_FORMAT_FEATURE_TRANSFER_SRC_BIT)
-      usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
-   if (feats & VK_FORMAT_FEATURE_TRANSFER_DST_BIT)
-      usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT;
-   if (feats & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT && (bind & (PIPE_BIND_LINEAR | PIPE_BIND_SHARED)) != (PIPE_BIND_LINEAR | PIPE_BIND_SHARED))
-      usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
+   if (bind & ZINK_BIND_TRANSIENT)
+      usage |= VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT;
+   else {
+      /* sadly, gallium doesn't let us know if it'll ever need this, so we have to assume */
+      if (feats & VK_FORMAT_FEATURE_TRANSFER_SRC_BIT)
+         usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
+      if (feats & VK_FORMAT_FEATURE_TRANSFER_DST_BIT)
+         usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT;
+      if (feats & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT && (bind & (PIPE_BIND_LINEAR | PIPE_BIND_SHARED)) != (PIPE_BIND_LINEAR | PIPE_BIND_SHARED))
+         usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
 
-   if ((templ->nr_samples <= 1 || screen->info.feats.features.shaderStorageImageMultisample) &&
-       (bind & PIPE_BIND_SHADER_IMAGE)) {
-      if (feats & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT)
-         usage |= VK_IMAGE_USAGE_STORAGE_BIT;
+      if ((templ->nr_samples <= 1 || screen->info.feats.features.shaderStorageImageMultisample) &&
+          (bind & PIPE_BIND_SHADER_IMAGE)) {
+         if (feats & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT)
+            usage |= VK_IMAGE_USAGE_STORAGE_BIT;
+      }
    }
 
    if (bind & PIPE_BIND_RENDER_TARGET) {
-      if (feats & VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT)
+      if (feats & VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT) {
          usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
-      else
+         if ((bind & (PIPE_BIND_LINEAR | PIPE_BIND_SHARED)) != (PIPE_BIND_LINEAR | PIPE_BIND_SHARED))
+            usage |= VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT;
+      } else
          return 0;
    }
 
@@ -230,6 +259,7 @@ get_image_usage_for_feats(struct zink_screen *screen, VkFormatFeatureFlags feats
 
    if (bind & PIPE_BIND_STREAM_OUTPUT)
       usage |= VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT;
+
    return usage;
 }
 
@@ -297,12 +327,13 @@ get_image_usage(struct zink_screen *screen, VkImageCreateInfo *ici, const struct
 }
 
 static uint64_t
-create_ici(struct zink_screen *screen, VkImageCreateInfo *ici, const struct pipe_resource *templ, unsigned bind, unsigned modifiers_count, const uint64_t *modifiers, bool *success)
+create_ici(struct zink_screen *screen, VkImageCreateInfo *ici, const struct pipe_resource *templ, bool dmabuf, unsigned bind, unsigned modifiers_count, const uint64_t *modifiers, bool *success)
 {
    ici->sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
    ici->pNext = NULL;
-   ici->flags = bind & (PIPE_BIND_SCANOUT | PIPE_BIND_DEPTH_STENCIL) ? 0 : VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT;
+   ici->flags = modifiers_count || dmabuf || bind & (PIPE_BIND_SCANOUT | PIPE_BIND_DEPTH_STENCIL) ? 0 : VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT;
    ici->usage = 0;
+   ici->queueFamilyIndexCount = 0;
 
    switch (templ->target) {
    case PIPE_TEXTURE_1D:
@@ -347,7 +378,7 @@ create_ici(struct zink_screen *screen, VkImageCreateInfo *ici, const struct pipe
    ici->samples = templ->nr_samples ? templ->nr_samples : VK_SAMPLE_COUNT_1_BIT;
    ici->tiling = modifiers_count ? VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT : bind & PIPE_BIND_LINEAR ? VK_IMAGE_TILING_LINEAR : VK_IMAGE_TILING_OPTIMAL;
    ici->sharingMode = VK_SHARING_MODE_EXCLUSIVE;
-   ici->initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+   ici->initialLayout = dmabuf ? VK_IMAGE_LAYOUT_PREINITIALIZED : VK_IMAGE_LAYOUT_UNDEFINED;
 
    if (templ->target == PIPE_TEXTURE_CUBE ||
        templ->target == PIPE_TEXTURE_CUBE_ARRAY ||
@@ -422,9 +453,21 @@ resource_object_create(struct zink_screen *screen, const struct pipe_resource *t
    VkMemoryRequirements reqs;
    VkMemoryPropertyFlags flags;
    bool need_dedicated = false;
+   VkExternalMemoryHandleTypeFlags export_types = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
+
+   VkExternalMemoryHandleTypeFlags external = 0;
+   if (whandle) {
+      if (whandle->type == WINSYS_HANDLE_TYPE_FD)
+         external = VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT;
+      else
+         unreachable("unknown handle type");
+   }
+
    /* TODO: remove linear for wsi */
-   bool scanout = (templ->bind & (PIPE_BIND_SCANOUT | PIPE_BIND_LINEAR)) == (PIPE_BIND_SCANOUT | PIPE_BIND_LINEAR);
-   bool shared = (templ->bind & (PIPE_BIND_SHARED | PIPE_BIND_LINEAR)) == (PIPE_BIND_SHARED | PIPE_BIND_LINEAR);
+   bool scanout = templ->bind & PIPE_BIND_SCANOUT;
+   bool shared = templ->bind & PIPE_BIND_SHARED;
+   if (shared && screen->info.have_EXT_external_memory_dma_buf)
+      export_types |= VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT;
 
    pipe_reference_init(&obj->reference, 1);
    util_dynarray_init(&obj->tmp, NULL);
@@ -432,12 +475,12 @@ resource_object_create(struct zink_screen *screen, const struct pipe_resource *t
    if (templ->target == PIPE_BUFFER) {
       VkBufferCreateInfo bci = create_bci(screen, templ, templ->bind);
 
-      if (vkCreateBuffer(screen->dev, &bci, NULL, &obj->buffer) != VK_SUCCESS) {
+      if (VKSCR(CreateBuffer)(screen->dev, &bci, NULL, &obj->buffer) != VK_SUCCESS) {
          debug_printf("vkCreateBuffer failed\n");
          goto fail1;
       }
 
-      vkGetBufferMemoryRequirements(screen->dev, obj->buffer, &reqs);
+      VKSCR(GetBufferMemoryRequirements)(screen->dev, obj->buffer, &reqs);
       if (templ->usage == PIPE_USAGE_STAGING)
          flags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
       else if (templ->usage == PIPE_USAGE_STREAM)
@@ -454,17 +497,17 @@ resource_object_create(struct zink_screen *screen, const struct pipe_resource *t
       unsigned ici_modifier_count = winsys_modifier ? 1 : modifiers_count;
       bool success = false;
       VkImageCreateInfo ici;
-      uint64_t mod = create_ici(screen, &ici, templ, templ->bind, ici_modifier_count, ici_modifiers, &success);
+      uint64_t mod = create_ici(screen, &ici, templ, !!external, templ->bind, ici_modifier_count, ici_modifiers, &success);
       VkExternalMemoryImageCreateInfo emici;
       VkImageDrmFormatModifierExplicitCreateInfoEXT idfmeci;
       VkImageDrmFormatModifierListCreateInfoEXT idfmlci;
       if (!success)
          goto fail1;
 
-      if (shared) {
+      if (shared || external) {
          emici.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO;
          emici.pNext = NULL;
-         emici.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
+         emici.handleTypes = export_types;
          ici.pNext = &emici;
 
          assert(ici.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT || mod != DRM_FORMAT_MOD_INVALID);
@@ -493,12 +536,13 @@ resource_object_create(struct zink_screen *screen, const struct pipe_resource *t
          } else if (ici.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
             idfmlci.sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT;
             idfmlci.pNext = ici.pNext;
-            idfmlci.drmFormatModifierCount = 1;
-            idfmlci.pDrmFormatModifiers = &mod;
+            idfmlci.drmFormatModifierCount = modifiers_count;
+            idfmlci.pDrmFormatModifiers = modifiers;
             ici.pNext = &idfmlci;
          } else if (ici.tiling == VK_IMAGE_TILING_OPTIMAL) {
             // TODO: remove for wsi
-            ici.pNext = NULL;
+            if (!external)
+               ici.pNext = NULL;
             scanout = false;
             shared = false;
          }
@@ -519,18 +563,19 @@ resource_object_create(struct zink_screen *screen, const struct pipe_resource *t
          .scanout = true,
       };
 
-      if ((screen->needs_mesa_wsi || screen->needs_mesa_flush_wsi) && scanout) {
+      if ((screen->needs_mesa_wsi || screen->needs_mesa_flush_wsi) && scanout &&
+          ici.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
          image_wsi_info.pNext = ici.pNext;
          ici.pNext = &image_wsi_info;
       }
 
-      VkResult result = vkCreateImage(screen->dev, &ici, NULL, &obj->image);
+      VkResult result = VKSCR(CreateImage)(screen->dev, &ici, NULL, &obj->image);
       if (result != VK_SUCCESS) {
          debug_printf("vkCreateImage failed\n");
          goto fail1;
       }
 
-      if (screen->vk.GetImageMemoryRequirements2) {
+      if (VKSCR(GetImageMemoryRequirements2)) {
          VkMemoryRequirements2 req2;
          req2.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2;
          VkImageMemoryRequirementsInfo2 info2;
@@ -541,16 +586,19 @@ resource_object_create(struct zink_screen *screen, const struct pipe_resource *t
          ded.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS;
          ded.pNext = NULL;
          req2.pNext = &ded;
-         screen->vk.GetImageMemoryRequirements2(screen->dev, &info2, &req2);
+         VKSCR(GetImageMemoryRequirements2)(screen->dev, &info2, &req2);
          memcpy(&reqs, &req2.memoryRequirements, sizeof(VkMemoryRequirements));
          need_dedicated = ded.prefersDedicatedAllocation || ded.requiresDedicatedAllocation;
       } else {
-         vkGetImageMemoryRequirements(screen->dev, obj->image, &reqs);
+         VKSCR(GetImageMemoryRequirements)(screen->dev, obj->image, &reqs);
       }
       if (templ->usage == PIPE_USAGE_STAGING && ici.tiling == VK_IMAGE_TILING_LINEAR)
         flags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
       else
         flags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+
+      obj->vkflags = ici.flags;
+      obj->vkusage = ici.usage;
    }
    obj->alignment = reqs.alignment;
 
@@ -560,6 +608,9 @@ resource_object_create(struct zink_screen *screen, const struct pipe_resource *t
             templ->usage == PIPE_USAGE_STAGING)
       flags |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
 
+   if (templ->bind & ZINK_BIND_TRANSIENT)
+      flags |= VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT;
+
    VkMemoryAllocateInfo mai;
    enum zink_alloc_flag aflags = templ->flags & PIPE_RESOURCE_FLAG_SPARSE ? ZINK_ALLOC_SPARSE : 0;
    mai.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
@@ -574,7 +625,7 @@ resource_object_create(struct zink_screen *screen, const struct pipe_resource *t
          heap = ZINK_HEAP_DEVICE_LOCAL;
          break;
       case ZINK_HEAP_HOST_VISIBLE_CACHED:
-         heap = ZINK_HEAP_HOST_VISIBLE_ANY;
+         heap = ZINK_HEAP_HOST_VISIBLE_COHERENT;
          break;
       default:
          break;
@@ -603,7 +654,7 @@ resource_object_create(struct zink_screen *screen, const struct pipe_resource *t
    VkExportMemoryAllocateInfo emai;
    if (templ->bind & PIPE_BIND_SHARED && shared) {
       emai.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO;
-      emai.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
+      emai.handleTypes = export_types;
 
       emai.pNext = mai.pNext;
       mai.pNext = &emai;
@@ -614,13 +665,17 @@ resource_object_create(struct zink_screen *screen, const struct pipe_resource *t
       NULL,
    };
 
-   if (whandle && whandle->type == WINSYS_HANDLE_TYPE_FD) {
+   if (whandle) {
       imfi.pNext = NULL;
-      imfi.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
-      imfi.fd = whandle->handle;
+      imfi.handleType = external;
+      imfi.fd = os_dupfd_cloexec(whandle->handle);
+      if (imfi.fd < 0) {
+         mesa_loge("ZINK: failed to dup dmabuf fd: %s\n", strerror(errno));
+         goto fail1;
+      }
 
       imfi.pNext = mai.pNext;
-      emai.pNext = &imfi;
+      mai.pNext = &imfi;
    }
 
    struct wsi_memory_allocate_info memory_wsi_info = {
@@ -651,10 +706,10 @@ resource_object_create(struct zink_screen *screen, const struct pipe_resource *t
 
    if (templ->target == PIPE_BUFFER) {
       if (!(templ->flags & PIPE_RESOURCE_FLAG_SPARSE))
-         if (vkBindBufferMemory(screen->dev, obj->buffer, zink_bo_get_mem(obj->bo), obj->offset) != VK_SUCCESS)
+         if (VKSCR(BindBufferMemory)(screen->dev, obj->buffer, zink_bo_get_mem(obj->bo), obj->offset) != VK_SUCCESS)
             goto fail3;
    } else {
-      if (vkBindImageMemory(screen->dev, obj->image, zink_bo_get_mem(obj->bo), obj->offset) != VK_SUCCESS)
+      if (VKSCR(BindImageMemory)(screen->dev, obj->image, zink_bo_get_mem(obj->bo), obj->offset) != VK_SUCCESS)
          goto fail3;
    }
    return obj;
@@ -664,9 +719,9 @@ resource_object_create(struct zink_screen *screen, const struct pipe_resource *t
 
 fail2:
    if (templ->target == PIPE_BUFFER)
-      vkDestroyBuffer(screen->dev, obj->buffer, NULL);
+      VKSCR(DestroyBuffer)(screen->dev, obj->buffer, NULL);
    else
-      vkDestroyImage(screen->dev, obj->image, NULL);
+      VKSCR(DestroyImage)(screen->dev, obj->image, NULL);
 fail1:
    FREE(obj);
    return NULL;
@@ -680,45 +735,71 @@ resource_create(struct pipe_screen *pscreen,
                 const uint64_t *modifiers, int modifiers_count)
 {
    struct zink_screen *screen = zink_screen(pscreen);
-   struct zink_resource *res = CALLOC_STRUCT(zink_resource);
+   struct zink_resource *res = CALLOC_STRUCT_CL(zink_resource);
 
    if (modifiers_count > 0) {
       /* for rebinds */
       res->modifiers_count = modifiers_count;
       res->modifiers = mem_dup(modifiers, modifiers_count * sizeof(uint64_t));
       if (!res->modifiers) {
-         FREE(res);
+         FREE_CL(res);
          return NULL;
       }
+      /* TODO: remove this when multi-plane modifiers are supported */
+      const struct zink_modifier_prop *prop = &screen->modifier_props[templ->format];
+      for (unsigned i = 0; i < modifiers_count; i++) {
+         for (unsigned j = 0; j < prop->drmFormatModifierCount; j++) {
+            if (prop->pDrmFormatModifierProperties[j].drmFormatModifier == modifiers[i]) {
+               if (prop->pDrmFormatModifierProperties[j].drmFormatModifierPlaneCount != 1)
+                  res->modifiers[i] = DRM_FORMAT_MOD_INVALID;
+               break;
+            }
+         }
+      }
    }
 
    res->base.b = *templ;
 
-   threaded_resource_init(&res->base.b);
+   threaded_resource_init(&res->base.b, false, 0);
    pipe_reference_init(&res->base.b.reference, 1);
    res->base.b.screen = pscreen;
 
    bool optimal_tiling = false;
-   res->obj = resource_object_create(screen, templ, whandle, &optimal_tiling, modifiers, 0);
+   struct pipe_resource templ2 = *templ;
+   unsigned scanout_flags = templ->bind & (PIPE_BIND_SCANOUT | PIPE_BIND_SHARED);
+   if (!(templ->bind & PIPE_BIND_LINEAR))
+      templ2.bind &= ~scanout_flags;
+   res->obj = resource_object_create(screen, &templ2, whandle, &optimal_tiling, NULL, 0);
    if (!res->obj) {
       free(res->modifiers);
-      FREE(res);
+      FREE_CL(res);
       return NULL;
    }
 
    res->internal_format = templ->format;
    if (templ->target == PIPE_BUFFER) {
       util_range_init(&res->valid_buffer_range);
+      if (!screen->resizable_bar && templ->width0 >= 8196) {
+         /* We don't want to evict buffers from VRAM by mapping them for CPU access,
+          * because they might never be moved back again. If a buffer is large enough,
+          * upload data by copying from a temporary GTT buffer. 8K might not seem much,
+          * but there can be 100000 buffers.
+          *
+          * This tweak improves performance for viewperf.
+          */
+         res->base.b.flags |= PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY;
+      }
    } else {
       res->format = zink_get_format(screen, templ->format);
-      res->layout = VK_IMAGE_LAYOUT_UNDEFINED;
+      res->dmabuf_acquire = whandle && whandle->type == WINSYS_HANDLE_TYPE_FD;
+      res->layout = res->dmabuf_acquire ? VK_IMAGE_LAYOUT_PREINITIALIZED : VK_IMAGE_LAYOUT_UNDEFINED;
       res->optimal_tiling = optimal_tiling;
       res->aspect = aspect_from_format(templ->format);
-      if (res->base.b.bind & (PIPE_BIND_SCANOUT | PIPE_BIND_SHARED) && optimal_tiling) {
+      if (scanout_flags && optimal_tiling) {
          // TODO: remove for wsi
-         struct pipe_resource templ2 = res->base.b;
-         templ2.bind = (res->base.b.bind & (PIPE_BIND_SCANOUT | PIPE_BIND_SHARED)) | PIPE_BIND_LINEAR;
-         res->scanout_obj = resource_object_create(screen, &templ2, whandle, &optimal_tiling, modifiers, modifiers_count);
+         templ2 = res->base.b;
+         templ2.bind = scanout_flags | PIPE_BIND_LINEAR;
+         res->scanout_obj = resource_object_create(screen, &templ2, whandle, &optimal_tiling, res->modifiers, res->modifiers_count);
          assert(!optimal_tiling);
       }
    }
@@ -733,9 +814,14 @@ resource_create(struct pipe_screen *pscreen,
                                              64, NULL,
                                              &res->dt_stride);
    }
-   if (res->obj->is_buffer)
+   if (res->obj->is_buffer) {
       res->base.buffer_id_unique = util_idalloc_mt_alloc(&screen->buffer_ids);
-
+      _mesa_hash_table_init(&res->bufferview_cache, NULL, NULL, equals_bvci);
+      simple_mtx_init(&res->bufferview_mtx, mtx_plain);
+   } else {
+      _mesa_hash_table_init(&res->surface_cache, NULL, NULL, equals_ivci);
+      simple_mtx_init(&res->surface_mtx, mtx_plain);
+   }
    return &res->base.b;
 }
 
@@ -781,7 +867,7 @@ zink_resource_get_param(struct pipe_screen *pscreen, struct pipe_context *pctx,
 
       sub_res.aspectMask = aspect;
 
-      vkGetImageSubresourceLayout(screen->dev, obj->image, &sub_res, &sub_res_layout);
+      VKSCR(GetImageSubresourceLayout)(screen->dev, obj->image, &sub_res, &sub_res_layout);
 
       *value = sub_res_layout.rowPitch;
       break;
@@ -794,7 +880,7 @@ zink_resource_get_param(struct pipe_screen *pscreen, struct pipe_context *pctx,
             layer
          };
          VkSubresourceLayout srl;
-         vkGetImageSubresourceLayout(screen->dev, obj->image, &isr, &srl);
+         VKSCR(GetImageSubresourceLayout)(screen->dev, obj->image, &isr, &srl);
          *value = srl.offset;
          break;
    }
@@ -803,10 +889,12 @@ zink_resource_get_param(struct pipe_screen *pscreen, struct pipe_context *pctx,
       *value = DRM_FORMAT_MOD_INVALID;
       if (!screen->info.have_EXT_image_drm_format_modifier)
          return false;
+      if (!res->modifiers)
+         return false;
       VkImageDrmFormatModifierPropertiesEXT prop;
       prop.sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_PROPERTIES_EXT;
       prop.pNext = NULL;
-      if (screen->vk.GetImageDrmFormatModifierPropertiesEXT(screen->dev, obj->image, &prop) == VK_SUCCESS)
+      if (VKSCR(GetImageDrmFormatModifierPropertiesEXT)(screen->dev, obj->image, &prop) == VK_SUCCESS)
          *value = prop.drmFormatModifier;
       break;
    }
@@ -818,7 +906,7 @@ zink_resource_get_param(struct pipe_screen *pscreen, struct pipe_context *pctx,
             layer
          };
          VkSubresourceLayout srl;
-         vkGetImageSubresourceLayout(screen->dev, obj->image, &isr, &srl);
+         VKSCR(GetImageSubresourceLayout)(screen->dev, obj->image, &isr, &srl);
          if (res->base.b.target == PIPE_TEXTURE_3D)
             *value = srl.depthPitch;
          else
@@ -854,7 +942,7 @@ zink_resource_get_handle(struct pipe_screen *pscreen,
                          struct winsys_handle *whandle,
                          unsigned usage)
 {
-   if (whandle->type == WINSYS_HANDLE_TYPE_FD) {
+   if (whandle->type == WINSYS_HANDLE_TYPE_FD || whandle->type == WINSYS_HANDLE_TYPE_KMS) {
 #ifdef ZINK_USE_DMABUF
       struct zink_resource *res = zink_resource(tex);
       struct zink_screen *screen = zink_screen(pscreen);
@@ -866,10 +954,21 @@ zink_resource_get_handle(struct pipe_screen *pscreen,
       fd_info.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR;
       //TODO: remove for wsi
       fd_info.memory = zink_bo_get_mem(obj->bo);
-      fd_info.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
-      VkResult result = (*screen->vk.GetMemoryFdKHR)(screen->dev, &fd_info, &fd);
+      if (whandle->type == WINSYS_HANDLE_TYPE_FD)
+         fd_info.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT;
+      else
+         fd_info.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
+      VkResult result = VKSCR(GetMemoryFdKHR)(screen->dev, &fd_info, &fd);
       if (result != VK_SUCCESS)
          return false;
+      if (whandle->type == WINSYS_HANDLE_TYPE_KMS) {
+         uint32_t h;
+         bool success = drmPrimeFDToHandle(screen->drm_fd, fd, &h) == 0;
+         close(fd);
+         if (!success)
+            return false;
+         fd = h;
+      }
       whandle->handle = fd;
       uint64_t value;
       zink_resource_get_param(pscreen, context, tex, 0, 0, 0, PIPE_RESOURCE_PARAM_MODIFIER, 0, &value);
@@ -926,10 +1025,10 @@ invalidate_buffer(struct zink_context *ctx, struct zink_resource *res)
    if (res->valid_buffer_range.start > res->valid_buffer_range.end)
       return false;
 
-   if (res->bind_history & ZINK_RESOURCE_USAGE_STREAMOUT)
+   if (res->so_valid)
       ctx->dirty_so_targets = true;
    /* force counter buffer reset */
-   res->bind_history &= ~ZINK_RESOURCE_USAGE_STREAMOUT;
+   res->so_valid = false;
 
    util_range_set_empty(&res->valid_buffer_range);
    if (!zink_resource_has_usage(res))
@@ -944,9 +1043,6 @@ invalidate_buffer(struct zink_context *ctx, struct zink_resource *res)
    /* this ref must be transferred before rebind or else BOOM */
    zink_batch_reference_resource_move(&ctx->batch, res);
    res->obj = new_obj;
-   res->access_stage = 0;
-   res->access = 0;
-   res->unordered_barrier = false;
    zink_resource_rebind(ctx, res);
    zink_descriptor_set_refs_clear(&old_obj->desc_set_refs, old_obj);
    return true;
@@ -977,7 +1073,7 @@ zink_transfer_copy_bufimage(struct zink_context *ctx,
       box.x = trans->offset;
 
    if (dst->obj->transfer_dst)
-      zink_copy_image_buffer(ctx, NULL, dst, src, trans->base.b.level, buf2img ? x : 0,
+      zink_copy_image_buffer(ctx, dst, src, trans->base.b.level, buf2img ? x : 0,
                               box.y, box.z, trans->base.b.level, &box, trans->base.b.usage);
    else
       util_blitter_copy_texture(ctx->blitter, &dst->base.b, trans->base.b.level,
@@ -1097,6 +1193,18 @@ zink_buffer_map(struct pipe_context *pctx,
       usage |= PIPE_MAP_DISCARD_WHOLE_RESOURCE;
    }
 
+   /* If a buffer in VRAM is too large and the range is discarded, don't
+    * map it directly. This makes sure that the buffer stays in VRAM.
+    */
+   bool force_discard_range = false;
+   if (usage & (PIPE_MAP_DISCARD_WHOLE_RESOURCE | PIPE_MAP_DISCARD_RANGE) &&
+       !(usage & PIPE_MAP_PERSISTENT) &&
+       res->base.b.flags & PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY) {
+      usage &= ~(PIPE_MAP_DISCARD_WHOLE_RESOURCE | PIPE_MAP_UNSYNCHRONIZED);
+      usage |= PIPE_MAP_DISCARD_RANGE;
+      force_discard_range = true;
+   }
+
    if (usage & PIPE_MAP_DISCARD_WHOLE_RESOURCE &&
        !(usage & (PIPE_MAP_UNSYNCHRONIZED | TC_TRANSFER_MAP_NO_INVALIDATE))) {
       assert(usage & PIPE_MAP_WRITE);
@@ -1117,7 +1225,7 @@ zink_buffer_map(struct pipe_context *pctx,
       /* Check if mapping this buffer would cause waiting for the GPU.
        */
 
-      if (!res->obj->host_visible ||
+      if (!res->obj->host_visible || force_discard_range ||
           !zink_resource_usage_check_completion(screen, res, ZINK_RESOURCE_ACCESS_RW)) {
          /* Do a wait-free write-only transfer using a temporary buffer. */
          unsigned offset;
@@ -1158,7 +1266,7 @@ zink_buffer_map(struct pipe_context *pctx,
          if (!trans->staging_res)
             goto fail;
          struct zink_resource *staging_res = zink_resource(trans->staging_res);
-         zink_copy_buffer(ctx, NULL, staging_res, res, trans->offset, box->x, box->width);
+         zink_copy_buffer(ctx, staging_res, res, trans->offset, box->x, box->width);
          res = staging_res;
          usage &= ~PIPE_MAP_UNSYNCHRONIZED;
          ptr = map_resource(screen, res);
@@ -1171,16 +1279,16 @@ zink_buffer_map(struct pipe_context *pctx,
          zink_resource_usage_wait(ctx, res, ZINK_RESOURCE_ACCESS_RW);
       else
          zink_resource_usage_wait(ctx, res, ZINK_RESOURCE_ACCESS_WRITE);
-      res->access = 0;
-      res->access_stage = 0;
+      res->obj->access = 0;
+      res->obj->access_stage = 0;
    }
 
    if (!ptr) {
       /* if writing to a streamout buffer, ensure synchronization next time it's used */
-      if (usage & PIPE_MAP_WRITE && res->bind_history & ZINK_RESOURCE_USAGE_STREAMOUT) {
+      if (usage & PIPE_MAP_WRITE && res->so_valid) {
          ctx->dirty_so_targets = true;
          /* force counter buffer reset */
-         res->bind_history &= ~ZINK_RESOURCE_USAGE_STREAMOUT;
+         res->so_valid = false;
       }
       ptr = map_resource(screen, res);
       if (!ptr)
@@ -1201,7 +1309,7 @@ zink_buffer_map(struct pipe_context *pctx,
       VkDeviceSize size = box->width;
       VkDeviceSize offset = res->obj->offset + trans->offset;
       VkMappedMemoryRange range = zink_resource_init_mem_range(screen, res->obj, offset, size);
-      if (vkInvalidateMappedMemoryRanges(screen->dev, 1, &range) != VK_SUCCESS) {
+      if (VKSCR(InvalidateMappedMemoryRanges)(screen->dev, 1, &range) != VK_SUCCESS) {
          zink_bo_unmap(screen, res->obj->bo);
          goto fail;
       }
@@ -1300,7 +1408,7 @@ zink_image_map(struct pipe_context *pctx,
          0
       };
       VkSubresourceLayout srl;
-      vkGetImageSubresourceLayout(screen->dev, res->obj->image, &isr, &srl);
+      VKSCR(GetImageSubresourceLayout)(screen->dev, res->obj->image, &isr, &srl);
       trans->base.b.stride = srl.rowPitch;
       if (res->base.b.target == PIPE_TEXTURE_3D)
          trans->base.b.layer_stride = srl.depthPitch;
@@ -1314,9 +1422,9 @@ zink_image_map(struct pipe_context *pctx,
                         (box->y / desc->block.height) * srl.rowPitch +
                         (box->x / desc->block.width) * (desc->block.bits / 8);
       if (!res->obj->coherent) {
-         VkDeviceSize size = box->width * box->height * desc->block.bits / 8;
+         VkDeviceSize size = (VkDeviceSize)box->width * box->height * desc->block.bits / 8;
          VkMappedMemoryRange range = zink_resource_init_mem_range(screen, res->obj, res->obj->offset + offset, size);
-         vkFlushMappedMemoryRanges(screen->dev, 1, &range);
+         VKSCR(FlushMappedMemoryRanges)(screen->dev, 1, &range);
       }
       ptr = ((uint8_t *)ptr) + offset;
    }
@@ -1354,7 +1462,7 @@ zink_transfer_flush_region(struct pipe_context *pctx,
          size = box->width;
          offset = trans->offset;
       } else {
-         size = box->width * box->height * util_format_get_blocksize(m->base.b.format);
+         size = (VkDeviceSize)box->width * box->height * util_format_get_blocksize(m->base.b.format);
          offset = trans->offset +
                   box->z * trans->depthPitch +
                   util_format_get_2d_size(m->base.b.format, trans->base.b.stride, box->y) +
@@ -1363,13 +1471,13 @@ zink_transfer_flush_region(struct pipe_context *pctx,
       }
       if (!m->obj->coherent) {
          VkMappedMemoryRange range = zink_resource_init_mem_range(screen, m->obj, m->obj->offset, m->obj->size);
-         vkFlushMappedMemoryRanges(screen->dev, 1, &range);
+         VKSCR(FlushMappedMemoryRanges)(screen->dev, 1, &range);
       }
       if (trans->staging_res) {
          struct zink_resource *staging_res = zink_resource(trans->staging_res);
 
          if (ptrans->resource->target == PIPE_BUFFER)
-            zink_copy_buffer(ctx, NULL, res, staging_res, box->x, offset, box->width);
+            zink_copy_buffer(ctx, res, staging_res, box->x, offset, box->width);
          else
             zink_transfer_copy_bufimage(ctx, res, staging_res, trans);
       }
@@ -1475,9 +1583,9 @@ zink_resource_tmp_buffer(struct zink_screen *screen, struct zink_resource *res,
    bci.size = size;
 
    VkBuffer buffer;
-   if (vkCreateBuffer(screen->dev, &bci, NULL, &buffer) != VK_SUCCESS)
+   if (VKSCR(CreateBuffer)(screen->dev, &bci, NULL, &buffer) != VK_SUCCESS)
       return VK_NULL_HANDLE;
-   vkBindBufferMemory(screen->dev, buffer, zink_bo_get_mem(res->obj->bo), res->obj->offset + offset);
+   VKSCR(BindBufferMemory)(screen->dev, buffer, zink_bo_get_mem(res->obj->bo), res->obj->offset + offset);
    if (offset_out)
       *offset_out = offset_add - offset;
    return buffer;
@@ -1502,7 +1610,7 @@ zink_resource_object_init_storage(struct zink_context *ctx, struct zink_resource
       res->base.b.bind |= PIPE_BIND_SHADER_IMAGE;
    } else {
       zink_fb_clears_apply_region(ctx, &res->base.b, (struct u_rect){0, res->base.b.width0, 0, res->base.b.height0});
-      zink_resource_image_barrier(ctx, NULL, res, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, 0, 0);
+      zink_resource_image_barrier(ctx, res, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, 0, 0);
       res->base.b.bind |= PIPE_BIND_SHADER_IMAGE;
       struct zink_resource_object *old_obj = res->obj;
       struct zink_resource_object *new_obj = resource_object_create(screen, &res->base.b, NULL, &res->optimal_tiling, res->modifiers, res->modifiers_count);
@@ -1556,17 +1664,17 @@ zink_resource_setup_transfer_layouts(struct zink_context *ctx, struct zink_resou
        * VK_IMAGE_LAYOUT_GENERAL. And since this isn't a present-related
        * operation, VK_IMAGE_LAYOUT_GENERAL seems most appropriate.
        */
-      zink_resource_image_barrier(ctx, NULL, src,
+      zink_resource_image_barrier(ctx, src,
                                   VK_IMAGE_LAYOUT_GENERAL,
                                   VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT,
                                   VK_PIPELINE_STAGE_TRANSFER_BIT);
    } else {
-      zink_resource_image_barrier(ctx, NULL, src,
+      zink_resource_image_barrier(ctx, src,
                                   VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
                                   VK_ACCESS_TRANSFER_READ_BIT,
                                   VK_PIPELINE_STAGE_TRANSFER_BIT);
 
-      zink_resource_image_barrier(ctx, NULL, dst,
+      zink_resource_image_barrier(ctx, dst,
                                   VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
                                   VK_ACCESS_TRANSFER_WRITE_BIT,
                                   VK_PIPELINE_STAGE_TRANSFER_BIT);
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_resource.h b/mesa 3D driver/src/gallium/drivers/zink/zink_resource.h
index d83799e10e..8602c1b69c 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_resource.h	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_resource.h	
@@ -29,8 +29,8 @@ struct sw_displaytarget;
 struct zink_batch;
 struct zink_context;
 struct zink_bo;
-#define ZINK_RESOURCE_USAGE_STREAMOUT (1 << 10) //much greater than ZINK_DESCRIPTOR_TYPES
 
+#include "util/hash_table.h"
 #include "util/simple_mtx.h"
 #include "util/u_transfer.h"
 #include "util/u_range.h"
@@ -43,6 +43,7 @@ struct zink_bo;
 #include <vulkan/vulkan.h>
 
 #define ZINK_MAP_TEMPORARY (PIPE_MAP_DRV_PRV << 0)
+#define ZINK_BIND_TRANSIENT (1 << 30) //transient fb attachment
 
 struct mem_key {
    unsigned seen_count;
@@ -55,12 +56,13 @@ struct mem_key {
 struct zink_resource_object {
    struct pipe_reference reference;
 
+   VkPipelineStageFlagBits access_stage;
+   VkAccessFlags access;
+   bool unordered_barrier;
+
    unsigned persistent_maps; //if nonzero, requires vkFlushMappedMemoryRanges during batch use
    struct zink_descriptor_refs desc_set_refs;
 
-   struct zink_batch_usage *reads;
-   struct zink_batch_usage *writes;
-
    struct util_dynarray tmp;
 
    union {
@@ -78,6 +80,8 @@ struct zink_resource_object {
 
    struct zink_bo *bo;
    VkDeviceSize offset, size, alignment;
+   VkImageCreateFlags vkflags;
+   VkImageUsageFlags vkusage;
 
    bool host_visible;
    bool coherent;
@@ -88,10 +92,6 @@ struct zink_resource {
 
    enum pipe_format internal_format:16;
 
-   VkPipelineStageFlagBits access_stage;
-   VkAccessFlags access;
-   bool unordered_barrier;
-
    struct zink_resource_object *obj;
    struct zink_resource_object *scanout_obj; //TODO: remove for wsi
    bool scanout_obj_init;
@@ -100,6 +100,8 @@ struct zink_resource {
          struct util_range valid_buffer_range;
          uint32_t vbo_bind_mask : PIPE_MAX_ATTRIBS;
          uint8_t ubo_bind_count[2];
+         uint8_t so_bind_count; //not counted in all_binds
+         bool so_valid;
          uint32_t ubo_bind_mask[PIPE_SHADER_TYPES];
          uint32_t ssbo_bind_mask[PIPE_SHADER_TYPES];
       };
@@ -108,20 +110,33 @@ struct zink_resource {
          VkImageLayout layout;
          VkImageAspectFlags aspect;
          bool optimal_tiling;
-         uint8_t fb_binds;
+         uint8_t fb_binds; //not counted in all_binds
       };
    };
    uint32_t sampler_binds[PIPE_SHADER_TYPES];
    uint16_t image_bind_count[2]; //gfx, compute
    uint16_t write_bind_count[2]; //gfx, compute
-   uint16_t bind_count[2]; //gfx, compute
+   uint16_t bindless[2]; //tex, img
+   union {
+      uint16_t bind_count[2]; //gfx, compute
+      uint32_t all_binds;
+   };
 
+   union {
+      struct {
+         struct hash_table bufferview_cache;
+         simple_mtx_t bufferview_mtx;
+      };
+      struct {
+         struct hash_table surface_cache;
+         simple_mtx_t surface_mtx;
+      };
+   };
+
+   bool dmabuf_acquire;
    struct sw_displaytarget *dt;
    unsigned dt_stride;
 
-   uint32_t bind_history; // enum zink_descriptor_type bitmask
-   uint32_t bind_stages;
-
    uint8_t modifiers_count;
    uint64_t *modifiers;
 };
@@ -179,6 +194,12 @@ zink_resource_tmp_buffer(struct zink_screen *screen, struct zink_resource *res,
 bool
 zink_resource_object_init_storage(struct zink_context *ctx, struct zink_resource *res);
 
+static inline bool
+zink_resource_has_binds(const struct zink_resource *res)
+{
+   return res->all_binds > 0;
+}
+
 #ifndef __cplusplus
 #include "zink_bo.h"
 
@@ -231,10 +252,10 @@ zink_resource_usage_set(struct zink_resource *res, struct zink_batch_state *bs,
    zink_bo_usage_set(res->obj->bo, bs, write);
 }
 
-static inline void
+static inline bool
 zink_resource_object_usage_unset(struct zink_resource_object *obj, struct zink_batch_state *bs)
 {
-   zink_bo_usage_unset(obj->bo, bs);
+   return zink_bo_usage_unset(obj->bo, bs);
 }
 
 #endif
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_screen.c b/mesa 3D driver/src/gallium/drivers/zink/zink_screen.c
index 855e28271f..10207633c0 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_screen.c	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_screen.c	
@@ -40,6 +40,7 @@
 #include "util/u_debug.h"
 #include "util/format/u_format.h"
 #include "util/hash_table.h"
+#include "util/os_file.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "util/u_screen.h"
@@ -51,6 +52,12 @@
 
 #include "frontend/sw_winsys.h"
 
+#if DETECT_OS_WINDOWS
+#include <io.h>
+#else
+#include <unistd.h>
+#endif
+
 #if defined(__APPLE__)
 // Source of MVK_VERSION
 #include "MoltenVK/vk_mvk_moltenvk.h"
@@ -75,6 +82,7 @@ static const struct debug_named_value
 zink_descriptor_options[] = {
    { "auto", ZINK_DESCRIPTOR_MODE_AUTO, "Automatically detect best mode" },
    { "lazy", ZINK_DESCRIPTOR_MODE_LAZY, "Don't cache, do least amount of updates" },
+   { "nofallback", ZINK_DESCRIPTOR_MODE_NOFALLBACK, "Cache, never use lazy fallback" },
    { "notemplates", ZINK_DESCRIPTOR_MODE_NOTEMPLATES, "Cache, but disable templated updates" },
    DEBUG_NAMED_VALUE_END
 };
@@ -105,18 +113,6 @@ zink_get_name(struct pipe_screen *pscreen)
    return buf;
 }
 
-static bool
-equals_ivci(const void *a, const void *b)
-{
-   return memcmp(a, b, sizeof(VkImageViewCreateInfo)) == 0;
-}
-
-static bool
-equals_bvci(const void *a, const void *b)
-{
-   return memcmp(a, b, sizeof(VkBufferViewCreateInfo)) == 0;
-}
-
 static uint32_t
 hash_framebuffer_state(const void *key)
 {
@@ -165,14 +161,14 @@ cache_put_job(void *data, void *gdata, int thread_index)
    struct zink_program *pg = data;
    struct zink_screen *screen = gdata;
    size_t size = 0;
-   if (vkGetPipelineCacheData(screen->dev, pg->pipeline_cache, &size, NULL) != VK_SUCCESS)
+   if (VKSCR(GetPipelineCacheData)(screen->dev, pg->pipeline_cache, &size, NULL) != VK_SUCCESS)
       return;
    if (pg->pipeline_cache_size == size)
       return;
    void *pipeline_data = malloc(size);
    if (!pipeline_data)
       return;
-   if (vkGetPipelineCacheData(screen->dev, pg->pipeline_cache, &size, pipeline_data) == VK_SUCCESS) {
+   if (VKSCR(GetPipelineCacheData)(screen->dev, pg->pipeline_cache, &size, pipeline_data) == VK_SUCCESS) {
       pg->pipeline_cache_size = size;
 
       cache_key key;
@@ -208,7 +204,7 @@ cache_get_job(void *data, void *gdata, int thread_index)
    disk_cache_compute_key(screen->disk_cache, pg->sha1, sizeof(pg->sha1), key);
    pcci.pInitialData = disk_cache_get(screen->disk_cache, key, &pg->pipeline_cache_size);
    pcci.initialDataSize = pg->pipeline_cache_size;
-   vkCreatePipelineCache(screen->dev, &pcci, NULL, &pg->pipeline_cache);
+   VKSCR(CreatePipelineCache)(screen->dev, &pcci, NULL, &pg->pipeline_cache);
    free((void*)pcci.pInitialData);
 }
 
@@ -297,10 +293,17 @@ zink_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
       uint32_t modes = BITFIELD_BIT(PIPE_PRIM_LINE_STRIP) |
                        BITFIELD_BIT(PIPE_PRIM_TRIANGLE_STRIP) |
                        BITFIELD_BIT(PIPE_PRIM_LINE_STRIP_ADJACENCY) |
-                       BITFIELD_BIT(PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY) |
-                       BITFIELD_BIT(PIPE_PRIM_PATCHES);
+                       BITFIELD_BIT(PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY);
       if (screen->have_triangle_fans)
          modes |= BITFIELD_BIT(PIPE_PRIM_TRIANGLE_FAN);
+      if (screen->info.have_EXT_primitive_topology_list_restart) {
+         modes |= BITFIELD_BIT(PIPE_PRIM_POINTS) |
+                  BITFIELD_BIT(PIPE_PRIM_LINES) |
+                  BITFIELD_BIT(PIPE_PRIM_TRIANGLES) |
+                  BITFIELD_BIT(PIPE_PRIM_TRIANGLES_ADJACENCY);
+         if (screen->info.list_restart_feats.primitiveTopologyPatchListRestart)
+            modes |= BITFIELD_BIT(PIPE_PRIM_PATCHES);
+      }
       return modes;
    }
    case PIPE_CAP_SUPPORTED_PRIM_MODES: {
@@ -314,6 +317,9 @@ zink_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
       return modes;
    }
 
+   case PIPE_CAP_FBFETCH:
+      return 1;
+
    case PIPE_CAP_QUERY_MEMORY_INFO:
    case PIPE_CAP_NPOT_TEXTURES:
    case PIPE_CAP_TGSI_TEXCOORD:
@@ -331,7 +337,6 @@ zink_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_CLIP_HALFZ:
    case PIPE_CAP_TGSI_TXQS:
    case PIPE_CAP_TEXTURE_BARRIER:
-   case PIPE_CAP_DRAW_PARAMETERS:
    case PIPE_CAP_QUERY_SO_OVERFLOW:
    case PIPE_CAP_GL_SPIRV:
    case PIPE_CAP_CLEAR_SCISSORED:
@@ -341,6 +346,12 @@ zink_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
       return 1;
 
+   case PIPE_CAP_SURFACE_SAMPLE_COUNT:
+      return screen->vk_version >= VK_MAKE_VERSION(1,2,0);
+
+   case PIPE_CAP_DRAW_PARAMETERS:
+      return screen->info.feats11.shaderDrawParameters || screen->info.have_KHR_shader_draw_parameters;
+
    case PIPE_CAP_TGSI_VOTE:
       return screen->spirv_version >= SPIRV_VERSION(1, 3);
 
@@ -389,7 +400,7 @@ zink_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
       return screen->info.props.limits.maxColorAttachments;
 
    case PIPE_CAP_OCCLUSION_QUERY:
-      return 1;
+      return screen->info.feats.features.occlusionQueryPrecise;
 
    case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS:
       return screen->info.have_EXT_sample_locations && screen->info.have_EXT_extended_dynamic_state;
@@ -476,7 +487,7 @@ zink_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
       return 1;
 
    case PIPE_CAP_CONDITIONAL_RENDER:
-     return screen->info.have_EXT_conditional_rendering;
+     return 1;
 
    case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
    case PIPE_CAP_GLSL_FEATURE_LEVEL:
@@ -502,6 +513,11 @@ zink_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_PRIMITIVE_RESTART:
       return 1;
 
+   case PIPE_CAP_BINDLESS_TEXTURE:
+      return screen->info.have_EXT_descriptor_indexing &&
+             /* push, 4 types, bindless */
+             screen->info.props.limits.maxBoundDescriptorSets >= 6;
+
    case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
       return screen->info.props.limits.minTexelBufferOffsetAlignment;
 
@@ -637,7 +653,7 @@ zink_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
       return MIN2(screen->info.props.limits.maxVertexOutputComponents / 4 / 2, 16);
 
    case PIPE_CAP_DMABUF:
-      return screen->info.have_KHR_external_memory_fd;
+      return screen->info.have_KHR_external_memory_fd && screen->info.have_EXT_external_memory_dma_buf && screen->info.have_EXT_queue_family_foreign;
 
    case PIPE_CAP_DEPTH_BOUNDS_TEST:
       return screen->info.feats.features.depthBounds;
@@ -724,10 +740,7 @@ zink_get_shader_param(struct pipe_screen *pscreen,
    case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS:
    case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS:
    case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
-      if (shader == PIPE_SHADER_VERTEX ||
-          shader == PIPE_SHADER_FRAGMENT)
-         return INT_MAX;
-      return 0;
+      return INT_MAX;
 
    case PIPE_SHADER_CAP_MAX_INPUTS: {
       uint32_t max = 0;
@@ -807,12 +820,16 @@ zink_get_shader_param(struct pipe_screen *pscreen,
    case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
    case PIPE_SHADER_CAP_SUBROUTINES:
    case PIPE_SHADER_CAP_INT64_ATOMICS:
-   case PIPE_SHADER_CAP_FP16_CONST_BUFFERS:
    case PIPE_SHADER_CAP_GLSL_16BIT_CONSTS:
       return 0; /* not implemented */
 
-   case PIPE_SHADER_CAP_FP16:
+   case PIPE_SHADER_CAP_FP16_CONST_BUFFERS:
+      return screen->info.feats11.uniformAndStorageBuffer16BitAccess ||
+             (screen->info.have_KHR_16bit_storage && screen->info.storage_16bit_feats.uniformAndStorageBuffer16BitAccess);
    case PIPE_SHADER_CAP_FP16_DERIVATIVES:
+      return screen->info.feats11.storageInputOutput16 ||
+             (screen->info.have_KHR_16bit_storage && screen->info.storage_16bit_feats.storageInputOutput16);
+   case PIPE_SHADER_CAP_FP16:
       return screen->info.feats12.shaderFloat16 ||
              (screen->info.have_KHR_shader_float16_int8 &&
               screen->info.shader_float16_int8_feats.shaderFloat16);
@@ -980,9 +997,15 @@ zink_is_format_supported(struct pipe_screen *pscreen,
    VkFormatProperties props = screen->format_props[format];
 
    if (target == PIPE_BUFFER) {
-      if (bind & PIPE_BIND_VERTEX_BUFFER &&
-          !(props.bufferFeatures & VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT))
-         return false;
+      if (bind & PIPE_BIND_VERTEX_BUFFER) {
+         if (!(props.bufferFeatures & VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT)) {
+            enum pipe_format new_format = zink_decompose_vertex_format(format);
+            if (!new_format)
+               return false;
+            if (!(screen->format_props[new_format].bufferFeatures & VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT))
+               return false;
+         }
+      }
 
       if (bind & PIPE_BIND_SAMPLER_VIEW &&
          !(props.bufferFeatures & VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT))
@@ -1042,29 +1065,17 @@ zink_destroy_screen(struct pipe_screen *pscreen)
    struct zink_screen *screen = zink_screen(pscreen);
 
    if (VK_NULL_HANDLE != screen->debugUtilsCallbackHandle) {
-      screen->vk.DestroyDebugUtilsMessengerEXT(screen->instance, screen->debugUtilsCallbackHandle, NULL);
+      VKSCR(DestroyDebugUtilsMessengerEXT)(screen->instance, screen->debugUtilsCallbackHandle, NULL);
    }
 
-   hash_table_foreach(&screen->surface_cache, entry) {
-      struct pipe_surface *psurf = (struct pipe_surface*)entry->data;
-      /* context is already destroyed, so this has to be destroyed directly */
-      zink_destroy_surface(screen, psurf);
+   if (!screen->info.have_KHR_imageless_framebuffer) {
+      hash_table_foreach(&screen->framebuffer_cache, entry) {
+         struct zink_framebuffer* fb = (struct zink_framebuffer*)entry->data;
+         zink_destroy_framebuffer(screen, fb);
+      }
+      simple_mtx_destroy(&screen->framebuffer_mtx);
    }
 
-   hash_table_foreach(&screen->bufferview_cache, entry) {
-      struct zink_buffer_view *bv = (struct zink_buffer_view*)entry->data;
-      zink_buffer_view_reference(screen, &bv, NULL);
-   }
-
-   hash_table_foreach(&screen->framebuffer_cache, entry) {
-      struct zink_framebuffer* fb = (struct zink_framebuffer*)entry->data;
-      zink_destroy_framebuffer(screen, fb);
-   }
-
-   simple_mtx_destroy(&screen->surface_mtx);
-   simple_mtx_destroy(&screen->bufferview_mtx);
-   simple_mtx_destroy(&screen->framebuffer_mtx);
-
    u_transfer_helper_destroy(pscreen->transfer_helper);
 #ifdef ENABLE_SHADER_CACHE
    if (screen->disk_cache) {
@@ -1080,17 +1091,20 @@ zink_destroy_screen(struct pipe_screen *pscreen)
    util_live_shader_cache_deinit(&screen->shaders);
 
    if (screen->sem)
-      vkDestroySemaphore(screen->dev, screen->sem, NULL);
+      VKSCR(DestroySemaphore)(screen->dev, screen->sem, NULL);
    if (screen->prev_sem)
-      vkDestroySemaphore(screen->dev, screen->prev_sem, NULL);
+      VKSCR(DestroySemaphore)(screen->dev, screen->prev_sem, NULL);
 
    if (screen->threaded)
       util_queue_destroy(&screen->flush_queue);
 
-   vkDestroyDevice(screen->dev, NULL);
+   VKSCR(DestroyDevice)(screen->dev, NULL);
    vkDestroyInstance(screen->instance, NULL);
    util_idalloc_mt_fini(&screen->buffer_ids);
 
+   if (screen->drm_fd != -1)
+      close(screen->drm_fd);
+
    slab_destroy_parent(&screen->transfer_pool);
    ralloc_free(screen);
 }
@@ -1215,7 +1229,7 @@ bool
 zink_is_depth_format_supported(struct zink_screen *screen, VkFormat format)
 {
    VkFormatProperties props;
-   vkGetPhysicalDeviceFormatProperties(screen->pdev, format, &props);
+   VKSCR(GetPhysicalDeviceFormatProperties)(screen->pdev, format, &props);
    return (props.linearTilingFeatures | props.optimalTilingFeatures) &
           VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT;
 }
@@ -1238,6 +1252,15 @@ emulate_x8(enum pipe_format format)
    case PIPE_FORMAT_R8G8B8X8_UNORM:
       return PIPE_FORMAT_R8G8B8A8_UNORM;
 
+   case PIPE_FORMAT_R16G16B16X16_FLOAT:
+      return PIPE_FORMAT_R16G16B16A16_FLOAT;
+   case PIPE_FORMAT_R16G16B16X16_SINT:
+      return PIPE_FORMAT_R16G16B16A16_SINT;
+   case PIPE_FORMAT_R16G16B16X16_SNORM:
+      return PIPE_FORMAT_R16G16B16A16_SNORM;
+   case PIPE_FORMAT_R16G16B16X16_UNORM:
+      return PIPE_FORMAT_R16G16B16A16_UNORM;
+
    default:
       return format;
    }
@@ -1314,11 +1337,12 @@ static bool
 check_have_device_time(struct zink_screen *screen)
 {
    uint32_t num_domains = 0;
-   screen->vk.GetPhysicalDeviceCalibrateableTimeDomainsEXT(screen->pdev, &num_domains, NULL);
+   VkTimeDomainEXT domains[8]; //current max is 4
+   VKSCR(GetPhysicalDeviceCalibrateableTimeDomainsEXT)(screen->pdev, &num_domains, NULL);
    assert(num_domains > 0);
+   assert(num_domains < ARRAY_SIZE(domains));
 
-   VkTimeDomainEXT *domains = malloc(sizeof(VkTimeDomainEXT) * num_domains);
-   screen->vk.GetPhysicalDeviceCalibrateableTimeDomainsEXT(screen->pdev, &num_domains, domains);
+   VKSCR(GetPhysicalDeviceCalibrateableTimeDomainsEXT)(screen->pdev, &num_domains, domains);
 
    /* VK_TIME_DOMAIN_DEVICE_EXT is used for the ctx->get_timestamp hook and is the only one we really need */
    for (unsigned i = 0; i < num_domains; i++) {
@@ -1327,10 +1351,33 @@ check_have_device_time(struct zink_screen *screen)
       }
    }
 
-   free(domains);
    return false;
 }
 
+static void
+zink_error(const char *msg)
+{
+   fprintf(stderr, "zink ERR: '%s'\n", msg);
+}
+
+static void
+zink_warn(const char *msg)
+{
+   fprintf(stderr, "zink WRN: '%s'\n", msg);
+}
+
+static void
+zink_info(const char *msg)
+{
+   fprintf(stderr, "zink NFO: '%s'\n", msg);
+}
+
+static void
+zink_msg(const char *msg)
+{
+   fprintf(stderr, "zink MSG: '%s'\n", msg);
+}
+
 static VKAPI_ATTR VkBool32 VKAPI_CALL
 zink_debug_util_callback(
     VkDebugUtilsMessageSeverityFlagBitsEXT           messageSeverity,
@@ -1338,19 +1385,17 @@ zink_debug_util_callback(
     const VkDebugUtilsMessengerCallbackDataEXT      *pCallbackData,
     void                                            *pUserData)
 {
-   const char *severity = "MSG";
-
    // Pick message prefix and color to use.
    // Only MacOS and Linux have been tested for color support
    if (messageSeverity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) {
-      severity = "ERR";
+      zink_error(pCallbackData->pMessage);
    } else if (messageSeverity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT) {
-      severity = "WRN";
+      zink_warn(pCallbackData->pMessage);
    } else if (messageSeverity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT) {
-      severity = "NFO";
-   }
+      zink_info(pCallbackData->pMessage);
+   } else
+      zink_msg(pCallbackData->pMessage);
 
-   fprintf(stderr, "zink DEBUG: %s: '%s'\n", severity, pCallbackData->pMessage);
    return VK_FALSE;
 }
 
@@ -1374,7 +1419,7 @@ create_debug(struct zink_screen *screen)
 
    VkDebugUtilsMessengerEXT vkDebugUtilsCallbackEXT = VK_NULL_HANDLE;
 
-   screen->vk.CreateDebugUtilsMessengerEXT(
+   VKSCR(CreateDebugUtilsMessengerEXT)(
        screen->instance,
        &vkDebugUtilsMessengerCreateInfoEXT,
        NULL,
@@ -1446,7 +1491,7 @@ populate_format_props(struct zink_screen *screen)
       VkFormat format = zink_get_format(screen, i);
       if (!format)
          continue;
-      if (screen->vk.GetPhysicalDeviceFormatProperties2) {
+      if (VKSCR(GetPhysicalDeviceFormatProperties2)) {
          VkFormatProperties2 props = {0};
          props.sType = VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2;
 
@@ -1459,7 +1504,7 @@ populate_format_props(struct zink_screen *screen)
             mod_props.pDrmFormatModifierProperties = mods;
             props.pNext = &mod_props;
          }
-         screen->vk.GetPhysicalDeviceFormatProperties2(screen->pdev, format, &props);
+         VKSCR(GetPhysicalDeviceFormatProperties2)(screen->pdev, format, &props);
          screen->format_props[i] = props.formatProperties;
          if (screen->info.have_EXT_image_drm_format_modifier && mod_props.drmFormatModifierCount) {
             screen->modifier_props[i].drmFormatModifierCount = mod_props.drmFormatModifierCount;
@@ -1470,7 +1515,7 @@ populate_format_props(struct zink_screen *screen)
             }
          }
       } else
-         vkGetPhysicalDeviceFormatProperties(screen->pdev, format, &screen->format_props[i]);
+         VKSCR(GetPhysicalDeviceFormatProperties)(screen->pdev, format, &screen->format_props[i]);
    }
 }
 
@@ -1485,12 +1530,12 @@ zink_screen_init_semaphore(struct zink_screen *screen)
    tci.sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO;
    tci.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE;
 
-   if (vkCreateSemaphore(screen->dev, &sci, NULL, &sem) == VK_SUCCESS) {
+   if (VKSCR(CreateSemaphore)(screen->dev, &sci, NULL, &sem) == VK_SUCCESS) {
       /* semaphore signal values can never decrease,
        * so we need a new semaphore anytime we overflow
        */
       if (screen->prev_sem)
-         vkDestroySemaphore(screen->dev, screen->prev_sem, NULL);
+         VKSCR(DestroySemaphore)(screen->dev, screen->prev_sem, NULL);
       screen->prev_sem = screen->sem;
       screen->sem = sem;
       return true;
@@ -1516,7 +1561,7 @@ zink_screen_timeline_wait(struct zink_screen *screen, uint32_t batch_id, uint64_
    bool success = false;
    if (screen->device_lost)
       return true;
-   VkResult ret = screen->vk.WaitSemaphores(screen->dev, &wi, timeout);
+   VkResult ret = VKSCR(WaitSemaphores)(screen->dev, &wi, timeout);
    success = zink_screen_handle_vkresult(screen, ret);
 
    if (success)
@@ -1536,7 +1581,7 @@ noop_submit(void *data, void *gdata, int thread_index)
    struct noop_submit_info *n = data;
    VkSubmitInfo si = {0};
    si.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
-   if (vkQueueSubmit(n->screen->threaded ? n->screen->thread_queue : n->screen->queue,
+   if (n->VKSCR(QueueSubmit)(n->screen->threaded ? n->screen->thread_queue : n->screen->queue,
                      1, &si, n->fence) != VK_SUCCESS) {
       debug_printf("ZINK: vkQueueSubmit() failed\n");
       n->screen->device_lost = true;
@@ -1567,7 +1612,7 @@ zink_screen_batch_id_wait(struct zink_screen *screen, uint32_t batch_id, uint64_
    util_queue_fence_init(&fence);
    fci.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
 
-   if (vkCreateFence(screen->dev, &fci, NULL, &n.fence) != VK_SUCCESS)
+   if (VKSCR(CreateFence)(screen->dev, &fci, NULL, &n.fence) != VK_SUCCESS)
       return false;
 
    n.screen = screen;
@@ -1584,10 +1629,10 @@ zink_screen_batch_id_wait(struct zink_screen *screen, uint32_t batch_id, uint64_
    }
 
    if (remaining)
-      ret = vkWaitForFences(screen->dev, 1, &n.fence, VK_TRUE, remaining);
+      ret = VKSCR(WaitForFences)(screen->dev, 1, &n.fence, VK_TRUE, remaining);
    else
-      ret = vkGetFenceStatus(screen->dev, n.fence);
-   vkDestroyFence(screen->dev, n.fence, NULL);
+      ret = VKSCR(GetFenceStatus)(screen->dev, n.fence);
+   VKSCR(DestroyFence)(screen->dev, n.fence, NULL);
    bool success = zink_screen_handle_vkresult(screen, ret);
 
    if (success)
@@ -1619,14 +1664,14 @@ zink_query_memory_info(struct pipe_screen *pscreen, struct pipe_memory_info *inf
 {
    struct zink_screen *screen = zink_screen(pscreen);
    memset(info, 0, sizeof(struct pipe_memory_info));
-   if (screen->info.have_EXT_memory_budget && screen->vk.GetPhysicalDeviceMemoryProperties2) {
+   if (screen->info.have_EXT_memory_budget && VKSCR(GetPhysicalDeviceMemoryProperties2)) {
       VkPhysicalDeviceMemoryProperties2 mem = {0};
       mem.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2;
 
       VkPhysicalDeviceMemoryBudgetPropertiesEXT budget = {0};
       budget.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT;
       mem.pNext = &budget;
-      screen->vk.GetPhysicalDeviceMemoryProperties2(screen->pdev, &mem);
+      VKSCR(GetPhysicalDeviceMemoryProperties2)(screen->pdev, &mem);
 
       for (unsigned i = 0; i < mem.memoryProperties.memoryHeapCount; i++) {
          if (mem.memoryProperties.memoryHeaps[i].flags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) {
@@ -1657,6 +1702,35 @@ zink_query_memory_info(struct pipe_screen *pscreen, struct pipe_memory_info *inf
    }
 }
 
+static void
+zink_query_dmabuf_modifiers(struct pipe_screen *pscreen, enum pipe_format format, int max, uint64_t *modifiers, unsigned int *external_only, int *count)
+{
+   struct zink_screen *screen = zink_screen(pscreen);
+   *count = screen->modifier_props[format].drmFormatModifierCount;
+   for (int i = 0; i < MIN2(max, *count); i++)
+      modifiers[i] = screen->modifier_props[format].pDrmFormatModifierProperties[i].drmFormatModifier;
+}
+
+static bool
+zink_is_dmabuf_modifier_supported(struct pipe_screen *pscreen, uint64_t modifier, enum pipe_format format, bool *external_only)
+{
+   struct zink_screen *screen = zink_screen(pscreen);
+   for (unsigned i = 0; i < screen->modifier_props[format].drmFormatModifierCount; i++)
+      if (screen->modifier_props[format].pDrmFormatModifierProperties[i].drmFormatModifier == modifier)
+         return true;
+   return false;
+}
+
+static unsigned
+zink_get_dmabuf_modifier_planes(struct pipe_screen *pscreen, uint64_t modifier, enum pipe_format format)
+{
+   struct zink_screen *screen = zink_screen(pscreen);
+   for (unsigned i = 0; i < screen->modifier_props[format].drmFormatModifierCount; i++)
+      if (screen->modifier_props[format].pDrmFormatModifierProperties[i].drmFormatModifier == modifier)
+         return screen->modifier_props[format].pDrmFormatModifierProperties[i].drmFormatModifierPlaneCount;
+   return 0;
+}
+
 static VkDevice
 zink_create_logical_device(struct zink_screen *screen)
 {
@@ -1705,19 +1779,12 @@ check_base_requirements(struct zink_screen *screen)
        !screen->info.feats.features.fillModeNonSolid ||
        !screen->info.feats.features.wideLines ||
        !screen->info.feats.features.largePoints ||
-       !screen->info.feats.features.alphaToOne ||
        !screen->info.feats.features.shaderClipDistance ||
        !(screen->info.feats12.scalarBlockLayout ||
          screen->info.have_EXT_scalar_block_layout) ||
        !screen->info.have_KHR_maintenance1 ||
        !screen->info.have_EXT_custom_border_color ||
-       !screen->info.have_EXT_line_rasterization ||
-       !screen->info.line_rast_feats.rectangularLines ||
-       !screen->info.line_rast_feats.bresenhamLines ||
-       !screen->info.line_rast_feats.smoothLines ||
-       !screen->info.line_rast_feats.stippledRectangularLines ||
-       !screen->info.line_rast_feats.stippledBresenhamLines ||
-       !screen->info.line_rast_feats.stippledSmoothLines) {
+       !screen->info.have_EXT_line_rasterization) {
       fprintf(stderr, "WARNING: Some incorrect rendering "
               "might occur because the selected Vulkan device (%s) doesn't support "
               "base Zink requirements: ", screen->info.props.deviceName);
@@ -1728,21 +1795,12 @@ check_base_requirements(struct zink_screen *screen)
       CHECK_OR_PRINT(feats.features.fillModeNonSolid);
       CHECK_OR_PRINT(feats.features.wideLines);
       CHECK_OR_PRINT(feats.features.largePoints);
-      CHECK_OR_PRINT(feats.features.alphaToOne);
       CHECK_OR_PRINT(feats.features.shaderClipDistance);
       if (!screen->info.feats12.scalarBlockLayout && !screen->info.have_EXT_scalar_block_layout)
          printf("scalarBlockLayout OR EXT_scalar_block_layout ");
       CHECK_OR_PRINT(have_KHR_maintenance1);
       CHECK_OR_PRINT(have_EXT_custom_border_color);
       CHECK_OR_PRINT(have_EXT_line_rasterization);
-      if (screen->info.have_EXT_line_rasterization) {
-         CHECK_OR_PRINT(line_rast_feats.rectangularLines);
-         CHECK_OR_PRINT(line_rast_feats.bresenhamLines);
-         CHECK_OR_PRINT(line_rast_feats.smoothLines);
-         CHECK_OR_PRINT(line_rast_feats.stippledRectangularLines);
-         CHECK_OR_PRINT(line_rast_feats.stippledBresenhamLines);
-         CHECK_OR_PRINT(line_rast_feats.stippledSmoothLines);
-      }
       fprintf(stderr, "\n");
    }
 }
@@ -1772,7 +1830,7 @@ zink_internal_create_screen(const struct pipe_screen_config *config)
 
    zink_debug = debug_get_option_zink_debug();
    screen->descriptor_mode = debug_get_option_zink_descriptor_mode();
-   if (util_bitcount(screen->descriptor_mode) > 1) {
+   if (screen->descriptor_mode > ZINK_DESCRIPTOR_MODE_NOTEMPLATES) {
       printf("Specify exactly one descriptor mode.\n");
       abort();
    }
@@ -1854,6 +1912,9 @@ zink_internal_create_screen(const struct pipe_screen_config *config)
    screen->base.get_compiler_options = zink_get_compiler_options;
    screen->base.get_sample_pixel_grid = zink_get_sample_pixel_grid;
    screen->base.is_format_supported = zink_is_format_supported;
+   screen->base.query_dmabuf_modifiers = zink_query_dmabuf_modifiers;
+   screen->base.is_dmabuf_modifier_supported = zink_is_dmabuf_modifier_supported;
+   screen->base.get_dmabuf_modifier_planes = zink_get_dmabuf_modifier_planes;
    screen->base.context_create = zink_context_create;
    screen->base.flush_frontbuffer = zink_flush_frontbuffer;
    screen->base.destroy = zink_destroy_screen;
@@ -1865,7 +1926,7 @@ zink_internal_create_screen(const struct pipe_screen_config *config)
       prop.pNext = NULL;
       for (unsigned i = 0; i < ARRAY_SIZE(screen->maxSampleLocationGridSize); i++) {
          if (screen->info.sample_locations_props.sampleLocationSampleCounts & (1 << i)) {
-            screen->vk.GetPhysicalDeviceMultisamplePropertiesEXT(screen->pdev, 1 << i, &prop);
+            VKSCR(GetPhysicalDeviceMultisamplePropertiesEXT)(screen->pdev, 1 << i, &prop);
             screen->maxSampleLocationGridSize[i] = prop.maxSampleLocationGridSize;
          }
       }
@@ -1917,18 +1978,26 @@ zink_internal_create_screen(const struct pipe_screen_config *config)
       /* not found: use compatible heap */
       if (screen->heap_map[i] == UINT8_MAX) {
          /* only cached mem has a failure case for now */
-         assert(i == ZINK_HEAP_HOST_VISIBLE_CACHED);
-         screen->heap_map[i] = screen->heap_map[ZINK_HEAP_HOST_VISIBLE_ANY];
+         assert(i == ZINK_HEAP_HOST_VISIBLE_CACHED || i == ZINK_HEAP_DEVICE_LOCAL_LAZY);
+         if (i == ZINK_HEAP_HOST_VISIBLE_CACHED)
+            screen->heap_map[i] = screen->heap_map[ZINK_HEAP_HOST_VISIBLE_COHERENT];
+         else
+            screen->heap_map[i] = screen->heap_map[ZINK_HEAP_DEVICE_LOCAL];
       }
    }
+   {
+      unsigned vis_vram = screen->heap_map[ZINK_HEAP_DEVICE_LOCAL_VISIBLE];
+      unsigned vram = screen->heap_map[ZINK_HEAP_DEVICE_LOCAL];
+      /* determine if vis vram is roughly equal to total vram */
+      if (screen->info.mem_props.memoryHeaps[screen->info.mem_props.memoryTypes[vis_vram].heapIndex].size >
+          screen->info.mem_props.memoryHeaps[screen->info.mem_props.memoryTypes[vram].heapIndex].size * 0.9)
+         screen->resizable_bar = true;
+   }
 
-   simple_mtx_init(&screen->surface_mtx, mtx_plain);
-   simple_mtx_init(&screen->bufferview_mtx, mtx_plain);
-   simple_mtx_init(&screen->framebuffer_mtx, mtx_plain);
-
-   _mesa_hash_table_init(&screen->framebuffer_cache, screen, hash_framebuffer_state, equals_framebuffer_state);
-   _mesa_hash_table_init(&screen->surface_cache, screen, NULL, equals_ivci);
-   _mesa_hash_table_init(&screen->bufferview_cache, screen, NULL, equals_bvci);
+   if (!screen->info.have_KHR_imageless_framebuffer) {
+      simple_mtx_init(&screen->framebuffer_mtx, mtx_plain);
+      _mesa_hash_table_init(&screen->framebuffer_cache, screen, hash_framebuffer_state, equals_framebuffer_state);
+   }
 
    zink_screen_init_descriptor_funcs(screen, false);
    util_idalloc_mt_init_tc(&screen->buffer_ids);
@@ -1944,8 +2013,10 @@ struct pipe_screen *
 zink_create_screen(struct sw_winsys *winsys)
 {
    struct zink_screen *ret = zink_internal_create_screen(NULL);
-   if (ret)
+   if (ret) {
       ret->winsys = winsys;
+      ret->drm_fd = -1;
+   }
 
    return &ret->base;
 }
@@ -1955,6 +2026,8 @@ zink_drm_create_screen(int fd, const struct pipe_screen_config *config)
 {
    struct zink_screen *ret = zink_internal_create_screen(config);
 
+   if (ret)
+      ret->drm_fd = os_dupfd_cloexec(fd);
    if (ret && !ret->info.have_KHR_external_memory_fd) {
       debug_printf("ZINK: KHR_external_memory_fd required!\n");
       zink_destroy_screen(&ret->base);
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_screen.h b/mesa 3D driver/src/gallium/drivers/zink/zink_screen.h
index 082ac2d23a..23f7fbafd5 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_screen.h	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_screen.h	
@@ -64,6 +64,7 @@ enum zink_descriptor_type;
 enum zink_descriptor_mode {
    ZINK_DESCRIPTOR_MODE_AUTO,
    ZINK_DESCRIPTOR_MODE_LAZY,
+   ZINK_DESCRIPTOR_MODE_NOFALLBACK,
    ZINK_DESCRIPTOR_MODE_NOTEMPLATES,
 };
 
@@ -81,15 +82,14 @@ struct zink_screen {
    VkSemaphore prev_sem;
    struct util_queue flush_queue;
 
+   unsigned buffer_rebind_counter;
+
    bool device_lost;
    struct sw_winsys *winsys;
+   int drm_fd;
 
    struct hash_table framebuffer_cache;
    simple_mtx_t framebuffer_mtx;
-   struct hash_table surface_cache;
-   simple_mtx_t surface_mtx;
-   struct hash_table bufferview_cache;
-   simple_mtx_t bufferview_mtx;
 
    struct slab_parent_pool transfer_pool;
    struct disk_cache *disk_cache;
@@ -107,6 +107,7 @@ struct zink_screen {
       uint32_t next_bo_unique_id;
    } pb;
    uint8_t heap_map[VK_MAX_MEMORY_TYPES];
+   bool resizable_bar;
 
    uint64_t total_video_mem;
    uint64_t clamp_video_mem;
@@ -142,7 +143,7 @@ struct zink_screen {
    struct vk_dispatch_table vk;
 
    bool (*descriptor_program_init)(struct zink_context *ctx, struct zink_program *pg);
-   void (*descriptor_program_deinit)(struct zink_screen *screen, struct zink_program *pg);
+   void (*descriptor_program_deinit)(struct zink_context *ctx, struct zink_program *pg);
    void (*descriptors_update)(struct zink_context *ctx, bool is_compute);
    void (*context_update_descriptor_states)(struct zink_context *ctx, bool is_compute);
    void (*context_invalidate_descriptor_state)(struct zink_context *ctx, enum pipe_shader_type shader,
@@ -238,6 +239,9 @@ struct mem_cache_entry {
    void *map;
 };
 
+#define VKCTX(fn) zink_screen(ctx->base.screen)->vk.fn
+#define VKSCR(fn) screen->vk.fn
+
 VkFormat
 zink_get_format(struct zink_screen *screen, enum pipe_format format);
 
@@ -263,4 +267,16 @@ zink_screen_init_descriptor_funcs(struct zink_screen *screen, bool fallback);
 
 void
 zink_stub_function_not_loaded(void);
+
+#define warn_missing_feature(feat) \
+   do { \
+      static bool warned = false; \
+      if (!warned) { \
+         fprintf(stderr, "WARNING: Incorrect rendering will happen, " \
+                         "because the Vulkan device doesn't support " \
+                         "the %s feature\n", feat); \
+         warned = true; \
+      } \
+   } while (0)
+
 #endif
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_shader_keys.h b/mesa 3D driver/src/gallium/drivers/zink/zink_shader_keys.h
index 867a0bff85..318728e87d 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_shader_keys.h	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_shader_keys.h	
@@ -26,12 +26,35 @@
 #ifndef ZINK_SHADER_KEYS_H
 # define ZINK_SHADER_KEYS_H
 
-struct zink_vs_key {
+#include "compiler/shader_info.h"
+
+struct zink_vs_key_base {
    bool clip_halfz;
    bool push_drawid;
    bool last_vertex_stage;
 };
 
+struct zink_vs_key {
+   struct zink_vs_key_base base;
+   uint8_t pad;
+   union {
+      struct {
+         uint32_t decomposed_attrs;
+         uint32_t decomposed_attrs_without_w;
+      } u32;
+      struct {
+         uint16_t decomposed_attrs;
+         uint16_t decomposed_attrs_without_w;
+      } u16;
+      struct {
+         uint8_t decomposed_attrs;
+         uint8_t decomposed_attrs_without_w;
+      } u8;
+   };
+   // not hashed
+   unsigned size;
+};
+
 struct zink_fs_key {
    uint8_t coord_replace_bits;
    bool coord_replace_yinvert;
@@ -39,11 +62,6 @@ struct zink_fs_key {
    bool force_dual_color_blend;
 };
 
-struct zink_tcs_key {
-   unsigned vertices_per_patch;
-   uint64_t vs_outputs_written;
-};
-
 struct zink_shader_key_base {
    uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS];
 };
@@ -57,24 +75,31 @@ struct zink_shader_key {
    union {
       /* reuse vs key for now with tes/gs since we only use clip_halfz */
       struct zink_vs_key vs;
+      struct zink_vs_key_base vs_base;
       struct zink_fs_key fs;
-      struct zink_tcs_key tcs;
    } key;
    struct zink_shader_key_base base;
    unsigned inline_uniforms:1;
    uint32_t size;
-   bool is_default_variant;
 };
 
 static inline const struct zink_fs_key *
 zink_fs_key(const struct zink_shader_key *key)
 {
+   assert(key);
    return &key->key.fs;
 }
 
+static inline const struct zink_vs_key_base *
+zink_vs_key_base(const struct zink_shader_key *key)
+{
+   return &key->key.vs_base;
+}
+
 static inline const struct zink_vs_key *
 zink_vs_key(const struct zink_shader_key *key)
 {
+   assert(key);
    return &key->key.vs;
 }
 
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_state.c b/mesa 3D driver/src/gallium/drivers/zink/zink_state.c
index 4e66eb5319..68e8d413ff 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_state.c	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_state.c	
@@ -24,6 +24,8 @@
 #include "zink_state.h"
 
 #include "zink_context.h"
+#include "zink_format.h"
+#include "zink_program.h"
 #include "zink_screen.h"
 
 #include "compiler/shader_enums.h"
@@ -42,12 +44,17 @@ zink_create_vertex_elements_state(struct pipe_context *pctx,
    struct zink_vertex_elements_state *ves = CALLOC_STRUCT(zink_vertex_elements_state);
    if (!ves)
       return NULL;
+   ves->hw_state.hash = _mesa_hash_pointer(ves);
 
    int buffer_map[PIPE_MAX_ATTRIBS];
    for (int i = 0; i < ARRAY_SIZE(buffer_map); ++i)
       buffer_map[i] = -1;
 
    int num_bindings = 0;
+   unsigned num_decomposed = 0;
+   uint32_t size8 = 0;
+   uint32_t size16 = 0;
+   uint32_t size32 = 0;
    for (i = 0; i < num_elements; ++i) {
       const struct pipe_vertex_element *elem = elements + i;
 
@@ -58,32 +65,82 @@ zink_create_vertex_elements_state(struct pipe_context *pctx,
       }
       binding = buffer_map[binding];
 
-
       ves->bindings[binding].binding = binding;
       ves->bindings[binding].inputRate = elem->instance_divisor ? VK_VERTEX_INPUT_RATE_INSTANCE : VK_VERTEX_INPUT_RATE_VERTEX;
 
       assert(!elem->instance_divisor || zink_screen(pctx->screen)->info.have_EXT_vertex_attribute_divisor);
-      ves->divisor[binding] = elem->instance_divisor;
-      assert(elem->instance_divisor <= screen->info.vdiv_props.maxVertexAttribDivisor);
+      if (elem->instance_divisor > screen->info.vdiv_props.maxVertexAttribDivisor)
+         debug_printf("zink: clamping instance divisor %u to %u\n", elem->instance_divisor, screen->info.vdiv_props.maxVertexAttribDivisor);
+      ves->divisor[binding] = MIN2(elem->instance_divisor, screen->info.vdiv_props.maxVertexAttribDivisor);
+
+      VkFormat format;
+      if (screen->format_props[elem->src_format].bufferFeatures & VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT)
+         format = zink_get_format(screen, elem->src_format);
+      else {
+         enum pipe_format new_format = zink_decompose_vertex_format(elem->src_format);
+         assert(new_format);
+         num_decomposed++;
+         assert(screen->format_props[new_format].bufferFeatures & VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT);
+         if (util_format_get_blocksize(new_format) == 4)
+            size32 |= BITFIELD_BIT(i);
+         else if (util_format_get_blocksize(new_format) == 2)
+            size16 |= BITFIELD_BIT(i);
+         else
+            size8 |= BITFIELD_BIT(i);
+         format = zink_get_format(screen, new_format);
+         unsigned size;
+         if (i < 8)
+            size = 1;
+         else if (i < 16)
+            size = 2;
+         else
+            size = 4;
+         if (util_format_get_nr_components(elem->src_format) == 4) {
+            ves->decomposed_attrs |= BITFIELD_BIT(i);
+            ves->decomposed_attrs_size = size;
+         } else {
+            ves->decomposed_attrs_without_w |= BITFIELD_BIT(i);
+            ves->decomposed_attrs_without_w_size = size;
+         }
+      }
 
       if (screen->info.have_EXT_vertex_input_dynamic_state) {
          ves->hw_state.dynattribs[i].sType = VK_STRUCTURE_TYPE_VERTEX_INPUT_ATTRIBUTE_DESCRIPTION_2_EXT;
          ves->hw_state.dynattribs[i].binding = binding;
          ves->hw_state.dynattribs[i].location = i;
-         ves->hw_state.dynattribs[i].format = zink_get_format(screen,
-                                                           elem->src_format);
+         ves->hw_state.dynattribs[i].format = format;
          assert(ves->hw_state.dynattribs[i].format != VK_FORMAT_UNDEFINED);
          ves->hw_state.dynattribs[i].offset = elem->src_offset;
       } else {
          ves->hw_state.attribs[i].binding = binding;
          ves->hw_state.attribs[i].location = i;
-         ves->hw_state.attribs[i].format = zink_get_format(screen,
-                                                           elem->src_format);
+         ves->hw_state.attribs[i].format = format;
          assert(ves->hw_state.attribs[i].format != VK_FORMAT_UNDEFINED);
          ves->hw_state.attribs[i].offset = elem->src_offset;
       }
    }
-
+   assert(num_decomposed + num_elements <= PIPE_MAX_ATTRIBS);
+   u_foreach_bit(i, ves->decomposed_attrs | ves->decomposed_attrs_without_w) {
+      const struct pipe_vertex_element *elem = elements + i;
+      const struct util_format_description *desc = util_format_description(elem->src_format);
+      unsigned size = 1;
+      if (size32 & BITFIELD_BIT(i))
+         size = 4;
+      else if (size16 & BITFIELD_BIT(i))
+         size = 2;
+      for (unsigned j = 1; j < desc->nr_channels; j++) {
+         if (screen->info.have_EXT_vertex_input_dynamic_state) {
+            memcpy(&ves->hw_state.dynattribs[num_elements], &ves->hw_state.dynattribs[i], sizeof(VkVertexInputAttributeDescription2EXT));
+            ves->hw_state.dynattribs[num_elements].location = num_elements;
+            ves->hw_state.dynattribs[num_elements].offset += j * size;
+         } else {
+            memcpy(&ves->hw_state.attribs[num_elements], &ves->hw_state.attribs[i], sizeof(VkVertexInputAttributeDescription));
+            ves->hw_state.attribs[num_elements].location = num_elements;
+            ves->hw_state.attribs[num_elements].offset += j * size;
+         }
+         num_elements++;
+      }
+   }
    ves->hw_state.num_bindings = num_bindings;
    ves->hw_state.num_attribs = num_elements;
    if (screen->info.have_EXT_vertex_input_dynamic_state) {
@@ -119,9 +176,48 @@ zink_bind_vertex_elements_state(struct pipe_context *pctx,
    ctx->element_state = cso;
    if (cso) {
       if (state->element_state != &ctx->element_state->hw_state) {
-         ctx->vertex_state_changed = true;
+         ctx->vertex_state_changed = !zink_screen(pctx->screen)->info.have_EXT_vertex_input_dynamic_state;
          ctx->vertex_buffers_dirty = ctx->element_state->hw_state.num_bindings > 0;
       }
+      const struct zink_vs_key *vs = zink_get_vs_key(ctx);
+      uint32_t decomposed_attrs = 0, decomposed_attrs_without_w = 0;
+      switch (vs->size) {
+      case 1:
+         decomposed_attrs = vs->u8.decomposed_attrs;
+         decomposed_attrs_without_w = vs->u8.decomposed_attrs_without_w;
+         break;
+      case 2:
+         decomposed_attrs = vs->u16.decomposed_attrs;
+         decomposed_attrs_without_w = vs->u16.decomposed_attrs_without_w;
+         break;
+      case 4:
+         decomposed_attrs = vs->u16.decomposed_attrs;
+         decomposed_attrs_without_w = vs->u16.decomposed_attrs_without_w;
+         break;
+      }
+      if (ctx->element_state->decomposed_attrs != decomposed_attrs ||
+          ctx->element_state->decomposed_attrs_without_w != decomposed_attrs_without_w) {
+         unsigned size = MAX2(ctx->element_state->decomposed_attrs_size, ctx->element_state->decomposed_attrs_without_w_size);
+         struct zink_shader_key *key = (struct zink_shader_key *)zink_set_vs_key(ctx);
+         key->size -= 2 * key->key.vs.size;
+         switch (size) {
+         case 1:
+            key->key.vs.u8.decomposed_attrs = ctx->element_state->decomposed_attrs;
+            key->key.vs.u8.decomposed_attrs_without_w = ctx->element_state->decomposed_attrs_without_w;
+            break;
+         case 2:
+            key->key.vs.u16.decomposed_attrs = ctx->element_state->decomposed_attrs;
+            key->key.vs.u16.decomposed_attrs_without_w = ctx->element_state->decomposed_attrs_without_w;
+            break;
+         case 4:
+            key->key.vs.u32.decomposed_attrs = ctx->element_state->decomposed_attrs;
+            key->key.vs.u32.decomposed_attrs_without_w = ctx->element_state->decomposed_attrs_without_w;
+            break;
+         default: break;
+         }
+         key->key.vs.size = size;
+         key->size += 2 * size;
+      }
       state->element_state = &ctx->element_state->hw_state;
    } else {
      state->element_state = NULL;
@@ -250,6 +346,7 @@ zink_create_blend_state(struct pipe_context *pctx,
    struct zink_blend_state *cso = CALLOC_STRUCT(zink_blend_state);
    if (!cso)
       return NULL;
+   cso->hash = _mesa_hash_pointer(cso);
 
    if (blend_state->logicop_enable) {
       cso->logicop_enable = VK_TRUE;
@@ -310,12 +407,19 @@ zink_create_blend_state(struct pipe_context *pctx,
 static void
 zink_bind_blend_state(struct pipe_context *pctx, void *cso)
 {
+   struct zink_context *ctx = zink_context(pctx);
    struct zink_gfx_pipeline_state* state = &zink_context(pctx)->gfx_pipeline_state;
+   struct zink_blend_state *blend = cso;
 
    if (state->blend_state != cso) {
       state->blend_state = cso;
+      state->blend_id = blend ? blend->hash : 0;
       state->dirty = true;
-      zink_context(pctx)->blend_state_changed = true;
+      bool force_dual_color_blend = zink_screen(pctx->screen)->driconf.dual_color_blend_by_location &&
+                                    blend && blend->dual_src_blend && state->blend_state->attachments[1].blendEnable;
+      if (force_dual_color_blend != zink_get_fs_key(ctx)->force_dual_color_blend)
+         zink_set_fs_key(ctx)->force_dual_color_blend = force_dual_color_blend;
+      ctx->blend_state_changed = true;
    }
 }
 
@@ -417,8 +521,8 @@ zink_bind_depth_stencil_alpha_state(struct pipe_context *pctx, void *cso)
 
    if (cso) {
       struct zink_gfx_pipeline_state *state = &ctx->gfx_pipeline_state;
-      if (state->depth_stencil_alpha_state != &ctx->dsa_state->hw_state) {
-         state->depth_stencil_alpha_state = &ctx->dsa_state->hw_state;
+      if (state->dyn_state1.depth_stencil_alpha_state != &ctx->dsa_state->hw_state) {
+         state->dyn_state1.depth_stencil_alpha_state = &ctx->dsa_state->hw_state;
          state->dirty |= !zink_screen(pctx->screen)->info.have_EXT_extended_dynamic_state;
          ctx->dsa_state_changed = true;
       }
@@ -454,6 +558,18 @@ line_width(float width, float granularity, const float range[2])
    return CLAMP(width, range[0], range[1]);
 }
 
+#define warn_line_feature(feat) \
+   do { \
+      static bool warned = false; \
+      if (!warned) { \
+         fprintf(stderr, "WARNING: Incorrect rendering will happen, " \
+                         "because the Vulkan device doesn't support " \
+                         "the %s feature of " \
+                         "VK_EXT_line_rasterization\n", feat); \
+         warned = true; \
+      } \
+   } while (0)
+
 static void *
 zink_create_rasterizer_state(struct pipe_context *pctx,
                              const struct pipe_rasterizer_state *rs_state)
@@ -472,13 +588,14 @@ zink_create_rasterizer_state(struct pipe_context *pctx,
    state->hw_state.depth_clamp = rs_state->depth_clip_near == 0;
    state->hw_state.rasterizer_discard = rs_state->rasterizer_discard;
    state->hw_state.force_persample_interp = rs_state->force_persample_interp;
-   state->hw_state.pv_mode = rs_state->flatshade_first ? VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT : VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT;
+   state->hw_state.pv_last = !rs_state->flatshade_first;
+   state->hw_state.clip_halfz = rs_state->clip_halfz;
 
    assert(rs_state->fill_front <= PIPE_POLYGON_MODE_POINT);
    if (rs_state->fill_back != rs_state->fill_front)
       debug_printf("BUG: vulkan doesn't support different front and back fill modes\n");
-   state->hw_state.polygon_mode = (VkPolygonMode)rs_state->fill_front; // same values
-   state->hw_state.cull_mode = (VkCullModeFlags)rs_state->cull_face; // same bits
+   state->hw_state.polygon_mode = rs_state->fill_front; // same values
+   state->hw_state.cull_mode = rs_state->cull_face; // same bits
 
    state->front_face = rs_state->front_ccw ?
                        VK_FRONT_FACE_COUNTER_CLOCKWISE :
@@ -496,13 +613,19 @@ zink_create_rasterizer_state(struct pipe_context *pctx,
                if (line_feats->stippledSmoothLines)
                   state->hw_state.line_mode =
                      VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT;
+               else
+                  warn_line_feature("stippledSmoothLines");
             } else if (line_feats->stippledRectangularLines)
                state->hw_state.line_mode =
                   VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT;
+            else
+               warn_line_feature("stippledRectangularLines");
          } else if (line_feats->stippledBresenhamLines)
             state->hw_state.line_mode =
                VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT;
          else {
+            warn_line_feature("stippledBresenhamLines");
+
             /* no suitable mode that supports line stippling */
             state->base.line_stipple_factor = 0;
             state->base.line_stipple_pattern = UINT16_MAX;
@@ -515,12 +638,18 @@ zink_create_rasterizer_state(struct pipe_context *pctx,
                if (line_feats->smoothLines)
                   state->hw_state.line_mode =
                      VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT;
+               else
+                  warn_line_feature("smoothLines");
             } else if (line_feats->rectangularLines)
                state->hw_state.line_mode =
                   VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT;
+            else
+               warn_line_feature("rectangularLines");
          } else if (line_feats->bresenhamLines)
             state->hw_state.line_mode =
                VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT;
+         else
+            warn_line_feature("bresenhamLines");
       }
       state->base.line_stipple_factor = 0;
       state->base.line_stipple_pattern = UINT16_MAX;
@@ -545,35 +674,35 @@ zink_bind_rasterizer_state(struct pipe_context *pctx, void *cso)
 {
    struct zink_context *ctx = zink_context(pctx);
    struct zink_screen *screen = zink_screen(pctx->screen);
-   bool clip_halfz = ctx->rast_state ? ctx->rast_state->base.clip_halfz : false;
    bool point_quad_rasterization = ctx->rast_state ? ctx->rast_state->base.point_quad_rasterization : false;
    bool scissor = ctx->rast_state ? ctx->rast_state->base.scissor : false;
+   bool pv_last = ctx->rast_state ? ctx->rast_state->hw_state.pv_last : false;
    ctx->rast_state = cso;
 
    if (ctx->rast_state) {
-      if (ctx->gfx_pipeline_state.rast_state != &ctx->rast_state->hw_state) {
-         if (screen->info.have_EXT_provoking_vertex &&
-             (!ctx->gfx_pipeline_state.rast_state ||
-              ctx->gfx_pipeline_state.rast_state->pv_mode != ctx->rast_state->hw_state.pv_mode) &&
-             /* without this prop, change in pv mode requires new rp */
-             !screen->info.pv_props.provokingVertexModePerPipeline)
-            zink_batch_no_rp(ctx);
-         ctx->gfx_pipeline_state.rast_state = &ctx->rast_state->hw_state;
-         ctx->gfx_pipeline_state.dirty = true;
-         ctx->rast_state_changed = true;
-      }
+      if (screen->info.have_EXT_provoking_vertex &&
+          pv_last != ctx->rast_state->hw_state.pv_last &&
+          /* without this prop, change in pv mode requires new rp */
+          !screen->info.pv_props.provokingVertexModePerPipeline)
+         zink_batch_no_rp(ctx);
+      uint32_t rast_bits = 0;
+      memcpy(&rast_bits, &ctx->rast_state->hw_state, sizeof(struct zink_rasterizer_hw_state));
+      ctx->gfx_pipeline_state.rast_state = rast_bits & BITFIELD_MASK(ZINK_RAST_HW_STATE_SIZE);
 
-      if (clip_halfz != ctx->rast_state->base.clip_halfz) {
-         ctx->last_vertex_stage_dirty = true;
+      ctx->gfx_pipeline_state.dirty = true;
+      ctx->rast_state_changed = true;
+
+      if (zink_get_last_vertex_key(ctx)->clip_halfz != ctx->rast_state->base.clip_halfz) {
+         zink_set_last_vertex_key(ctx)->clip_halfz = ctx->rast_state->base.clip_halfz;
          ctx->vp_state_changed = true;
       }
 
-      if (ctx->gfx_pipeline_state.front_face != ctx->rast_state->front_face) {
-         ctx->gfx_pipeline_state.front_face = ctx->rast_state->front_face;
+      if (ctx->gfx_pipeline_state.dyn_state1.front_face != ctx->rast_state->front_face) {
+         ctx->gfx_pipeline_state.dyn_state1.front_face = ctx->rast_state->front_face;
          ctx->gfx_pipeline_state.dirty |= !zink_screen(pctx->screen)->info.have_EXT_extended_dynamic_state;
       }
       if (ctx->rast_state->base.point_quad_rasterization != point_quad_rasterization)
-         ctx->dirty_shader_stages |= BITFIELD_BIT(PIPE_SHADER_FRAGMENT);
+         zink_set_fs_point_coord_key(ctx);
       if (ctx->rast_state->base.scissor != scissor)
          ctx->scissor_changed = true;
    }
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_state.h b/mesa 3D driver/src/gallium/drivers/zink/zink_state.h
index 599514ad3f..1254498377 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_state.h	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_state.h	
@@ -29,6 +29,7 @@
 #include "pipe/p_state.h"
 
 struct zink_vertex_elements_hw_state {
+   uint32_t hash;
    union {
       VkVertexInputAttributeDescription attribs[PIPE_MAX_ATTRIBS];
       VkVertexInputAttributeDescription2EXT dynattribs[PIPE_MAX_ATTRIBS];
@@ -51,19 +52,26 @@ struct zink_vertex_elements_state {
    } bindings[PIPE_MAX_ATTRIBS];
    uint32_t divisor[PIPE_MAX_ATTRIBS];
    uint8_t binding_map[PIPE_MAX_ATTRIBS];
+   uint32_t decomposed_attrs;
+   unsigned decomposed_attrs_size;
+   uint32_t decomposed_attrs_without_w;
+   unsigned decomposed_attrs_without_w_size;
    struct zink_vertex_elements_hw_state hw_state;
 };
 
 struct zink_rasterizer_hw_state {
-   VkPolygonMode polygon_mode;
-   VkCullModeFlags cull_mode;
-   VkProvokingVertexModeEXT pv_mode;
-   VkLineRasterizationModeEXT line_mode;
-   unsigned depth_clamp : 1;
-   unsigned rasterizer_discard : 1;
-   unsigned force_persample_interp : 1;
-   bool line_stipple_enable;
+   unsigned polygon_mode : 2; //VkPolygonMode
+   unsigned cull_mode : 2; //VkCullModeFlags
+   unsigned line_mode : 2; //VkLineRasterizationModeEXT
+   bool depth_clamp:1;
+   bool rasterizer_discard:1;
+   bool pv_last:1;
+   bool line_stipple_enable:1;
+   bool force_persample_interp:1;
+   bool clip_halfz:1;
 };
+#define ZINK_RAST_HW_STATE_SIZE 12
+
 
 struct zink_rasterizer_state {
    struct pipe_rasterizer_state base;
@@ -75,6 +83,7 @@ struct zink_rasterizer_state {
 };
 
 struct zink_blend_state {
+   uint32_t hash;
    VkPipelineColorBlendAttachmentState attachments[PIPE_MAX_COLOR_BUFS];
 
    VkBool32 logicop_enable;
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_surface.c b/mesa 3D driver/src/gallium/drivers/zink/zink_surface.c
index b3fe21ad8c..e7af996352 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_surface.c	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_surface.c	
@@ -37,7 +37,9 @@ create_ivci(struct zink_screen *screen,
             const struct pipe_surface *templ,
             enum pipe_texture_target target)
 {
-   VkImageViewCreateInfo ivci = {0};
+   VkImageViewCreateInfo ivci;
+   /* zero holes since this is hashed */
+   memset(&ivci, 0, sizeof(VkImageViewCreateInfo));
    ivci.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
    ivci.image = res->obj->image;
 
@@ -78,7 +80,14 @@ create_ivci(struct zink_screen *screen,
    ivci.format = zink_get_format(screen, templ->format);
    assert(ivci.format != VK_FORMAT_UNDEFINED);
 
-   // TODO: format swizzles
+   /* TODO: it's currently illegal to use non-identity swizzles for framebuffer attachments,
+    * but if that ever changes, this will be useful
+   const struct util_format_description *desc = util_format_description(templ->format);
+   ivci.components.r = zink_component_mapping(zink_clamp_void_swizzle(desc, PIPE_SWIZZLE_X));
+   ivci.components.g = zink_component_mapping(zink_clamp_void_swizzle(desc, PIPE_SWIZZLE_Y));
+   ivci.components.b = zink_component_mapping(zink_clamp_void_swizzle(desc, PIPE_SWIZZLE_Z));
+   ivci.components.a = zink_component_mapping(zink_clamp_void_swizzle(desc, PIPE_SWIZZLE_W));
+   */
    ivci.components.r = VK_COMPONENT_SWIZZLE_R;
    ivci.components.g = VK_COMPONENT_SWIZZLE_G;
    ivci.components.b = VK_COMPONENT_SWIZZLE_B;
@@ -94,6 +103,18 @@ create_ivci(struct zink_screen *screen,
    return ivci;
 }
 
+static void
+init_surface_info(struct zink_surface *surface, struct zink_resource *res, VkImageViewCreateInfo *ivci)
+{
+   surface->info.flags = res->obj->vkflags;
+   surface->info.usage = res->obj->vkusage;
+   surface->info.width = surface->base.width;
+   surface->info.height = surface->base.height;
+   surface->info.layerCount = ivci->subresourceRange.layerCount;
+   surface->info.format = ivci->format;
+   surface->info_hash = _mesa_hash_data(&surface->info, sizeof(surface->info));
+}
+
 static struct zink_surface *
 create_surface(struct pipe_context *pctx,
                struct pipe_resource *pres,
@@ -101,6 +122,7 @@ create_surface(struct pipe_context *pctx,
                VkImageViewCreateInfo *ivci)
 {
    struct zink_screen *screen = zink_screen(pctx->screen);
+   struct zink_resource *res = zink_resource(pres);
    unsigned int level = templ->u.tex.level;
 
    struct zink_surface *surface = CALLOC_STRUCT(zink_surface);
@@ -112,7 +134,9 @@ create_surface(struct pipe_context *pctx,
    surface->base.context = pctx;
    surface->base.format = templ->format;
    surface->base.width = u_minify(pres->width0, level);
+   assert(surface->base.width);
    surface->base.height = u_minify(pres->height0, level);
+   assert(surface->base.height);
    surface->base.nr_samples = templ->nr_samples;
    surface->base.u.tex.level = level;
    surface->base.u.tex.first_layer = templ->u.tex.first_layer;
@@ -121,7 +145,9 @@ create_surface(struct pipe_context *pctx,
    util_dynarray_init(&surface->framebuffer_refs, NULL);
    util_dynarray_init(&surface->desc_set_refs.refs, NULL);
 
-   if (vkCreateImageView(screen->dev, ivci, NULL,
+   init_surface_info(surface, res, ivci);
+
+   if (VKSCR(CreateImageView)(screen->dev, ivci, NULL,
                          &surface->image_view) != VK_SUCCESS) {
       FREE(surface);
       return NULL;
@@ -142,21 +168,22 @@ zink_get_surface(struct zink_context *ctx,
             const struct pipe_surface *templ,
             VkImageViewCreateInfo *ivci)
 {
-   struct zink_screen *screen = zink_screen(ctx->base.screen);
    struct zink_surface *surface = NULL;
+   struct zink_resource *res = zink_resource(pres);
    uint32_t hash = hash_ivci(ivci);
 
-   simple_mtx_lock(&screen->surface_mtx);
-   struct hash_entry *entry = _mesa_hash_table_search_pre_hashed(&screen->surface_cache, hash, ivci);
+   simple_mtx_lock(&res->surface_mtx);
+   struct hash_entry *entry = _mesa_hash_table_search_pre_hashed(&res->surface_cache, hash, ivci);
 
    if (!entry) {
       /* create a new surface */
       surface = create_surface(&ctx->base, pres, templ, ivci);
+      surface->base.nr_samples = 0;
       surface->hash = hash;
       surface->ivci = *ivci;
-      entry = _mesa_hash_table_insert_pre_hashed(&screen->surface_cache, hash, &surface->ivci, surface);
+      entry = _mesa_hash_table_insert_pre_hashed(&res->surface_cache, hash, &surface->ivci, surface);
       if (!entry) {
-         simple_mtx_unlock(&screen->surface_mtx);
+         simple_mtx_unlock(&res->surface_mtx);
          return NULL;
       }
 
@@ -165,11 +192,23 @@ zink_get_surface(struct zink_context *ctx,
       surface = entry->data;
       p_atomic_inc(&surface->base.reference.count);
    }
-   simple_mtx_unlock(&screen->surface_mtx);
+   simple_mtx_unlock(&res->surface_mtx);
 
    return &surface->base;
 }
 
+static struct pipe_surface *
+wrap_surface(struct pipe_context *pctx, struct pipe_surface *psurf)
+{
+   struct zink_ctx_surface *csurf = CALLOC_STRUCT(zink_ctx_surface);
+   csurf->base = *psurf;
+   pipe_reference_init(&csurf->base.reference, 1);
+   csurf->surf = (struct zink_surface*)psurf;
+   csurf->base.context = pctx;
+
+   return &csurf->base;
+}
+
 static struct pipe_surface *
 zink_create_surface(struct pipe_context *pctx,
                     struct pipe_resource *pres,
@@ -181,7 +220,31 @@ zink_create_surface(struct pipe_context *pctx,
    if (pres->target == PIPE_TEXTURE_3D)
       ivci.viewType = VK_IMAGE_VIEW_TYPE_2D;
 
-   return zink_get_surface(zink_context(pctx), pres, templ, &ivci);
+   struct pipe_surface *psurf = zink_get_surface(zink_context(pctx), pres, templ, &ivci);
+   if (!psurf)
+      return NULL;
+
+   struct zink_ctx_surface *csurf = (struct zink_ctx_surface*)wrap_surface(pctx, psurf);
+
+   if (templ->nr_samples) {
+      /* transient fb attachment: not cached */
+      struct pipe_resource rtempl = *pres;
+      rtempl.nr_samples = templ->nr_samples;
+      rtempl.bind |= ZINK_BIND_TRANSIENT;
+      struct zink_resource *transient = zink_resource(pctx->screen->resource_create(pctx->screen, &rtempl));
+      if (!transient)
+         return NULL;
+      ivci.image = transient->obj->image;
+      csurf->transient = (struct zink_ctx_surface*)wrap_surface(pctx, (struct pipe_surface*)create_surface(pctx, &transient->base.b, templ, &ivci));
+      if (!csurf->transient) {
+         pipe_resource_reference((struct pipe_resource**)&transient, NULL);
+         pipe_surface_release(pctx, &psurf);
+         return NULL;
+      }
+      pipe_resource_reference((struct pipe_resource**)&transient, NULL);
+   }
+
+   return &csurf->base;
 }
 
 /* framebuffers are owned by their surfaces, so each time a surface that's part of a cached fb
@@ -206,11 +269,6 @@ surface_clear_fb_refs(struct zink_screen *screen, struct pipe_surface *psurface)
             simple_mtx_unlock(&screen->framebuffer_mtx);
             break;
          }
-         /* null surface doesn't get a ref but it will double-free
-          * if the pointer isn't unset
-          */
-         if (fb->null_surface == psurface)
-            fb->null_surface = NULL;
       }
    }
    util_dynarray_fini(&surface->framebuffer_refs);
@@ -220,19 +278,28 @@ void
 zink_destroy_surface(struct zink_screen *screen, struct pipe_surface *psurface)
 {
    struct zink_surface *surface = zink_surface(psurface);
-   simple_mtx_lock(&screen->surface_mtx);
-   struct hash_entry *he = _mesa_hash_table_search_pre_hashed(&screen->surface_cache, surface->hash, &surface->ivci);
-   assert(he);
-   assert(he->data == surface);
-   _mesa_hash_table_remove(&screen->surface_cache, he);
-   simple_mtx_unlock(&screen->surface_mtx);
-   surface_clear_fb_refs(screen, psurface);
+   struct zink_resource *res = zink_resource(psurface->texture);
+   if (!psurface->nr_samples) {
+      simple_mtx_lock(&res->surface_mtx);
+      if (psurface->reference.count) {
+         /* got a cache hit during deletion */
+         simple_mtx_unlock(&res->surface_mtx);
+         return;
+      }
+      struct hash_entry *he = _mesa_hash_table_search_pre_hashed(&res->surface_cache, surface->hash, &surface->ivci);
+      assert(he);
+      assert(he->data == surface);
+      _mesa_hash_table_remove(&res->surface_cache, he);
+      simple_mtx_unlock(&res->surface_mtx);
+   }
+   if (!screen->info.have_KHR_imageless_framebuffer)
+      surface_clear_fb_refs(screen, psurface);
    zink_descriptor_set_refs_clear(&surface->desc_set_refs, surface);
    util_dynarray_fini(&surface->framebuffer_refs);
    pipe_resource_reference(&psurface->texture, NULL);
    if (surface->simage_view)
-      vkDestroyImageView(screen->dev, surface->simage_view, NULL);
-   vkDestroyImageView(screen->dev, surface->image_view, NULL);
+      VKSCR(DestroyImageView)(screen->dev, surface->simage_view, NULL);
+   VKSCR(DestroyImageView)(screen->dev, surface->image_view, NULL);
    FREE(surface);
 }
 
@@ -240,13 +307,17 @@ static void
 zink_surface_destroy(struct pipe_context *pctx,
                      struct pipe_surface *psurface)
 {
-   zink_destroy_surface(zink_screen(pctx->screen), psurface);
+   struct zink_ctx_surface *csurf = (struct zink_ctx_surface *)psurface;
+   zink_surface_reference(zink_screen(pctx->screen), &csurf->surf, NULL);
+   pipe_surface_release(pctx, (struct pipe_surface**)&csurf->transient);
+   FREE(csurf);
 }
 
 bool
 zink_rebind_surface(struct zink_context *ctx, struct pipe_surface **psurface)
 {
    struct zink_surface *surface = zink_surface(*psurface);
+   struct zink_resource *res = zink_resource((*psurface)->texture);
    struct zink_screen *screen = zink_screen(ctx->base.screen);
    if (surface->simage_view)
       return false;
@@ -254,8 +325,8 @@ zink_rebind_surface(struct zink_context *ctx, struct pipe_surface **psurface)
                                             zink_resource((*psurface)->texture), (*psurface), surface->base.texture->target);
    uint32_t hash = hash_ivci(&ivci);
 
-   simple_mtx_lock(&screen->surface_mtx);
-   struct hash_entry *new_entry = _mesa_hash_table_search_pre_hashed(&screen->surface_cache, hash, &ivci);
+   simple_mtx_lock(&res->surface_mtx);
+   struct hash_entry *new_entry = _mesa_hash_table_search_pre_hashed(&res->surface_cache, hash, &ivci);
    if (zink_batch_usage_exists(surface->batch_uses))
       zink_batch_reference_surface(&ctx->batch, surface);
    surface_clear_fb_refs(screen, *psurface);
@@ -263,29 +334,33 @@ zink_rebind_surface(struct zink_context *ctx, struct pipe_surface **psurface)
    if (new_entry) {
       /* reuse existing surface; old one will be cleaned up naturally */
       struct zink_surface *new_surface = new_entry->data;
-      simple_mtx_unlock(&screen->surface_mtx);
+      simple_mtx_unlock(&res->surface_mtx);
       zink_batch_usage_set(&new_surface->batch_uses, ctx->batch.state);
       zink_surface_reference(screen, (struct zink_surface**)psurface, new_surface);
       return true;
    }
-   struct hash_entry *entry = _mesa_hash_table_search_pre_hashed(&screen->surface_cache, surface->hash, &surface->ivci);
+   struct hash_entry *entry = _mesa_hash_table_search_pre_hashed(&res->surface_cache, surface->hash, &surface->ivci);
    assert(entry);
-   _mesa_hash_table_remove(&screen->surface_cache, entry);
+   _mesa_hash_table_remove(&res->surface_cache, entry);
    VkImageView image_view;
-   if (vkCreateImageView(screen->dev, &ivci, NULL, &image_view) != VK_SUCCESS) {
+   if (VKSCR(CreateImageView)(screen->dev, &ivci, NULL, &image_view) != VK_SUCCESS) {
       debug_printf("zink: failed to create new imageview");
-      simple_mtx_unlock(&screen->surface_mtx);
+      simple_mtx_unlock(&res->surface_mtx);
       return false;
    }
    surface->hash = hash;
    surface->ivci = ivci;
-   entry = _mesa_hash_table_insert_pre_hashed(&screen->surface_cache, surface->hash, &surface->ivci, surface);
+   entry = _mesa_hash_table_insert_pre_hashed(&res->surface_cache, surface->hash, &surface->ivci, surface);
    assert(entry);
    surface->simage_view = surface->image_view;
    surface->image_view = image_view;
    surface->obj = zink_resource(surface->base.texture)->obj;
+   /* update for imageless fb */
+   surface->info.flags = res->obj->vkflags;
+   surface->info.usage = res->obj->vkusage;
+   surface->info_hash = _mesa_hash_data(&surface->info, sizeof(surface->info));
    zink_batch_usage_set(&surface->batch_uses, ctx->batch.state);
-   simple_mtx_unlock(&screen->surface_mtx);
+   simple_mtx_unlock(&res->surface_mtx);
    return true;
 }
 
@@ -309,7 +384,7 @@ zink_surface_create_null(struct zink_context *ctx, enum pipe_texture_target targ
       return NULL;
 
    surf_templ.format = PIPE_FORMAT_R8_UINT;
-   surf_templ.nr_samples = samples;
+   surf_templ.nr_samples = 0;
    struct pipe_surface *psurf = ctx->base.create_surface(&ctx->base, pres, &surf_templ);
    pipe_resource_reference(&pres, NULL);
    return psurf;
diff --git a/mesa 3D driver/src/gallium/drivers/zink/zink_surface.h b/mesa 3D driver/src/gallium/drivers/zink/zink_surface.h
index 8b682c4de3..b6de77cd59 100644
--- a/mesa 3D driver/src/gallium/drivers/zink/zink_surface.h	
+++ b/mesa 3D driver/src/gallium/drivers/zink/zink_surface.h	
@@ -30,9 +30,20 @@
 
 struct pipe_context;
 
+struct zink_surface_info {
+   VkImageCreateFlags flags;
+   VkImageUsageFlags usage;
+   uint32_t width;
+   uint32_t height;
+   uint32_t layerCount;
+   VkFormat format;
+};
+
 struct zink_surface {
    struct pipe_surface base;
    VkImageViewCreateInfo ivci;
+   struct zink_surface_info info; //TODO: union with fb refs
+   uint32_t info_hash;
    VkImageView image_view;
    VkImageView simage_view;//old iview after storage replacement/rebind
    void *obj; //backing resource object
@@ -42,10 +53,36 @@ struct zink_surface {
    struct zink_descriptor_refs desc_set_refs;
 };
 
+/* wrapper object that preserves the gallium expectation of having
+ * pipe_surface::context match the context used to create the surface
+ */
+struct zink_ctx_surface {
+   struct pipe_surface base;
+   struct zink_surface *surf;
+   struct zink_ctx_surface *transient; //zink_ctx_surface
+   /* TODO: need replicate EXT */
+   bool transient_init;
+};
+
+/* use this cast for framebuffer surfaces */
 static inline struct zink_surface *
-zink_surface(struct pipe_surface *pipe)
+zink_csurface(struct pipe_surface *psurface)
 {
-   return (struct zink_surface *)pipe;
+   return psurface ? ((struct zink_ctx_surface *)psurface)->surf : NULL;
+}
+
+/* use this cast for checking transient framebuffer surfaces */
+static inline struct zink_surface *
+zink_transient_surface(struct pipe_surface *psurface)
+{
+   return psurface ? ((struct zink_ctx_surface *)psurface)->transient ? ((struct zink_ctx_surface *)psurface)->transient->surf : NULL : NULL;
+}
+
+/* use this cast for internal surfaces */
+static inline struct zink_surface *
+zink_surface(struct pipe_surface *psurface)
+{
+   return (struct zink_surface *)psurface;
 }
 
 void
@@ -101,6 +138,13 @@ zink_surface_clamp_viewtype(VkImageViewType viewType, unsigned first_layer, unsi
 bool
 zink_rebind_surface(struct zink_context *ctx, struct pipe_surface **psurface);
 
+static inline bool
+zink_rebind_ctx_surface(struct zink_context *ctx, struct pipe_surface **psurface)
+{
+   struct zink_ctx_surface *csurf = (struct zink_ctx_surface*)*psurface;
+   return zink_rebind_surface(ctx, (struct pipe_surface**)&csurf->surf);
+}
+
 struct pipe_surface *
 zink_surface_create_null(struct zink_context *ctx, enum pipe_texture_target target, unsigned width, unsigned height, unsigned samples);
 #endif
diff --git a/mesa 3D driver/src/gallium/frontends/clover/api/event.cpp b/mesa 3D driver/src/gallium/frontends/clover/api/event.cpp
index e1fd9356aa..7c3b0812f2 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/api/event.cpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/api/event.cpp	
@@ -282,10 +282,10 @@ clGetEventProfilingInfo(cl_event d_ev, cl_profiling_info param,
 
    return CL_SUCCESS;
 
-} catch (std::bad_cast &e) {
+} catch (std::bad_cast &) {
    return CL_PROFILING_INFO_NOT_AVAILABLE;
 
-} catch (lazy<cl_ulong>::undefined_error &e) {
+} catch (lazy<cl_ulong>::undefined_error &) {
    return CL_PROFILING_INFO_NOT_AVAILABLE;
 
 } catch (error &e) {
diff --git a/mesa 3D driver/src/gallium/frontends/clover/api/kernel.cpp b/mesa 3D driver/src/gallium/frontends/clover/api/kernel.cpp
index c937642fcc..7849334206 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/api/kernel.cpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/api/kernel.cpp	
@@ -38,7 +38,7 @@ clCreateKernel(cl_program d_prog, const char *name, cl_int *r_errcode) try {
    ret_error(r_errcode, CL_SUCCESS);
    return new kernel(prog, name, range(sym.args));
 
-} catch (std::out_of_range &e) {
+} catch (std::out_of_range &) {
    ret_error(r_errcode, CL_INVALID_KERNEL_NAME);
    return NULL;
 
@@ -57,7 +57,7 @@ clCreateKernelsInProgram(cl_program d_prog, cl_uint count,
       throw error(CL_INVALID_VALUE);
 
    if (rd_kerns)
-      copy(map([&](const module::symbol &sym) {
+      copy(map([&](const binary::symbol &sym) {
                return desc(new kernel(prog,
                                       std::string(sym.name.begin(),
                                                   sym.name.end()),
@@ -100,7 +100,7 @@ clSetKernelArg(cl_kernel d_kern, cl_uint idx, size_t size,
    obj(d_kern).args().at(idx).set(size, value);
    return CL_SUCCESS;
 
-} catch (std::out_of_range &e) {
+} catch (std::out_of_range &) {
    return CL_INVALID_ARG_INDEX;
 
 } catch (error &e) {
@@ -189,7 +189,7 @@ clGetKernelWorkGroupInfo(cl_kernel d_kern, cl_device_id d_dev,
 } catch (error &e) {
    return e.get();
 
-} catch (std::out_of_range &e) {
+} catch (std::out_of_range &) {
    return CL_INVALID_DEVICE;
 }
 
@@ -231,7 +231,7 @@ clGetKernelArgInfo(cl_kernel d_kern,
 
    return CL_SUCCESS;
 
-} catch (std::out_of_range &e) {
+} catch (std::out_of_range &) {
    return CL_INVALID_ARG_INDEX;
 
 } catch (error &e) {
@@ -257,9 +257,9 @@ namespace {
          throw error(CL_INVALID_KERNEL_ARGS);
 
       // If the command queue's device is not associated to the program, we get
-      // a module, with no sections, which will also fail the following test.
-      auto &m = kern.program().build(q.device()).binary;
-      if (!any_of(type_equals(module::section::text_executable), m.secs))
+      // a binary, with no sections, which will also fail the following test.
+      auto &b = kern.program().build(q.device()).bin;
+      if (!any_of(type_equals(binary::section::text_executable), b.secs))
          throw error(CL_INVALID_PROGRAM_EXECUTABLE);
    }
 
@@ -381,7 +381,7 @@ clSetKernelArgSVMPointer(cl_kernel d_kern,
    obj(d_kern).args().at(arg_index).set_svm(arg_value);
    return CL_SUCCESS;
 
-} catch (std::out_of_range &e) {
+} catch (std::out_of_range &) {
    return CL_INVALID_ARG_INDEX;
 
 } catch (error &e) {
diff --git a/mesa 3D driver/src/gallium/frontends/clover/api/memory.cpp b/mesa 3D driver/src/gallium/frontends/clover/api/memory.cpp
index d3039d47ba..bd0c332bf9 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/api/memory.cpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/api/memory.cpp	
@@ -204,7 +204,7 @@ clCreateImageWithProperties(cl_context d_ctx,
 
    const cl_mem_flags flags = validate_flags(desc->buffer, d_flags, false);
 
-   if (!supported_formats(ctx, desc->image_type).count(*format))
+   if (!supported_formats(ctx, desc->image_type, d_flags).count(*format))
       throw error(CL_IMAGE_FORMAT_NOT_SUPPORTED);
 
    std::vector<cl_mem_properties> properties = fill_properties(d_properties);
@@ -228,6 +228,41 @@ clCreateImageWithProperties(cl_context d_ctx,
                          desc->image_width,
                          row_pitch, host_ptr);
 
+   case CL_MEM_OBJECT_IMAGE1D_BUFFER:
+      if (!desc->image_width)
+         throw error(CL_INVALID_IMAGE_SIZE);
+
+      if (all_of([=](const device &dev) {
+               const size_t max = dev.max_image_buffer_size();
+               return (desc->image_width > max);
+            }, ctx.devices()))
+         throw error(CL_INVALID_IMAGE_SIZE);
+
+      return new image1d_buffer(ctx, properties, flags, format,
+                                desc->image_width,
+                                row_pitch, host_ptr, desc->buffer);
+
+   case CL_MEM_OBJECT_IMAGE1D_ARRAY: {
+      if (!desc->image_width)
+         throw error(CL_INVALID_IMAGE_SIZE);
+
+      if (all_of([=](const device &dev) {
+               const size_t max = dev.max_image_size();
+               const size_t amax = dev.max_image_array_number();
+               return (desc->image_width > max ||
+                       desc->image_array_size > amax);
+            }, ctx.devices()))
+         throw error(CL_INVALID_IMAGE_SIZE);
+
+      const size_t slice_pitch = desc->image_slice_pitch ?
+         desc->image_slice_pitch : row_pitch;
+
+      return new image1d_array(ctx, properties, flags, format,
+                               desc->image_width,
+                               desc->image_array_size, slice_pitch,
+                               host_ptr);
+   }
+
    case CL_MEM_OBJECT_IMAGE2D:
       if (!desc->image_width || !desc->image_height)
          throw error(CL_INVALID_IMAGE_SIZE);
@@ -243,6 +278,28 @@ clCreateImageWithProperties(cl_context d_ctx,
                          desc->image_width, desc->image_height,
                          row_pitch, host_ptr);
 
+   case CL_MEM_OBJECT_IMAGE2D_ARRAY: {
+      if (!desc->image_width || !desc->image_height || !desc->image_array_size)
+         throw error(CL_INVALID_IMAGE_SIZE);
+
+      if (all_of([=](const device &dev) {
+               const size_t max = dev.max_image_size();
+               const size_t amax = dev.max_image_array_number();
+               return (desc->image_width > max ||
+                       desc->image_height > max ||
+                       desc->image_array_size > amax);
+            }, ctx.devices()))
+         throw error(CL_INVALID_IMAGE_SIZE);
+
+      const size_t slice_pitch = desc->image_slice_pitch ?
+         desc->image_slice_pitch : row_pitch * desc->image_height;
+
+      return new image2d_array(ctx, properties, flags, format,
+                               desc->image_width, desc->image_height,
+                               desc->image_array_size, row_pitch,
+                               slice_pitch, host_ptr);
+   }
+
    case CL_MEM_OBJECT_IMAGE3D: {
       if (!desc->image_width || !desc->image_height || !desc->image_depth)
          throw error(CL_INVALID_IMAGE_SIZE);
@@ -264,12 +321,6 @@ clCreateImageWithProperties(cl_context d_ctx,
                          slice_pitch, host_ptr);
    }
 
-   case CL_MEM_OBJECT_IMAGE1D_ARRAY:
-   case CL_MEM_OBJECT_IMAGE1D_BUFFER:
-   case CL_MEM_OBJECT_IMAGE2D_ARRAY:
-      // XXX - Not implemented.
-      throw error(CL_IMAGE_FORMAT_NOT_SUPPORTED);
-
    default:
       throw error(CL_INVALID_IMAGE_DESCRIPTOR);
    }
@@ -317,7 +368,7 @@ clGetSupportedImageFormats(cl_context d_ctx, cl_mem_flags flags,
                            cl_mem_object_type type, cl_uint count,
                            cl_image_format *r_buf, cl_uint *r_count) try {
    auto &ctx = obj(d_ctx);
-   auto formats = supported_formats(ctx, type);
+   auto formats = supported_formats(ctx, type, flags);
 
    if (flags & CL_MEM_KERNEL_READ_AND_WRITE) {
       if (r_count)
@@ -388,7 +439,18 @@ clGetMemObjectInfo(cl_mem d_mem, cl_mem_info param,
 
    case CL_MEM_ASSOCIATED_MEMOBJECT: {
       sub_buffer *sub = dynamic_cast<sub_buffer *>(&mem);
-      buf.as_scalar<cl_mem>() = (sub ? desc(sub->parent()) : NULL);
+      if (sub) {
+         buf.as_scalar<cl_mem>() = desc(sub->parent());
+         break;
+      }
+
+      image *img = dynamic_cast<image *>(&mem);
+      if (img) {
+         buf.as_scalar<cl_mem>() = desc(img->buffer());
+         break;
+      }
+
+      buf.as_scalar<cl_mem>() = NULL;
       break;
    }
    case CL_MEM_OFFSET: {
@@ -447,11 +509,19 @@ clGetImageInfo(cl_mem d_mem, cl_image_info param,
       break;
 
    case CL_IMAGE_HEIGHT:
-      buf.as_scalar<size_t>() = img.height();
+      buf.as_scalar<size_t>() = img.dimensions() > 1 ? img.height() : 0;
       break;
 
    case CL_IMAGE_DEPTH:
-      buf.as_scalar<size_t>() = img.depth();
+      buf.as_scalar<size_t>() = img.dimensions() > 2 ? img.depth() : 0;
+      break;
+
+   case CL_IMAGE_ARRAY_SIZE:
+      buf.as_scalar<size_t>() = img.array_size();
+      break;
+
+   case CL_IMAGE_BUFFER:
+      buf.as_scalar<cl_mem>() = img.buffer();
       break;
 
    case CL_IMAGE_NUM_MIP_LEVELS:
@@ -551,7 +621,7 @@ clSVMAlloc(cl_context d_ctx,
    CLOVER_NOT_SUPPORTED_UNTIL("2.0");
    return nullptr;
 
-} catch (error &e) {
+} catch (error &) {
    return nullptr;
 }
 
@@ -572,5 +642,5 @@ clSVMFree(cl_context d_ctx,
 
    CLOVER_NOT_SUPPORTED_UNTIL("2.0");
 
-} catch (error &e) {
+} catch (error &) {
 }
diff --git a/mesa 3D driver/src/gallium/frontends/clover/api/platform.cpp b/mesa 3D driver/src/gallium/frontends/clover/api/platform.cpp
index 9b38c3c487..b2077d3036 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/api/platform.cpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/api/platform.cpp	
@@ -118,7 +118,7 @@ clover::GetExtensionFunctionAddressForPlatform(cl_platform_id d_platform,
    obj(d_platform);
    return GetExtensionFunctionAddress(p_name);
 
-} catch (error &e) {
+} catch (error &) {
    return NULL;
 }
 
diff --git a/mesa 3D driver/src/gallium/frontends/clover/api/program.cpp b/mesa 3D driver/src/gallium/frontends/clover/api/program.cpp
index f5971248f4..52d8d42a84 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/api/program.cpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/api/program.cpp	
@@ -143,8 +143,8 @@ clCreateProgramWithBinary(cl_context d_ctx, cl_uint n,
       throw error(CL_INVALID_DEVICE);
 
    // Deserialize the provided binaries,
-   std::vector<std::pair<cl_int, module>> result = map(
-      [](const unsigned char *p, size_t l) -> std::pair<cl_int, module> {
+   std::vector<std::pair<cl_int, binary>> result = map(
+      [](const unsigned char *p, size_t l) -> std::pair<cl_int, binary> {
          if (!p || !l)
             return { CL_INVALID_VALUE, {} };
 
@@ -152,9 +152,9 @@ clCreateProgramWithBinary(cl_context d_ctx, cl_uint n,
             std::stringbuf bin( std::string{ (char*)p, l } );
             std::istream s(&bin);
 
-            return { CL_SUCCESS, module::deserialize(s) };
+            return { CL_SUCCESS, binary::deserialize(s) };
 
-         } catch (std::istream::failure &e) {
+         } catch (std::istream::failure &) {
             return { CL_INVALID_BINARY, {} };
          }
       },
@@ -333,10 +333,10 @@ clCompileProgram(cl_program d_prog, cl_uint num_devs,
    prog.compile(devs, opts, headers);
    return CL_SUCCESS;
 
-} catch (invalid_build_options_error &e) {
+} catch (invalid_build_options_error &) {
    return CL_INVALID_COMPILER_OPTIONS;
 
-} catch (build_error &e) {
+} catch (build_error &) {
    return CL_COMPILE_PROGRAM_FAILURE;
 
 } catch (error &e) {
@@ -446,13 +446,13 @@ clLinkProgram(cl_context d_ctx, cl_uint num_devs, const cl_device_id *d_devs,
       prog().link(devs, opts, progs);
       ret_error(r_errcode, CL_SUCCESS);
 
-   } catch (build_error &e) {
+   } catch (build_error &) {
       ret_error(r_errcode, CL_LINK_PROGRAM_FAILURE);
    }
 
    return r_prog;
 
-} catch (invalid_build_options_error &e) {
+} catch (invalid_build_options_error &) {
    ret_error(r_errcode, CL_INVALID_LINKER_OPTIONS);
    return NULL;
 
@@ -507,7 +507,7 @@ clGetProgramInfo(cl_program d_prog, cl_program_info param,
 
    case CL_PROGRAM_BINARY_SIZES:
       buf.as_vector<size_t>() = map([&](const device &dev) {
-            return prog.build(dev).binary.size();
+            return prog.build(dev).bin.size();
          },
          prog.devices());
       break;
@@ -516,7 +516,7 @@ clGetProgramInfo(cl_program d_prog, cl_program_info param,
       buf.as_matrix<unsigned char>() = map([&](const device &dev) {
             std::stringbuf bin;
             std::ostream s(&bin);
-            prog.build(dev).binary.serialize(s);
+            prog.build(dev).bin.serialize(s);
             return bin.str();
          },
          prog.devices());
@@ -527,7 +527,7 @@ clGetProgramInfo(cl_program d_prog, cl_program_info param,
       break;
 
    case CL_PROGRAM_KERNEL_NAMES:
-      buf.as_string() = fold([](const std::string &a, const module::symbol &s) {
+      buf.as_string() = fold([](const std::string &a, const binary::symbol &s) {
             return ((a.empty() ? "" : a + ";") + s.name);
          }, std::string(), prog.symbols());
       break;
diff --git a/mesa 3D driver/src/gallium/frontends/clover/api/transfer.cpp b/mesa 3D driver/src/gallium/frontends/clover/api/transfer.cpp
index 834c47864a..d279337b3d 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/api/transfer.cpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/api/transfer.cpp	
@@ -104,7 +104,9 @@ namespace {
    void
    validate_object(command_queue &q, image &img,
                    const vector_t &orig, const vector_t &region) {
-      vector_t size = { img.width(), img.height(), img.depth() };
+      size_t height = img.type() == CL_MEM_OBJECT_IMAGE1D_ARRAY ? img.array_size() : img.height();
+      size_t depth = img.type() == CL_MEM_OBJECT_IMAGE2D_ARRAY ? img.array_size() : img.depth();
+      vector_t size = { img.width(), height, depth };
       const auto &dev = q.device();
 
       if (!dev.image_support())
@@ -126,12 +128,26 @@ namespace {
             throw error(CL_INVALID_IMAGE_SIZE);
          break;
       }
+      case CL_MEM_OBJECT_IMAGE1D_ARRAY: {
+         const size_t max_size = dev.max_image_size();
+         const size_t max_array = dev.max_image_array_number();
+         if (img.width() > max_size || img.array_size() > max_array)
+            throw error(CL_INVALID_IMAGE_SIZE);
+         break;
+      }
       case CL_MEM_OBJECT_IMAGE2D: {
          const size_t max = dev.max_image_size();
          if (img.width() > max || img.height() > max)
             throw error(CL_INVALID_IMAGE_SIZE);
          break;
       }
+      case CL_MEM_OBJECT_IMAGE2D_ARRAY: {
+         const size_t max_size = dev.max_image_size();
+         const size_t max_array = dev.max_image_array_number();
+         if (img.width() > max_size || img.height() > max_size || img.array_size() > max_array)
+            throw error(CL_INVALID_IMAGE_SIZE);
+         break;
+      }
       case CL_MEM_OBJECT_IMAGE3D: {
          const size_t max = dev.max_image_size_3d();
          if (img.width() > max || img.height() > max || img.depth() > max)
@@ -862,7 +878,7 @@ clEnqueueMapImage(cl_command_queue d_q, cl_mem d_mem, cl_bool blocking,
    if (!row_pitch)
       throw error(CL_INVALID_VALUE);
 
-   if (img.slice_pitch() && !slice_pitch)
+   if ((img.slice_pitch() || img.array_size()) && !slice_pitch)
       throw error(CL_INVALID_VALUE);
 
    auto *map = img.resource_in(q).add_map(q, flags, blocking, origin, region);
diff --git a/mesa 3D driver/src/gallium/frontends/clover/core/binary.cpp b/mesa 3D driver/src/gallium/frontends/clover/core/binary.cpp
new file mode 100644
index 0000000000..3bc3fe14bd
--- /dev/null
+++ b/mesa 3D driver/src/gallium/frontends/clover/core/binary.cpp	
@@ -0,0 +1,243 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#include <type_traits>
+#include <iostream>
+
+#include "core/binary.hpp"
+
+using namespace clover;
+
+namespace {
+   template<typename T, typename = void>
+   struct _serializer;
+
+   /// Serialize the specified object.
+   template<typename T>
+   void
+   _proc(std::ostream &os, const T &x) {
+      _serializer<T>::proc(os, x);
+   }
+
+   /// Deserialize the specified object.
+   template<typename T>
+   void
+   _proc(std::istream &is, T &x) {
+      _serializer<T>::proc(is, x);
+   }
+
+   template<typename T>
+   T
+   _proc(std::istream &is) {
+      T x;
+      _serializer<T>::proc(is, x);
+      return x;
+   }
+
+   /// Calculate the size of the specified object.
+   template<typename T>
+   void
+   _proc(binary::size_t &sz, const T &x) {
+      _serializer<T>::proc(sz, x);
+   }
+
+   /// (De)serialize a scalar value.
+   template<typename T>
+   struct _serializer<T, typename std::enable_if<
+                            std::is_scalar<T>::value>::type> {
+      static void
+      proc(std::ostream &os, const T &x) {
+         os.write(reinterpret_cast<const char *>(&x), sizeof(x));
+      }
+
+      static void
+      proc(std::istream &is, T &x) {
+         is.read(reinterpret_cast<char *>(&x), sizeof(x));
+      }
+
+      static void
+      proc(binary::size_t &sz, const T &x) {
+         sz += sizeof(x);
+      }
+   };
+
+   /// (De)serialize a vector.
+   template<typename T>
+   struct _serializer<std::vector<T>,
+                      typename std::enable_if<
+                         !std::is_scalar<T>::value>::type> {
+      static void
+      proc(std::ostream &os, const std::vector<T> &v) {
+         _proc<uint32_t>(os, v.size());
+
+         for (size_t i = 0; i < v.size(); i++)
+            _proc<T>(os, v[i]);
+      }
+
+      static void
+      proc(std::istream &is, std::vector<T> &v) {
+         v.resize(_proc<uint32_t>(is));
+
+         for (size_t i = 0; i < v.size(); i++)
+            new(&v[i]) T(_proc<T>(is));
+      }
+
+      static void
+      proc(binary::size_t &sz, const std::vector<T> &v) {
+         sz += sizeof(uint32_t);
+
+         for (size_t i = 0; i < v.size(); i++)
+            _proc<T>(sz, v[i]);
+      }
+   };
+
+   template<typename T>
+   struct _serializer<std::vector<T>,
+                      typename std::enable_if<
+                         std::is_scalar<T>::value>::type> {
+      static void
+      proc(std::ostream &os, const std::vector<T> &v) {
+         _proc<uint32_t>(os, v.size());
+         os.write(reinterpret_cast<const char *>(&v[0]),
+                  v.size() * sizeof(T));
+      }
+
+      static void
+      proc(std::istream &is, std::vector<T> &v) {
+         v.resize(_proc<uint32_t>(is));
+         is.read(reinterpret_cast<char *>(&v[0]),
+                 v.size() * sizeof(T));
+      }
+
+      static void
+      proc(binary::size_t &sz, const std::vector<T> &v) {
+         sz += sizeof(uint32_t) + sizeof(T) * v.size();
+      }
+   };
+
+   /// (De)serialize a string.
+   template<>
+   struct _serializer<std::string> {
+      static void
+      proc(std::ostream &os, const std::string &s) {
+         _proc<uint32_t>(os, s.size());
+         os.write(&s[0], s.size() * sizeof(std::string::value_type));
+      }
+
+      static void
+      proc(std::istream &is, std::string &s) {
+         s.resize(_proc<uint32_t>(is));
+         is.read(&s[0], s.size() * sizeof(std::string::value_type));
+      }
+
+      static void
+      proc(binary::size_t &sz, const std::string &s) {
+         sz += sizeof(uint32_t) + sizeof(std::string::value_type) * s.size();
+      }
+   };
+
+   /// (De)serialize a printf format
+   template<>
+   struct _serializer<binary::printf_info> {
+      template<typename S, typename QT>
+      static void
+      proc(S & s, QT &x) {
+         _proc(s, x.arg_sizes);
+         _proc(s, x.strings);
+      }
+   };
+
+   /// (De)serialize a binary::section.
+   template<>
+   struct _serializer<binary::section> {
+      template<typename S, typename QT>
+      static void
+      proc(S &s, QT &x) {
+         _proc(s, x.id);
+         _proc(s, x.type);
+         _proc(s, x.size);
+         _proc(s, x.data);
+      }
+   };
+
+   /// (De)serialize a binary::argument.
+   template<>
+   struct _serializer<binary::argument> {
+      template<typename S, typename QT>
+      static void
+      proc(S &s, QT &x) {
+         _proc(s, x.type);
+         _proc(s, x.size);
+         _proc(s, x.target_size);
+         _proc(s, x.target_align);
+         _proc(s, x.ext_type);
+         _proc(s, x.semantic);
+      }
+   };
+
+   /// (De)serialize a binary::symbol.
+   template<>
+   struct _serializer<binary::symbol> {
+      template<typename S, typename QT>
+      static void
+      proc(S &s, QT &x) {
+         _proc(s, x.name);
+         _proc(s, x.attributes);
+         _proc(s, x.reqd_work_group_size);
+         _proc(s, x.section);
+         _proc(s, x.offset);
+         _proc(s, x.args);
+      }
+   };
+
+   /// (De)serialize a binary.
+   template<>
+   struct _serializer<binary> {
+      template<typename S, typename QT>
+      static void
+      proc(S &s, QT &x) {
+         _proc(s, x.syms);
+         _proc(s, x.secs);
+         _proc(s, x.printf_infos);
+         _proc(s, x.printf_strings_in_buffer);
+      }
+   };
+};
+
+namespace clover {
+   void
+   binary::serialize(std::ostream &os) const {
+      _proc(os, *this);
+   }
+
+   binary
+   binary::deserialize(std::istream &is) {
+      return _proc<binary>(is);
+   }
+
+   binary::size_t
+   binary::size() const {
+      size_t sz = 0;
+      _proc(sz, *this);
+      return sz;
+   }
+}
diff --git a/mesa 3D driver/src/gallium/frontends/clover/core/binary.hpp b/mesa 3D driver/src/gallium/frontends/clover/core/binary.hpp
new file mode 100644
index 0000000000..951ddc48a4
--- /dev/null
+++ b/mesa 3D driver/src/gallium/frontends/clover/core/binary.hpp	
@@ -0,0 +1,169 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#ifndef CLOVER_CORE_BINARY_HPP
+#define CLOVER_CORE_BINARY_HPP
+
+#include <vector>
+#include <string>
+
+#include "CL/cl.h"
+
+namespace clover {
+   struct binary {
+      typedef uint32_t resource_id;
+      typedef uint32_t size_t;
+
+      struct section {
+         enum type {
+            text_intermediate,
+            text_library,
+            text_executable,
+            data_constant,
+            data_global,
+            data_local,
+            data_private
+         };
+
+         section(resource_id id, enum type type, size_t size,
+                 const std::vector<char> &data) :
+                 id(id), type(type), size(size), data(data) { }
+         section() : id(0), type(text_intermediate), size(0), data() { }
+
+         resource_id id;
+         type type;
+         size_t size;
+         std::vector<char> data;
+      };
+
+      struct printf_info {
+         std::vector<uint32_t> arg_sizes;
+         std::vector<uint8_t> strings;
+      };
+
+      struct arg_info {
+         arg_info(const std::string &arg_name, const std::string &type_name,
+                  const cl_kernel_arg_type_qualifier type_qualifier,
+                  const cl_kernel_arg_address_qualifier address_qualifier,
+                  const cl_kernel_arg_access_qualifier access_qualifier) :
+            arg_name(arg_name), type_name(type_name),
+            type_qualifier(type_qualifier),
+            address_qualifier(address_qualifier),
+            access_qualifier(access_qualifier) { };
+         arg_info() : arg_name(""), type_name(""), type_qualifier(0),
+            address_qualifier(0), access_qualifier(0) { };
+
+         std::string arg_name;
+         std::string type_name;
+         cl_kernel_arg_type_qualifier type_qualifier;
+         cl_kernel_arg_address_qualifier address_qualifier;
+         cl_kernel_arg_access_qualifier access_qualifier;
+      };
+
+      struct argument {
+         enum type {
+            scalar,
+            constant,
+            global,
+            local,
+            image_rd,
+            image_wr,
+            sampler
+         };
+
+         enum ext_type {
+            zero_ext,
+            sign_ext
+         };
+
+         enum semantic {
+            general,
+            grid_dimension,
+            grid_offset,
+            image_size,
+            image_format,
+            constant_buffer,
+            printf_buffer
+         };
+
+         argument(enum type type, size_t size,
+                  size_t target_size, size_t target_align,
+                  enum ext_type ext_type,
+                  enum semantic semantic = general) :
+            type(type), size(size),
+            target_size(target_size), target_align(target_align),
+            ext_type(ext_type), semantic(semantic) { }
+
+         argument(enum type type, size_t size) :
+            type(type), size(size),
+            target_size(size), target_align(1),
+            ext_type(zero_ext), semantic(general) { }
+
+         argument() : type(scalar), size(0),
+                      target_size(0), target_align(1),
+                      ext_type(zero_ext), semantic(general) { }
+
+         type type;
+         size_t size;
+         size_t target_size;
+         size_t target_align; // For arguments of type local, this represents
+                              // the alignment requirement for the pointed
+                              // type and for the pointer itself.
+         ext_type ext_type;
+         semantic semantic;
+         arg_info info;
+      };
+
+      struct symbol {
+         symbol(const std::string &name, const std::string &attributes,
+                const std::vector<::size_t> &reqd_work_group_size,
+                resource_id section, size_t offset,
+                const std::vector<argument> &args) :
+            name(name), attributes(attributes),
+            reqd_work_group_size(reqd_work_group_size),
+            section(section),
+            offset(offset), args(args) { }
+         symbol() : name(), attributes(), reqd_work_group_size({0, 0, 0}),
+            section(0), offset(0), args() { }
+
+         std::string name;
+         std::string attributes;
+         std::vector<::size_t> reqd_work_group_size;
+         resource_id section;
+         size_t offset;
+         std::vector<argument> args;
+      };
+
+      binary() : printf_strings_in_buffer(0) { }
+      void serialize(std::ostream &os) const;
+      static binary deserialize(std::istream &is);
+      size_t size() const;
+
+      std::vector<symbol> syms;
+      std::vector<section> secs;
+      std::vector<printf_info> printf_infos;
+      // printfs strings stored in output buffer
+      uint32_t printf_strings_in_buffer;
+   };
+}
+
+#endif
diff --git a/mesa 3D driver/src/gallium/frontends/clover/core/compiler.hpp b/mesa 3D driver/src/gallium/frontends/clover/core/compiler.hpp
index 7e2380eb25..98002edea6 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/core/compiler.hpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/core/compiler.hpp	
@@ -24,14 +24,14 @@
 #define CLOVER_CORE_COMPILER_HPP
 
 #include "core/device.hpp"
-#include "core/module.hpp"
+#include "core/binary.hpp"
 #include "llvm/invocation.hpp"
 #include "nir/invocation.hpp"
 #include "spirv/invocation.hpp"
 
 namespace clover {
    namespace compiler {
-      static inline module
+      static inline binary
       compile_program(const program &prog, const header_map &headers,
                       const device &dev, const std::string &opts,
                       std::string &log) {
@@ -59,21 +59,21 @@ namespace clover {
          }
       }
 
-      static inline module
-      link_program(const std::vector<module> &ms, const device &dev,
+      static inline binary
+      link_program(const std::vector<binary> &bs, const device &dev,
                    const std::string &opts, std::string &log) {
          const bool create_library =
             opts.find("-create-library") != std::string::npos;
          switch (dev.ir_format()) {
          case PIPE_SHADER_IR_NIR_SERIALIZED: {
-            auto spirv_linked_module = spirv::link_program(ms, dev, opts, log);
+            auto spirv_linked_module = spirv::link_program(bs, dev, opts, log);
             if (create_library)
                return spirv_linked_module;
             return nir::spirv_to_nir(spirv_linked_module,
                                      dev, log);
          }
          case PIPE_SHADER_IR_NATIVE:
-            return llvm::link_program(ms, dev, opts, log);
+            return llvm::link_program(bs, dev, opts, log);
          default:
             unreachable("device with unsupported IR");
             throw error(CL_INVALID_VALUE);
diff --git a/mesa 3D driver/src/gallium/frontends/clover/core/device.cpp b/mesa 3D driver/src/gallium/frontends/clover/core/device.cpp
index 7ef66d5049..2e3d77d2bf 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/core/device.cpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/core/device.cpp	
@@ -221,12 +221,14 @@ device::vendor_id() const {
 
 size_t
 device::max_images_read() const {
-   return PIPE_MAX_SHADER_SAMPLER_VIEWS;
+   return pipe->get_shader_param(pipe, PIPE_SHADER_COMPUTE,
+                                 PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS);
 }
 
 size_t
 device::max_images_write() const {
-   return PIPE_MAX_SHADER_IMAGES;
+   return pipe->get_shader_param(pipe, PIPE_SHADER_COMPUTE,
+                                 PIPE_SHADER_CAP_MAX_SHADER_IMAGES);
 }
 
 size_t
diff --git a/mesa 3D driver/src/gallium/frontends/clover/core/device.hpp b/mesa 3D driver/src/gallium/frontends/clover/core/device.hpp
index 311d0c97e9..4020ae9626 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/core/device.hpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/core/device.hpp	
@@ -28,7 +28,7 @@
 
 #include "core/object.hpp"
 #include "core/format.hpp"
-#include "core/module.hpp"
+#include "core/binary.hpp"
 #include "util/lazy.hpp"
 #include "pipe-loader/pipe_loader.h"
 
@@ -106,7 +106,7 @@ namespace clover {
       friend class root_resource;
       friend class hard_event;
       friend std::set<cl_image_format>
-      supported_formats(const context &, cl_mem_object_type);
+      supported_formats(const context &, cl_mem_object_type, cl_mem_flags flags);
       const void *get_compiler_options(enum pipe_shader_ir ir) const;
 
       clover::platform &platform;
diff --git a/mesa 3D driver/src/gallium/frontends/clover/core/format.cpp b/mesa 3D driver/src/gallium/frontends/clover/core/format.cpp
index 16a5767b93..6a0a2690cf 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/core/format.cpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/core/format.cpp	
@@ -26,97 +26,73 @@
 #include "pipe/p_context.h"
 
 namespace clover {
+   // see table 16 and 17 in the 3.0 CL spec under "5.3.1.1. Image Format Descriptor"
+   // TODO optional channel orders:
+   //  * CL_Rx
+   //  * CL_RGx
+   //  * CL_RGBx
+   //  * CL_sRGBx
+   #define _FF(c, b, g) \
+      { { CL_R, c }, PIPE_FORMAT_R##b##_##g },                      \
+      { { CL_A, c }, PIPE_FORMAT_A##b##_##g },                      \
+      { { CL_RG, c }, PIPE_FORMAT_R##b##G##b##_##g },               \
+      { { CL_RA, c }, PIPE_FORMAT_R##b##A##b##_##g },               \
+      { { CL_RGB, c }, PIPE_FORMAT_R##b##G##b##B##b##_##g },        \
+      { { CL_RGBA, c }, PIPE_FORMAT_R##b##G##b##B##b##A##b##_##g }
+      // broken but also optional
+      //{ { CL_LUMINANCE, c }, PIPE_FORMAT_L##b##_##g },
+      //{ { CL_INTENSITY, c }, PIPE_FORMAT_I##b##_##g },
+
+   #define _FI(c, b, g) \
+      _FF(c##b, b, g)
+
    static const std::map<cl_image_format, pipe_format> formats {
-      { { CL_BGRA, CL_UNORM_INT8 }, PIPE_FORMAT_B8G8R8A8_UNORM },
-      { { CL_ARGB, CL_UNORM_INT8 }, PIPE_FORMAT_A8R8G8B8_UNORM },
+      //required in CL 2.0 but broken
+      //_FI(CL_SNORM_INT, 8, SNORM),
+      //_FI(CL_SNORM_INT, 16, SNORM),
+      _FI(CL_UNORM_INT, 8, UNORM),
+      _FI(CL_UNORM_INT, 16, UNORM),
+      _FI(CL_SIGNED_INT, 8, SINT),
+      _FI(CL_SIGNED_INT, 16, SINT),
+      _FI(CL_SIGNED_INT, 32, SINT),
+      _FI(CL_UNSIGNED_INT, 8, UINT),
+      _FI(CL_UNSIGNED_INT, 16, UINT),
+      _FI(CL_UNSIGNED_INT, 32, UINT),
+      _FF(CL_HALF_FLOAT, 16, FLOAT),
+      _FF(CL_FLOAT, 32, FLOAT),
+
+      // TODO: next three can be CL_RGBx as well
       { { CL_RGB, CL_UNORM_SHORT_565 }, PIPE_FORMAT_B5G6R5_UNORM },
-      { { CL_LUMINANCE, CL_UNORM_INT8 }, PIPE_FORMAT_L8_UNORM },
-      { { CL_A, CL_UNORM_INT8 }, PIPE_FORMAT_A8_UNORM },
-      { { CL_INTENSITY, CL_UNORM_INT8 }, PIPE_FORMAT_I8_UNORM },
-      { { CL_LUMINANCE, CL_UNORM_INT16 }, PIPE_FORMAT_L16_UNORM },
-      { { CL_R, CL_FLOAT }, PIPE_FORMAT_R32_FLOAT },
-      { { CL_RG, CL_FLOAT }, PIPE_FORMAT_R32G32_FLOAT },
-      { { CL_RGB, CL_FLOAT }, PIPE_FORMAT_R32G32B32_FLOAT },
-      { { CL_RGBA, CL_FLOAT }, PIPE_FORMAT_R32G32B32A32_FLOAT },
-      { { CL_R, CL_UNORM_INT16 }, PIPE_FORMAT_R16_UNORM },
-      { { CL_RG, CL_UNORM_INT16 }, PIPE_FORMAT_R16G16_UNORM },
-      { { CL_RGB, CL_UNORM_INT16 }, PIPE_FORMAT_R16G16B16_UNORM },
-      { { CL_RGBA, CL_UNORM_INT16 }, PIPE_FORMAT_R16G16B16A16_UNORM },
-      { { CL_R, CL_SNORM_INT16 }, PIPE_FORMAT_R16_SNORM },
-      { { CL_RG, CL_SNORM_INT16 }, PIPE_FORMAT_R16G16_SNORM },
-      { { CL_RGB, CL_SNORM_INT16 }, PIPE_FORMAT_R16G16B16_SNORM },
-      { { CL_RGBA, CL_SNORM_INT16 }, PIPE_FORMAT_R16G16B16A16_SNORM },
-      { { CL_R, CL_UNORM_INT8 }, PIPE_FORMAT_R8_UNORM },
-      { { CL_RG, CL_UNORM_INT8 }, PIPE_FORMAT_R8G8_UNORM },
-      { { CL_RGB, CL_UNORM_INT8 }, PIPE_FORMAT_R8G8B8_UNORM },
-      { { CL_RGBA, CL_UNORM_INT8 }, PIPE_FORMAT_R8G8B8A8_UNORM },
-      { { CL_R, CL_SNORM_INT8 }, PIPE_FORMAT_R8_SNORM },
-      { { CL_RG, CL_SNORM_INT8 }, PIPE_FORMAT_R8G8_SNORM },
-      { { CL_RGB, CL_SNORM_INT8 }, PIPE_FORMAT_R8G8B8_SNORM },
-      { { CL_RGBA, CL_SNORM_INT8 }, PIPE_FORMAT_R8G8B8A8_SNORM },
-      { { CL_R, CL_HALF_FLOAT }, PIPE_FORMAT_R16_FLOAT },
-      { { CL_RG, CL_HALF_FLOAT }, PIPE_FORMAT_R16G16_FLOAT },
-      { { CL_RGB, CL_HALF_FLOAT }, PIPE_FORMAT_R16G16B16_FLOAT },
-      { { CL_RGBA, CL_HALF_FLOAT }, PIPE_FORMAT_R16G16B16A16_FLOAT },
-      { { CL_RGBx, CL_UNORM_SHORT_555 }, PIPE_FORMAT_B5G5R5X1_UNORM },
-      { { CL_RGBx, CL_UNORM_INT8 }, PIPE_FORMAT_R8G8B8X8_UNORM },
-      { { CL_A, CL_UNORM_INT16 }, PIPE_FORMAT_A16_UNORM },
-      { { CL_INTENSITY, CL_UNORM_INT16 }, PIPE_FORMAT_I16_UNORM },
-      { { CL_LUMINANCE, CL_SNORM_INT8 }, PIPE_FORMAT_L8_SNORM },
-      { { CL_INTENSITY, CL_SNORM_INT8 }, PIPE_FORMAT_I8_SNORM },
-      { { CL_A, CL_SNORM_INT16 }, PIPE_FORMAT_A16_SNORM },
-      { { CL_LUMINANCE, CL_SNORM_INT16 }, PIPE_FORMAT_L16_SNORM },
-      { { CL_INTENSITY, CL_SNORM_INT16 }, PIPE_FORMAT_I16_SNORM },
-      { { CL_A, CL_HALF_FLOAT }, PIPE_FORMAT_A16_FLOAT },
-      { { CL_LUMINANCE, CL_HALF_FLOAT }, PIPE_FORMAT_L16_FLOAT },
-      { { CL_INTENSITY, CL_HALF_FLOAT }, PIPE_FORMAT_I16_FLOAT },
-      { { CL_A, CL_FLOAT }, PIPE_FORMAT_A32_FLOAT },
-      { { CL_LUMINANCE, CL_FLOAT }, PIPE_FORMAT_L32_FLOAT },
-      { { CL_INTENSITY, CL_FLOAT }, PIPE_FORMAT_I32_FLOAT },
-      { { CL_RA, CL_UNORM_INT8 }, PIPE_FORMAT_R8A8_UNORM },
-      { { CL_R, CL_UNSIGNED_INT8 }, PIPE_FORMAT_R8_UINT },
-      { { CL_RG, CL_UNSIGNED_INT8 }, PIPE_FORMAT_R8G8_UINT },
-      { { CL_RGB, CL_UNSIGNED_INT8 }, PIPE_FORMAT_R8G8B8_UINT },
-      { { CL_RGBA, CL_UNSIGNED_INT8 }, PIPE_FORMAT_R8G8B8A8_UINT },
-      { { CL_R, CL_SIGNED_INT8 }, PIPE_FORMAT_R8_SINT },
-      { { CL_RG, CL_SIGNED_INT8 }, PIPE_FORMAT_R8G8_SINT },
-      { { CL_RGB, CL_SIGNED_INT8 }, PIPE_FORMAT_R8G8B8_SINT },
-      { { CL_RGBA, CL_SIGNED_INT8 }, PIPE_FORMAT_R8G8B8A8_SINT },
-      { { CL_R, CL_UNSIGNED_INT16 }, PIPE_FORMAT_R16_UINT },
-      { { CL_RG, CL_UNSIGNED_INT16 }, PIPE_FORMAT_R16G16_UINT },
-      { { CL_RGB, CL_UNSIGNED_INT16 }, PIPE_FORMAT_R16G16B16_UINT },
-      { { CL_RGBA, CL_UNSIGNED_INT16 }, PIPE_FORMAT_R16G16B16A16_UINT },
-      { { CL_R, CL_SIGNED_INT16 }, PIPE_FORMAT_R16_SINT },
-      { { CL_RG, CL_SIGNED_INT16 }, PIPE_FORMAT_R16G16_SINT },
-      { { CL_RGB, CL_SIGNED_INT16 }, PIPE_FORMAT_R16G16B16_SINT },
-      { { CL_RGBA, CL_SIGNED_INT16 }, PIPE_FORMAT_R16G16B16A16_SINT },
-      { { CL_R, CL_UNSIGNED_INT32 }, PIPE_FORMAT_R32_UINT },
-      { { CL_RG, CL_UNSIGNED_INT32 }, PIPE_FORMAT_R32G32_UINT },
-      { { CL_RGB, CL_UNSIGNED_INT32 }, PIPE_FORMAT_R32G32B32_UINT },
-      { { CL_RGBA, CL_UNSIGNED_INT32 }, PIPE_FORMAT_R32G32B32A32_UINT },
-      { { CL_R, CL_SIGNED_INT32 }, PIPE_FORMAT_R32_SINT },
-      { { CL_RG, CL_SIGNED_INT32 }, PIPE_FORMAT_R32G32_SINT },
-      { { CL_RGB, CL_SIGNED_INT32 }, PIPE_FORMAT_R32G32B32_SINT },
-      { { CL_RGBA, CL_SIGNED_INT32 }, PIPE_FORMAT_R32G32B32A32_SINT },
-      { { CL_A, CL_UNSIGNED_INT8 }, PIPE_FORMAT_A8_UINT },
-      { { CL_INTENSITY, CL_UNSIGNED_INT8 }, PIPE_FORMAT_I8_UINT },
-      { { CL_LUMINANCE, CL_UNSIGNED_INT8 }, PIPE_FORMAT_L8_UINT },
-      { { CL_A, CL_SIGNED_INT8 }, PIPE_FORMAT_A8_SINT },
-      { { CL_INTENSITY, CL_SIGNED_INT8 }, PIPE_FORMAT_I8_SINT },
-      { { CL_LUMINANCE, CL_SIGNED_INT8 }, PIPE_FORMAT_L8_SINT },
-      { { CL_A, CL_UNSIGNED_INT16 }, PIPE_FORMAT_A16_UINT },
-      { { CL_INTENSITY, CL_UNSIGNED_INT16 }, PIPE_FORMAT_I16_UINT },
-      { { CL_LUMINANCE, CL_UNSIGNED_INT16 }, PIPE_FORMAT_L16_UINT },
-      { { CL_A, CL_SIGNED_INT16 }, PIPE_FORMAT_A16_SINT },
-      { { CL_INTENSITY, CL_SIGNED_INT16 }, PIPE_FORMAT_I16_SINT },
-      { { CL_LUMINANCE, CL_SIGNED_INT16 }, PIPE_FORMAT_L16_SINT },
-      { { CL_A, CL_UNSIGNED_INT32 }, PIPE_FORMAT_A32_UINT },
-      { { CL_INTENSITY, CL_UNSIGNED_INT32 }, PIPE_FORMAT_I32_UINT },
-      { { CL_LUMINANCE, CL_UNSIGNED_INT32 }, PIPE_FORMAT_L32_UINT },
-      { { CL_A, CL_SIGNED_INT32 }, PIPE_FORMAT_A32_SINT },
-      { { CL_INTENSITY, CL_SIGNED_INT32 }, PIPE_FORMAT_I32_SINT },
-      { { CL_LUMINANCE, CL_SIGNED_INT32 }, PIPE_FORMAT_L32_SINT }
+      { { CL_RGB, CL_UNORM_SHORT_555 }, PIPE_FORMAT_B5G5R5A1_UNORM },
+      { { CL_RGB, CL_UNORM_INT_101010 }, PIPE_FORMAT_B10G10R10X2_UNORM },
+
+      { { CL_RGBA, CL_UNORM_INT_101010_2 }, PIPE_FORMAT_B10G10R10A2_UNORM },
+
+      { { CL_ARGB, CL_UNORM_INT8 }, PIPE_FORMAT_A8R8G8B8_UNORM },
+      { { CL_ARGB, CL_UNSIGNED_INT8 }, PIPE_FORMAT_A8R8G8B8_UINT },
+
+      { { CL_BGRA, CL_SNORM_INT8 }, PIPE_FORMAT_B8G8R8A8_SNORM },
+      { { CL_BGRA, CL_UNORM_INT8 }, PIPE_FORMAT_B8G8R8A8_UNORM },
+      { { CL_BGRA, CL_SIGNED_INT8 }, PIPE_FORMAT_B8G8R8A8_SINT },
+      { { CL_BGRA, CL_UNSIGNED_INT8 }, PIPE_FORMAT_B8G8R8A8_UINT },
+
+      { { CL_ABGR, CL_SNORM_INT8 }, PIPE_FORMAT_A8B8G8R8_SNORM },
+      { { CL_ABGR, CL_UNORM_INT8 }, PIPE_FORMAT_A8B8G8R8_UNORM },
+      { { CL_ABGR, CL_SIGNED_INT8 }, PIPE_FORMAT_A8B8G8R8_SINT },
+      { { CL_ABGR, CL_UNSIGNED_INT8 }, PIPE_FORMAT_A8B8G8R8_UINT },
+
+      // disable for now as it needs CL C 2.0 support
+      //{ { CL_DEPTH, CL_UNORM_INT16 }, PIPE_FORMAT_Z16_UNORM },
+      //{ { CL_DEPTH, CL_FLOAT }, PIPE_FORMAT_Z32_FLOAT },
+
+      // required in CL 2.0 but broken
+      //{ { CL_sRGBA, CL_UNORM_INT8 }, PIPE_FORMAT_R8G8B8A8_SRGB },
+      // optional but broken
+      //{ { CL_sRGB, CL_UNORM_INT8 }, PIPE_FORMAT_R8G8B8_SRGB },
+      //{ { CL_sBGRA, CL_UNORM_INT8 }, PIPE_FORMAT_B8G8R8A8_SRGB },
    };
+   #undef _FF
+   #undef _FI
 
    pipe_texture_target
    translate_target(cl_mem_object_type type) {
@@ -150,11 +126,15 @@ namespace clover {
    }
 
    std::set<cl_image_format>
-   supported_formats(const context &ctx, cl_mem_object_type type) {
+   supported_formats(const context &ctx, cl_mem_object_type type, cl_mem_flags flags) {
       std::set<cl_image_format> s;
       pipe_texture_target target = translate_target(type);
-      unsigned bindings = (PIPE_BIND_SAMPLER_VIEW |
-                           PIPE_BIND_COMPUTE_RESOURCE);
+      unsigned bindings = 0;
+
+      if (flags & (CL_MEM_READ_ONLY | CL_MEM_READ_WRITE | CL_MEM_KERNEL_READ_AND_WRITE))
+         bindings |= PIPE_BIND_SAMPLER_VIEW;
+      if (flags & (CL_MEM_WRITE_ONLY | CL_MEM_READ_WRITE | CL_MEM_KERNEL_READ_AND_WRITE))
+         bindings |= PIPE_BIND_SHADER_IMAGE;
 
       for (auto f : formats) {
          if (all_of([=](const device &dev) {
diff --git a/mesa 3D driver/src/gallium/frontends/clover/core/format.hpp b/mesa 3D driver/src/gallium/frontends/clover/core/format.hpp
index a8b7053c5d..a66a817e92 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/core/format.hpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/core/format.hpp	
@@ -38,7 +38,8 @@ namespace clover {
    /// the given memory object type.
    ///
    std::set<cl_image_format> supported_formats(const context &ctx,
-                                               cl_mem_object_type type);
+                                               cl_mem_object_type type,
+                                               cl_mem_flags flags);
 }
 
 static inline bool
diff --git a/mesa 3D driver/src/gallium/frontends/clover/core/kernel.cpp b/mesa 3D driver/src/gallium/frontends/clover/core/kernel.cpp
index 894b3bfffe..3fe698bec4 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/core/kernel.cpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/core/kernel.cpp	
@@ -29,21 +29,21 @@
 using namespace clover;
 
 kernel::kernel(clover::program &prog, const std::string &name,
-               const std::vector<module::argument> &margs) :
+               const std::vector<binary::argument> &bargs) :
    program(prog), _name(name), exec(*this),
    program_ref(prog._kernel_ref_counter) {
-   for (auto &marg : margs) {
-      if (marg.semantic == module::argument::general)
-         _args.emplace_back(argument::create(marg));
+   for (auto &barg : bargs) {
+      if (barg.semantic == binary::argument::general)
+         _args.emplace_back(argument::create(barg));
    }
    for (auto &dev : prog.devices()) {
-      auto &m = prog.build(dev).binary;
-      auto msym = find(name_equals(name), m.syms);
-      const auto f = id_type_equals(msym.section, module::section::data_constant);
-      if (!any_of(f, m.secs))
+      auto &b = prog.build(dev).bin;
+      auto bsym = find(name_equals(name), b.syms);
+      const auto f = id_type_equals(bsym.section, binary::section::data_constant);
+      if (!any_of(f, b.secs))
          continue;
 
-      auto mconst = find(f, m.secs);
+      auto mconst = find(f, b.secs);
       auto rb = std::make_unique<root_buffer>(prog.context(), std::vector<cl_mem_properties>(),
                                               CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY,
                                               mconst.size, mconst.data.data());
@@ -64,7 +64,7 @@ kernel::launch(command_queue &q,
                const std::vector<size_t> &grid_offset,
                const std::vector<size_t> &grid_size,
                const std::vector<size_t> &block_size) {
-   const auto m = program().build(q.device()).binary;
+   const auto b = program().build(q.device()).bin;
    const auto reduced_grid_size =
       map(divides(), grid_size, block_size);
 
@@ -86,7 +86,7 @@ kernel::launch(command_queue &q,
                                exec.samplers.data());
 
    q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
-                             exec.sviews.size(), 0, exec.sviews.data());
+                             exec.sviews.size(), 0, false, exec.sviews.data());
    q.pipe->set_shader_images(q.pipe, PIPE_SHADER_COMPUTE, 0,
                              exec.iviews.size(), 0, exec.iviews.data());
    q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(),
@@ -98,7 +98,7 @@ kernel::launch(command_queue &q,
    info.work_dim = grid_size.size();
    copy(pad_vector(q, block_size, 1), info.block);
    copy(pad_vector(q, reduced_grid_size, 1), info.grid);
-   info.pc = find(name_equals(_name), m.syms).offset;
+   info.pc = find(name_equals(_name), b.syms).offset;
    info.input = exec.input.data();
 
    q.pipe->launch_grid(q.pipe, &info);
@@ -108,7 +108,7 @@ kernel::launch(command_queue &q,
    q.pipe->set_shader_images(q.pipe, PIPE_SHADER_COMPUTE, 0,
                              0, exec.iviews.size(), NULL);
    q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
-                             0, exec.sviews.size(), NULL);
+                             0, exec.sviews.size(), false, NULL);
    q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0,
                                exec.samplers.size(), NULL);
 
@@ -164,19 +164,19 @@ kernel::args() const {
    return map(derefs(), _args);
 }
 
-std::vector<clover::module::arg_info>
+std::vector<clover::binary::arg_info>
 kernel::args_infos() {
-   std::vector<clover::module::arg_info> infos;
-   for (auto &marg: find(name_equals(_name), program().symbols()).args)
-      if (marg.semantic == clover::module::argument::general)
-         infos.emplace_back(marg.info);
+   std::vector<clover::binary::arg_info> infos;
+   for (auto &barg: find(name_equals(_name), program().symbols()).args)
+      if (barg.semantic == clover::binary::argument::general)
+         infos.emplace_back(barg.info);
 
    return infos;
 }
 
-const module &
-kernel::module(const command_queue &q) const {
-   return program().build(q.device()).binary;
+const binary &
+kernel::binary(const command_queue &q) const {
+   return program().build(q.device()).bin;
 }
 
 kernel::exec_context::exec_context(kernel &kern) :
@@ -194,79 +194,79 @@ kernel::exec_context::bind(intrusive_ptr<command_queue> _q,
    std::swap(q, _q);
 
    // Bind kernel arguments.
-   auto &m = kern.program().build(q->device()).binary;
-   auto msym = find(name_equals(kern.name()), m.syms);
-   auto margs = msym.args;
-   auto msec = find(id_type_equals(msym.section, module::section::text_executable), m.secs);
+   auto &b = kern.program().build(q->device()).bin;
+   auto bsym = find(name_equals(kern.name()), b.syms);
+   auto bargs = bsym.args;
+   auto msec = find(id_type_equals(bsym.section, binary::section::text_executable), b.secs);
    auto explicit_arg = kern._args.begin();
 
-   for (auto &marg : margs) {
-      switch (marg.semantic) {
-      case module::argument::general:
-         (*(explicit_arg++))->bind(*this, marg);
+   for (auto &barg : bargs) {
+      switch (barg.semantic) {
+      case binary::argument::general:
+         (*(explicit_arg++))->bind(*this, barg);
          break;
 
-      case module::argument::grid_dimension: {
+      case binary::argument::grid_dimension: {
          const cl_uint dimension = grid_offset.size();
-         auto arg = argument::create(marg);
+         auto arg = argument::create(barg);
 
          arg->set(sizeof(dimension), &dimension);
-         arg->bind(*this, marg);
+         arg->bind(*this, barg);
          break;
       }
-      case module::argument::grid_offset: {
+      case binary::argument::grid_offset: {
          for (cl_uint x : pad_vector(*q, grid_offset, 0)) {
-            auto arg = argument::create(marg);
+            auto arg = argument::create(barg);
 
             arg->set(sizeof(x), &x);
-            arg->bind(*this, marg);
+            arg->bind(*this, barg);
          }
          break;
       }
-      case module::argument::image_size: {
+      case binary::argument::image_size: {
          auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
          std::vector<cl_uint> image_size{
                static_cast<cl_uint>(img->width()),
                static_cast<cl_uint>(img->height()),
                static_cast<cl_uint>(img->depth())};
          for (auto x : image_size) {
-            auto arg = argument::create(marg);
+            auto arg = argument::create(barg);
 
             arg->set(sizeof(x), &x);
-            arg->bind(*this, marg);
+            arg->bind(*this, barg);
          }
          break;
       }
-      case module::argument::image_format: {
+      case binary::argument::image_format: {
          auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
          cl_image_format fmt = img->format();
          std::vector<cl_uint> image_format{
                static_cast<cl_uint>(fmt.image_channel_data_type),
                static_cast<cl_uint>(fmt.image_channel_order)};
          for (auto x : image_format) {
-            auto arg = argument::create(marg);
+            auto arg = argument::create(barg);
 
             arg->set(sizeof(x), &x);
-            arg->bind(*this, marg);
+            arg->bind(*this, barg);
          }
          break;
       }
-      case module::argument::constant_buffer: {
-         auto arg = argument::create(marg);
+      case binary::argument::constant_buffer: {
+         auto arg = argument::create(barg);
          cl_mem buf = kern._constant_buffers.at(&q->device()).get();
          arg->set(sizeof(buf), &buf);
-         arg->bind(*this, marg);
+         arg->bind(*this, barg);
          break;
       }
-      case module::argument::printf_buffer: {
-         print_handler = printf_handler::create(q, m.printf_infos,
-                                                m.printf_strings_in_buffer,
+      case binary::argument::printf_buffer: {
+         print_handler = printf_handler::create(q, b.printf_infos,
+                                                b.printf_strings_in_buffer,
                                                 q->device().max_printf_buffer_size());
          cl_mem print_mem = print_handler->get_mem();
 
-         auto arg = argument::create(marg);
+         auto arg = argument::create(barg);
          arg->set(sizeof(cl_mem), &print_mem);
-         arg->bind(*this, marg);
+         arg->bind(*this, barg);
          break;
       }
       }
@@ -352,9 +352,9 @@ namespace {
    ///
    template<typename T>
    void
-   extend(T &v, enum module::argument::ext_type ext, size_t n) {
+   extend(T &v, enum binary::argument::ext_type ext, size_t n) {
       const size_t m = std::min(v.size(), n);
-      const bool sign_ext = (ext == module::argument::sign_ext);
+      const bool sign_ext = (ext == binary::argument::sign_ext);
       const uint8_t fill = (sign_ext && msb(v) ? ~0 : 0);
       T w(n, fill);
 
@@ -388,27 +388,27 @@ namespace {
 }
 
 std::unique_ptr<kernel::argument>
-kernel::argument::create(const module::argument &marg) {
-   switch (marg.type) {
-   case module::argument::scalar:
-      return std::unique_ptr<kernel::argument>(new scalar_argument(marg.size));
+kernel::argument::create(const binary::argument &barg) {
+   switch (barg.type) {
+   case binary::argument::scalar:
+      return std::unique_ptr<kernel::argument>(new scalar_argument(barg.size));
 
-   case module::argument::global:
+   case binary::argument::global:
       return std::unique_ptr<kernel::argument>(new global_argument);
 
-   case module::argument::local:
+   case binary::argument::local:
       return std::unique_ptr<kernel::argument>(new local_argument);
 
-   case module::argument::constant:
+   case binary::argument::constant:
       return std::unique_ptr<kernel::argument>(new constant_argument);
 
-   case module::argument::image_rd:
+   case binary::argument::image_rd:
       return std::unique_ptr<kernel::argument>(new image_rd_argument);
 
-   case module::argument::image_wr:
+   case binary::argument::image_wr:
       return std::unique_ptr<kernel::argument>(new image_wr_argument);
 
-   case module::argument::sampler:
+   case binary::argument::sampler:
       return std::unique_ptr<kernel::argument>(new sampler_argument);
 
    }
@@ -445,12 +445,12 @@ kernel::scalar_argument::set(size_t size, const void *value) {
 
 void
 kernel::scalar_argument::bind(exec_context &ctx,
-                              const module::argument &marg) {
+                              const binary::argument &barg) {
    auto w = v;
 
-   extend(w, marg.ext_type, marg.target_size);
+   extend(w, barg.ext_type, barg.target_size);
    byteswap(w, ctx.q->device().endianness());
-   align(ctx.input, marg.target_align);
+   align(ctx.input, barg.target_align);
    insert(ctx.input, w);
 }
 
@@ -480,8 +480,8 @@ kernel::global_argument::set_svm(const void *value) {
 
 void
 kernel::global_argument::bind(exec_context &ctx,
-                              const module::argument &marg) {
-   align(ctx.input, marg.target_align);
+                              const binary::argument &barg) {
+   align(ctx.input, barg.target_align);
 
    if (buf) {
       const resource &r = buf->resource_in(*ctx.q);
@@ -492,17 +492,17 @@ kernel::global_argument::bind(exec_context &ctx,
       // We don't need to.  Buffer offsets are always
       // one-dimensional.
       auto v = bytes(r.offset[0]);
-      extend(v, marg.ext_type, marg.target_size);
+      extend(v, barg.ext_type, barg.target_size);
       byteswap(v, ctx.q->device().endianness());
       insert(ctx.input, v);
    } else if (svm) {
       auto v = bytes(svm);
-      extend(v, marg.ext_type, marg.target_size);
+      extend(v, barg.ext_type, barg.target_size);
       byteswap(v, ctx.q->device().endianness());
       insert(ctx.input, v);
    } else {
       // Null pointer.
-      allocate(ctx.input, marg.target_size);
+      allocate(ctx.input, barg.target_size);
    }
 }
 
@@ -529,11 +529,11 @@ kernel::local_argument::set(size_t size, const void *value) {
 
 void
 kernel::local_argument::bind(exec_context &ctx,
-                             const module::argument &marg) {
-   ctx.mem_local = ::align(ctx.mem_local, marg.target_align);
+                             const binary::argument &barg) {
+   ctx.mem_local = ::align(ctx.mem_local, barg.target_align);
    auto v = bytes(ctx.mem_local);
 
-   extend(v, module::argument::zero_ext, marg.target_size);
+   extend(v, binary::argument::zero_ext, barg.target_size);
    byteswap(v, ctx.q->device().endianness());
    align(ctx.input, ctx.q->device().address_bits() / 8);
    insert(ctx.input, v);
@@ -559,14 +559,14 @@ kernel::constant_argument::set(size_t size, const void *value) {
 
 void
 kernel::constant_argument::bind(exec_context &ctx,
-                                const module::argument &marg) {
-   align(ctx.input, marg.target_align);
+                                const binary::argument &barg) {
+   align(ctx.input, barg.target_align);
 
    if (buf) {
       resource &r = buf->resource_in(*ctx.q);
       auto v = bytes(ctx.resources.size() << 24 | r.offset[0]);
 
-      extend(v, module::argument::zero_ext, marg.target_size);
+      extend(v, binary::argument::zero_ext, barg.target_size);
       byteswap(v, ctx.q->device().endianness());
       insert(ctx.input, v);
 
@@ -574,7 +574,7 @@ kernel::constant_argument::bind(exec_context &ctx,
       ctx.resources.push_back(st);
    } else {
       // Null pointer.
-      allocate(ctx.input, marg.target_size);
+      allocate(ctx.input, barg.target_size);
    }
 }
 
@@ -598,12 +598,12 @@ kernel::image_rd_argument::set(size_t size, const void *value) {
 
 void
 kernel::image_rd_argument::bind(exec_context &ctx,
-                                const module::argument &marg) {
+                                const binary::argument &barg) {
    auto v = bytes(ctx.sviews.size());
 
-   extend(v, module::argument::zero_ext, marg.target_size);
+   extend(v, binary::argument::zero_ext, barg.target_size);
    byteswap(v, ctx.q->device().endianness());
-   align(ctx.input, marg.target_align);
+   align(ctx.input, barg.target_align);
    insert(ctx.input, v);
 
    st = img->resource_in(*ctx.q).bind_sampler_view(*ctx.q);
@@ -629,12 +629,12 @@ kernel::image_wr_argument::set(size_t size, const void *value) {
 
 void
 kernel::image_wr_argument::bind(exec_context &ctx,
-                                const module::argument &marg) {
+                                const binary::argument &barg) {
    auto v = bytes(ctx.iviews.size());
 
-   extend(v, module::argument::zero_ext, marg.target_size);
+   extend(v, binary::argument::zero_ext, barg.target_size);
    byteswap(v, ctx.q->device().endianness());
-   align(ctx.input, marg.target_align);
+   align(ctx.input, barg.target_align);
    insert(ctx.input, v);
    ctx.iviews.push_back(img->resource_in(*ctx.q).create_image_view(*ctx.q));
 }
@@ -660,7 +660,7 @@ kernel::sampler_argument::set(size_t size, const void *value) {
 
 void
 kernel::sampler_argument::bind(exec_context &ctx,
-                               const module::argument &marg) {
+                               const binary::argument &barg) {
    st = s->bind(*ctx.q);
    ctx.samplers.push_back(st);
 }
diff --git a/mesa 3D driver/src/gallium/frontends/clover/core/kernel.hpp b/mesa 3D driver/src/gallium/frontends/clover/core/kernel.hpp
index a2264376cc..e9d3bc9276 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/core/kernel.hpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/core/kernel.hpp	
@@ -74,7 +74,7 @@ namespace clover {
       class argument {
       public:
          static std::unique_ptr<argument>
-         create(const module::argument &marg);
+         create(const binary::argument &barg);
 
          argument(const argument &arg) = delete;
          argument &
@@ -97,7 +97,7 @@ namespace clover {
          /// Allocate the necessary resources to bind the specified
          /// object to this argument, and update \a ctx accordingly.
          virtual void bind(exec_context &ctx,
-                           const module::argument &marg) = 0;
+                           const binary::argument &barg) = 0;
 
          /// Free any resources that were allocated in bind().
          virtual void unbind(exec_context &ctx) = 0;
@@ -120,7 +120,7 @@ namespace clover {
 
    public:
       kernel(clover::program &prog, const std::string &name,
-             const std::vector<clover::module::argument> &margs);
+             const std::vector<clover::binary::argument> &bargs);
 
       kernel(const kernel &kern) = delete;
       kernel &
@@ -144,12 +144,12 @@ namespace clover {
 
       argument_range args();
       const_argument_range args() const;
-      std::vector<clover::module::arg_info> args_infos();
+      std::vector<clover::binary::arg_info> args_infos();
 
       const intrusive_ref<clover::program> program;
 
    private:
-      const clover::module &module(const command_queue &q) const;
+      const clover::binary &binary(const command_queue &q) const;
 
       class scalar_argument : public argument {
       public:
@@ -157,7 +157,7 @@ namespace clover {
 
          virtual void set(size_t size, const void *value);
          virtual void bind(exec_context &ctx,
-                           const module::argument &marg);
+                           const binary::argument &barg);
          virtual void unbind(exec_context &ctx);
 
       private:
@@ -172,7 +172,7 @@ namespace clover {
          virtual void set(size_t size, const void *value);
          virtual void set_svm(const void *value);
          virtual void bind(exec_context &ctx,
-                           const module::argument &marg);
+                           const binary::argument &barg);
          virtual void unbind(exec_context &ctx);
 
       private:
@@ -186,7 +186,7 @@ namespace clover {
 
          virtual void set(size_t size, const void *value);
          virtual void bind(exec_context &ctx,
-                           const module::argument &marg);
+                           const binary::argument &barg);
          virtual void unbind(exec_context &ctx);
 
       private:
@@ -199,7 +199,7 @@ namespace clover {
 
          virtual void set(size_t size, const void *value);
          virtual void bind(exec_context &ctx,
-                           const module::argument &marg);
+                           const binary::argument &barg);
          virtual void unbind(exec_context &ctx);
 
       private:
@@ -220,7 +220,7 @@ namespace clover {
       public:
          virtual void set(size_t size, const void *value);
          virtual void bind(exec_context &ctx,
-                           const module::argument &marg);
+                           const binary::argument &barg);
          virtual void unbind(exec_context &ctx);
 
       private:
@@ -231,7 +231,7 @@ namespace clover {
       public:
          virtual void set(size_t size, const void *value);
          virtual void bind(exec_context &ctx,
-                           const module::argument &marg);
+                           const binary::argument &barg);
          virtual void unbind(exec_context &ctx);
       };
 
@@ -241,7 +241,7 @@ namespace clover {
 
          virtual void set(size_t size, const void *value);
          virtual void bind(exec_context &ctx,
-                           const module::argument &marg);
+                           const binary::argument &barg);
          virtual void unbind(exec_context &ctx);
 
       private:
diff --git a/mesa 3D driver/src/gallium/frontends/clover/core/memory.cpp b/mesa 3D driver/src/gallium/frontends/clover/core/memory.cpp
index e722af22b5..6270107e94 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/core/memory.cpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/core/memory.cpp	
@@ -171,12 +171,13 @@ image::image(clover::context &ctx,
              std::vector<cl_mem_properties> properties,
              cl_mem_flags flags,
              const cl_image_format *format,
-             size_t width, size_t height, size_t depth,
+             size_t width, size_t height, size_t depth, size_t array_size,
              size_t row_pitch, size_t slice_pitch, size_t size,
-             void *host_ptr) :
+             void *host_ptr, cl_mem buffer) :
    memory_obj(ctx, properties, flags, size, host_ptr),
    _format(*format), _width(width), _height(height), _depth(depth),
-   _row_pitch(row_pitch), _slice_pitch(slice_pitch) {
+   _row_pitch(row_pitch), _slice_pitch(slice_pitch), _array_size(array_size),
+   _buffer(buffer) {
 }
 
 resource &
@@ -249,19 +250,45 @@ image::slice_pitch() const {
    return _slice_pitch;
 }
 
+size_t
+image::array_size() const {
+   return _array_size;
+}
+
+cl_mem
+image::buffer() const {
+   return _buffer;
+}
+
 image1d::image1d(clover::context &ctx,
                  std::vector<cl_mem_properties> properties,
                  cl_mem_flags flags,
                  const cl_image_format *format,
                  size_t width, size_t row_pitch,
                  void *host_ptr) :
-   image(ctx, properties, flags, format, width, 1, 1,
-         row_pitch, 0, row_pitch, host_ptr) {
+   basic_image(ctx, properties, flags, format, width, 1, 1, 0,
+               row_pitch, 0, row_pitch, host_ptr, nullptr) {
 }
 
-cl_mem_object_type
-image1d::type() const {
-   return CL_MEM_OBJECT_IMAGE1D;
+image1d_buffer::image1d_buffer(clover::context &ctx,
+                               std::vector<cl_mem_properties> properties,
+                               cl_mem_flags flags,
+                               const cl_image_format *format,
+                               size_t width, size_t row_pitch,
+                               void *host_ptr, cl_mem buffer) :
+   basic_image(ctx, properties, flags, format, width, 1, 1, 0,
+               row_pitch, 0, row_pitch, host_ptr, buffer) {
+}
+
+image1d_array::image1d_array(clover::context &ctx,
+                             std::vector<cl_mem_properties> properties,
+                             cl_mem_flags flags,
+                             const cl_image_format *format,
+                             size_t width,
+                             size_t array_size, size_t slice_pitch,
+                             void *host_ptr) :
+   basic_image(ctx, properties, flags, format, width, 1, 1, array_size,
+               0, slice_pitch, slice_pitch * array_size, host_ptr, nullptr) {
 }
 
 image2d::image2d(clover::context &ctx,
@@ -270,13 +297,19 @@ image2d::image2d(clover::context &ctx,
                  const cl_image_format *format, size_t width,
                  size_t height, size_t row_pitch,
                  void *host_ptr) :
-   image(ctx, properties, flags, format, width, height, 1,
-         row_pitch, 0, height * row_pitch, host_ptr) {
+   basic_image(ctx, properties, flags, format, width, height, 1, 0,
+               row_pitch, 0, height * row_pitch, host_ptr, nullptr) {
 }
 
-cl_mem_object_type
-image2d::type() const {
-   return CL_MEM_OBJECT_IMAGE2D;
+image2d_array::image2d_array(clover::context &ctx,
+                             std::vector<cl_mem_properties> properties,
+                             cl_mem_flags flags,
+                             const cl_image_format *format,
+                             size_t width, size_t height, size_t array_size,
+                             size_t row_pitch, size_t slice_pitch,
+                             void *host_ptr) :
+   basic_image(ctx, properties, flags, format, width, height, 1, array_size,
+               row_pitch, slice_pitch, slice_pitch * array_size, host_ptr, nullptr) {
 }
 
 image3d::image3d(clover::context &ctx,
@@ -286,12 +319,7 @@ image3d::image3d(clover::context &ctx,
                  size_t width, size_t height, size_t depth,
                  size_t row_pitch, size_t slice_pitch,
                  void *host_ptr) :
-   image(ctx, properties, flags, format, width, height, depth,
-         row_pitch, slice_pitch, depth * slice_pitch,
-         host_ptr) {
-}
-
-cl_mem_object_type
-image3d::type() const {
-   return CL_MEM_OBJECT_IMAGE3D;
+   basic_image(ctx, properties, flags, format, width, height, depth, 0,
+               row_pitch, slice_pitch, depth * slice_pitch,
+               host_ptr, nullptr) {
 }
diff --git a/mesa 3D driver/src/gallium/frontends/clover/core/memory.hpp b/mesa 3D driver/src/gallium/frontends/clover/core/memory.hpp
index 19d6e2ad45..d6a170bcfb 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/core/memory.hpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/core/memory.hpp	
@@ -138,18 +138,21 @@ namespace clover {
             std::vector<cl_mem_properties> properties,
             cl_mem_flags flags,
             const cl_image_format *format,
-            size_t width, size_t height, size_t depth,
+            size_t width, size_t height, size_t depth, size_t array_size,
             size_t row_pitch, size_t slice_pitch, size_t size,
-            void *host_ptr);
+            void *host_ptr, cl_mem buffer);
 
    public:
       cl_image_format format() const;
+      virtual cl_uint dimensions() const = 0;
       size_t width() const;
       size_t height() const;
       size_t depth() const;
       size_t pixel_size() const;
       size_t row_pitch() const;
       size_t slice_pitch() const;
+      size_t array_size() const;
+      cl_mem buffer() const;
       virtual clover::resource &
       resource_in(command_queue &q);
       virtual clover::resource &
@@ -167,12 +170,26 @@ namespace clover {
       size_t _depth;
       size_t _row_pitch;
       size_t _slice_pitch;
+      size_t _array_size;
+      cl_mem _buffer;
       std::map<device *,
                std::unique_ptr<root_resource>> resources;
       std::mutex resources_mtx;
    };
 
-   class image1d : public image {
+   template<cl_mem_object_type Type, cl_uint Dim>
+   class basic_image : public image {
+   public:
+      using image::image;
+      virtual cl_mem_object_type type() const {
+         return Type;
+      }
+      virtual cl_uint dimensions() const {
+         return Dim;
+      }
+   };
+
+   class image1d : public basic_image<CL_MEM_OBJECT_IMAGE1D, 1> {
    public:
       image1d(clover::context &ctx,
               std::vector<cl_mem_properties> properties,
@@ -180,11 +197,30 @@ namespace clover {
               const cl_image_format *format,
               size_t width, size_t row_pitch,
               void *host_ptr);
-
-      virtual cl_mem_object_type type() const;
    };
 
-   class image2d : public image {
+   class image1d_buffer : public basic_image<CL_MEM_OBJECT_IMAGE1D_BUFFER, 1> {
+   public:
+      image1d_buffer(clover::context &ctx,
+                     std::vector<cl_mem_properties> properties,
+                     cl_mem_flags flags,
+                     const cl_image_format *format,
+                     size_t width, size_t row_pitch,
+                     void *host_ptr, cl_mem buffer);
+   };
+
+   class image1d_array : public basic_image<CL_MEM_OBJECT_IMAGE1D_ARRAY, 1> {
+   public:
+      image1d_array(clover::context &ctx,
+                    std::vector<cl_mem_properties> properties,
+                    cl_mem_flags flags,
+                    const cl_image_format *format,
+                    size_t width,
+                    size_t array_size, size_t slice_pitch,
+                    void *host_ptr);
+   };
+
+   class image2d : public basic_image<CL_MEM_OBJECT_IMAGE2D, 2> {
    public:
       image2d(clover::context &ctx,
               std::vector<cl_mem_properties> properties,
@@ -192,11 +228,20 @@ namespace clover {
               const cl_image_format *format, size_t width,
               size_t height, size_t row_pitch,
               void *host_ptr);
-
-      virtual cl_mem_object_type type() const;
    };
 
-   class image3d : public image {
+   class image2d_array : public basic_image<CL_MEM_OBJECT_IMAGE2D_ARRAY, 2> {
+   public:
+      image2d_array(clover::context &ctx,
+                    std::vector<cl_mem_properties> properties,
+                    cl_mem_flags flags,
+                    const cl_image_format *format,
+                    size_t width, size_t height, size_t array_size,
+                    size_t row_pitch, size_t slice_pitch,
+                    void *host_ptr);
+   };
+
+   class image3d : public basic_image<CL_MEM_OBJECT_IMAGE3D, 3>{
    public:
       image3d(clover::context &ctx,
               std::vector<cl_mem_properties> properties,
@@ -205,8 +250,6 @@ namespace clover {
               size_t width, size_t height, size_t depth,
               size_t row_pitch, size_t slice_pitch,
               void *host_ptr);
-
-      virtual cl_mem_object_type type() const;
    };
 }
 
diff --git a/mesa 3D driver/src/gallium/frontends/clover/core/printf.cpp b/mesa 3D driver/src/gallium/frontends/clover/core/printf.cpp
index d4ec96bd58..bf131c6416 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/core/printf.cpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/core/printf.cpp	
@@ -40,7 +40,7 @@ namespace {
    const std::string clc_printf_whitelist = "%0123456789-+ #.AacdeEfFgGhilopsuvxX";
 
    void
-   print_formatted(const std::vector<module::printf_info> &formatters,
+   print_formatted(const std::vector<binary::printf_info> &formatters,
                    bool _strings_in_buffer,
                    const std::vector<char> &buffer) {
 
@@ -51,7 +51,7 @@ namespace {
       for (size_t buf_pos = 0; buf_pos < buffer.size(); ) {
          cl_uint fmt_idx = *(cl_uint*)&buffer[buf_pos];
          assert(fmt_idx > 0);
-         module::printf_info fmt = formatters[fmt_idx-1];
+         binary::printf_info fmt = formatters[fmt_idx-1];
 
          std::string format = (char *)fmt.strings.data();
          buf_pos += sizeof(cl_uint);
@@ -175,7 +175,7 @@ namespace {
 
 std::unique_ptr<printf_handler>
 printf_handler::create(const intrusive_ptr<command_queue> &q,
-                       const std::vector<module::printf_info> &infos,
+                       const std::vector<binary::printf_info> &infos,
                        bool strings_in_buffer,
                        cl_uint size) {
    return std::unique_ptr<printf_handler>(
@@ -183,7 +183,7 @@ printf_handler::create(const intrusive_ptr<command_queue> &q,
 }
 
 printf_handler::printf_handler(const intrusive_ptr<command_queue> &q,
-                               const std::vector<module::printf_info> &infos,
+                               const std::vector<binary::printf_info> &infos,
                                bool strings_in_buffer,
                                cl_uint size) :
    _q(q), _formatters(infos), _strings_in_buffer(strings_in_buffer), _size(size), _buffer() {
diff --git a/mesa 3D driver/src/gallium/frontends/clover/core/printf.hpp b/mesa 3D driver/src/gallium/frontends/clover/core/printf.hpp
index 3fc740c1ae..4d57a7150b 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/core/printf.hpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/core/printf.hpp	
@@ -32,7 +32,7 @@ namespace clover {
    public:
       static std::unique_ptr<printf_handler>
       create(const intrusive_ptr<command_queue> &q,
-             const std::vector<module::printf_info> &info,
+             const std::vector<binary::printf_info> &info,
              bool strings_in_buffer, cl_uint size);
 
       printf_handler(const printf_handler &arg) = delete;
@@ -46,11 +46,11 @@ namespace clover {
 
    private:
       printf_handler(const intrusive_ptr<command_queue> &q,
-                     const std::vector<module::printf_info> &infos,
+                     const std::vector<binary::printf_info> &infos,
                      bool strings_in_buffer, cl_uint size);
 
       intrusive_ptr<command_queue> _q;
-      std::vector<module::printf_info> _formatters;
+      std::vector<binary::printf_info> _formatters;
       bool _strings_in_buffer;
       cl_uint _size;
       std::unique_ptr<root_buffer> _buffer;
diff --git a/mesa 3D driver/src/gallium/frontends/clover/core/program.cpp b/mesa 3D driver/src/gallium/frontends/clover/core/program.cpp
index 0d97904d41..43609a2524 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/core/program.cpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/core/program.cpp	
@@ -33,10 +33,10 @@ program::program(clover::context &ctx, std::string &&source,
 
 program::program(clover::context &ctx,
                  const ref_vector<device> &devs,
-                 const std::vector<module> &binaries) :
+                 const std::vector<binary> &binaries) :
    context(ctx), _devices(devs), _kernel_ref_counter(0),
    _il_type(il_type::none) {
-   for_each([&](device &dev, const module &bin) {
+   for_each([&](device &dev, const binary &bin) {
          _builds[&dev] = { bin };
       },
       devs, binaries);
@@ -52,11 +52,11 @@ program::compile(const ref_vector<device> &devs, const std::string &opts,
          std::string log;
 
          try {
-            const module m =
+            const binary b =
                compiler::compile_program(*this, headers, dev, opts, log);
-            _builds[&dev] = { m, opts, log };
+            _builds[&dev] = { b, opts, log };
          } catch (...) {
-            _builds[&dev] = { module(), opts, log };
+            _builds[&dev] = { binary(), opts, log };
             throw;
          }
       }
@@ -69,16 +69,16 @@ program::link(const ref_vector<device> &devs, const std::string &opts,
    _devices = devs;
 
    for (auto &dev : devs) {
-      const std::vector<module> ms = map([&](const program &prog) {
-         return prog.build(dev).binary;
+      const std::vector<binary> bs = map([&](const program &prog) {
+         return prog.build(dev).bin;
          }, progs);
       std::string log = _builds[&dev].log;
 
       try {
-         const module m = compiler::link_program(ms, dev, opts, log);
-         _builds[&dev] = { m, opts, log };
+         const binary b = compiler::link_program(bs, dev, opts, log);
+         _builds[&dev] = { b, opts, log };
       } catch (...) {
-         _builds[&dev] = { module(), opts, log };
+         _builds[&dev] = { binary(), opts, log };
          throw;
       }
    }
@@ -101,7 +101,7 @@ program::devices() const {
 
 cl_build_status
 program::build::status() const {
-   if (!binary.secs.empty())
+   if (!bin.secs.empty())
       return CL_BUILD_SUCCESS;
    else if (log.size())
       return CL_BUILD_ERROR;
@@ -111,11 +111,11 @@ program::build::status() const {
 
 cl_program_binary_type
 program::build::binary_type() const {
-   if (any_of(type_equals(module::section::text_intermediate), binary.secs))
+   if (any_of(type_equals(binary::section::text_intermediate), bin.secs))
       return CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
-   else if (any_of(type_equals(module::section::text_library), binary.secs))
+   else if (any_of(type_equals(binary::section::text_library), bin.secs))
       return CL_PROGRAM_BINARY_TYPE_LIBRARY;
-   else if (any_of(type_equals(module::section::text_executable), binary.secs))
+   else if (any_of(type_equals(binary::section::text_executable), bin.secs))
       return CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
    else
       return CL_PROGRAM_BINARY_TYPE_NONE;
@@ -127,12 +127,12 @@ program::build(const device &dev) const {
    return _builds.count(&dev) ? _builds.find(&dev)->second : null;
 }
 
-const std::vector<module::symbol> &
+const std::vector<binary::symbol> &
 program::symbols() const {
    if (_builds.empty())
       throw error(CL_INVALID_PROGRAM_EXECUTABLE);
 
-   return _builds.begin()->second.binary.syms;
+   return _builds.begin()->second.bin.syms;
 }
 
 unsigned
diff --git a/mesa 3D driver/src/gallium/frontends/clover/core/program.hpp b/mesa 3D driver/src/gallium/frontends/clover/core/program.hpp
index 5cc80d2b62..3969f4fd29 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/core/program.hpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/core/program.hpp	
@@ -27,7 +27,7 @@
 
 #include "core/object.hpp"
 #include "core/context.hpp"
-#include "core/module.hpp"
+#include "core/binary.hpp"
 
 namespace clover {
    typedef std::vector<std::pair<std::string, std::string>> header_map;
@@ -45,7 +45,7 @@ namespace clover {
               enum il_type il_type);
       program(clover::context &ctx,
               const ref_vector<device> &devs = {},
-              const std::vector<module> &binaries = {});
+              const std::vector<binary> &binaries = {});
 
       program(const program &prog) = delete;
       program &
@@ -62,20 +62,20 @@ namespace clover {
       device_range devices() const;
 
       struct build {
-         build(const module &m = {}, const std::string &opts = {},
-               const std::string &log = {}) : binary(m), opts(opts), log(log) {}
+         build(const binary &b = {}, const std::string &opts = {},
+               const std::string &log = {}) : bin(b), opts(opts), log(log) {}
 
          cl_build_status status() const;
          cl_program_binary_type binary_type() const;
 
-         module binary;
+         binary bin;
          std::string opts;
          std::string log;
       };
 
       const build &build(const device &dev) const;
 
-      const std::vector<module::symbol> &symbols() const;
+      const std::vector<binary::symbol> &symbols() const;
 
       unsigned kernel_ref_count() const;
 
diff --git a/mesa 3D driver/src/gallium/frontends/clover/core/resource.cpp b/mesa 3D driver/src/gallium/frontends/clover/core/resource.cpp
index 6ace714a86..484e579863 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/core/resource.cpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/core/resource.cpp	
@@ -89,8 +89,8 @@ resource::add_map(command_queue &q, cl_map_flags flags, bool blocking,
 
 void
 resource::del_map(void *p) {
-   erase_if([&](const mapping &m) {
-         return static_cast<void *>(m) == p;
+   erase_if([&](const mapping &b) {
+         return static_cast<void *>(b) == p;
       }, maps);
 }
 
@@ -164,13 +164,14 @@ root_resource::root_resource(clover::device &dev, memory_obj &obj,
       info.width0 = img->width();
       info.height0 = img->height();
       info.depth0 = img->depth();
+      info.array_size = MAX2(1, img->array_size());
    } else {
       info.width0 = obj.size();
       info.height0 = 1;
       info.depth0 = 1;
+      info.array_size = 1;
    }
 
-   info.array_size = 1;
    info.target = translate_target(obj.type());
    info.bind = (PIPE_BIND_SAMPLER_VIEW |
                 PIPE_BIND_COMPUTE_RESOURCE |
diff --git a/mesa 3D driver/src/gallium/frontends/clover/llvm/codegen.hpp b/mesa 3D driver/src/gallium/frontends/clover/llvm/codegen.hpp
index e35627c472..c07debe031 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/llvm/codegen.hpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/llvm/codegen.hpp	
@@ -23,14 +23,14 @@
 ///
 /// \file
 /// Tools to generate various forms of binary code from existing LLVM IR in
-/// the given llvm::Module object and output the result as a clover::module.
+/// the given llvm::Module object and output the result as a clover::binary.
 ///
 
 #ifndef CLOVER_LLVM_CODEGEN_HPP
 #define CLOVER_LLVM_CODEGEN_HPP
 
 #include "llvm/util.hpp"
-#include "core/module.hpp"
+#include "core/binary.hpp"
 
 #include <llvm/IR/Module.h>
 
@@ -41,15 +41,15 @@ namespace clover {
       std::string
       print_module_bitcode(const ::llvm::Module &mod);
 
-      module
+      binary
       build_module_library(const ::llvm::Module &mod,
-                           enum module::section::type section_type);
+                           enum binary::section::type section_type);
 
       std::unique_ptr< ::llvm::Module>
-      parse_module_library(const module &m, ::llvm::LLVMContext &ctx,
+      parse_module_library(const binary &b, ::llvm::LLVMContext &ctx,
                            std::string &r_log);
 
-      module
+      binary
       build_module_native(::llvm::Module &mod, const target &target,
                           const clang::CompilerInstance &c,
                           std::string &r_log);
@@ -57,7 +57,7 @@ namespace clover {
       std::string
       print_module_native(const ::llvm::Module &mod, const target &target);
 
-      module
+      binary
       build_module_common(const ::llvm::Module &mod,
                           const std::vector<char> &code,
                           const std::map<std::string, unsigned> &offsets,
diff --git a/mesa 3D driver/src/gallium/frontends/clover/llvm/codegen/bitcode.cpp b/mesa 3D driver/src/gallium/frontends/clover/llvm/codegen/bitcode.cpp
index 8ea3e57168..102df00014 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/llvm/codegen/bitcode.cpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/llvm/codegen/bitcode.cpp	
@@ -49,7 +49,7 @@
 #endif
 #include <llvm/Support/raw_ostream.h>
 
-using clover::module;
+using clover::binary;
 using namespace clover::llvm;
 
 namespace {
@@ -70,20 +70,20 @@ clover::llvm::print_module_bitcode(const ::llvm::Module &mod) {
    return os.str();
 }
 
-module
+binary
 clover::llvm::build_module_library(const ::llvm::Module &mod,
-                                   enum module::section::type section_type) {
-   module m;
+                                   enum binary::section::type section_type) {
+   binary b;
    const auto code = emit_code(mod);
-   m.secs.emplace_back(0, section_type, code.size(), code);
-   return m;
+   b.secs.emplace_back(0, section_type, code.size(), code);
+   return b;
 }
 
 std::unique_ptr< ::llvm::Module>
-clover::llvm::parse_module_library(const module &m, ::llvm::LLVMContext &ctx,
+clover::llvm::parse_module_library(const binary &b, ::llvm::LLVMContext &ctx,
                                    std::string &r_log) {
    auto mod = ::llvm::parseBitcodeFile(::llvm::MemoryBufferRef(
-                                        as_string(m.secs[0].data), " "), ctx);
+                                        as_string(b.secs[0].data), " "), ctx);
 
    if (::llvm::Error err = mod.takeError()) {
       ::llvm::handleAllErrors(std::move(err), [&](::llvm::ErrorInfoBase &eib) {
diff --git a/mesa 3D driver/src/gallium/frontends/clover/llvm/codegen/common.cpp b/mesa 3D driver/src/gallium/frontends/clover/llvm/codegen/common.cpp
index 2ed46794de..ff87d9c2a0 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/llvm/codegen/common.cpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/llvm/codegen/common.cpp	
@@ -25,7 +25,7 @@
 ///
 /// \file
 /// Codegen back-end-independent part of the construction of an executable
-/// clover::module, including kernel argument metadata extraction and
+/// clover::binary, including kernel argument metadata extraction and
 /// formatting of the pre-generated binary code in a form that can be
 /// understood by pipe drivers.
 ///
@@ -42,7 +42,7 @@
 
 #include <clang/Basic/TargetInfo.h>
 
-using clover::module;
+using clover::binary;
 using clover::detokenize;
 using namespace clover::llvm;
 
@@ -54,20 +54,20 @@ using ::llvm::cast;
 using ::llvm::dyn_cast;
 
 namespace {
-   enum module::argument::type
+   enum binary::argument::type
    get_image_type(const std::string &type,
                   const std::string &qual) {
       if (type == "image1d_t" || type == "image2d_t" || type == "image3d_t") {
          if (qual == "read_only")
-            return module::argument::image_rd;
+            return binary::argument::image_rd;
          else if (qual == "write_only")
-            return module::argument::image_wr;
+            return binary::argument::image_wr;
       }
 
       unreachable("Unsupported image type");
    }
 
-   module::arg_info create_arg_info(const std::string &arg_name,
+   binary::arg_info create_arg_info(const std::string &arg_name,
                                     const std::string &type_name,
                                     const std::string &type_qualifier,
                                     const uint64_t address_qualifier,
@@ -100,7 +100,7 @@ namespace {
       else if (access_qualifier == "read_write")
          cl_access_qualifier = CL_KERNEL_ARG_ACCESS_READ_WRITE;
 
-      return module::arg_info(arg_name, type_name, cl_type_qualifier,
+      return binary::arg_info(arg_name, type_name, cl_type_qualifier,
                               cl_address_qualifier, cl_access_qualifier);
    }
 
@@ -147,10 +147,10 @@ namespace {
       return detokenize(attributes, " ");
    }
 
-   std::vector<module::argument>
+   std::vector<binary::argument>
    make_kernel_args(const Module &mod, const std::string &kernel_name,
                     const clang::CompilerInstance &c) {
-      std::vector<module::argument> args;
+      std::vector<binary::argument> args;
       const Function &f = *mod.getFunction(kernel_name);
       ::llvm::DataLayout dl(&mod);
       const auto size_type =
@@ -176,28 +176,28 @@ namespace {
                f, arg, "kernel_arg_access_qual");
             args.emplace_back(get_image_type(type_name, access_qual),
                               target_size, target_size,
-                              target_align, module::argument::zero_ext);
+                              target_align, binary::argument::zero_ext);
 
          } else if (type_name == "sampler_t") {
-            args.emplace_back(module::argument::sampler, arg_api_size,
+            args.emplace_back(binary::argument::sampler, arg_api_size,
                               target_size, target_align,
-                              module::argument::zero_ext);
+                              binary::argument::zero_ext);
 
          } else if (type_name == "__llvm_image_size") {
             // Image size implicit argument.
-            args.emplace_back(module::argument::scalar, sizeof(cl_uint),
+            args.emplace_back(binary::argument::scalar, sizeof(cl_uint),
                               dl.getTypeStoreSize(size_type),
                               dl.getABITypeAlignment(size_type),
-                              module::argument::zero_ext,
-                              module::argument::image_size);
+                              binary::argument::zero_ext,
+                              binary::argument::image_size);
 
          } else if (type_name == "__llvm_image_format") {
             // Image format implicit argument.
-            args.emplace_back(module::argument::scalar, sizeof(cl_uint),
+            args.emplace_back(binary::argument::scalar, sizeof(cl_uint),
                               dl.getTypeStoreSize(size_type),
                               dl.getABITypeAlignment(size_type),
-                              module::argument::zero_ext,
-                              module::argument::image_format);
+                              binary::argument::zero_ext,
+                              binary::argument::image_format);
 
          } else {
             // Other types.
@@ -215,10 +215,10 @@ namespace {
                if (address_space == map[offset]) {
                   const auto pointee_type = cast<
                      ::llvm::PointerType>(actual_type)->getElementType();
-                  args.emplace_back(module::argument::local, arg_api_size,
+                  args.emplace_back(binary::argument::local, arg_api_size,
                                     target_size,
                                     dl.getABITypeAlignment(pointee_type),
-                                    module::argument::zero_ext);
+                                    binary::argument::zero_ext);
                } else {
                   // XXX: Correctly handle constant address space.  There is no
                   // way for r600g to pass a handle for constant buffers back
@@ -227,19 +227,19 @@ namespace {
                   // continue treating constant buffers as global buffers
                   // until we can come up with a way to create handles for
                   // constant buffers.
-                  args.emplace_back(module::argument::global, arg_api_size,
+                  args.emplace_back(binary::argument::global, arg_api_size,
                                     target_size, target_align,
-                                    module::argument::zero_ext);
+                                    binary::argument::zero_ext);
                }
 
             } else {
-               const bool needs_sign_ext = f.getAttributes().hasAttribute(
-                  arg.getArgNo() + 1, ::llvm::Attribute::SExt);
+               const bool needs_sign_ext = f.getAttributes().hasParamAttr(
+                  arg.getArgNo(), ::llvm::Attribute::SExt);
 
-               args.emplace_back(module::argument::scalar, arg_api_size,
+               args.emplace_back(binary::argument::scalar, arg_api_size,
                                  target_size, target_align,
-                                 (needs_sign_ext ? module::argument::sign_ext :
-                                  module::argument::zero_ext));
+                                 (needs_sign_ext ? binary::argument::sign_ext :
+                                  binary::argument::zero_ext));
             }
 
             // Add kernel argument infos if built with -cl-kernel-arg-info.
@@ -257,25 +257,25 @@ namespace {
       // Append implicit arguments.  XXX - The types, ordering and
       // vector size of the implicit arguments should depend on the
       // target according to the selected calling convention.
-      args.emplace_back(module::argument::scalar, sizeof(cl_uint),
+      args.emplace_back(binary::argument::scalar, sizeof(cl_uint),
                         dl.getTypeStoreSize(size_type),
                         dl.getABITypeAlignment(size_type),
-                        module::argument::zero_ext,
-                        module::argument::grid_dimension);
+                        binary::argument::zero_ext,
+                        binary::argument::grid_dimension);
 
-      args.emplace_back(module::argument::scalar, sizeof(cl_uint),
+      args.emplace_back(binary::argument::scalar, sizeof(cl_uint),
                         dl.getTypeStoreSize(size_type),
                         dl.getABITypeAlignment(size_type),
-                        module::argument::zero_ext,
-                        module::argument::grid_offset);
+                        binary::argument::zero_ext,
+                        binary::argument::grid_offset);
 
       return args;
    }
 
-   module::section
+   binary::section
    make_text_section(const std::vector<char> &code) {
       const pipe_binary_program_header header { uint32_t(code.size()) };
-      module::section text { 0, module::section::text_executable,
+      binary::section text { 0, binary::section::text_executable,
                              header.num_bytes, {} };
 
       text.data.insert(text.data.end(), reinterpret_cast<const char *>(&header),
@@ -286,24 +286,24 @@ namespace {
    }
 }
 
-module
+binary
 clover::llvm::build_module_common(const Module &mod,
                                   const std::vector<char> &code,
                                   const std::map<std::string,
                                                  unsigned> &offsets,
                                   const clang::CompilerInstance &c) {
-   module m;
+   binary b;
 
    for (const auto &llvm_name : map(std::mem_fn(&Function::getName),
                                get_kernels(mod))) {
       const ::std::string name(llvm_name);
       if (offsets.count(name))
-         m.syms.emplace_back(name, kernel_attributes(mod, name),
+         b.syms.emplace_back(name, kernel_attributes(mod, name),
                              get_reqd_work_group_size(mod, name),
                              0, offsets.at(name),
                              make_kernel_args(mod, name, c));
    }
 
-   m.secs.push_back(make_text_section(code));
-   return m;
+   b.secs.push_back(make_text_section(code));
+   return b;
 }
diff --git a/mesa 3D driver/src/gallium/frontends/clover/llvm/codegen/native.cpp b/mesa 3D driver/src/gallium/frontends/clover/llvm/codegen/native.cpp
index 683e966ea5..03166b26d2 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/llvm/codegen/native.cpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/llvm/codegen/native.cpp	
@@ -28,7 +28,6 @@
 ///
 
 #include <llvm/Target/TargetMachine.h>
-#include <llvm/Support/TargetRegistry.h>
 #include <llvm/Transforms/Utils/Cloning.h>
 
 #include "llvm/codegen.hpp"
@@ -36,7 +35,7 @@
 #include "llvm/util.hpp"
 #include "core/error.hpp"
 
-using clover::module;
+using clover::binary;
 using clover::build_error;
 using namespace clover::llvm;
 using ::llvm::TargetMachine;
@@ -143,7 +142,7 @@ namespace {
    }
 }
 
-module
+binary
 clover::llvm::build_module_native(::llvm::Module &mod, const target &target,
                                   const clang::CompilerInstance &c,
                                   std::string &r_log) {
@@ -167,7 +166,7 @@ clover::llvm::print_module_native(const ::llvm::Module &mod,
 
 #else
 
-module
+binary
 clover::llvm::build_module_native(::llvm::Module &mod, const target &target,
                                   const clang::CompilerInstance &c,
                                   std::string &r_log) {
diff --git a/mesa 3D driver/src/gallium/frontends/clover/llvm/compat.hpp b/mesa 3D driver/src/gallium/frontends/clover/llvm/compat.hpp
index 89aa0dfbf5..8c369b2cac 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/llvm/compat.hpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/llvm/compat.hpp	
@@ -56,6 +56,12 @@
 #include <llvm/Support/CodeGen.h>
 #endif
 
+#if LLVM_VERSION_MAJOR >= 14
+#include <llvm/MC/TargetRegistry.h>
+#else
+#include <llvm/Support/TargetRegistry.h>
+#endif
+
 namespace clover {
    namespace llvm {
       namespace compat {
diff --git a/mesa 3D driver/src/gallium/frontends/clover/llvm/invocation.cpp b/mesa 3D driver/src/gallium/frontends/clover/llvm/invocation.cpp
index c3f54c1ed2..1e5f3266c2 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/llvm/invocation.cpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/llvm/invocation.cpp	
@@ -60,7 +60,7 @@
 #include "util/algorithm.hpp"
 
 
-using clover::module;
+using clover::binary;
 using clover::device;
 using clover::build_error;
 using clover::invalid_build_options_error;
@@ -389,7 +389,7 @@ namespace {
 #endif
 }
 
-module
+binary
 clover::llvm::compile_program(const std::string &source,
                               const header_map &headers,
                               const device &dev,
@@ -407,7 +407,7 @@ clover::llvm::compile_program(const std::string &source,
    if (has_flag(debug::llvm))
       debug::log(".ll", print_module_bitcode(*mod));
 
-   return build_module_library(*mod, module::section::text_intermediate);
+   return build_module_library(*mod, binary::section::text_intermediate);
 }
 
 namespace {
@@ -421,10 +421,10 @@ namespace {
       // functions as internal enables the optimizer to perform optimizations
       // like function inlining and global dead-code elimination.
       //
-      // When there is no "main" function in a module, the internalize pass will
-      // treat the module like a library, and it won't internalize any functions.
+      // When there is no "main" function in a binary, the internalize pass will
+      // treat the binary like a library, and it won't internalize any functions.
       // Since there is no "main" function in our kernels, we need to tell
-      // the internalizer pass that this module is not a library by passing a
+      // the internalizer pass that this binary is not a library by passing a
       // list of kernel functions to the internalizer.  The internalizer will
       // treat the functions in the list as "main" functions and internalize
       // all of the other functions.
@@ -448,12 +448,12 @@ namespace {
 
    std::unique_ptr<Module>
    link(LLVMContext &ctx, const clang::CompilerInstance &c,
-        const std::vector<module> &modules, std::string &r_log) {
+        const std::vector<binary> &binaries, std::string &r_log) {
       std::unique_ptr<Module> mod { new Module("link", ctx) };
       std::unique_ptr< ::llvm::Linker> linker { new ::llvm::Linker(*mod) };
 
-      for (auto &m : modules) {
-         if (linker->linkInModule(parse_module_library(m, ctx, r_log)))
+      for (auto &b : binaries) {
+         if (linker->linkInModule(parse_module_library(b, ctx, r_log)))
             throw build_error();
       }
 
@@ -461,8 +461,8 @@ namespace {
    }
 }
 
-module
-clover::llvm::link_program(const std::vector<module> &modules,
+binary
+clover::llvm::link_program(const std::vector<binary> &binaries,
                            const device &dev, const std::string &opts,
                            std::string &r_log) {
    std::vector<std::string> options = tokenize(opts + " input.cl");
@@ -471,7 +471,7 @@ clover::llvm::link_program(const std::vector<module> &modules,
 
    auto ctx = create_context(r_log);
    auto c = create_compiler_instance(dev, dev.ir_target(), options, r_log);
-   auto mod = link(*ctx, *c, modules, r_log);
+   auto mod = link(*ctx, *c, binaries, r_log);
 
    optimize(*mod, c->getCodeGenOpts().OptimizationLevel, !create_library);
 
@@ -483,7 +483,7 @@ clover::llvm::link_program(const std::vector<module> &modules,
       debug::log(id + ".ll", print_module_bitcode(*mod));
 
    if (create_library) {
-      return build_module_library(*mod, module::section::text_library);
+      return build_module_library(*mod, binary::section::text_library);
 
    } else if (dev.ir_format() == PIPE_SHADER_IR_NATIVE) {
       if (has_flag(debug::native))
@@ -497,7 +497,7 @@ clover::llvm::link_program(const std::vector<module> &modules,
 }
 
 #ifdef HAVE_CLOVER_SPIRV
-module
+binary
 clover::llvm::compile_to_spirv(const std::string &source,
                                const header_map &headers,
                                const device &dev,
diff --git a/mesa 3D driver/src/gallium/frontends/clover/llvm/invocation.hpp b/mesa 3D driver/src/gallium/frontends/clover/llvm/invocation.hpp
index 1f0e9db2cf..f6304978f6 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/llvm/invocation.hpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/llvm/invocation.hpp	
@@ -24,25 +24,25 @@
 #define CLOVER_LLVM_INVOCATION_HPP
 
 #include "core/error.hpp"
-#include "core/module.hpp"
+#include "core/binary.hpp"
 #include "core/program.hpp"
 #include "pipe/p_defines.h"
 
 namespace clover {
    namespace llvm {
-      module compile_program(const std::string &source,
+      binary compile_program(const std::string &source,
                              const header_map &headers,
                              const device &device,
                              const std::string &opts,
                              std::string &r_log);
 
-      module link_program(const std::vector<module> &modules,
+      binary link_program(const std::vector<binary> &binaries,
                           const device &device,
                           const std::string &opts,
                           std::string &r_log);
 
 #ifdef HAVE_CLOVER_SPIRV
-      module compile_to_spirv(const std::string &source,
+      binary compile_to_spirv(const std::string &source,
                               const header_map &headers,
                               const device &dev,
                               const std::string &opts,
diff --git a/mesa 3D driver/src/gallium/frontends/clover/meson.build b/mesa 3D driver/src/gallium/frontends/clover/meson.build
index b6a231f979..b7240c9c03 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/meson.build	
+++ b/mesa 3D driver/src/gallium/frontends/clover/meson.build	
@@ -113,6 +113,8 @@ clover_files = files(
   'api/sampler.cpp',
   'api/transfer.cpp',
   'api/util.hpp',
+  'core/binary.cpp',
+  'core/binary.hpp',
   'core/compiler.hpp',
   'core/context.cpp',
   'core/context.hpp',
@@ -127,8 +129,6 @@ clover_files = files(
   'core/kernel.hpp',
   'core/memory.cpp',
   'core/memory.hpp',
-  'core/module.cpp',
-  'core/module.hpp',
   'core/object.hpp',
   'core/platform.cpp',
   'core/platform.hpp',
@@ -148,6 +148,7 @@ clover_files = files(
   'util/adaptor.hpp',
   'util/algebra.hpp',
   'util/algorithm.hpp',
+  'util/compat.hpp',
   'util/factor.hpp',
   'util/functional.hpp',
   'util/lazy.hpp',
diff --git a/mesa 3D driver/src/gallium/frontends/clover/nir/invocation.cpp b/mesa 3D driver/src/gallium/frontends/clover/nir/invocation.cpp
index 70d47d964a..f7802aa40f 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/nir/invocation.cpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/nir/invocation.cpp	
@@ -26,7 +26,7 @@
 
 #include "core/device.hpp"
 #include "core/error.hpp"
-#include "core/module.hpp"
+#include "core/binary.hpp"
 #include "pipe/p_state.h"
 #include "util/algorithm.hpp"
 #include "util/functional.hpp"
@@ -72,49 +72,71 @@ static void debug_function(void *private_data,
 static void
 clover_arg_size_align(const glsl_type *type, unsigned *size, unsigned *align)
 {
-   if (type == glsl_type::sampler_type) {
+   if (type == glsl_type::sampler_type || type->is_image()) {
       *size = 0;
       *align = 1;
-   } else if (type->is_image()) {
-      *size = *align = sizeof(cl_mem);
    } else {
       *size = type->cl_size();
       *align = type->cl_alignment();
    }
 }
 
+static void
+clover_nir_add_image_uniforms(nir_shader *shader)
+{
+   /* Clover expects each image variable to take up a cl_mem worth of space in
+    * the arguments data.  Add uniforms as needed to match this expectation.
+    */
+   nir_foreach_image_variable_safe(var, shader) {
+      nir_variable *uniform = rzalloc(shader, nir_variable);
+      uniform->name = ralloc_strdup(uniform, var->name);
+      uniform->type = glsl_uintN_t_type(sizeof(cl_mem) * 8);
+      uniform->data.mode = nir_var_uniform;
+      uniform->data.read_only = true;
+      uniform->data.location = var->data.location;
+
+      exec_node_insert_node_before(&var->node, &uniform->node);
+   }
+}
+
 static bool
 clover_nir_lower_images(nir_shader *shader)
 {
    nir_function_impl *impl = nir_shader_get_entrypoint(shader);
 
    ASSERTED int last_loc = -1;
-   int num_rd_images = 0, num_wr_images = 0, num_samplers = 0;
+   int num_rd_images = 0, num_wr_images = 0;
+   nir_foreach_image_variable(var, shader) {
+      /* Assume they come in order */
+      assert(var->data.location > last_loc);
+      last_loc = var->data.location;
+
+      if (var->data.access & ACCESS_NON_WRITEABLE)
+         var->data.driver_location = num_rd_images++;
+      else
+         var->data.driver_location = num_wr_images++;
+   }
+   shader->info.num_textures = num_rd_images;
+   BITSET_ZERO(shader->info.textures_used);
+   if (num_rd_images)
+      BITSET_SET_RANGE_INSIDE_WORD(shader->info.textures_used, 0, num_rd_images - 1);
+   shader->info.num_images = num_wr_images;
+
+   last_loc = -1;
+   int num_samplers = 0;
    nir_foreach_uniform_variable(var, shader) {
-      if (glsl_type_is_image(var->type) || glsl_type_is_sampler(var->type)) {
+      if (var->type == glsl_bare_sampler_type()) {
          /* Assume they come in order */
          assert(var->data.location > last_loc);
          last_loc = var->data.location;
-      }
 
-      /* TODO: Constant samplers */
-      if (var->type == glsl_bare_sampler_type()) {
+         /* TODO: Constant samplers */
          var->data.driver_location = num_samplers++;
-      } else if (glsl_type_is_image(var->type)) {
-         if (var->data.access & ACCESS_NON_WRITEABLE)
-            var->data.driver_location = num_rd_images++;
-         else
-            var->data.driver_location = num_wr_images++;
       } else {
          /* CL shouldn't have any sampled images */
          assert(!glsl_type_is_sampler(var->type));
       }
    }
-   shader->info.num_textures = num_rd_images;
-   BITSET_ZERO(shader->info.textures_used);
-   if (num_rd_images)
-      BITSET_SET_RANGE(shader->info.textures_used, 0, num_rd_images - 1);
-   shader->info.num_images = num_wr_images;
 
    nir_builder b;
    nir_builder_init(&b, impl);
@@ -238,7 +260,7 @@ clover_nir_lower_images(nir_shader *shader)
 }
 
 struct clover_lower_nir_state {
-   std::vector<module::argument> &args;
+   std::vector<binary::argument> &args;
    uint32_t global_dims;
    nir_variable *constant_var;
    nir_variable *printf_buffer;
@@ -261,9 +283,9 @@ clover_lower_nir_instr(nir_builder *b, nir_instr *instr, void *_state)
    case nir_intrinsic_load_printf_buffer_address: {
       if (!state->printf_buffer) {
          unsigned location = state->args.size();
-         state->args.emplace_back(module::argument::global, sizeof(size_t),
-                                  8, 8, module::argument::zero_ext,
-                                  module::argument::printf_buffer);
+         state->args.emplace_back(binary::argument::global, sizeof(size_t),
+                                  8, 8, binary::argument::zero_ext,
+                                  binary::argument::printf_buffer);
 
          const glsl_type *type = glsl_uint64_t_type();
          state->printf_buffer = nir_variable_create(b->shader, nir_var_uniform,
@@ -282,9 +304,9 @@ clover_lower_nir_instr(nir_builder *b, nir_instr *instr, void *_state)
           * three 32 bit values
          */
          unsigned location = state->args.size();
-         state->args.emplace_back(module::argument::scalar, 4, 4, 4,
-                                  module::argument::zero_ext,
-                                  module::argument::grid_offset);
+         state->args.emplace_back(binary::argument::scalar, 4, 4, 4,
+                                  binary::argument::zero_ext,
+                                  binary::argument::grid_offset);
 
          const glsl_type *type = glsl_uint_type();
          for (uint32_t i = 0; i < 3; i++) {
@@ -313,7 +335,7 @@ clover_lower_nir_instr(nir_builder *b, nir_instr *instr, void *_state)
 }
 
 static bool
-clover_lower_nir(nir_shader *nir, std::vector<module::argument> &args,
+clover_lower_nir(nir_shader *nir, std::vector<binary::argument> &args,
                  uint32_t dims, uint32_t pointer_bit_size)
 {
    nir_variable *constant_var = NULL;
@@ -324,10 +346,10 @@ clover_lower_nir(nir_shader *nir, std::vector<module::argument> &args,
                                          "constant_buffer_addr");
       constant_var->data.location = args.size();
 
-      args.emplace_back(module::argument::global, sizeof(cl_mem),
+      args.emplace_back(binary::argument::global, sizeof(cl_mem),
                         pointer_bit_size / 8, pointer_bit_size / 8,
-                        module::argument::zero_ext,
-                        module::argument::constant_buffer);
+                        binary::argument::zero_ext,
+                        binary::argument::constant_buffer);
    }
 
    clover_lower_nir_state state = { args, dims, constant_var };
@@ -396,19 +418,27 @@ nir_shader *clover::nir::load_libclc_nir(const device &dev, std::string &r_log)
 				 &spirv_options, compiler_options);
 }
 
-module clover::nir::spirv_to_nir(const module &mod, const device &dev,
+static bool
+can_remove_var(nir_variable *var, void *data)
+{
+   return !(var->type->is_sampler() ||
+            var->type->is_texture() ||
+            var->type->is_image());
+}
+
+binary clover::nir::spirv_to_nir(const binary &mod, const device &dev,
                                  std::string &r_log)
 {
    spirv_to_nir_options spirv_options = create_spirv_options(dev, r_log);
    std::shared_ptr<nir_shader> nir = dev.clc_nir;
    spirv_options.clc_shader = nir.get();
 
-   module m;
+   binary b;
    // We only insert one section.
    assert(mod.secs.size() == 1);
    auto &section = mod.secs[0];
 
-   module::resource_id section_id = 0;
+   binary::resource_id section_id = 0;
    for (const auto &sym : mod.syms) {
       assert(sym.section == 0);
 
@@ -496,6 +526,7 @@ module clover::nir::spirv_to_nir(const module &mod, const device &dev,
       NIR_PASS_V(nir, clover_lower_nir, args, dev.max_block_size().size(),
                  dev.address_bits());
 
+      NIR_PASS_V(nir, clover_nir_add_image_uniforms);
       NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
                  nir_var_uniform, clover_arg_size_align);
       NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
@@ -525,7 +556,10 @@ module clover::nir::spirv_to_nir(const module &mod, const device &dev,
       NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_global,
                  spirv_options.global_addr_format);
 
-      NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_all, NULL);
+      struct nir_remove_dead_variables_options remove_dead_variables_options = {
+            .can_remove_var = can_remove_var,
+      };
+      NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_all, &remove_dead_variables_options);
 
       if (compiler_options->lower_int64_options)
          NIR_PASS_V(nir, nir_lower_int64);
@@ -534,15 +568,15 @@ module clover::nir::spirv_to_nir(const module &mod, const device &dev,
 
       if (nir->constant_data_size) {
          const char *ptr = reinterpret_cast<const char *>(nir->constant_data);
-         const module::section constants {
+         const binary::section constants {
             section_id,
-            module::section::data_constant,
+            binary::section::data_constant,
             nir->constant_data_size,
             { ptr, ptr + nir->constant_data_size }
          };
          nir->constant_data = NULL;
          nir->constant_data_size = 0;
-         m.secs.push_back(constants);
+         b.secs.push_back(constants);
       }
 
       void *mem_ctx = ralloc_context(NULL);
@@ -558,17 +592,17 @@ module clover::nir::spirv_to_nir(const module &mod, const device &dev,
       ralloc_free(nir);
 
       const pipe_binary_program_header header { uint32_t(blob.size) };
-      module::section text { section_id, module::section::text_executable, header.num_bytes, {} };
+      binary::section text { section_id, binary::section::text_executable, header.num_bytes, {} };
       text.data.insert(text.data.end(), reinterpret_cast<const char *>(&header),
                        reinterpret_cast<const char *>(&header) + sizeof(header));
       text.data.insert(text.data.end(), blob.data, blob.data + blob.size);
 
       free(blob.data);
 
-      m.printf_strings_in_buffer = false;
-      m.printf_infos.reserve(printf_info_count);
+      b.printf_strings_in_buffer = false;
+      b.printf_infos.reserve(printf_info_count);
       for (unsigned i = 0; i < printf_info_count; i++) {
-         module::printf_info info;
+         binary::printf_info info;
 
          info.arg_sizes.reserve(printf_infos[i].num_args);
          for (unsigned j = 0; j < printf_infos[i].num_args; j++)
@@ -576,20 +610,20 @@ module clover::nir::spirv_to_nir(const module &mod, const device &dev,
 
          info.strings.resize(printf_infos[i].string_size);
          memcpy(info.strings.data(), printf_infos[i].strings, printf_infos[i].string_size);
-         m.printf_infos.push_back(info);
+         b.printf_infos.push_back(info);
       }
 
       ralloc_free(mem_ctx);
 
-      m.syms.emplace_back(sym.name, sym.attributes,
+      b.syms.emplace_back(sym.name, sym.attributes,
                           sym.reqd_work_group_size, section_id, 0, args);
-      m.secs.push_back(text);
+      b.secs.push_back(text);
       section_id++;
    }
-   return m;
+   return b;
 }
 #else
-module clover::nir::spirv_to_nir(const module &mod, const device &dev, std::string &r_log)
+binary clover::nir::spirv_to_nir(const binary &mod, const device &dev, std::string &r_log)
 {
    r_log += "SPIR-V support in clover is not enabled.\n";
    throw error(CL_LINKER_NOT_AVAILABLE);
diff --git a/mesa 3D driver/src/gallium/frontends/clover/nir/invocation.hpp b/mesa 3D driver/src/gallium/frontends/clover/nir/invocation.hpp
index 4d2acedc68..873f625ef9 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/nir/invocation.hpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/nir/invocation.hpp	
@@ -23,7 +23,7 @@
 #ifndef CLOVER_NIR_INVOCATION_HPP
 #define CLOVER_NIR_INVOCATION_HPP
 
-#include "core/module.hpp"
+#include "core/binary.hpp"
 #include <util/disk_cache.h>
 
 struct nir_shader;
@@ -38,8 +38,8 @@ namespace clover {
 
       struct disk_cache *create_clc_disk_cache(void);
 
-      // converts a given spirv module to nir
-      module spirv_to_nir(const module &mod, const device &dev, std::string &r_log);
+      // converts a given spirv binary to nir
+      binary spirv_to_nir(const binary &bin, const device &dev, std::string &r_log);
    }
 }
 
diff --git a/mesa 3D driver/src/gallium/frontends/clover/spirv/invocation.cpp b/mesa 3D driver/src/gallium/frontends/clover/spirv/invocation.cpp
index fc6cc4bfb4..48f0d9e1fa 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/spirv/invocation.cpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/spirv/invocation.cpp	
@@ -62,17 +62,17 @@ namespace {
       return static_cast<T>(word_ptr[index]);
    }
 
-   enum module::argument::type
+   enum binary::argument::type
    convert_storage_class(SpvStorageClass storage_class, std::string &err) {
       switch (storage_class) {
       case SpvStorageClassFunction:
-         return module::argument::scalar;
+         return binary::argument::scalar;
       case SpvStorageClassUniformConstant:
-         return module::argument::global;
+         return binary::argument::global;
       case SpvStorageClassWorkgroup:
-         return module::argument::local;
+         return binary::argument::local;
       case SpvStorageClassCrossWorkgroup:
-         return module::argument::global;
+         return binary::argument::global;
       default:
          err += "Invalid storage type " + std::to_string(storage_class) + "\n";
          throw build_error();
@@ -94,7 +94,7 @@ namespace {
       }
    }
 
-   enum module::argument::type
+   enum binary::argument::type
    convert_image_type(SpvId id, SpvDim dim, SpvAccessQualifier access,
                       std::string &err) {
       switch (dim) {
@@ -104,9 +104,9 @@ namespace {
       case SpvDimBuffer:
          switch (access) {
          case SpvAccessQualifierReadOnly:
-            return module::argument::image_rd;
+            return binary::argument::image_rd;
          case SpvAccessQualifierWriteOnly:
-            return module::argument::image_wr;
+            return binary::argument::image_wr;
          default:
             err += "Unknown access qualifier " + std::to_string(access) + " for image "
                 +  std::to_string(id) + ".\n";
@@ -119,11 +119,11 @@ namespace {
       }
    }
 
-   module::section
+   binary::section
    make_text_section(const std::string &code,
-                     enum module::section::type section_type) {
+                     enum binary::section::type section_type) {
       const pipe_binary_program_header header { uint32_t(code.size()) };
-      module::section text { 0, section_type, header.num_bytes, {} };
+      binary::section text { 0, section_type, header.num_bytes, {} };
 
       text.data.insert(text.data.end(), reinterpret_cast<const char *>(&header),
                        reinterpret_cast<const char *>(&header) + sizeof(header));
@@ -132,8 +132,8 @@ namespace {
       return text;
    }
 
-   module
-   create_module_from_spirv(const std::string &source,
+   binary
+   create_binary_from_spirv(const std::string &source,
                             size_t pointer_byte_size,
                             std::string &err) {
       const size_t length = source.size() / sizeof(uint32_t);
@@ -141,15 +141,15 @@ namespace {
 
       std::string kernel_name;
       size_t kernel_nb = 0u;
-      std::vector<module::argument> args;
+      std::vector<binary::argument> args;
       std::vector<size_t> req_local_size;
 
-      module m;
+      binary b;
 
       std::vector<std::string> attributes;
       std::unordered_map<SpvId, std::vector<size_t> > req_local_sizes;
       std::unordered_map<SpvId, std::string> kernels;
-      std::unordered_map<SpvId, module::argument> types;
+      std::unordered_map<SpvId, binary::argument> types;
       std::unordered_map<SpvId, SpvId> pointer_types;
       std::unordered_map<SpvId, unsigned int> constants;
       std::unordered_set<SpvId> packed_structures;
@@ -291,7 +291,7 @@ namespace {
          case SpvOpConstant:
             // We only care about constants that represent the size of arrays.
             // If they are passed as argument, they will never be more than
-            // 4GB-wide, and even if they did, a clover::module::argument size
+            // 4GB-wide, and even if they did, a clover::binary::argument size
             // is represented by an int.
             constants[get<SpvId>(inst, 2)] = get<unsigned int>(inst, 3u);
             break;
@@ -300,8 +300,8 @@ namespace {
          case SpvOpTypeFloat: {
             const auto size = get<uint32_t>(inst, 2) / 8u;
             const auto id = get<SpvId>(inst, 1);
-            types[id] = { module::argument::scalar, size, size, size,
-                          module::argument::zero_ext };
+            types[id] = { binary::argument::scalar, size, size, size,
+                          binary::argument::zero_ext };
             types[id].info.address_qualifier = CL_KERNEL_ARG_ADDRESS_PRIVATE;
             break;
          }
@@ -323,9 +323,9 @@ namespace {
             const auto elem_size = types_iter->second.size;
             const auto elem_nbs = constants_iter->second;
             const auto size = elem_size * elem_nbs;
-            types[id] = { module::argument::scalar, size, size,
+            types[id] = { binary::argument::scalar, size, size,
                           types_iter->second.target_align,
-                          module::argument::zero_ext };
+                          binary::argument::zero_ext };
             break;
          }
 
@@ -340,7 +340,7 @@ namespace {
                const auto types_iter = types.find(type_id);
 
                // If a type was not found, that means it is not one of the
-               // types allowed as kernel arguments. And since the module has
+               // types allowed as kernel arguments. And since the binary has
                // been validated, this means this type is not used for kernel
                // arguments, and therefore can be ignored.
                if (types_iter == types.end())
@@ -353,8 +353,8 @@ namespace {
                struct_align = std::max(struct_align, alignment);
             }
             struct_size += (-struct_size) & (struct_align - 1u);
-            types[id] = { module::argument::scalar, struct_size, struct_size,
-                          struct_align, module::argument::zero_ext };
+            types[id] = { binary::argument::scalar, struct_size, struct_size,
+                          struct_align, binary::argument::zero_ext };
             break;
          }
 
@@ -364,7 +364,7 @@ namespace {
             const auto types_iter = types.find(type_id);
 
             // If a type was not found, that means it is not one of the
-            // types allowed as kernel arguments. And since the module has
+            // types allowed as kernel arguments. And since the binary has
             // been validated, this means this type is not used for kernel
             // arguments, and therefore can be ignored.
             if (types_iter == types.end())
@@ -373,8 +373,8 @@ namespace {
             const auto elem_size = types_iter->second.size;
             const auto elem_nbs = get<uint32_t>(inst, 3);
             const auto size = elem_size * (elem_nbs != 3 ? elem_nbs : 4);
-            types[id] = { module::argument::scalar, size, size, size,
-                          module::argument::zero_ext };
+            types[id] = { binary::argument::scalar, size, size, size,
+                          binary::argument::zero_ext };
             types[id].info.address_qualifier = CL_KERNEL_ARG_ADDRESS_PRIVATE;
             break;
          }
@@ -391,7 +391,7 @@ namespace {
             if (opcode == SpvOpTypePointer)
                pointer_types[id] = get<SpvId>(inst, 3);
 
-            module::size_t alignment;
+            binary::size_t alignment;
             if (storage_class == SpvStorageClassWorkgroup)
                alignment = opcode == SpvOpTypePointer ? types[pointer_types[id]].target_align : 0;
             else
@@ -399,15 +399,15 @@ namespace {
 
             types[id] = { convert_storage_class(storage_class, err),
                           sizeof(cl_mem),
-                          static_cast<module::size_t>(pointer_byte_size),
+                          static_cast<binary::size_t>(pointer_byte_size),
                           alignment,
-                          module::argument::zero_ext };
+                          binary::argument::zero_ext };
             types[id].info.address_qualifier = convert_storage_class_to_cl(storage_class);
             break;
          }
 
          case SpvOpTypeSampler:
-            types[get<SpvId>(inst, 1)] = { module::argument::sampler,
+            types[get<SpvId>(inst, 1)] = { binary::argument::sampler,
                                              sizeof(cl_sampler) };
             break;
 
@@ -417,7 +417,7 @@ namespace {
             const auto access = get<SpvAccessQualifier>(inst, 9);
             types[id] = { convert_image_type(id, dim, access, err),
                           sizeof(cl_mem), sizeof(cl_mem), sizeof(cl_mem),
-                          module::argument::zero_ext };
+                          binary::argument::zero_ext };
             break;
          }
 
@@ -457,10 +457,10 @@ namespace {
                for (auto &i : func_param_attr_iter->second) {
                   switch (i) {
                   case SpvFunctionParameterAttributeSext:
-                     arg.ext_type = module::argument::sign_ext;
+                     arg.ext_type = binary::argument::sign_ext;
                      break;
                   case SpvFunctionParameterAttributeZext:
-                     arg.ext_type = module::argument::zero_ext;
+                     arg.ext_type = binary::argument::zero_ext;
                      break;
                   case SpvFunctionParameterAttributeByVal: {
                      const SpvId ptr_type_id =
@@ -498,7 +498,7 @@ namespace {
             for (size_t i = 0; i < param_type_names[kernel_name].size(); i++)
                args[i].info.type_name = param_type_names[kernel_name][i];
 
-            m.syms.emplace_back(kernel_name, detokenize(attributes, " "),
+            b.syms.emplace_back(kernel_name, detokenize(attributes, " "),
                                 req_local_size, 0, kernel_nb, args);
             ++kernel_nb;
             kernel_name.clear();
@@ -513,9 +513,9 @@ namespace {
          i += num_operands;
       }
 
-      m.secs.push_back(make_text_section(source,
-                                         module::section::text_intermediate));
-      return m;
+      b.secs.push_back(make_text_section(source,
+                                         binary::section::text_intermediate));
+      return b;
    }
 
    bool
@@ -773,7 +773,7 @@ clover::spirv::version_to_string(uint32_t version) {
       std::to_string(minor_version);
 }
 
-module
+binary
 clover::spirv::compile_program(const std::string &binary,
                                const device &dev, std::string &r_log,
                                bool validate) {
@@ -791,12 +791,12 @@ clover::spirv::compile_program(const std::string &binary,
    if (!check_memory_model(dev, source, r_log))
       throw build_error();
 
-   return create_module_from_spirv(source,
+   return create_binary_from_spirv(source,
                                    dev.address_bits() == 32 ? 4u : 8u, r_log);
 }
 
-module
-clover::spirv::link_program(const std::vector<module> &modules,
+binary
+clover::spirv::link_program(const std::vector<binary> &binaries,
                             const device &dev, const std::string &opts,
                             std::string &r_log) {
    std::vector<std::string> options = tokenize(opts);
@@ -819,15 +819,15 @@ clover::spirv::link_program(const std::vector<module> &modules,
    spvtools::LinkerOptions linker_options;
    linker_options.SetCreateLibrary(create_library);
 
-   module m;
+   binary b;
 
-   const auto section_type = create_library ? module::section::text_library :
-                                              module::section::text_executable;
+   const auto section_type = create_library ? binary::section::text_library :
+                                              binary::section::text_executable;
 
    std::vector<const uint32_t *> sections;
-   sections.reserve(modules.size());
+   sections.reserve(binaries.size());
    std::vector<size_t> lengths;
-   lengths.reserve(modules.size());
+   lengths.reserve(binaries.size());
 
    auto const validator_consumer = [&r_log](spv_message_level_t level,
                                             const char *source,
@@ -836,14 +836,14 @@ clover::spirv::link_program(const std::vector<module> &modules,
       r_log += format_validator_msg(level, source, position, message);
    };
 
-   for (const auto &mod : modules) {
-      const auto &msec = find([](const module::section &sec) {
-                  return sec.type == module::section::text_intermediate ||
-                         sec.type == module::section::text_library;
-               }, mod.secs);
+   for (const auto &bin : binaries) {
+      const auto &bsec = find([](const binary::section &sec) {
+                  return sec.type == binary::section::text_intermediate ||
+                         sec.type == binary::section::text_library;
+               }, bin.secs);
 
-      const auto c_il = ((struct pipe_binary_program_header*)msec.data.data())->blob;
-      const auto length = msec.size;
+      const auto c_il = ((struct pipe_binary_program_header*)bsec.data.data())->blob;
+      const auto length = bsec.size;
 
       if (!check_spirv_version(dev, c_il, r_log))
          throw error(CL_LINK_PROGRAM_FAILURE);
@@ -876,12 +876,12 @@ clover::spirv::link_program(const std::vector<module> &modules,
    if (has_flag(llvm::debug::spirv))
       llvm::debug::log(".spvasm", spirv::print_module(final_binary, dev.device_version()));
 
-   for (const auto &mod : modules)
-      m.syms.insert(m.syms.end(), mod.syms.begin(), mod.syms.end());
+   for (const auto &bin : binaries)
+      b.syms.insert(b.syms.end(), bin.syms.begin(), bin.syms.end());
 
-   m.secs.emplace_back(make_text_section(final_binary, section_type));
+   b.secs.emplace_back(make_text_section(final_binary, section_type));
 
-   return m;
+   return b;
 }
 
 bool
@@ -915,7 +915,7 @@ clover::spirv::print_module(const std::string &binary,
    spvtools::SpirvTools spvTool(target_env);
    spv_context spvContext = spvContextCreate(target_env);
    if (!spvContext)
-      return "Failed to create an spv_context for disassembling the module.";
+      return "Failed to create an spv_context for disassembling the binary.";
 
    spv_text disassembly;
    spvBinaryToText(spvContext,
@@ -974,7 +974,7 @@ clover::spirv::version_to_string(uint32_t version) {
    return "";
 }
 
-module
+binary
 clover::spirv::compile_program(const std::string &binary,
                                const device &dev, std::string &r_log,
                                bool validate) {
@@ -982,8 +982,8 @@ clover::spirv::compile_program(const std::string &binary,
    throw build_error();
 }
 
-module
-clover::spirv::link_program(const std::vector<module> &/*modules*/,
+binary
+clover::spirv::link_program(const std::vector<binary> &/*binaries*/,
                             const device &/*dev*/, const std::string &/*opts*/,
                             std::string &r_log) {
    r_log += "SPIR-V support in clover is not enabled.\n";
diff --git a/mesa 3D driver/src/gallium/frontends/clover/spirv/invocation.hpp b/mesa 3D driver/src/gallium/frontends/clover/spirv/invocation.hpp
index ab79c5c388..50b0c085b4 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/spirv/invocation.hpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/spirv/invocation.hpp	
@@ -26,7 +26,7 @@
 #include <unordered_set>
 
 #include "core/context.hpp"
-#include "core/module.hpp"
+#include "core/binary.hpp"
 #include "core/program.hpp"
 
 namespace clover {
@@ -49,14 +49,14 @@ namespace clover {
       // Converts an integer SPIR-V version into its textual representation.
       std::string version_to_string(uint32_t version);
 
-      // Creates a clover module out of the given SPIR-V binary.
-      module compile_program(const std::string &binary,
+      // Creates a clover binary out of the given SPIR-V binary.
+      binary compile_program(const std::string &binary,
                              const device &dev, std::string &r_log,
                              bool validate = true);
 
-      // Combines multiple clover modules into a single one, resolving
+      // Combines multiple clover objects into a single one, resolving
       // link dependencies between them.
-      module link_program(const std::vector<module> &modules, const device &dev,
+      binary link_program(const std::vector<binary> &objects, const device &dev,
                           const std::string &opts, std::string &r_log);
 
       // Returns a textual representation of the given binary.
diff --git a/mesa 3D driver/src/gallium/frontends/clover/util/adaptor.hpp b/mesa 3D driver/src/gallium/frontends/clover/util/adaptor.hpp
index e903596857..08601fc416 100644
--- a/mesa 3D driver/src/gallium/frontends/clover/util/adaptor.hpp	
+++ b/mesa 3D driver/src/gallium/frontends/clover/util/adaptor.hpp	
@@ -25,6 +25,7 @@
 
 #include <iterator>
 
+#include "util/compat.hpp"
 #include "util/tuple.hpp"
 #include "util/pointer.hpp"
 #include "util/functional.hpp"
@@ -43,8 +44,8 @@ namespace clover {
       class iterator_adaptor {
       public:
          typedef std::forward_iterator_tag iterator_category;
-         typedef typename std::result_of<
-               F(typename std::iterator_traits<Is>::reference...)
+         typedef typename invoke_result<
+               F, typename std::iterator_traits<Is>::reference...
             >::type reference;
          typedef typename std::remove_reference<reference>::type value_type;
          typedef pseudo_ptr<value_type> pointer;
diff --git a/mesa 3D driver/src/gallium/frontends/clover/util/compat.hpp b/mesa 3D driver/src/gallium/frontends/clover/util/compat.hpp
new file mode 100644
index 0000000000..dd8db8d2c5
--- /dev/null
+++ b/mesa 3D driver/src/gallium/frontends/clover/util/compat.hpp	
@@ -0,0 +1,43 @@
+//
+// Copyright © Microsoft Corporation
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#ifndef CLOVER_UTIL_COMPAT_HPP
+#define CLOVER_UTIL_COMPAT_HPP
+
+#include <type_traits>
+
+namespace clover {
+   template<typename F, typename... Args>
+   struct invoke_result {
+#if __cplusplus >= 201703L
+      typedef typename std::invoke_result<
+         F, Args...
+      >::type type;
+#else
+      typedef typename std::result_of<
+         F(Args...)
+      >::type type;
+#endif
+   };
+}
+
+#endif
diff --git a/mesa 3D driver/src/gallium/frontends/d3d10umd/D3DKMT.cpp b/mesa 3D driver/src/gallium/frontends/d3d10umd/D3DKMT.cpp
index 1649eddc41..dc31e99318 100644
--- a/mesa 3D driver/src/gallium/frontends/d3d10umd/D3DKMT.cpp	
+++ b/mesa 3D driver/src/gallium/frontends/d3d10umd/D3DKMT.cpp	
@@ -244,7 +244,7 @@ D3DKMTGetMultisampleMethodList(D3DKMT_GETMULTISAMPLEMETHODLIST *pData)
 
 
 EXTERN_C NTSTATUS APIENTRY
-D3DKMTPresent(CONST D3DKMT_PRESENT *pData)
+D3DKMTPresent(D3DKMT_PRESENT *pData)
 {
    LOG_UNSUPPORTED_ENTRYPOINT();
    return STATUS_NOT_IMPLEMENTED;
@@ -409,7 +409,7 @@ D3DKMTSetVidPnSourceOwner(CONST D3DKMT_SETVIDPNSOURCEOWNER *pData)
 
 
 EXTERN_C NTSTATUS APIENTRY
-D3DKMTSetVidPnSourceOwner1(const void *pData)
+D3DKMTSetVidPnSourceOwner1(CONST D3DKMT_SETVIDPNSOURCEOWNER1 *pData)
 {
    LOG_UNSUPPORTED_ENTRYPOINT();
    return STATUS_NOT_IMPLEMENTED;
diff --git a/mesa 3D driver/src/gallium/frontends/d3d10umd/Device.cpp b/mesa 3D driver/src/gallium/frontends/d3d10umd/Device.cpp
index afca18819c..e347ce0132 100644
--- a/mesa 3D driver/src/gallium/frontends/d3d10umd/Device.cpp	
+++ b/mesa 3D driver/src/gallium/frontends/d3d10umd/Device.cpp	
@@ -32,7 +32,7 @@
 
 
 #include "Draw.h"
-#include "Dxgi.h"
+#include "DxgiFns.h"
 #include "InputAssembly.h"
 #include "OutputMerger.h"
 #include "Query.h"
@@ -354,11 +354,11 @@ DestroyDevice(D3D10DDI_HDEVICE hDevice)   // IN
    static struct pipe_sampler_view * sampler_views[PIPE_MAX_SHADER_SAMPLER_VIEWS];
    memset(sampler_views, 0, sizeof sampler_views);
    pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0,
-                           PIPE_MAX_SHADER_SAMPLER_VIEWS, 0, sampler_views);
+                           PIPE_MAX_SHADER_SAMPLER_VIEWS, 0, false, sampler_views);
    pipe->set_sampler_views(pipe, PIPE_SHADER_VERTEX, 0,
-                           PIPE_MAX_SHADER_SAMPLER_VIEWS, 0, sampler_views);
+                           PIPE_MAX_SHADER_SAMPLER_VIEWS, 0, false, sampler_views);
    pipe->set_sampler_views(pipe, PIPE_SHADER_GEOMETRY, 0,
-                           PIPE_MAX_SHADER_SAMPLER_VIEWS, 0, sampler_views);
+                           PIPE_MAX_SHADER_SAMPLER_VIEWS, 0, false, sampler_views);
 
    pipe->destroy(pipe);
 }
diff --git a/mesa 3D driver/src/gallium/frontends/d3d10umd/DriverIncludes.h b/mesa 3D driver/src/gallium/frontends/d3d10umd/DriverIncludes.h
index e78d46fafc..c7df64042b 100644
--- a/mesa 3D driver/src/gallium/frontends/d3d10umd/DriverIncludes.h	
+++ b/mesa 3D driver/src/gallium/frontends/d3d10umd/DriverIncludes.h	
@@ -43,12 +43,21 @@
 
 #include <windows.h>
 
-#include "winddk/winddk_compat.h"
+#include "winddk_compat.h"
 
 //typedef LARGE_INTEGER PHYSICAL_ADDRESS;
 //typedef __success(return >= 0) LONG NTSTATUS;
 
-#define D3D10DDI_MINOR_HEADER_VERSION 1
+#define D3D10DDI_MINOR_HEADER_VERSION 2
+
+/* Unfortunately WinDDK's d3d10umddi.h defines D3D10.x constants as global
+ * const variables instead of preprocessor defines, causing LINK to fail due
+ * to duplicate symbols.  Include d3d10_1.h to avoid the issue.
+ */
+#ifdef _MSC_VER
+#include <d3d10_1.h>
+#endif
+
 #include <d3d10umddi.h>
 
 #include "Debug.h"
diff --git a/mesa 3D driver/src/gallium/frontends/d3d10umd/DxgiFns.cpp b/mesa 3D driver/src/gallium/frontends/d3d10umd/DxgiFns.cpp
new file mode 100644
index 0000000000..cd844a2b0f
--- /dev/null
+++ b/mesa 3D driver/src/gallium/frontends/d3d10umd/DxgiFns.cpp	
@@ -0,0 +1,374 @@
+/**************************************************************************
+ *
+ * Copyright 2012-2021 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+/*
+ * DxgiFns.cpp --
+ *    DXGI related functions.
+ */
+
+#include <stdio.h>
+
+#include "DxgiFns.h"
+#include "Format.h"
+#include "State.h"
+
+#include "Debug.h"
+
+#include "util/format/u_format.h"
+
+
+/*
+ * ----------------------------------------------------------------------
+ *
+ * _Present --
+ *
+ *    This is turned into kernel callbacks rather than directly emitted
+ *    as fifo packets.
+ *
+ * ----------------------------------------------------------------------
+ */
+
+HRESULT APIENTRY
+_Present(DXGI_DDI_ARG_PRESENT *pPresentData)
+{
+
+   LOG_ENTRYPOINT();
+
+   struct pipe_context *pipe = CastPipeDevice(pPresentData->hDevice);
+   Resource *pSrcResource = CastResource(pPresentData->hSurfaceToPresent);
+
+   D3DKMT_PRESENT *pPresentInfo = (D3DKMT_PRESENT *)pPresentData->pDXGIContext;
+
+   HWND hWnd = pPresentInfo->hWindow;
+
+   if (0) {
+      DebugPrintf("  hWindow = 0x%08lx\n", pPresentInfo->hWindow);
+      if (pPresentInfo->Flags.SrcRectValid) {
+         DebugPrintf("  SrcRect.left = %li\n", pPresentInfo->SrcRect.left);
+         DebugPrintf("  SrcRect.top = %li\n", pPresentInfo->SrcRect.top);
+         DebugPrintf("  SrcRect.right = %li\n", pPresentInfo->SrcRect.right);
+         DebugPrintf("  SrcRect.bottom = %li\n", pPresentInfo->SrcRect.bottom);
+      }
+      if (pPresentInfo->Flags.DstRectValid) {
+         DebugPrintf("  DstRect.left = %li\n", pPresentInfo->DstRect.left);
+         DebugPrintf("  DstRect.top = %li\n", pPresentInfo->DstRect.top);
+         DebugPrintf("  DstRect.right = %li\n", pPresentInfo->DstRect.right);
+         DebugPrintf("  DstRect.bottom = %li\n", pPresentInfo->DstRect.bottom);
+      }
+   }
+
+   RECT rect;
+   if (!GetClientRect(hWnd, &rect)) {
+      DebugPrintf("Invalid window.\n");
+      return S_OK;
+   }
+
+   int windowWidth  = rect.right  - rect.left;
+   int windowHeight = rect.bottom - rect.top;
+
+   HDC hDC = GetDC(hWnd);
+
+   unsigned w = pSrcResource->resource->width0;
+   unsigned h = pSrcResource->resource->height0;
+
+   void *map;
+   struct pipe_transfer *transfer;
+   map = pipe_texture_map(pipe,
+                          pSrcResource->resource,
+                          0, 0, PIPE_MAP_READ,
+                          0, 0, w, h,
+                          &transfer);
+   if (map) {
+
+      BITMAPINFO bmi;
+
+      memset(&bmi, 0, sizeof bmi);
+      bmi.bmiHeader.biSize = sizeof(BITMAPINFOHEADER);
+      bmi.bmiHeader.biWidth = w;
+      bmi.bmiHeader.biHeight= -(long)h;
+      bmi.bmiHeader.biPlanes = 1;
+      bmi.bmiHeader.biBitCount = 32;
+      bmi.bmiHeader.biCompression = BI_RGB;
+      bmi.bmiHeader.biSizeImage = 0;
+      bmi.bmiHeader.biXPelsPerMeter = 0;
+      bmi.bmiHeader.biYPelsPerMeter = 0;
+      bmi.bmiHeader.biClrUsed = 0;
+      bmi.bmiHeader.biClrImportant = 0;
+
+      DWORD *pixels = NULL;
+
+      // http://www.daniweb.com/software-development/cpp/code/241875/fast-animation-with-the-windows-gdi
+
+      HBITMAP hBmp = CreateDIBSection(hDC, &bmi, DIB_RGB_COLORS, (void**)&pixels, NULL, 0);
+
+      util_format_translate(
+            PIPE_FORMAT_B8G8R8X8_UNORM,
+            (void *)pixels, w * 4,
+            0, 0,
+            pSrcResource->resource->format,
+            map, transfer->stride,
+            0, 0, w, h);
+
+      if (0) {
+         /*
+          * Save a BMP for debugging.
+          */
+
+         FILE *fp = fopen("present.bmp", "wb");
+         if (fp) {
+            BITMAPFILEHEADER bmf;
+            bmf.bfType = 0x4d42;
+            bmf.bfSize = sizeof bmf + sizeof bmi + h * w * 4;
+            bmf.bfReserved1 = 0;
+            bmf.bfReserved2 = 0;
+            bmf.bfOffBits = sizeof bmf + sizeof bmi;
+
+            fwrite(&bmf, sizeof bmf, 1, fp);
+            fwrite(&bmi, sizeof bmi, 1, fp);
+            fwrite(pixels, h, w * 4, fp);
+            fclose(fp);
+         }
+      }
+
+      HDC hdcMem;
+      hdcMem = CreateCompatibleDC(hDC);
+      HBITMAP hbmOld = (HBITMAP)SelectObject(hdcMem, hBmp);
+
+      int iStretchMode = SetStretchBltMode(hDC, HALFTONE);
+
+      StretchBlt(hDC, 0, 0, windowWidth, windowHeight,
+                 hdcMem, 0, 0, w, h,
+                 SRCCOPY);
+
+      if (iStretchMode) {
+         SetStretchBltMode(hDC, iStretchMode);
+      }
+
+      SelectObject(hdcMem, hbmOld);
+      DeleteDC(hdcMem);
+      DeleteObject(hBmp);
+
+      pipe_texture_unmap(pipe, transfer);
+   }
+
+   ReleaseDC(hWnd, hDC);
+
+   return S_OK;
+}
+
+
+/*
+ * ----------------------------------------------------------------------
+ *
+ * _GetGammaCaps --
+ *
+ *    Return gamma capabilities.
+ *
+ * ----------------------------------------------------------------------
+ */
+
+HRESULT APIENTRY
+_GetGammaCaps( DXGI_DDI_ARG_GET_GAMMA_CONTROL_CAPS *GetCaps )
+{
+   LOG_ENTRYPOINT();
+
+   DXGI_GAMMA_CONTROL_CAPABILITIES *pCaps;
+
+   pCaps = GetCaps->pGammaCapabilities;
+
+   pCaps->ScaleAndOffsetSupported = FALSE;
+   pCaps->MinConvertedValue = 0.0;
+   pCaps->MaxConvertedValue = 1.0;
+   pCaps->NumGammaControlPoints = 17;
+
+   for (UINT i = 0; i < pCaps->NumGammaControlPoints; i++) {
+      pCaps->ControlPointPositions[i] = (float)i / (float)(pCaps->NumGammaControlPoints - 1);
+   }
+
+   return S_OK;
+}
+
+
+/*
+ * ----------------------------------------------------------------------
+ *
+ * _SetDisplayMode --
+ *
+ *    Set the resource that is used to scan out to the display.
+ *
+ * ----------------------------------------------------------------------
+ */
+
+HRESULT APIENTRY
+_SetDisplayMode( DXGI_DDI_ARG_SETDISPLAYMODE *SetDisplayMode )
+{
+   LOG_UNSUPPORTED_ENTRYPOINT();
+
+   return S_OK;
+}
+
+
+/*
+ * ----------------------------------------------------------------------
+ *
+ * _SetResourcePriority --
+ *
+ * ----------------------------------------------------------------------
+ */
+
+HRESULT APIENTRY
+_SetResourcePriority( DXGI_DDI_ARG_SETRESOURCEPRIORITY *SetResourcePriority )
+{
+   LOG_ENTRYPOINT();
+
+   /* ignore */
+
+   return S_OK;
+}
+
+
+/*
+ * ----------------------------------------------------------------------
+ *
+ * _QueryResourceResidency --
+ *
+ * ----------------------------------------------------------------------
+ */
+
+HRESULT APIENTRY
+_QueryResourceResidency( DXGI_DDI_ARG_QUERYRESOURCERESIDENCY *QueryResourceResidency )
+{
+   LOG_ENTRYPOINT();
+
+   for (UINT i = 0; i < QueryResourceResidency->Resources; ++i) {
+      QueryResourceResidency->pStatus[i] = DXGI_DDI_RESIDENCY_FULLY_RESIDENT;
+   }
+
+   return S_OK;
+}
+
+
+/*
+ * ----------------------------------------------------------------------
+ *
+ * _RotateResourceIdentities --
+ *
+ *    Rotate a list of resources by recreating their views with
+ *    the updated rotations.
+ *
+ * ----------------------------------------------------------------------
+ */
+
+HRESULT APIENTRY
+_RotateResourceIdentities( DXGI_DDI_ARG_ROTATE_RESOURCE_IDENTITIES *RotateResourceIdentities )
+{
+   LOG_ENTRYPOINT();
+
+   if (RotateResourceIdentities->Resources <= 1) {
+      return S_OK;
+   }
+
+   struct pipe_context *pipe = CastPipeDevice(RotateResourceIdentities->hDevice);
+   struct pipe_screen *screen = pipe->screen;
+
+   struct pipe_resource *resource0 = CastPipeResource(RotateResourceIdentities->pResources[0]);
+
+   assert(resource0);
+   LOG_UNSUPPORTED(resource0->last_level);
+
+   /*
+    * XXX: Copying is not very efficient, but it is much simpler than the
+    * alternative of recreating all views.
+    */
+
+   struct pipe_resource *temp_resource;
+   temp_resource = screen->resource_create(screen, resource0);
+   assert(temp_resource);
+   if (!temp_resource) {
+      return E_OUTOFMEMORY;
+   }
+
+   struct pipe_box src_box;
+   src_box.x = 0;
+   src_box.y = 0;
+   src_box.z = 0;
+   src_box.width  = resource0->width0;
+   src_box.height = resource0->height0;
+   src_box.depth  = resource0->depth0;
+
+   for (UINT i = 0; i < RotateResourceIdentities->Resources + 1; ++i) {
+      struct pipe_resource *src_resource;
+      struct pipe_resource *dst_resource;
+
+      if (i < RotateResourceIdentities->Resources) {
+         src_resource = CastPipeResource(RotateResourceIdentities->pResources[i]);
+      } else {
+         src_resource = temp_resource;
+      }
+
+      if (i > 0) {
+         dst_resource = CastPipeResource(RotateResourceIdentities->pResources[i - 1]);
+      } else {
+         dst_resource = temp_resource;
+      }
+
+      assert(dst_resource);
+      assert(src_resource);
+
+      pipe->resource_copy_region(pipe,
+                                 dst_resource,
+                                 0, // dst_level
+                                 0, 0, 0, // dst_x,y,z
+                                 src_resource,
+                                 0, // src_level
+                                 &src_box);
+   }
+
+   pipe_resource_reference(&temp_resource, NULL);
+
+   return S_OK;
+}
+
+
+/*
+ * ----------------------------------------------------------------------
+ *
+ * _Blt --
+ *
+ *    Do a blt between two subresources. Apply MSAA resolve, format
+ *    conversion and stretching.
+ *
+ * ----------------------------------------------------------------------
+ */
+
+HRESULT APIENTRY
+_Blt(DXGI_DDI_ARG_BLT *Blt)
+{
+   LOG_UNSUPPORTED_ENTRYPOINT();
+
+   return S_OK;
+}
diff --git a/mesa 3D driver/src/gallium/frontends/d3d10umd/DxgiFns.h b/mesa 3D driver/src/gallium/frontends/d3d10umd/DxgiFns.h
new file mode 100644
index 0000000000..3689faa307
--- /dev/null
+++ b/mesa 3D driver/src/gallium/frontends/d3d10umd/DxgiFns.h	
@@ -0,0 +1,46 @@
+/**************************************************************************
+ *
+ * Copyright 2012-2021 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+/*
+ * DxgiFns.h --
+ *    DXGI related functions
+ */
+
+#ifndef WRAP_DXGI_H
+#define WRAP_DXGI_H
+
+#include "DriverIncludes.h"
+
+HRESULT APIENTRY _Present( DXGI_DDI_ARG_PRESENT * );
+HRESULT APIENTRY _GetGammaCaps( DXGI_DDI_ARG_GET_GAMMA_CONTROL_CAPS * );
+HRESULT APIENTRY _SetDisplayMode( DXGI_DDI_ARG_SETDISPLAYMODE * );
+HRESULT APIENTRY _SetResourcePriority( DXGI_DDI_ARG_SETRESOURCEPRIORITY * );
+HRESULT APIENTRY _QueryResourceResidency( DXGI_DDI_ARG_QUERYRESOURCERESIDENCY * );
+HRESULT APIENTRY _RotateResourceIdentities( DXGI_DDI_ARG_ROTATE_RESOURCE_IDENTITIES * );
+HRESULT APIENTRY _Blt( DXGI_DDI_ARG_BLT * );
+
+#endif
diff --git a/mesa 3D driver/src/gallium/frontends/d3d10umd/Rasterizer.cpp b/mesa 3D driver/src/gallium/frontends/d3d10umd/Rasterizer.cpp
index df127c793c..48c9f75a4e 100644
--- a/mesa 3D driver/src/gallium/frontends/d3d10umd/Rasterizer.cpp	
+++ b/mesa 3D driver/src/gallium/frontends/d3d10umd/Rasterizer.cpp	
@@ -226,6 +226,7 @@ CreateRasterizerState(
    state.clip_halfz = 1;
    state.depth_clip_near = pRasterizerDesc->DepthClipEnable ? 1 : 0;
    state.depth_clip_far = pRasterizerDesc->DepthClipEnable ? 1 : 0;
+   state.depth_clamp = 1;
 
    state.point_quad_rasterization = 1;
    state.point_size = 1.0f;
diff --git a/mesa 3D driver/src/gallium/frontends/d3d10umd/Resource.cpp b/mesa 3D driver/src/gallium/frontends/d3d10umd/Resource.cpp
index a7910d2e3c..3dd91365b2 100644
--- a/mesa 3D driver/src/gallium/frontends/d3d10umd/Resource.cpp	
+++ b/mesa 3D driver/src/gallium/frontends/d3d10umd/Resource.cpp	
@@ -270,6 +270,7 @@ CreateResource(D3D10DDI_HDEVICE hDevice,                                // IN
 
    templat.target     = translate_texture_target( pCreateResource->ResourceDimension,
                                                   pCreateResource->ArraySize );
+   pResource->buffer = templat.target == PIPE_BUFFER;
 
    if (pCreateResource->Format == DXGI_FORMAT_UNKNOWN) {
       assert(pCreateResource->ResourceDimension == D3D10DDIRESOURCE_BUFFER);
@@ -315,37 +316,62 @@ CreateResource(D3D10DDI_HDEVICE hDevice,                                // IN
                                                           sizeof *pResource->transfers);
 
    if (pCreateResource->pInitialDataUP) {
-      for (UINT SubResource = 0; SubResource < pResource->NumSubResources; ++SubResource) {
+      if (pResource->buffer) {
+         assert(pResource->NumSubResources == 1);
          const D3D10_DDIARG_SUBRESOURCE_UP* pInitialDataUP =
-               &pCreateResource->pInitialDataUP[SubResource];
+               &pCreateResource->pInitialDataUP[0];
 
          unsigned level;
          struct pipe_box box;
-         subResourceBox(pResource->resource, SubResource, &level, &box);
+         subResourceBox(pResource->resource, 0, &level, &box);
 
          struct pipe_transfer *transfer;
          void *map;
-         map = pipe->transfer_map(pipe,
-                                  pResource->resource,
-                                  level,
-                                  PIPE_MAP_WRITE |
-                                  PIPE_MAP_UNSYNCHRONIZED,
-                                  &box,
-                                  &transfer);
+         map = pipe->buffer_map(pipe,
+                                pResource->resource,
+                                level,
+                                PIPE_MAP_WRITE |
+                                PIPE_MAP_UNSYNCHRONIZED,
+                                &box,
+                                &transfer);
          assert(map);
          if (map) {
-            for (int z = 0; z < box.depth; ++z) {
-               ubyte *dst = (ubyte*)map + z*transfer->layer_stride;
-               const ubyte *src = (const ubyte*)pInitialDataUP->pSysMem + z*pInitialDataUP->SysMemSlicePitch;
-               util_copy_rect(dst,
-                              templat.format,
-                              transfer->stride,
-                              0, 0, box.width, box.height,
-                              src,
-                              pInitialDataUP->SysMemPitch,
-                              0, 0);
+            memcpy(map, pInitialDataUP->pSysMem, box.width);
+            pipe_buffer_unmap(pipe, transfer);
+         }
+      } else {
+         for (UINT SubResource = 0; SubResource < pResource->NumSubResources; ++SubResource) {
+            const D3D10_DDIARG_SUBRESOURCE_UP* pInitialDataUP =
+                  &pCreateResource->pInitialDataUP[SubResource];
+
+            unsigned level;
+            struct pipe_box box;
+            subResourceBox(pResource->resource, SubResource, &level, &box);
+
+            struct pipe_transfer *transfer;
+            void *map;
+            map = pipe->texture_map(pipe,
+                                    pResource->resource,
+                                    level,
+                                    PIPE_MAP_WRITE |
+                                    PIPE_MAP_UNSYNCHRONIZED,
+                                    &box,
+                                    &transfer);
+            assert(map);
+            if (map) {
+               for (int z = 0; z < box.depth; ++z) {
+                  ubyte *dst = (ubyte*)map + z*transfer->layer_stride;
+                  const ubyte *src = (const ubyte*)pInitialDataUP->pSysMem + z*pInitialDataUP->SysMemSlicePitch;
+                  util_copy_rect(dst,
+                                 templat.format,
+                                 transfer->stride,
+                                 0, 0, box.width, box.height,
+                                 src,
+                                 pInitialDataUP->SysMemPitch,
+                                 0, 0);
+               }
+               pipe_texture_unmap(pipe, transfer);
             }
-            pipe_transfer_unmap(pipe, transfer);
          }
       }
    }
@@ -423,7 +449,11 @@ DestroyResource(D3D10DDI_HDEVICE hDevice,       // IN
 
    for (UINT SubResource = 0; SubResource < pResource->NumSubResources; ++SubResource) {
       if (pResource->transfers[SubResource]) {
-         pipe_transfer_unmap(pipe, pResource->transfers[SubResource]);
+         if (pResource->buffer) {
+            pipe_buffer_unmap(pipe, pResource->transfers[SubResource]);
+         } else {
+            pipe_texture_unmap(pipe, pResource->transfers[SubResource]);
+         }
          pResource->transfers[SubResource] = NULL;
       }
    }
@@ -493,12 +523,21 @@ ResourceMap(D3D10DDI_HDEVICE hDevice,                                // IN
    assert(!pResource->transfers[SubResource]);
 
    void *map;
-   map = pipe->transfer_map(pipe,
-                            resource,
-                            level,
-                            usage,
-                            &box,
-                            &pResource->transfers[SubResource]);
+   if (pResource->buffer) {
+      map = pipe->buffer_map(pipe,
+                             resource,
+                             level,
+                             usage,
+                             &box,
+                             &pResource->transfers[SubResource]);
+   } else {
+      map = pipe->texture_map(pipe,
+                              resource,
+                              level,
+                              usage,
+                              &box,
+                              &pResource->transfers[SubResource]);
+   }
    if (!map) {
       DebugPrintf("%s: failed to map resource\n", __FUNCTION__);
       SetError(hDevice, E_FAIL);
@@ -534,7 +573,11 @@ ResourceUnmap(D3D10DDI_HDEVICE hDevice,      // IN
    assert(SubResource < pResource->NumSubResources);
 
    if (pResource->transfers[SubResource]) {
-      pipe_transfer_unmap(pipe, pResource->transfers[SubResource]);
+      if (pResource->buffer) {
+         pipe_buffer_unmap(pipe, pResource->transfers[SubResource]);
+      } else {
+         pipe_texture_unmap(pipe, pResource->transfers[SubResource]);
+      }
       pResource->transfers[SubResource] = NULL;
    }
 }
@@ -834,7 +877,8 @@ ResourceUpdateSubResourceUP(D3D10DDI_HDEVICE hDevice,                // IN
    }
 
    struct pipe_context *pipe = pDevice->pipe;
-   struct pipe_resource *dst_resource = CastPipeResource(hDstResource);
+   Resource *pDstResource = CastResource(hDstResource);
+   struct pipe_resource *dst_resource = pDstResource->resource;
 
    unsigned level;
    struct pipe_box box;
@@ -855,12 +899,21 @@ ResourceUpdateSubResourceUP(D3D10DDI_HDEVICE hDevice,                // IN
 
    struct pipe_transfer *transfer;
    void *map;
-   map = pipe->transfer_map(pipe,
-                            dst_resource,
-                            level,
-                            PIPE_MAP_WRITE | PIPE_MAP_DISCARD_RANGE,
-                            &box,
-                            &transfer);
+   if (pDstResource->buffer) {
+      map = pipe->buffer_map(pipe,
+                              dst_resource,
+                              level,
+                              PIPE_MAP_WRITE | PIPE_MAP_DISCARD_RANGE,
+                              &box,
+                              &transfer);
+   } else {
+      map = pipe->texture_map(pipe,
+                              dst_resource,
+                              level,
+                              PIPE_MAP_WRITE | PIPE_MAP_DISCARD_RANGE,
+                              &box,
+                              &transfer);
+   }
    assert(map);
    if (map) {
       for (int z = 0; z < box.depth; ++z) {
@@ -874,7 +927,11 @@ ResourceUpdateSubResourceUP(D3D10DDI_HDEVICE hDevice,                // IN
                         RowPitch,
                         0, 0);
       }
-      pipe_transfer_unmap(pipe, transfer);
+      if (pDstResource->buffer) {
+         pipe_buffer_unmap(pipe, transfer);
+      } else {
+         pipe_texture_unmap(pipe, transfer);
+      }
    }
 }
 
diff --git a/mesa 3D driver/src/gallium/frontends/d3d10umd/Shader.cpp b/mesa 3D driver/src/gallium/frontends/d3d10umd/Shader.cpp
index ce5d0edd33..e70d58a9f1 100644
--- a/mesa 3D driver/src/gallium/frontends/d3d10umd/Shader.cpp	
+++ b/mesa 3D driver/src/gallium/frontends/d3d10umd/Shader.cpp	
@@ -251,7 +251,7 @@ SetShaderResources(enum pipe_shader_type shader_type,                  // IN
     * probably think about not updating all always... It should just work.
     */
    pipe->set_sampler_views(pipe, shader_type, 0, PIPE_MAX_SHADER_SAMPLER_VIEWS,
-                           0, sampler_views);
+                           0, false, sampler_views);
 }
 
 
diff --git a/mesa 3D driver/src/gallium/frontends/d3d10umd/ShaderParse.c b/mesa 3D driver/src/gallium/frontends/d3d10umd/ShaderParse.c
index e4b6b88aa7..1995d38188 100644
--- a/mesa 3D driver/src/gallium/frontends/d3d10umd/ShaderParse.c	
+++ b/mesa 3D driver/src/gallium/frontends/d3d10umd/ShaderParse.c	
@@ -392,7 +392,7 @@ Shader_parse_opcode(struct Shader_parser *parser,
    if (opcode_is_extended) {
       /* NOTE: DECODE_IS_D3D10_SB_OPCODE_DOUBLE_EXTENDED is broken.
        */
-      assert(!((*curr & D3D10_SB_OPCODE_DOUBLE_EXTENDED_MASK) >> D3D10_SB_OPERAND_DOUBLE_EXTENDED_SHIFT));
+      assert(!((*curr & D3D10_SB_OPERAND_DOUBLE_EXTENDED_MASK) >> D3D10_SB_OPERAND_DOUBLE_EXTENDED_SHIFT));
 
       switch (DECODE_D3D10_SB_EXTENDED_OPCODE_TYPE(*curr)) {
       case D3D10_SB_EXTENDED_OPCODE_EMPTY:
diff --git a/mesa 3D driver/src/gallium/frontends/d3d10umd/ShaderParse.h b/mesa 3D driver/src/gallium/frontends/d3d10umd/ShaderParse.h
index 5e8fba044a..df95536f2b 100644
--- a/mesa 3D driver/src/gallium/frontends/d3d10umd/ShaderParse.h	
+++ b/mesa 3D driver/src/gallium/frontends/d3d10umd/ShaderParse.h	
@@ -35,9 +35,6 @@
 
 #include "DriverIncludes.h"
 
-//#include "winddk/winddk_compat.h"
-#include "winddk/d3d10tokenizedprogramformat.hpp"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/mesa 3D driver/src/gallium/frontends/d3d10umd/State.h b/mesa 3D driver/src/gallium/frontends/d3d10umd/State.h
index 5cb865ce69..3e45cb7974 100644
--- a/mesa 3D driver/src/gallium/frontends/d3d10umd/State.h	
+++ b/mesa 3D driver/src/gallium/frontends/d3d10umd/State.h	
@@ -150,6 +150,7 @@ struct Resource
    DXGI_FORMAT Format;
    UINT MipLevels;
    UINT NumSubResources;
+   bool buffer;
    struct pipe_resource *resource;
    struct pipe_transfer **transfers;
    struct pipe_stream_output_target *so_target;
diff --git a/mesa 3D driver/src/gallium/frontends/d3d10umd/meson.build b/mesa 3D driver/src/gallium/frontends/d3d10umd/meson.build
index 90774d9217..7141c8da17 100644
--- a/mesa 3D driver/src/gallium/frontends/d3d10umd/meson.build	
+++ b/mesa 3D driver/src/gallium/frontends/d3d10umd/meson.build	
@@ -14,7 +14,7 @@ libd3d10umd = static_library(
     'Debug.cpp',
     'Device.cpp',
     'Draw.cpp',
-    'Dxgi.cpp',
+    'DxgiFns.cpp',
     'Format.cpp',
     'InputAssembly.cpp',
     'OutputMerger.cpp',
diff --git a/mesa 3D driver/src/gallium/frontends/dri/dri2.c b/mesa 3D driver/src/gallium/frontends/dri/dri2.c
index 6dcbd3365c..1cad733e41 100644
--- a/mesa 3D driver/src/gallium/frontends/dri/dri2.c	
+++ b/mesa 3D driver/src/gallium/frontends/dri/dri2.c	
@@ -801,7 +801,7 @@ static __DRIimage *
 dri2_create_image_from_winsys(__DRIscreen *_screen,
                               int width, int height, const struct dri2_format_mapping *map,
                               int num_handles, struct winsys_handle *whandle,
-                              bool is_protected_content,
+                              unsigned bind,
                               void *loaderPrivate)
 {
    struct dri_screen *screen = dri_screen(_screen);
@@ -860,15 +860,12 @@ dri2_create_image_from_winsys(__DRIscreen *_screen,
    if (!tex_usage)
       return NULL;
 
-   if (is_protected_content)
-      tex_usage |= PIPE_BIND_PROTECTED;
-
    img = CALLOC_STRUCT(__DRIimageRec);
    if (!img)
       return NULL;
 
    memset(&templ, 0, sizeof(templ));
-   templ.bind = tex_usage;
+   templ.bind = tex_usage | bind;
    templ.target = screen->target;
    templ.last_level = 0;
    templ.depth0 = 1;
@@ -916,7 +913,7 @@ dri2_create_image_from_winsys(__DRIscreen *_screen,
        */
       const struct driOptionCache *optionCache = &screen->dev->option_cache;
       if (!driQueryOptionb(optionCache, "disable_protected_content_check") &&
-          (bool)(tex->bind & PIPE_BIND_PROTECTED) != is_protected_content) {
+          (tex->bind & PIPE_BIND_PROTECTED) != (bind & PIPE_BIND_PROTECTED)) {
          pipe_resource_reference(&img->texture, NULL);
          pipe_resource_reference(&tex, NULL);
          FREE(img);
@@ -956,7 +953,7 @@ dri2_create_image_from_name(__DRIscreen *_screen,
    whandle.stride = pitch * util_format_get_blocksize(map->pipe_format);
 
    img = dri2_create_image_from_winsys(_screen, width, height, map,
-                                       1, &whandle, false, loaderPrivate);
+                                       1, &whandle, 0, loaderPrivate);
 
    if (!img)
       return NULL;
@@ -1003,8 +1000,8 @@ static __DRIimage *
 dri2_create_image_from_fd(__DRIscreen *_screen,
                           int width, int height, int fourcc,
                           uint64_t modifier, int *fds, int num_fds,
-                          int *strides, int *offsets, bool protected_content,
-                          unsigned *error, void *loaderPrivate)
+                          int *strides, int *offsets,
+                          unsigned bind, unsigned *error, void *loaderPrivate)
 {
    struct winsys_handle whandles[4];
    const struct dri2_format_mapping *map = dri2_get_mapping_by_fourcc(fourcc);
@@ -1041,7 +1038,7 @@ dri2_create_image_from_fd(__DRIscreen *_screen,
    }
 
    img = dri2_create_image_from_winsys(_screen, width, height, map,
-                                       num_fds, whandles, protected_content,
+                                       num_fds, whandles, bind,
                                        loaderPrivate);
    if(img == NULL) {
       err = __DRI_IMAGE_ERROR_BAD_ALLOC;
@@ -1461,7 +1458,7 @@ dri2_from_names(__DRIscreen *screen, int width, int height, int format,
    whandle.modifier = DRM_FORMAT_MOD_INVALID;
 
    img = dri2_create_image_from_winsys(screen, width, height, map,
-                                       1, &whandle, false, loaderPrivate);
+                                       1, &whandle, 0, loaderPrivate);
    if (img == NULL)
       return NULL;
 
@@ -1518,7 +1515,23 @@ dri2_from_fds(__DRIscreen *screen, int width, int height, int fourcc,
 {
    return dri2_create_image_from_fd(screen, width, height, fourcc,
                                    DRM_FORMAT_MOD_INVALID, fds, num_fds,
-                                   strides, offsets, false, NULL, loaderPrivate);
+                                   strides, offsets, 0, NULL, loaderPrivate);
+}
+
+static __DRIimage *
+dri2_from_fds2(__DRIscreen *screen, int width, int height, int fourcc,
+              int *fds, int num_fds, uint32_t flags, int *strides,
+              int *offsets, void *loaderPrivate)
+{
+   unsigned bind = 0;
+   if (flags & __DRI_IMAGE_PROTECTED_CONTENT_FLAG)
+      bind |= PIPE_BIND_PROTECTED;
+   if (flags & __DRI_IMAGE_PRIME_LINEAR_BUFFER)
+      bind |= PIPE_BIND_DRI_PRIME;
+
+   return dri2_create_image_from_fd(screen, width, height, fourcc,
+                                   DRM_FORMAT_MOD_INVALID, fds, num_fds,
+                                   strides, offsets, bind, NULL, loaderPrivate);
 }
 
 static boolean
@@ -1536,16 +1549,24 @@ dri2_query_dma_buf_modifiers(__DRIscreen *_screen, int fourcc, int max,
 
    format = map->pipe_format;
 
+   bool native_sampling = pscreen->is_format_supported(pscreen, format, screen->target, 0, 0,
+                                                       PIPE_BIND_SAMPLER_VIEW);
    if (pscreen->is_format_supported(pscreen, format, screen->target, 0, 0,
-                                     PIPE_BIND_RENDER_TARGET) ||
-        pscreen->is_format_supported(pscreen, format, screen->target, 0, 0,
-                                     PIPE_BIND_SAMPLER_VIEW) ||
-        dri2_yuv_dma_buf_supported(screen, map)) {
-      if (pscreen->query_dmabuf_modifiers != NULL)
+                                    PIPE_BIND_RENDER_TARGET) ||
+       native_sampling ||
+       dri2_yuv_dma_buf_supported(screen, map))  {
+      if (pscreen->query_dmabuf_modifiers != NULL) {
          pscreen->query_dmabuf_modifiers(pscreen, format, max, modifiers,
                                          external_only, count);
-      else
+         if (!native_sampling && external_only) {
+            /* To support it using YUV lowering, we need it to be samplerExternalOES.
+             */
+            for (int i = 0; i < *count; i++)
+               external_only[i] = true;
+         }
+      } else {
          *count = 0;
+      }
       return true;
    }
    return false;
@@ -1591,7 +1612,7 @@ dri2_from_dma_bufs(__DRIscreen *screen,
 
    img = dri2_create_image_from_fd(screen, width, height, fourcc,
                                    DRM_FORMAT_MOD_INVALID, fds, num_fds,
-                                   strides, offsets, false, error, loaderPrivate);
+                                   strides, offsets, 0, error, loaderPrivate);
    if (img == NULL)
       return NULL;
 
@@ -1620,7 +1641,7 @@ dri2_from_dma_bufs2(__DRIscreen *screen,
 
    img = dri2_create_image_from_fd(screen, width, height, fourcc,
                                    modifier, fds, num_fds, strides, offsets,
-                                   false, error, loaderPrivate);
+                                   0, error, loaderPrivate);
    if (img == NULL)
       return NULL;
 
@@ -1650,7 +1671,8 @@ dri2_from_dma_bufs3(__DRIscreen *screen,
 
    img = dri2_create_image_from_fd(screen, width, height, fourcc,
                                    modifier, fds, num_fds, strides, offsets,
-                                   flags & __DRI_IMAGE_PROTECTED_CONTENT_FLAG,
+                                   (flags & __DRI_IMAGE_PROTECTED_CONTENT_FLAG) ?
+                                      PIPE_BIND_PROTECTED : 0,
                                    error, loaderPrivate);
    if (img == NULL)
       return NULL;
@@ -1767,7 +1789,7 @@ dri2_get_capabilities(__DRIscreen *_screen)
 
 /* The extension is modified during runtime if DRI_PRIME is detected */
 static const __DRIimageExtension dri2ImageExtensionTempl = {
-    .base = { __DRI_IMAGE, 19 },
+    .base = { __DRI_IMAGE, 20 },
 
     .createImageFromName          = dri2_create_image_from_name,
     .createImageFromRenderbuffer  = dri2_create_image_from_renderbuffer,
@@ -1780,6 +1802,7 @@ static const __DRIimageExtension dri2ImageExtensionTempl = {
     .fromPlanar                   = dri2_from_planar,
     .createImageFromTexture       = dri2_create_from_texture,
     .createImageFromFds           = NULL,
+    .createImageFromFds2          = NULL,
     .createImageFromDmaBufs       = NULL,
     .blitImage                    = dri2_blit_image,
     .getCapabilities              = dri2_get_capabilities,
@@ -2272,6 +2295,7 @@ dri2_init_screen_extensions(struct dri_screen *screen,
       if (drmGetCap(screen->sPriv->fd, DRM_CAP_PRIME, &cap) == 0 &&
           (cap & DRM_PRIME_CAP_IMPORT)) {
          screen->image_extension.createImageFromFds = dri2_from_fds;
+         screen->image_extension.createImageFromFds2 = dri2_from_fds2;
          screen->image_extension.createImageFromDmaBufs = dri2_from_dma_bufs;
          screen->image_extension.createImageFromDmaBufs2 = dri2_from_dma_bufs2;
          screen->image_extension.createImageFromDmaBufs3 = dri2_from_dma_bufs3;
@@ -2350,6 +2374,15 @@ dri2_init_screen(__DRIscreen * sPriv)
    screen->broken_invalidate = !sPriv->dri2.useInvalidate;
    screen->lookup_egl_image = dri2_lookup_egl_image;
 
+   const __DRIimageLookupExtension *loader = sPriv->dri2.image;
+   if (loader &&
+       loader->base.version >= 2 &&
+       loader->validateEGLImage &&
+       loader->lookupEGLImageValidated) {
+      screen->validate_egl_image = dri2_validate_egl_image;
+      screen->lookup_egl_image_validated = dri2_lookup_egl_image_validated;
+   }
+
    return configs;
 
 destroy_screen:
@@ -2404,6 +2437,15 @@ dri_kms_init_screen(__DRIscreen * sPriv)
    screen->broken_invalidate = !sPriv->dri2.useInvalidate;
    screen->lookup_egl_image = dri2_lookup_egl_image;
 
+   const __DRIimageLookupExtension *loader = sPriv->dri2.image;
+   if (loader &&
+       loader->base.version >= 2 &&
+       loader->validateEGLImage &&
+       loader->lookupEGLImageValidated) {
+      screen->validate_egl_image = dri2_validate_egl_image;
+      screen->lookup_egl_image_validated = dri2_lookup_egl_image_validated;
+   }
+
    return configs;
 
 destroy_screen:
diff --git a/mesa 3D driver/src/gallium/frontends/dri/dri_helpers.c b/mesa 3D driver/src/gallium/frontends/dri/dri_helpers.c
index 583ce67978..a2c864ae10 100644
--- a/mesa 3D driver/src/gallium/frontends/dri/dri_helpers.c	
+++ b/mesa 3D driver/src/gallium/frontends/dri/dri_helpers.c	
@@ -253,6 +253,22 @@ dri2_lookup_egl_image(struct dri_screen *screen, void *handle)
    return img;
 }
 
+boolean
+dri2_validate_egl_image(struct dri_screen *screen, void *handle)
+{
+   const __DRIimageLookupExtension *loader = screen->sPriv->dri2.image;
+
+   return loader->validateEGLImage(handle, screen->sPriv->loaderPrivate);
+}
+
+__DRIimage *
+dri2_lookup_egl_image_validated(struct dri_screen *screen, void *handle)
+{
+   const __DRIimageLookupExtension *loader = screen->sPriv->dri2.image;
+
+   return loader->lookupEGLImageValidated(handle, screen->sPriv->loaderPrivate);
+}
+
 __DRIimage *
 dri2_create_image_from_renderbuffer2(__DRIcontext *context,
 				     int renderbuffer, void *loaderPrivate,
diff --git a/mesa 3D driver/src/gallium/frontends/dri/dri_helpers.h b/mesa 3D driver/src/gallium/frontends/dri/dri_helpers.h
index d7daad83f1..37a27772a8 100644
--- a/mesa 3D driver/src/gallium/frontends/dri/dri_helpers.h	
+++ b/mesa 3D driver/src/gallium/frontends/dri/dri_helpers.h	
@@ -62,6 +62,12 @@ dri2_yuv_dma_buf_supported(struct dri_screen *screen,
 __DRIimage *
 dri2_lookup_egl_image(struct dri_screen *screen, void *handle);
 
+boolean
+dri2_validate_egl_image(struct dri_screen *screen, void *handle);
+
+__DRIimage *
+dri2_lookup_egl_image_validated(struct dri_screen *screen, void *handle);
+
 __DRIimage *
 dri2_create_image_from_renderbuffer(__DRIcontext *context,
 				    int renderbuffer, void *loaderPrivate);
diff --git a/mesa 3D driver/src/gallium/frontends/dri/dri_query_renderer.c b/mesa 3D driver/src/gallium/frontends/dri/dri_query_renderer.c
index d9d2984bd6..eeebdb7657 100644
--- a/mesa 3D driver/src/gallium/frontends/dri/dri_query_renderer.c	
+++ b/mesa 3D driver/src/gallium/frontends/dri/dri_query_renderer.c	
@@ -74,6 +74,11 @@ dri2_query_renderer_integer(__DRIscreen *_screen, int param,
       if (!value[0])
          return -1;
       return 0;
+   case __DRI2_RENDERER_PREFER_BACK_BUFFER_REUSE:
+      value[0] =
+         screen->base.screen->get_param(screen->base.screen,
+                                        PIPE_CAP_PREFER_BACK_BUFFER_REUSE);
+      return 0;
    default:
       return driQueryRendererIntegerCommon(_screen, param, value);
    }
diff --git a/mesa 3D driver/src/gallium/frontends/dri/dri_screen.c b/mesa 3D driver/src/gallium/frontends/dri/dri_screen.c
index 0ec2b259e9..ea7f7996be 100644
--- a/mesa 3D driver/src/gallium/frontends/dri/dri_screen.c	
+++ b/mesa 3D driver/src/gallium/frontends/dri/dri_screen.c	
@@ -176,8 +176,7 @@ dri_fill_in_modes(struct dri_screen *screen)
 
    allow_rgba_ordering = dri_loader_get_cap(screen, DRI_LOADER_CAP_RGBA_ORDERING);
    allow_rgb10 = driQueryOptionb(&screen->dev->option_cache, "allow_rgb10_configs");
-   allow_fp16 = driQueryOptionb(&screen->dev->option_cache, "allow_fp16_configs");
-   allow_fp16 &= dri_loader_get_cap(screen, DRI_LOADER_CAP_FP16);
+   allow_fp16 = dri_loader_get_cap(screen, DRI_LOADER_CAP_FP16);
 
    msaa_samples_max = (screen->st_api->feature_mask & ST_API_FEATURE_MS_VISUALS_MASK)
       ? MSAA_VISUAL_MAX_SAMPLES : 1;
@@ -378,7 +377,10 @@ dri_fill_st_visual(struct st_visual *stvis,
    }
 
    if (mode->samples > 0) {
-      stvis->samples = mode->samples;
+      if (debug_get_bool_option("DRI_NO_MSAA", false))
+         stvis->samples = 0;
+      else
+         stvis->samples = mode->samples;
    }
 
    switch (mode->depthBits) {
@@ -432,7 +434,9 @@ dri_get_egl_image(struct st_manager *smapi,
    __DRIimage *img = NULL;
    const struct dri2_format_mapping *map;
 
-   if (screen->lookup_egl_image) {
+   if (screen->lookup_egl_image_validated) {
+      img = screen->lookup_egl_image_validated(screen, egl_image);
+   } else if (screen->lookup_egl_image) {
       img = screen->lookup_egl_image(screen, egl_image);
    }
 
@@ -457,6 +461,15 @@ dri_get_egl_image(struct st_manager *smapi,
    return TRUE;
 }
 
+static bool
+dri_validate_egl_image(struct st_manager *smapi,
+                       void *egl_image)
+{
+   struct dri_screen *screen = (struct dri_screen *)smapi;
+
+   return screen->validate_egl_image(screen, egl_image);
+}
+
 static int
 dri_get_param(struct st_manager *smapi,
               enum st_manager_param param)
@@ -543,6 +556,9 @@ dri_init_screen_helper(struct dri_screen *screen,
    screen->base.get_param = dri_get_param;
    screen->base.set_background_context = dri_set_background_context;
 
+   if (screen->validate_egl_image)
+      screen->base.validate_egl_image = dri_validate_egl_image;
+
    screen->st_api = st_gl_api_create();
    if (!screen->st_api)
       return NULL;
diff --git a/mesa 3D driver/src/gallium/frontends/dri/dri_screen.h b/mesa 3D driver/src/gallium/frontends/dri/dri_screen.h
index c6c08f9e3e..0b60cb68bc 100644
--- a/mesa 3D driver/src/gallium/frontends/dri/dri_screen.h	
+++ b/mesa 3D driver/src/gallium/frontends/dri/dri_screen.h	
@@ -81,6 +81,8 @@ struct dri_screen
 
    /* hooks filled in by dri2 & drisw */
    __DRIimage * (*lookup_egl_image)(struct dri_screen *ctx, void *handle);
+   boolean (*validate_egl_image)(struct dri_screen *ctx, void *handle);
+   __DRIimage * (*lookup_egl_image_validated)(struct dri_screen *ctx, void *handle);
 
    /* DRI exts that vary based on gallium pipe_screen caps. */
    __DRIimageExtension image_extension;
diff --git a/mesa 3D driver/src/gallium/frontends/dri/drisw.c b/mesa 3D driver/src/gallium/frontends/dri/drisw.c
index 5f5d475c45..adb486ceb0 100644
--- a/mesa 3D driver/src/gallium/frontends/dri/drisw.c	
+++ b/mesa 3D driver/src/gallium/frontends/dri/drisw.c	
@@ -541,6 +541,15 @@ drisw_init_screen(__DRIscreen * sPriv)
       sPriv->extensions = drisw_screen_extensions;
    screen->lookup_egl_image = dri2_lookup_egl_image;
 
+   const __DRIimageLookupExtension *image = sPriv->dri2.image;
+   if (image &&
+       image->base.version >= 2 &&
+       image->validateEGLImage &&
+       image->lookupEGLImageValidated) {
+      screen->validate_egl_image = dri2_validate_egl_image;
+      screen->lookup_egl_image_validated = dri2_lookup_egl_image_validated;
+   }
+
    return configs;
 fail:
    dri_destroy_screen_helper(screen);
diff --git a/mesa 3D driver/src/gallium/frontends/lavapipe/ci/deqp-lvp-asan-fails.txt b/mesa 3D driver/src/gallium/frontends/lavapipe/ci/deqp-lvp-asan-fails.txt
new file mode 100644
index 0000000000..45d071823a
--- /dev/null
+++ b/mesa 3D driver/src/gallium/frontends/lavapipe/ci/deqp-lvp-asan-fails.txt	
@@ -0,0 +1,2 @@
+dEQP-VK.glsl.builtin.precision.pow.highp.vec2,Fail
+dEQP-VK.glsl.texture_functions.query.texturequerylod.sampler2d_fixed_fragment,Fail
diff --git a/mesa 3D driver/src/gallium/frontends/lavapipe/ci/deqp-lvp-asan-skips.txt b/mesa 3D driver/src/gallium/frontends/lavapipe/ci/deqp-lvp-asan-skips.txt
new file mode 100644
index 0000000000..c50ec15827
--- /dev/null
+++ b/mesa 3D driver/src/gallium/frontends/lavapipe/ci/deqp-lvp-asan-skips.txt	
@@ -0,0 +1,9 @@
+# Lots of timeouts
+dEQP-VK.texture.filtering.*.combinations.*
+dEQP-VK.binding_model.buffer_device_address.set3.depth3.*
+dEQP-VK.ssbo.layout.*instance_array
+dEQP-VK.tessellation.invariance.*
+dEQP-VK.subgroups.ballot_broadcast.compute.subgroupbroadcast.*
+dEQP-VK.ssbo.layout.3_level_unsized_array.scalar.*
+dEQP-VK.ssbo.layout.3_level_array.scalar.*
+dEQP-VK.ssbo.phys.layout.unsized_struct_array.per_block_buffer.*_instance_array_comp_access_store_cols
diff --git a/mesa 3D driver/src/gallium/frontends/lavapipe/ci/deqp-lvp-fails.txt b/mesa 3D driver/src/gallium/frontends/lavapipe/ci/deqp-lvp-fails.txt
index c62d5385a2..be2274e5d7 100644
--- a/mesa 3D driver/src/gallium/frontends/lavapipe/ci/deqp-lvp-fails.txt	
+++ b/mesa 3D driver/src/gallium/frontends/lavapipe/ci/deqp-lvp-fails.txt	
@@ -1,31 +1,16 @@
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.a8b8g8r8_unorm_pack32.r16_unorm.general_general_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.a8b8g8r8_unorm_pack32.r16_unorm.general_optimal_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.a8b8g8r8_unorm_pack32.r16g16_unorm.optimal_general_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.a8b8g8r8_unorm_pack32.r16g16_unorm.optimal_optimal_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.a8b8g8r8_unorm_pack32.r16g16b16a16_unorm.general_optimal_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.b8g8r8a8_unorm.r16_unorm.general_general_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.b8g8r8a8_unorm.r16_unorm.general_optimal_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.b8g8r8a8_unorm.r16g16_unorm.optimal_general_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.b8g8r8a8_unorm.r16g16_unorm.optimal_optimal_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.b8g8r8a8_unorm.r16g16b16a16_unorm.general_optimal_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.r8_unorm.r16_unorm.general_general_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.r8_unorm.r16_unorm.general_optimal_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.r8_unorm.r16g16_unorm.optimal_general_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.r8_unorm.r16g16_unorm.optimal_optimal_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.r8_unorm.r16g16b16a16_unorm.general_optimal_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.r8g8_unorm.r16_unorm.general_general_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.r8g8_unorm.r16_unorm.general_optimal_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.r8g8_unorm.r16g16_unorm.optimal_general_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.r8g8_unorm.r16g16_unorm.optimal_optimal_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.r8g8_unorm.r16g16b16a16_unorm.general_optimal_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.r8g8b8a8_unorm.a2b10g10r10_unorm_pack32.general_general_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.r8g8b8a8_unorm.a2b10g10r10_unorm_pack32.general_linear_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.r8g8b8a8_unorm.a2b10g10r10_unorm_pack32.optimal_optimal_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.r8g8b8a8_unorm.r16_unorm.optimal_general_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.r8g8b8a8_unorm.r16_unorm.optimal_optimal_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.r8g8b8a8_unorm.r16g16_unorm.general_optimal_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.r8g8b8a8_unorm.r16g16b16a16_unorm.linear_general_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.r8g8b8a8_unorm.r16g16b16a16_unorm.optimal_general_linear,Fail
-dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.2d.r8g8b8a8_unorm.r16g16b16a16_unorm.optimal_linear_linear,Fail
+dEQP-VK.api.driver_properties.conformance_version,Fail
+dEQP-VK.pipeline.sampler.exact_sampling.r16g16b16a16_sfloat.solid_color.normalized_coords.edge_right,Fail
+dEQP-VK.pipeline.sampler.exact_sampling.r32g32_sfloat.solid_color.normalized_coords.centered,Fail
+dEQP-VK.pipeline.sampler.exact_sampling.r32g32_sfloat.solid_color.normalized_coords.edge_left,Fail
+dEQP-VK.pipeline.sampler.exact_sampling.r32g32_sfloat.solid_color.normalized_coords.edge_right,Fail
 dEQP-VK.rasterization.primitives.static_stipple.rectangular_line_strip_wide,Fail
+dEQP-VK.rasterization.primitives.dynamic_stipple.line_strip_wide,Fail
+dEQP-VK.rasterization.primitives.dynamic_stipple.rectangular_line_strip_wide,Fail
+dEQP-VK.rasterization.primitives.dynamic_stipple.smooth_line_strip_wide,Fail
 dEQP-VK.rasterization.primitives_multisample_4_bit.dynamic_stipple.line_strip_wide,Fail
+dEQP-VK.rasterization.primitives_multisample_4_bit.dynamic_stipple.rectangular_line_strip_wide,Fail
+dEQP-VK.rasterization.primitives_multisample_4_bit.static_stipple.line_strip_wide,Fail
+dEQP-VK.rasterization.primitives_multisample_4_bit.static_stipple.rectangular_line_strip_wide,Fail
+dEQP-VK.rasterization.primitives.static_stipple.line_strip_wide,Fail
+dEQP-VK.rasterization.primitives.static_stipple.smooth_line_strip_wide,Fail
+dEQP-VK.spirv_assembly.instruction.spirv1p4.entrypoint.comp_workgroup_entry_point,Fail
diff --git a/mesa 3D driver/src/gallium/frontends/lavapipe/ci/deqp-lvp-skips.txt b/mesa 3D driver/src/gallium/frontends/lavapipe/ci/deqp-lvp-skips.txt
index c35ee1f72c..41ccb37a8d 100644
--- a/mesa 3D driver/src/gallium/frontends/lavapipe/ci/deqp-lvp-skips.txt	
+++ b/mesa 3D driver/src/gallium/frontends/lavapipe/ci/deqp-lvp-skips.txt	
@@ -5,5 +5,19 @@
 # TODO: fix me
 dEQP-VK.texture.filtering.3d.sizes.3x7x5.linear_mipmap_linear
 
-# Timeout (VK-GL-CTS 1.2.7.0)
+# Timeouts (VK-GL-CTS 1.2.7.0)
 dEQP-VK.tessellation.invariance.outer_edge_division.quads_fractional_even_spacing
+dEQP-VK.graphicsfuzz.spv-stable-maze-flatten-copy-composite
+dEQP-VK.graphicsfuzz.spv-stable-pillars-O-op-select-to-op-phi
+dEQP-VK.graphicsfuzz.spv-stable-pillars-volatile-nontemporal-store
+dEQP-VK.graphicsfuzz.spv-composites
+dEQP-VK.ssbo.layout.random.16bit.all_shared_buffer.40
+dEQP-VK.ssbo.phys.layout.random.16bit.scalar.13
+dEQP-VK.ssbo.phys.layout.random.all_shared_buffer.47
+dEQP-VK.tessellation.invariance.outer_edge_division.quads_equal_spacing
+dEQP-VK.tessellation.invariance.outer_triangle_set.quads_equal_spacing
+dEQP-VK.tessellation.invariance.outer_triangle_set.quads_fractional_even_spacing
+dEQP-VK.tessellation.invariance.outer_triangle_set.quads_fractional_odd_spacing
+dEQP-VK.tessellation.invariance.outer_triangle_set.triangles_equal_spacing
+dEQP-VK.tessellation.invariance.outer_triangle_set.triangles_fractional_even_spacing
+dEQP-VK.tessellation.invariance.outer_triangle_set.triangles_fractional_odd_spacing
diff --git a/mesa 3D driver/src/gallium/frontends/lavapipe/ci/gitlab-ci.yml b/mesa 3D driver/src/gallium/frontends/lavapipe/ci/gitlab-ci.yml
index 77920b2cbe..8a243cb422 100644
--- a/mesa 3D driver/src/gallium/frontends/lavapipe/ci/gitlab-ci.yml	
+++ b/mesa 3D driver/src/gallium/frontends/lavapipe/ci/gitlab-ci.yml	
@@ -1,25 +1,39 @@
+.lavapipe-test:
+  stage: software-renderer
+  extends:
+    - .test-vk
+    - .lavapipe-rules
+    - .deqp-test-vk
+  variables:
+    GPU_VERSION: lvp
+    VK_DRIVER: lvp
+    DEQP_EXPECTED_RENDERER: llvmpipe
+
 lavapipe-vk:
   stage: software-renderer
   extends:
-    - .test-vk
-    - .lavapipe-rules
-    - .deqp-test-vk
+    - .lavapipe-test
   variables:
-    GPU_VERSION: lvp
-    VK_DRIVER: lvp
     DEQP_FRACTION: 10
-    DEQP_EXPECTED_RENDERER: llvmpipe
 
 lavapipe-nir-stress:
-  stage: software-renderer
   extends:
-    - .test-vk
-    - .lavapipe-rules
-    - .deqp-test-vk
+    - .lavapipe-test
   variables:
-    GPU_VERSION: lvp
-    VK_DRIVER: lvp
     DEQP_FRACTION: 100
-    DEQP_EXPECTED_RENDERER: llvmpipe
     NIR_TEST_CLONE: "true"
     NIR_TEST_SERIALIZE: "true"
+
+lavapipe-vk-asan:
+  extends:
+    - .lavapipe-test
+  variables:
+    GPU_VERSION: lvp-asan
+    DEQP_FRACTION: 50
+    DEQP_RUNNER_OPTIONS: "--env LD_PRELOAD=libasan.so.6"
+    # Disable the leak checks, since the library gets dlclose()d and thus get
+    # totally useless leak reports.  We can still catch buffer overflows.
+    ASAN_OPTIONS: "detect_leaks=0"
+  needs:
+    - debian/x86_test-vk
+    - debian-testing-asan
diff --git a/mesa 3D driver/src/gallium/frontends/lavapipe/lvp_cmd_buffer.c b/mesa 3D driver/src/gallium/frontends/lavapipe/lvp_cmd_buffer.c
index a04f7442fd..cb2d58226b 100644
--- a/mesa 3D driver/src/gallium/frontends/lavapipe/lvp_cmd_buffer.c	
+++ b/mesa 3D driver/src/gallium/frontends/lavapipe/lvp_cmd_buffer.c	
@@ -36,14 +36,20 @@ static VkResult lvp_create_cmd_buffer(
    cmd_buffer = vk_alloc(&pool->alloc, sizeof(*cmd_buffer), 8,
                          VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (cmd_buffer == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   VkResult result = vk_command_buffer_init(&cmd_buffer->vk, &device->vk);
+   if (result != VK_SUCCESS) {
+      vk_free(&pool->alloc, cmd_buffer);
+      return result;
+   }
 
-   vk_object_base_init(&device->vk, &cmd_buffer->base,
-                       VK_OBJECT_TYPE_COMMAND_BUFFER);
    cmd_buffer->device = device;
    cmd_buffer->pool = pool;
-   list_inithead(&cmd_buffer->cmds);
-   cmd_buffer->last_emit = &cmd_buffer->cmds;
+
+   cmd_buffer->queue.alloc = &pool->alloc;
+   list_inithead(&cmd_buffer->queue.cmds);
+
    cmd_buffer->status = LVP_CMD_BUFFER_STATUS_INITIAL;
    if (pool) {
       list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
@@ -58,21 +64,12 @@ static VkResult lvp_create_cmd_buffer(
    return VK_SUCCESS;
 }
 
-static void
-lvp_cmd_buffer_free_all_cmds(struct lvp_cmd_buffer *cmd_buffer)
-{
-   struct lvp_cmd_buffer_entry *tmp, *cmd;
-   LIST_FOR_EACH_ENTRY_SAFE(cmd, tmp, &cmd_buffer->cmds, cmd_link) {
-      list_del(&cmd->cmd_link);
-      vk_free(&cmd_buffer->pool->alloc, cmd);
-   }
-}
-
 static VkResult lvp_reset_cmd_buffer(struct lvp_cmd_buffer *cmd_buffer)
 {
-   lvp_cmd_buffer_free_all_cmds(cmd_buffer);
-   list_inithead(&cmd_buffer->cmds);
-   cmd_buffer->last_emit = &cmd_buffer->cmds;
+   vk_command_buffer_reset(&cmd_buffer->vk);
+
+   vk_free_queue(&cmd_buffer->queue);
+   list_inithead(&cmd_buffer->queue.cmds);
    cmd_buffer->status = LVP_CMD_BUFFER_STATUS_INITIAL;
    return VK_SUCCESS;
 }
@@ -98,7 +95,11 @@ VKAPI_ATTR VkResult VKAPI_CALL lvp_AllocateCommandBuffers(
 
          result = lvp_reset_cmd_buffer(cmd_buffer);
          cmd_buffer->level = pAllocateInfo->level;
-         vk_object_base_reset(&cmd_buffer->base);
+         vk_command_buffer_finish(&cmd_buffer->vk);
+         VkResult init_result =
+            vk_command_buffer_init(&cmd_buffer->vk, &device->vk);
+         if (init_result != VK_SUCCESS)
+            result = init_result;
 
          pCommandBuffers[i] = lvp_cmd_buffer_to_handle(cmd_buffer);
       } else {
@@ -122,9 +123,9 @@ VKAPI_ATTR VkResult VKAPI_CALL lvp_AllocateCommandBuffers(
 static void
 lvp_cmd_buffer_destroy(struct lvp_cmd_buffer *cmd_buffer)
 {
-   lvp_cmd_buffer_free_all_cmds(cmd_buffer);
+   vk_free_queue(&cmd_buffer->queue);
    list_del(&cmd_buffer->pool_link);
-   vk_object_base_finish(&cmd_buffer->base);
+   vk_command_buffer_finish(&cmd_buffer->vk);
    vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
 }
 
@@ -191,7 +192,7 @@ VKAPI_ATTR VkResult VKAPI_CALL lvp_CreateCommandPool(
    pool = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pool), 8,
                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (pool == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    vk_object_base_init(&device->vk, &pool->base,
                        VK_OBJECT_TYPE_COMMAND_POOL);
@@ -266,168 +267,246 @@ VKAPI_ATTR void VKAPI_CALL lvp_TrimCommandPool(
    }
 }
 
-static struct lvp_cmd_buffer_entry *cmd_buf_entry_alloc_size(struct lvp_cmd_buffer *cmd_buffer,
-                                                             uint32_t extra_size,
-                                                             enum lvp_cmds type)
+VKAPI_ATTR void VKAPI_CALL lvp_CmdDrawMultiEXT(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    drawCount,
+    const VkMultiDrawInfoEXT                   *pVertexInfo,
+    uint32_t                                    instanceCount,
+    uint32_t                                    firstInstance,
+    uint32_t                                    stride)
 {
-   struct lvp_cmd_buffer_entry *cmd;
-   uint32_t cmd_size = sizeof(*cmd) + extra_size;
-   cmd = vk_alloc(&cmd_buffer->pool->alloc,
-                  cmd_size,
-                  8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
+
+   struct vk_cmd_queue_entry *cmd = vk_zalloc(cmd_buffer->queue.alloc,
+                                              sizeof(*cmd), 8,
+                                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
    if (!cmd)
-      return NULL;
+      return;
 
-   cmd->cmd_type = type;
-   return cmd;
-}
+   cmd->type = VK_CMD_DRAW_MULTI_EXT;
+   list_addtail(&cmd->cmd_link, &cmd_buffer->queue.cmds);
 
-static struct lvp_cmd_buffer_entry *cmd_buf_entry_alloc(struct lvp_cmd_buffer *cmd_buffer,
-                                                        enum lvp_cmds type)
-{
-   return cmd_buf_entry_alloc_size(cmd_buffer, 0, type);
-}
-
-static void cmd_buf_queue(struct lvp_cmd_buffer *cmd_buffer,
-                          struct lvp_cmd_buffer_entry *cmd)
-{
-   switch (cmd->cmd_type) {
-   case LVP_CMD_BIND_DESCRIPTOR_SETS:
-   case LVP_CMD_PUSH_DESCRIPTOR_SET:
-      list_add(&cmd->cmd_link, cmd_buffer->last_emit);
-      cmd_buffer->last_emit = &cmd->cmd_link;
-      break;
-   case LVP_CMD_NEXT_SUBPASS:
-   case LVP_CMD_DRAW:
-   case LVP_CMD_DRAW_INDEXED:
-   case LVP_CMD_DRAW_INDIRECT:
-   case LVP_CMD_DRAW_INDEXED_INDIRECT:
-   case LVP_CMD_DISPATCH:
-   case LVP_CMD_DISPATCH_INDIRECT:
-      cmd_buffer->last_emit = &cmd->cmd_link;
-      FALLTHROUGH;
-   default:
-      list_addtail(&cmd->cmd_link, &cmd_buffer->cmds);
+   cmd->u.draw_multi_ext.draw_count = drawCount;
+   if (pVertexInfo) {
+      unsigned i = 0;
+      cmd->u.draw_multi_ext.vertex_info = vk_zalloc(cmd_buffer->queue.alloc,
+                                                    sizeof(*cmd->u.draw_multi_ext.vertex_info) * drawCount,
+                                                    8,
+                                                    VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+      vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride)
+         memcpy(&cmd->u.draw_multi_ext.vertex_info[i], draw, sizeof(*cmd->u.draw_multi_ext.vertex_info));
    }
+   cmd->u.draw_multi_ext.instance_count = instanceCount;
+   cmd->u.draw_multi_ext.first_instance = firstInstance;
+   cmd->u.draw_multi_ext.stride = stride;
 }
 
-static void
-state_setup_attachments(struct lvp_attachment_state *attachments,
-                        struct lvp_render_pass *pass,
-                        const VkClearValue *clear_values)
+VKAPI_ATTR void VKAPI_CALL lvp_CmdDrawMultiIndexedEXT(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    drawCount,
+    const VkMultiDrawIndexedInfoEXT            *pIndexInfo,
+    uint32_t                                    instanceCount,
+    uint32_t                                    firstInstance,
+    uint32_t                                    stride,
+    const int32_t                              *pVertexOffset)
 {
-   for (uint32_t i = 0; i < pass->attachment_count; ++i) {
-      struct lvp_render_pass_attachment *att = &pass->attachments[i];
-      VkImageAspectFlags att_aspects = vk_format_aspects(att->format);
-      VkImageAspectFlags clear_aspects = 0;
-      if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
-         /* color attachment */
-         if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
-            clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
-         }
-      } else {
-         /* depthstencil attachment */
-         if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
-             att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
-            clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
-            if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
-                att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE)
-               clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
-         }
-         if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
-             att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
-            clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
-         }
+   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
+
+   struct vk_cmd_queue_entry *cmd = vk_zalloc(cmd_buffer->queue.alloc,
+                                              sizeof(*cmd), 8,
+                                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+   if (!cmd)
+      return;
+
+   cmd->type = VK_CMD_DRAW_MULTI_INDEXED_EXT;
+   list_addtail(&cmd->cmd_link, &cmd_buffer->queue.cmds);
+
+   cmd->u.draw_multi_indexed_ext.draw_count = drawCount;
+
+   if (pIndexInfo) {
+      unsigned i = 0;
+      cmd->u.draw_multi_indexed_ext.index_info = vk_zalloc(cmd_buffer->queue.alloc,
+                                                           sizeof(*cmd->u.draw_multi_indexed_ext.index_info) * drawCount,
+                                                           8,
+                                                           VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+      vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
+         cmd->u.draw_multi_indexed_ext.index_info[i].firstIndex = draw->firstIndex;
+         cmd->u.draw_multi_indexed_ext.index_info[i].indexCount = draw->indexCount;
+         if (pVertexOffset == NULL)
+            cmd->u.draw_multi_indexed_ext.index_info[i].vertexOffset = draw->vertexOffset;
       }
-      attachments[i].pending_clear_aspects = clear_aspects;
-      if (clear_aspects)
-         attachments[i].clear_value = clear_values[i];
+   }
+
+   cmd->u.draw_multi_indexed_ext.instance_count = instanceCount;
+   cmd->u.draw_multi_indexed_ext.first_instance = firstInstance;
+   cmd->u.draw_multi_indexed_ext.stride = stride;
+
+   if (pVertexOffset) {
+      cmd->u.draw_multi_indexed_ext.vertex_offset = vk_zalloc(cmd_buffer->queue.alloc, sizeof(*cmd->u.draw_multi_indexed_ext.vertex_offset), 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+      memcpy(cmd->u.draw_multi_indexed_ext.vertex_offset, pVertexOffset, sizeof(*cmd->u.draw_multi_indexed_ext.vertex_offset));
    }
 }
 
-VKAPI_ATTR void VKAPI_CALL lvp_CmdBeginRenderPass2(
-    VkCommandBuffer                             commandBuffer,
-    const VkRenderPassBeginInfo*                pRenderPassBeginInfo,
-    const VkSubpassBeginInfo*                   pSubpassBeginInfo)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_render_pass, pass, pRenderPassBeginInfo->renderPass);
-   LVP_FROM_HANDLE(lvp_framebuffer, framebuffer, pRenderPassBeginInfo->framebuffer);
-   const struct VkRenderPassAttachmentBeginInfo *attachment_info =
-      vk_find_struct_const(pRenderPassBeginInfo->pNext,
-                           RENDER_PASS_ATTACHMENT_BEGIN_INFO);
-   struct lvp_cmd_buffer_entry *cmd;
-   uint32_t cmd_size = pass->attachment_count * sizeof(struct lvp_attachment_state);
-
-   if (attachment_info)
-      cmd_size += attachment_info->attachmentCount * sizeof(struct lvp_image_view *);
-
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, cmd_size, LVP_CMD_BEGIN_RENDER_PASS);
-   if (!cmd)
-      return;
-
-   cmd->u.begin_render_pass.render_pass = pass;
-   cmd->u.begin_render_pass.framebuffer = framebuffer;
-   cmd->u.begin_render_pass.render_area = pRenderPassBeginInfo->renderArea;
-
-   cmd->u.begin_render_pass.attachments = (struct lvp_attachment_state *)(cmd + 1);
-   cmd->u.begin_render_pass.imageless_views = NULL;
-   if (attachment_info) {
-      cmd->u.begin_render_pass.imageless_views = (struct lvp_image_view **)(cmd->u.begin_render_pass.attachments + pass->attachment_count);
-      for (unsigned i = 0; i < attachment_info->attachmentCount; i++)
-         cmd->u.begin_render_pass.imageless_views[i] = lvp_image_view_from_handle(attachment_info->pAttachments[i]);
-   }
-
-   state_setup_attachments(cmd->u.begin_render_pass.attachments, pass, pRenderPassBeginInfo->pClearValues);
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdNextSubpass2(
-    VkCommandBuffer                             commandBuffer,
-    const VkSubpassBeginInfo*                   pSubpassBeginInfo,
-    const VkSubpassEndInfo*                     pSubpassEndInfo)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_NEXT_SUBPASS);
-   if (!cmd)
-      return;
-
-   cmd->u.next_subpass.contents = pSubpassBeginInfo->contents;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdBindVertexBuffers(
-   VkCommandBuffer                             commandBuffer,
-   uint32_t                                    firstBinding,
-   uint32_t                                    bindingCount,
-   const VkBuffer*                             pBuffers,
-   const VkDeviceSize*                         pOffsets)
-{
-   lvp_CmdBindVertexBuffers2EXT(commandBuffer, firstBinding,
-      bindingCount, pBuffers, pOffsets, NULL, NULL);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdBindPipeline(
+VKAPI_ATTR void VKAPI_CALL lvp_CmdPushDescriptorSetKHR(
    VkCommandBuffer                             commandBuffer,
    VkPipelineBindPoint                         pipelineBindPoint,
-   VkPipeline                                  _pipeline)
+   VkPipelineLayout                            layout,
+   uint32_t                                    set,
+   uint32_t                                    descriptorWriteCount,
+   const VkWriteDescriptorSet*                 pDescriptorWrites)
 {
    LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_pipeline, pipeline, _pipeline);
-   struct lvp_cmd_buffer_entry *cmd;
+   struct vk_cmd_push_descriptor_set_khr *pds;
 
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_BIND_PIPELINE);
+   struct vk_cmd_queue_entry *cmd = vk_zalloc(cmd_buffer->queue.alloc,
+                                              sizeof(*cmd), 8,
+                                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
    if (!cmd)
       return;
 
-   cmd->u.pipeline.bind_point = pipelineBindPoint;
-   cmd->u.pipeline.pipeline = pipeline;
+   pds = &cmd->u.push_descriptor_set_khr;
 
-   cmd_buf_queue(cmd_buffer, cmd);
+   cmd->type = VK_CMD_PUSH_DESCRIPTOR_SET_KHR;
+   list_addtail(&cmd->cmd_link, &cmd_buffer->queue.cmds);
+
+   pds->pipeline_bind_point = pipelineBindPoint;
+   pds->layout = layout;
+   pds->set = set;
+   pds->descriptor_write_count = descriptorWriteCount;
+
+   if (pDescriptorWrites) {
+      pds->descriptor_writes = vk_zalloc(cmd_buffer->queue.alloc,
+                                         sizeof(*pds->descriptor_writes) * descriptorWriteCount,
+                                         8,
+                                         VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+      memcpy(pds->descriptor_writes,
+             pDescriptorWrites,
+             sizeof(*pds->descriptor_writes) * descriptorWriteCount);
+
+      for (unsigned i = 0; i < descriptorWriteCount; i++) {
+         switch (pds->descriptor_writes[i].descriptorType) {
+         case VK_DESCRIPTOR_TYPE_SAMPLER:
+         case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+         case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+         case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+         case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+            pds->descriptor_writes[i].pImageInfo = vk_zalloc(cmd_buffer->queue.alloc,
+                                         sizeof(VkDescriptorImageInfo) * pds->descriptor_writes[i].descriptorCount,
+                                         8,
+                                         VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+            memcpy((VkDescriptorImageInfo *)pds->descriptor_writes[i].pImageInfo,
+                   pDescriptorWrites[i].pImageInfo,
+                   sizeof(VkDescriptorImageInfo) * pds->descriptor_writes[i].descriptorCount);
+            break;
+         case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+         case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+            pds->descriptor_writes[i].pTexelBufferView = vk_zalloc(cmd_buffer->queue.alloc,
+                                         sizeof(VkBufferView) * pds->descriptor_writes[i].descriptorCount,
+                                         8,
+                                         VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+            memcpy((VkBufferView *)pds->descriptor_writes[i].pTexelBufferView,
+                   pDescriptorWrites[i].pTexelBufferView,
+                   sizeof(VkBufferView) * pds->descriptor_writes[i].descriptorCount);
+            break;
+         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+         case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+         case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+         default:
+            pds->descriptor_writes[i].pBufferInfo = vk_zalloc(cmd_buffer->queue.alloc,
+                                         sizeof(VkDescriptorBufferInfo) * pds->descriptor_writes[i].descriptorCount,
+                                         8,
+                                         VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+            memcpy((VkDescriptorBufferInfo *)pds->descriptor_writes[i].pBufferInfo,
+                   pDescriptorWrites[i].pBufferInfo,
+                   sizeof(VkDescriptorBufferInfo) * pds->descriptor_writes[i].descriptorCount);
+            break;
+         }
+      }
+   }
+}
+
+VKAPI_ATTR void VKAPI_CALL lvp_CmdPushDescriptorSetWithTemplateKHR(
+   VkCommandBuffer                             commandBuffer,
+   VkDescriptorUpdateTemplate                  descriptorUpdateTemplate,
+   VkPipelineLayout                            layout,
+   uint32_t                                    set,
+   const void*                                 pData)
+{
+   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
+   LVP_FROM_HANDLE(lvp_descriptor_update_template, templ, descriptorUpdateTemplate);
+   size_t info_size = 0;
+   struct vk_cmd_queue_entry *cmd = vk_zalloc(cmd_buffer->queue.alloc,
+                                              sizeof(*cmd), 8,
+                                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+   if (!cmd)
+      return;
+
+   cmd->type = VK_CMD_PUSH_DESCRIPTOR_SET_WITH_TEMPLATE_KHR;
+
+   list_addtail(&cmd->cmd_link, &cmd_buffer->queue.cmds);
+
+   cmd->u.push_descriptor_set_with_template_khr.descriptor_update_template = descriptorUpdateTemplate;
+   cmd->u.push_descriptor_set_with_template_khr.layout = layout;
+   cmd->u.push_descriptor_set_with_template_khr.set = set;
+
+   for (unsigned i = 0; i < templ->entry_count; i++) {
+      VkDescriptorUpdateTemplateEntry *entry = &templ->entry[i];
+
+      switch (entry->descriptorType) {
+      case VK_DESCRIPTOR_TYPE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+      case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+         info_size += sizeof(VkDescriptorImageInfo) * entry->descriptorCount;
+         break;
+      case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+         info_size += sizeof(VkBufferView) * entry->descriptorCount;
+         break;
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+      default:
+         info_size += sizeof(VkDescriptorBufferInfo) * entry->descriptorCount;
+         break;
+      }
+   }
+
+   cmd->u.push_descriptor_set_with_template_khr.data = vk_zalloc(cmd_buffer->queue.alloc, info_size, 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+
+   uint64_t offset = 0;
+   for (unsigned i = 0; i < templ->entry_count; i++) {
+      VkDescriptorUpdateTemplateEntry *entry = &templ->entry[i];
+
+      unsigned size = 0;
+      switch (entry->descriptorType) {
+      case VK_DESCRIPTOR_TYPE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+      case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+         size = sizeof(VkDescriptorImageInfo);
+         break;
+      case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+         size = sizeof(VkBufferView);
+         break;
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+      default:
+         size = sizeof(VkDescriptorBufferInfo);
+         break;
+      }
+      for (unsigned i = 0; i < entry->descriptorCount; i++) {
+         memcpy((uint8_t*)cmd->u.push_descriptor_set_with_template_khr.data + offset, (const uint8_t*)pData + entry->offset + i * entry->stride, size);
+         offset += size;
+      }
+   }
 }
 
 VKAPI_ATTR void VKAPI_CALL lvp_CmdBindDescriptorSets(
@@ -442,1842 +521,31 @@ VKAPI_ATTR void VKAPI_CALL lvp_CmdBindDescriptorSets(
 {
    LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
    LVP_FROM_HANDLE(lvp_pipeline_layout, layout, _layout);
-   struct lvp_cmd_buffer_entry *cmd;
-   struct lvp_descriptor_set **sets;
-   uint32_t *offsets;
-   int i;
-   uint32_t cmd_size = descriptorSetCount * sizeof(struct lvp_descriptor_set *) + dynamicOffsetCount * sizeof(uint32_t);
-
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, cmd_size, LVP_CMD_BIND_DESCRIPTOR_SETS);
+   struct vk_cmd_queue_entry *cmd = vk_zalloc(cmd_buffer->queue.alloc,
+                                              sizeof(*cmd), 8,
+                                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
    if (!cmd)
       return;
 
-   cmd->u.descriptor_sets.bind_point = pipelineBindPoint;
-   cmd->u.descriptor_sets.first = firstSet;
-   cmd->u.descriptor_sets.count = descriptorSetCount;
+   cmd->type = VK_CMD_BIND_DESCRIPTOR_SETS;
+   list_addtail(&cmd->cmd_link, &cmd_buffer->queue.cmds);
 
-   for (i = 0; i < layout->num_sets; i++)
-      cmd->u.descriptor_sets.set_layout[i] = layout->set[i].layout;
-   sets = (struct lvp_descriptor_set **)(cmd + 1);
-   for (i = 0; i < descriptorSetCount; i++) {
+   /* _layout could have been destroyed by when this command executes */
+   struct lvp_descriptor_set_layout **set_layout = vk_zalloc(cmd_buffer->queue.alloc, sizeof(*set_layout) * layout->num_sets, 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+   cmd->driver_data = set_layout;
+   for (unsigned i = 0; i < layout->num_sets; i++)
+      set_layout[i] = layout->set[i].layout;
 
-      sets[i] = lvp_descriptor_set_from_handle(pDescriptorSets[i]);
+   cmd->u.bind_descriptor_sets.pipeline_bind_point = pipelineBindPoint;
+   cmd->u.bind_descriptor_sets.first_set = firstSet;
+   cmd->u.bind_descriptor_sets.descriptor_set_count = descriptorSetCount;
+   if (pDescriptorSets) {
+      cmd->u.bind_descriptor_sets.descriptor_sets = vk_zalloc(cmd_buffer->queue.alloc, sizeof(*cmd->u.bind_descriptor_sets.descriptor_sets) * descriptorSetCount, 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+      memcpy(( VkDescriptorSet* )cmd->u.bind_descriptor_sets.descriptor_sets, pDescriptorSets, sizeof(*cmd->u.bind_descriptor_sets.descriptor_sets) * descriptorSetCount);
    }
-   cmd->u.descriptor_sets.sets = sets;
-
-   cmd->u.descriptor_sets.dynamic_offset_count = dynamicOffsetCount;
-   offsets = (uint32_t *)(sets + descriptorSetCount);
-   for (i = 0; i < dynamicOffsetCount; i++)
-      offsets[i] = pDynamicOffsets[i];
-   cmd->u.descriptor_sets.dynamic_offsets = offsets;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdDraw(
-   VkCommandBuffer                             commandBuffer,
-   uint32_t                                    vertexCount,
-   uint32_t                                    instanceCount,
-   uint32_t                                    firstVertex,
-   uint32_t                                    firstInstance)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   uint32_t cmd_size = sizeof(struct pipe_draw_start_count_bias);
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, cmd_size, LVP_CMD_DRAW);
-   if (!cmd)
-      return;
-
-   cmd->u.draw.instance_count = instanceCount;
-   cmd->u.draw.first_instance = firstInstance;
-   cmd->u.draw.draw_count = 1;
-   cmd->u.draw.draws[0].start = firstVertex;
-   cmd->u.draw.draws[0].count = vertexCount;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdDrawMultiEXT(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    drawCount,
-    const VkMultiDrawInfoEXT                   *pVertexInfo,
-    uint32_t                                    instanceCount,
-    uint32_t                                    firstInstance,
-    uint32_t                                    stride)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   uint32_t cmd_size = drawCount * sizeof(struct pipe_draw_start_count_bias);
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, cmd_size, LVP_CMD_DRAW);
-   if (!cmd)
-      return;
-
-   cmd->u.draw.instance_count = instanceCount;
-   cmd->u.draw.first_instance = firstInstance;
-   cmd->u.draw.draw_count = drawCount;
-   if (stride == sizeof(struct pipe_draw_start_count_bias))
-      memcpy(cmd->u.draw.draws, pVertexInfo, cmd_size);
-   else {
-      unsigned i = 0;
-      vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride)
-         memcpy(&cmd->u.draw.draws[i], draw, sizeof(struct VkMultiDrawInfoEXT));
+   cmd->u.bind_descriptor_sets.dynamic_offset_count = dynamicOffsetCount;
+   if (pDynamicOffsets) {
+      cmd->u.bind_descriptor_sets.dynamic_offsets = vk_zalloc(cmd_buffer->queue.alloc, sizeof(*cmd->u.bind_descriptor_sets.dynamic_offsets) * dynamicOffsetCount, 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+      memcpy(( uint32_t* )cmd->u.bind_descriptor_sets.dynamic_offsets, pDynamicOffsets, sizeof(*cmd->u.bind_descriptor_sets.dynamic_offsets) * dynamicOffsetCount);
    }
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdEndRenderPass2(
-   VkCommandBuffer                             commandBuffer,
-   const VkSubpassEndInfo*                     pSubpassEndInfo)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_END_RENDER_PASS);
-   if (!cmd)
-      return;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetViewport(
-   VkCommandBuffer                             commandBuffer,
-   uint32_t                                    firstViewport,
-   uint32_t                                    viewportCount,
-   const VkViewport*                           pViewports)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-   int i;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_VIEWPORT);
-   if (!cmd)
-      return;
-
-   cmd->u.set_viewport.first_viewport = firstViewport;
-   cmd->u.set_viewport.viewport_count = viewportCount;
-   for (i = 0; i < viewportCount; i++)
-      cmd->u.set_viewport.viewports[i] = pViewports[i];
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetScissor(
-   VkCommandBuffer                             commandBuffer,
-   uint32_t                                    firstScissor,
-   uint32_t                                    scissorCount,
-   const VkRect2D*                             pScissors)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-   int i;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_SCISSOR);
-   if (!cmd)
-      return;
-
-   cmd->u.set_scissor.first_scissor = firstScissor;
-   cmd->u.set_scissor.scissor_count = scissorCount;
-   for (i = 0; i < scissorCount; i++)
-      cmd->u.set_scissor.scissors[i] = pScissors[i];
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetLineWidth(
-   VkCommandBuffer                             commandBuffer,
-   float                                       lineWidth)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_LINE_WIDTH);
-   if (!cmd)
-      return;
-
-   cmd->u.set_line_width.line_width = lineWidth;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetDepthBias(
-   VkCommandBuffer                             commandBuffer,
-   float                                       depthBiasConstantFactor,
-   float                                       depthBiasClamp,
-   float                                       depthBiasSlopeFactor)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_DEPTH_BIAS);
-   if (!cmd)
-      return;
-
-   cmd->u.set_depth_bias.constant_factor = depthBiasConstantFactor;
-   cmd->u.set_depth_bias.clamp = depthBiasClamp;
-   cmd->u.set_depth_bias.slope_factor = depthBiasSlopeFactor;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetBlendConstants(
-   VkCommandBuffer                             commandBuffer,
-   const float                                 blendConstants[4])
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_BLEND_CONSTANTS);
-   if (!cmd)
-      return;
-
-   memcpy(cmd->u.set_blend_constants.blend_constants, blendConstants, 4 * sizeof(float));
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetDepthBounds(
-   VkCommandBuffer                             commandBuffer,
-   float                                       minDepthBounds,
-   float                                       maxDepthBounds)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_DEPTH_BOUNDS);
-   if (!cmd)
-      return;
-
-   cmd->u.set_depth_bounds.min_depth = minDepthBounds;
-   cmd->u.set_depth_bounds.max_depth = maxDepthBounds;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetStencilCompareMask(
-   VkCommandBuffer                             commandBuffer,
-   VkStencilFaceFlags                          faceMask,
-   uint32_t                                    compareMask)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_STENCIL_COMPARE_MASK);
-   if (!cmd)
-      return;
-
-   cmd->u.stencil_vals.face_mask = faceMask;
-   cmd->u.stencil_vals.value = compareMask;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetStencilWriteMask(
-   VkCommandBuffer                             commandBuffer,
-   VkStencilFaceFlags                          faceMask,
-   uint32_t                                    writeMask)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_STENCIL_WRITE_MASK);
-   if (!cmd)
-      return;
-
-   cmd->u.stencil_vals.face_mask = faceMask;
-   cmd->u.stencil_vals.value = writeMask;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetStencilReference(
-   VkCommandBuffer                             commandBuffer,
-   VkStencilFaceFlags                          faceMask,
-   uint32_t                                    reference)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_STENCIL_REFERENCE);
-   if (!cmd)
-      return;
-
-   cmd->u.stencil_vals.face_mask = faceMask;
-   cmd->u.stencil_vals.value = reference;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdPushConstants(
-   VkCommandBuffer                             commandBuffer,
-   VkPipelineLayout                            layout,
-   VkShaderStageFlags                          stageFlags,
-   uint32_t                                    offset,
-   uint32_t                                    size,
-   const void*                                 pValues)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, (size - 4), LVP_CMD_PUSH_CONSTANTS);
-   if (!cmd)
-      return;
-
-   cmd->u.push_constants.stage = stageFlags;
-   cmd->u.push_constants.offset = offset;
-   cmd->u.push_constants.size = size;
-   memcpy(cmd->u.push_constants.val, pValues, size);
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdBindIndexBuffer(
-   VkCommandBuffer                             commandBuffer,
-   VkBuffer                                    _buffer,
-   VkDeviceSize                                offset,
-   VkIndexType                                 indexType)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_buffer, buffer, _buffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_BIND_INDEX_BUFFER);
-   if (!cmd)
-      return;
-
-   cmd->u.index_buffer.buffer = buffer;
-   cmd->u.index_buffer.offset = offset;
-   cmd->u.index_buffer.index_type = indexType;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdDrawIndexed(
-   VkCommandBuffer                             commandBuffer,
-   uint32_t                                    indexCount,
-   uint32_t                                    instanceCount,
-   uint32_t                                    firstIndex,
-   int32_t                                     vertexOffset,
-   uint32_t                                    firstInstance)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   uint32_t cmd_size = sizeof(struct pipe_draw_start_count_bias);
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, cmd_size, LVP_CMD_DRAW_INDEXED);
-   if (!cmd)
-      return;
-
-   cmd->u.draw_indexed.instance_count = instanceCount;
-   cmd->u.draw_indexed.first_instance = firstInstance;
-   cmd->u.draw_indexed.draw_count = 1;
-   cmd->u.draw_indexed.draws[0].start = firstIndex;
-   cmd->u.draw_indexed.draws[0].count = indexCount;
-   cmd->u.draw_indexed.draws[0].index_bias = vertexOffset;
-   cmd->u.draw_indexed.calc_start = true;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdDrawMultiIndexedEXT(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    drawCount,
-    const VkMultiDrawIndexedInfoEXT            *pIndexInfo,
-    uint32_t                                    instanceCount,
-    uint32_t                                    firstInstance,
-    uint32_t                                    stride,
-    const int32_t                              *pVertexOffset)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   uint32_t cmd_size = drawCount * sizeof(struct pipe_draw_start_count_bias);
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, cmd_size, LVP_CMD_DRAW_INDEXED);
-   if (!cmd)
-      return;
-
-   cmd->u.draw_indexed.instance_count = instanceCount;
-   cmd->u.draw_indexed.first_instance = firstInstance;
-   cmd->u.draw_indexed.draw_count = drawCount;
-   cmd->u.draw_indexed.vertex_offset_changes = !pVertexOffset;
-   if (stride == sizeof(struct pipe_draw_start_count_bias))
-      memcpy(cmd->u.draw_indexed.draws, pIndexInfo, cmd_size);
-   else {
-      unsigned i = 0;
-      vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride)
-         memcpy(&cmd->u.draw_indexed.draws[i], draw, sizeof(struct pipe_draw_start_count_bias));
-   }
-   /* only the first member is read if vertex_offset_changes is true */
-   if (pVertexOffset)
-      cmd->u.draw_indexed.draws[0].index_bias = *pVertexOffset;
-   cmd->u.draw_indexed.calc_start = true;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdDrawIndirect(
-   VkCommandBuffer                             commandBuffer,
-   VkBuffer                                    _buffer,
-   VkDeviceSize                                offset,
-   uint32_t                                    drawCount,
-   uint32_t                                    stride)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_buffer, buf, _buffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_DRAW_INDIRECT);
-   if (!cmd)
-      return;
-
-   cmd->u.draw_indirect.offset = offset;
-   cmd->u.draw_indirect.buffer = buf;
-   cmd->u.draw_indirect.draw_count = drawCount;
-   cmd->u.draw_indirect.stride = stride;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdDrawIndexedIndirect(
-   VkCommandBuffer                             commandBuffer,
-   VkBuffer                                    _buffer,
-   VkDeviceSize                                offset,
-   uint32_t                                    drawCount,
-   uint32_t                                    stride)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_buffer, buf, _buffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_DRAW_INDEXED_INDIRECT);
-   if (!cmd)
-      return;
-
-   cmd->u.draw_indirect.offset = offset;
-   cmd->u.draw_indirect.buffer = buf;
-   cmd->u.draw_indirect.draw_count = drawCount;
-   cmd->u.draw_indirect.stride = stride;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdDispatch(
-   VkCommandBuffer                             commandBuffer,
-   uint32_t                                    x,
-   uint32_t                                    y,
-   uint32_t                                    z)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_DISPATCH);
-   if (!cmd)
-      return;
-
-   cmd->u.dispatch.x = x;
-   cmd->u.dispatch.y = y;
-   cmd->u.dispatch.z = z;
-   cmd->u.dispatch.base_x = 0;
-   cmd->u.dispatch.base_y = 0;
-   cmd->u.dispatch.base_z = 0;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdDispatchIndirect(
-   VkCommandBuffer                             commandBuffer,
-   VkBuffer                                    _buffer,
-   VkDeviceSize                                offset)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_DISPATCH_INDIRECT);
-   if (!cmd)
-      return;
-
-   cmd->u.dispatch_indirect.buffer = lvp_buffer_from_handle(_buffer);
-   cmd->u.dispatch_indirect.offset = offset;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdExecuteCommands(
-   VkCommandBuffer                             commandBuffer,
-   uint32_t                                    commandBufferCount,
-   const VkCommandBuffer*                      pCmdBuffers)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-   uint32_t cmd_size = commandBufferCount * sizeof(struct lvp_cmd_buffer *);
-
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, cmd_size, LVP_CMD_EXECUTE_COMMANDS);
-   if (!cmd)
-      return;
-
-   cmd->u.execute_commands.command_buffer_count = commandBufferCount;
-   for (unsigned i = 0; i < commandBufferCount; i++)
-      cmd->u.execute_commands.cmd_buffers[i] = lvp_cmd_buffer_from_handle(pCmdBuffers[i]);
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetEvent(VkCommandBuffer commandBuffer,
-                     VkEvent _event,
-                     VkPipelineStageFlags stageMask)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_event, event, _event);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_EVENT);
-   if (!cmd)
-      return;
-
-   cmd->u.event_set.event = event;
-   cmd->u.event_set.value = true;
-   cmd->u.event_set.flush = !!(stageMask == VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT);
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdResetEvent(VkCommandBuffer commandBuffer,
-                       VkEvent _event,
-                       VkPipelineStageFlags stageMask)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_event, event, _event);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_EVENT);
-   if (!cmd)
-      return;
-
-   cmd->u.event_set.event = event;
-   cmd->u.event_set.value = false;
-   cmd->u.event_set.flush = !!(stageMask == VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT);
-
-   cmd_buf_queue(cmd_buffer, cmd);
-
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdWaitEvents(VkCommandBuffer commandBuffer,
-                       uint32_t eventCount,
-                       const VkEvent* pEvents,
-                       VkPipelineStageFlags srcStageMask,
-                       VkPipelineStageFlags dstStageMask,
-                       uint32_t memoryBarrierCount,
-                       const VkMemoryBarrier* pMemoryBarriers,
-                       uint32_t bufferMemoryBarrierCount,
-                       const VkBufferMemoryBarrier* pBufferMemoryBarriers,
-                       uint32_t imageMemoryBarrierCount,
-                       const VkImageMemoryBarrier* pImageMemoryBarriers)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-   uint32_t cmd_size = 0;
-
-   cmd_size += eventCount * sizeof(struct lvp_event *);
-   cmd_size += memoryBarrierCount * sizeof(VkMemoryBarrier);
-   cmd_size += bufferMemoryBarrierCount * sizeof(VkBufferMemoryBarrier);
-   cmd_size += imageMemoryBarrierCount * sizeof(VkImageMemoryBarrier);
-
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, cmd_size, LVP_CMD_WAIT_EVENTS);
-   if (!cmd)
-      return;
-
-   cmd->u.wait_events.src_stage_mask = srcStageMask;
-   cmd->u.wait_events.dst_stage_mask = dstStageMask;
-   cmd->u.wait_events.event_count = eventCount;
-   cmd->u.wait_events.events = (struct lvp_event **)(cmd + 1);
-   for (unsigned i = 0; i < eventCount; i++)
-      cmd->u.wait_events.events[i] = lvp_event_from_handle(pEvents[i]);
-   cmd->u.wait_events.memory_barrier_count = memoryBarrierCount;
-   cmd->u.wait_events.buffer_memory_barrier_count = bufferMemoryBarrierCount;
-   cmd->u.wait_events.image_memory_barrier_count = imageMemoryBarrierCount;
-
-   /* TODO finish off this */
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-/* copy a 2KHR struct to the base struct */
-static inline void
-copy_2_struct_to_base(void *base, const void *struct2, size_t struct_size)
-{
-   size_t offset = align(sizeof(VkStructureType) + sizeof(void*), 8);
-   memcpy(base, ((uint8_t*)struct2) + offset, struct_size);
-}
-
-/* copy an array of 2KHR structs to an array of base structs */
-#define COPY_STRUCT2_ARRAY(count, base, struct2, struct_type) \
-   do { \
-      for (unsigned _i = 0; _i < (count); _i++) \
-         copy_2_struct_to_base(&base[_i], &struct2[_i], sizeof(struct_type)); \
-   } while (0)
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdCopyBufferToImage2KHR(
-   VkCommandBuffer                             commandBuffer,
-   const VkCopyBufferToImageInfo2KHR          *info)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_buffer, src_buffer, info->srcBuffer);
-   LVP_FROM_HANDLE(lvp_image, dst_image, info->dstImage);
-   struct lvp_cmd_buffer_entry *cmd;
-   uint32_t cmd_size = info->regionCount * sizeof(VkBufferImageCopy);
-
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, cmd_size, LVP_CMD_COPY_BUFFER_TO_IMAGE);
-   if (!cmd)
-      return;
-
-   cmd->u.buffer_to_img.src = src_buffer;
-   cmd->u.buffer_to_img.dst = dst_image;
-   cmd->u.buffer_to_img.dst_layout = info->dstImageLayout;
-   cmd->u.buffer_to_img.region_count = info->regionCount;
-
-   {
-      VkBufferImageCopy *regions;
-
-      regions = (VkBufferImageCopy *)(cmd + 1);
-      COPY_STRUCT2_ARRAY(info->regionCount, regions, info->pRegions, VkBufferImageCopy);
-      cmd->u.buffer_to_img.regions = regions;
-   }
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdCopyImageToBuffer2KHR(
-   VkCommandBuffer                             commandBuffer,
-   const VkCopyImageToBufferInfo2KHR          *info)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_image, src_image, info->srcImage);
-   LVP_FROM_HANDLE(lvp_buffer, dst_buffer, info->dstBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-   uint32_t cmd_size = info->regionCount * sizeof(VkBufferImageCopy);
-
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, cmd_size, LVP_CMD_COPY_IMAGE_TO_BUFFER);
-   if (!cmd)
-      return;
-
-   cmd->u.img_to_buffer.src = src_image;
-   cmd->u.img_to_buffer.dst = dst_buffer;
-   cmd->u.img_to_buffer.src_layout = info->srcImageLayout;
-   cmd->u.img_to_buffer.region_count = info->regionCount;
-
-   {
-      VkBufferImageCopy *regions;
-
-      regions = (VkBufferImageCopy *)(cmd + 1);
-      COPY_STRUCT2_ARRAY(info->regionCount, regions, info->pRegions, VkBufferImageCopy);
-      cmd->u.img_to_buffer.regions = regions;
-   }
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdCopyImage2KHR(
-   VkCommandBuffer                             commandBuffer,
-   const VkCopyImageInfo2KHR                  *info)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_image, src_image, info->srcImage);
-   LVP_FROM_HANDLE(lvp_image, dest_image, info->dstImage);
-   struct lvp_cmd_buffer_entry *cmd;
-   uint32_t cmd_size = info->regionCount * sizeof(VkImageCopy);
-
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, cmd_size, LVP_CMD_COPY_IMAGE);
-   if (!cmd)
-      return;
-
-   cmd->u.copy_image.src = src_image;
-   cmd->u.copy_image.dst = dest_image;
-   cmd->u.copy_image.src_layout = info->srcImageLayout;
-   cmd->u.copy_image.dst_layout = info->dstImageLayout;
-   cmd->u.copy_image.region_count = info->regionCount;
-
-   {
-      VkImageCopy *regions;
-
-      regions = (VkImageCopy *)(cmd + 1);
-      COPY_STRUCT2_ARRAY(info->regionCount, regions, info->pRegions, VkImageCopy);
-      cmd->u.copy_image.regions = regions;
-   }
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdCopyBuffer2KHR(
-   VkCommandBuffer                             commandBuffer,
-   const VkCopyBufferInfo2KHR                 *info)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_buffer, src_buffer, info->srcBuffer);
-   LVP_FROM_HANDLE(lvp_buffer, dest_buffer, info->dstBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-   uint32_t cmd_size = info->regionCount * sizeof(VkBufferCopy);
-
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, cmd_size, LVP_CMD_COPY_BUFFER);
-   if (!cmd)
-      return;
-
-   cmd->u.copy_buffer.src = src_buffer;
-   cmd->u.copy_buffer.dst = dest_buffer;
-   cmd->u.copy_buffer.region_count = info->regionCount;
-
-   {
-      VkBufferCopy *regions;
-
-      regions = (VkBufferCopy *)(cmd + 1);
-      COPY_STRUCT2_ARRAY(info->regionCount, regions, info->pRegions, VkBufferCopy);
-      cmd->u.copy_buffer.regions = regions;
-   }
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdBlitImage2KHR(
-   VkCommandBuffer                             commandBuffer,
-   const VkBlitImageInfo2KHR                  *info)
-{
-   
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_image, src_image, info->srcImage);
-   LVP_FROM_HANDLE(lvp_image, dest_image, info->dstImage);
-   struct lvp_cmd_buffer_entry *cmd;
-   uint32_t cmd_size = info->regionCount * sizeof(VkImageBlit);
-
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, cmd_size, LVP_CMD_BLIT_IMAGE);
-   if (!cmd)
-      return;
-
-   cmd->u.blit_image.src = src_image;
-   cmd->u.blit_image.dst = dest_image;
-   cmd->u.blit_image.src_layout = info->srcImageLayout;
-   cmd->u.blit_image.dst_layout = info->dstImageLayout;
-   cmd->u.blit_image.filter = info->filter;
-   cmd->u.blit_image.region_count = info->regionCount;
-
-   {
-      VkImageBlit *regions;
-
-      regions = (VkImageBlit *)(cmd + 1);
-      COPY_STRUCT2_ARRAY(info->regionCount, regions, info->pRegions, VkImageBlit);
-      cmd->u.blit_image.regions = regions;
-   }
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdClearAttachments(
-   VkCommandBuffer                             commandBuffer,
-   uint32_t                                    attachmentCount,
-   const VkClearAttachment*                    pAttachments,
-   uint32_t                                    rectCount,
-   const VkClearRect*                          pRects)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-   uint32_t cmd_size = attachmentCount * sizeof(VkClearAttachment) + rectCount * sizeof(VkClearRect);
-
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, cmd_size, LVP_CMD_CLEAR_ATTACHMENTS);
-   if (!cmd)
-      return;
-
-   cmd->u.clear_attachments.attachment_count = attachmentCount;
-   cmd->u.clear_attachments.attachments = (VkClearAttachment *)(cmd + 1);
-   for (unsigned i = 0; i < attachmentCount; i++)
-      cmd->u.clear_attachments.attachments[i] = pAttachments[i];
-   cmd->u.clear_attachments.rect_count = rectCount;
-   cmd->u.clear_attachments.rects = (VkClearRect *)(cmd->u.clear_attachments.attachments + attachmentCount);
-   for (unsigned i = 0; i < rectCount; i++)
-      cmd->u.clear_attachments.rects[i] = pRects[i];
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdFillBuffer(
-   VkCommandBuffer                             commandBuffer,
-   VkBuffer                                    dstBuffer,
-   VkDeviceSize                                dstOffset,
-   VkDeviceSize                                fillSize,
-   uint32_t                                    data)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_buffer, dst_buffer, dstBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_FILL_BUFFER);
-   if (!cmd)
-      return;
-
-   cmd->u.fill_buffer.buffer = dst_buffer;
-   cmd->u.fill_buffer.offset = dstOffset;
-   cmd->u.fill_buffer.fill_size = fillSize;
-   cmd->u.fill_buffer.data = data;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdUpdateBuffer(
-   VkCommandBuffer                             commandBuffer,
-   VkBuffer                                    dstBuffer,
-   VkDeviceSize                                dstOffset,
-   VkDeviceSize                                dataSize,
-   const void*                                 pData)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_buffer, dst_buffer, dstBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, dataSize, LVP_CMD_UPDATE_BUFFER);
-   if (!cmd)
-      return;
-
-   cmd->u.update_buffer.buffer = dst_buffer;
-   cmd->u.update_buffer.offset = dstOffset;
-   cmd->u.update_buffer.data_size = dataSize;
-   memcpy(cmd->u.update_buffer.data, pData, dataSize);
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdClearColorImage(
-   VkCommandBuffer                             commandBuffer,
-   VkImage                                     image_h,
-   VkImageLayout                               imageLayout,
-   const VkClearColorValue*                    pColor,
-   uint32_t                                    rangeCount,
-   const VkImageSubresourceRange*              pRanges)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_image, image, image_h);
-   struct lvp_cmd_buffer_entry *cmd;
-   uint32_t cmd_size = rangeCount * sizeof(VkImageSubresourceRange);
-
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, cmd_size, LVP_CMD_CLEAR_COLOR_IMAGE);
-   if (!cmd)
-      return;
-
-   cmd->u.clear_color_image.image = image;
-   cmd->u.clear_color_image.layout = imageLayout;
-   cmd->u.clear_color_image.clear_val = *pColor;
-   cmd->u.clear_color_image.range_count = rangeCount;
-   cmd->u.clear_color_image.ranges = (VkImageSubresourceRange *)(cmd + 1);
-   for (unsigned i = 0; i < rangeCount; i++)
-      cmd->u.clear_color_image.ranges[i] = pRanges[i];
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdClearDepthStencilImage(
-   VkCommandBuffer                             commandBuffer,
-   VkImage                                     image_h,
-   VkImageLayout                               imageLayout,
-   const VkClearDepthStencilValue*             pDepthStencil,
-   uint32_t                                    rangeCount,
-   const VkImageSubresourceRange*              pRanges)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_image, image, image_h);
-   struct lvp_cmd_buffer_entry *cmd;
-   uint32_t cmd_size = rangeCount * sizeof(VkImageSubresourceRange);
-
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, cmd_size, LVP_CMD_CLEAR_DEPTH_STENCIL_IMAGE);
-   if (!cmd)
-      return;
-
-   cmd->u.clear_ds_image.image = image;
-   cmd->u.clear_ds_image.layout = imageLayout;
-   cmd->u.clear_ds_image.clear_val = *pDepthStencil;
-   cmd->u.clear_ds_image.range_count = rangeCount;
-   cmd->u.clear_ds_image.ranges = (VkImageSubresourceRange *)(cmd + 1);
-   for (unsigned i = 0; i < rangeCount; i++)
-      cmd->u.clear_ds_image.ranges[i] = pRanges[i];
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdResolveImage2KHR(
-   VkCommandBuffer                             commandBuffer,
-   const VkResolveImageInfo2KHR               *info)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_image, src_image, info->srcImage);
-   LVP_FROM_HANDLE(lvp_image, dst_image, info->dstImage);
-   struct lvp_cmd_buffer_entry *cmd;
-   uint32_t cmd_size = info->regionCount * sizeof(VkImageResolve);
-
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, cmd_size, LVP_CMD_RESOLVE_IMAGE);
-   if (!cmd)
-      return;
-
-   cmd->u.resolve_image.src = src_image;
-   cmd->u.resolve_image.dst = dst_image;
-   cmd->u.resolve_image.src_layout = info->srcImageLayout;
-   cmd->u.resolve_image.dst_layout = info->dstImageLayout;
-   cmd->u.resolve_image.region_count = info->regionCount;
-   cmd->u.resolve_image.regions = (VkImageResolve *)(cmd + 1);
-   COPY_STRUCT2_ARRAY(info->regionCount, cmd->u.resolve_image.regions, info->pRegions, VkImageResolve);
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdResetQueryPool(
-   VkCommandBuffer                             commandBuffer,
-   VkQueryPool                                 queryPool,
-   uint32_t                                    firstQuery,
-   uint32_t                                    queryCount)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_query_pool, query_pool, queryPool);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_RESET_QUERY_POOL);
-   if (!cmd)
-      return;
-
-   cmd->u.query.pool = query_pool;
-   cmd->u.query.query = firstQuery;
-   cmd->u.query.index = queryCount;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdBeginQueryIndexedEXT(
-   VkCommandBuffer                             commandBuffer,
-   VkQueryPool                                 queryPool,
-   uint32_t                                    query,
-   VkQueryControlFlags                         flags,
-   uint32_t                                    index)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_query_pool, query_pool, queryPool);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_BEGIN_QUERY);
-   if (!cmd)
-      return;
-
-   cmd->u.query.pool = query_pool;
-   cmd->u.query.query = query;
-   cmd->u.query.index = index;
-   cmd->u.query.precise = true;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdBeginQuery(
-   VkCommandBuffer                             commandBuffer,
-   VkQueryPool                                 queryPool,
-   uint32_t                                    query,
-   VkQueryControlFlags                         flags)
-{
-   lvp_CmdBeginQueryIndexedEXT(commandBuffer, queryPool, query, flags, 0);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdEndQueryIndexedEXT(
-   VkCommandBuffer                             commandBuffer,
-   VkQueryPool                                 queryPool,
-   uint32_t                                    query,
-   uint32_t                                    index)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_query_pool, query_pool, queryPool);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_END_QUERY);
-   if (!cmd)
-      return;
-
-   cmd->u.query.pool = query_pool;
-   cmd->u.query.query = query;
-   cmd->u.query.index = index;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdEndQuery(
-   VkCommandBuffer                             commandBuffer,
-   VkQueryPool                                 queryPool,
-   uint32_t                                    query)
-{
-   lvp_CmdEndQueryIndexedEXT(commandBuffer, queryPool, query, 0);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdWriteTimestamp(
-   VkCommandBuffer                             commandBuffer,
-   VkPipelineStageFlagBits                     pipelineStage,
-   VkQueryPool                                 queryPool,
-   uint32_t                                    query)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_query_pool, query_pool, queryPool);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_WRITE_TIMESTAMP);
-   if (!cmd)
-      return;
-
-   cmd->u.query.pool = query_pool;
-   cmd->u.query.query = query;
-   cmd->u.query.flush = !(pipelineStage == VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT);
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdCopyQueryPoolResults(
-   VkCommandBuffer                             commandBuffer,
-   VkQueryPool                                 queryPool,
-   uint32_t                                    firstQuery,
-   uint32_t                                    queryCount,
-   VkBuffer                                    dstBuffer,
-   VkDeviceSize                                dstOffset,
-   VkDeviceSize                                stride,
-   VkQueryResultFlags                          flags)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_query_pool, query_pool, queryPool);
-   LVP_FROM_HANDLE(lvp_buffer, buffer, dstBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_COPY_QUERY_POOL_RESULTS);
-   if (!cmd)
-      return;
-
-   cmd->u.copy_query_pool_results.pool = query_pool;
-   cmd->u.copy_query_pool_results.first_query = firstQuery;
-   cmd->u.copy_query_pool_results.query_count = queryCount;
-   cmd->u.copy_query_pool_results.dst = buffer;
-   cmd->u.copy_query_pool_results.dst_offset = dstOffset;
-   cmd->u.copy_query_pool_results.stride = stride;
-   cmd->u.copy_query_pool_results.flags = flags;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdPipelineBarrier(
-   VkCommandBuffer                             commandBuffer,
-   VkPipelineStageFlags                        srcStageMask,
-   VkPipelineStageFlags                        destStageMask,
-   VkBool32                                    byRegion,
-   uint32_t                                    memoryBarrierCount,
-   const VkMemoryBarrier*                      pMemoryBarriers,
-   uint32_t                                    bufferMemoryBarrierCount,
-   const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
-   uint32_t                                    imageMemoryBarrierCount,
-   const VkImageMemoryBarrier*                 pImageMemoryBarriers)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-   uint32_t cmd_size = 0;
-
-   cmd_size += memoryBarrierCount * sizeof(VkMemoryBarrier);
-   cmd_size += bufferMemoryBarrierCount * sizeof(VkBufferMemoryBarrier);
-   cmd_size += imageMemoryBarrierCount * sizeof(VkImageMemoryBarrier);
-
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, cmd_size, LVP_CMD_PIPELINE_BARRIER);
-   if (!cmd)
-      return;
-
-   cmd->u.pipeline_barrier.src_stage_mask = srcStageMask;
-   cmd->u.pipeline_barrier.dst_stage_mask = destStageMask;
-   cmd->u.pipeline_barrier.by_region = byRegion;
-   cmd->u.pipeline_barrier.memory_barrier_count = memoryBarrierCount;
-   cmd->u.pipeline_barrier.buffer_memory_barrier_count = bufferMemoryBarrierCount;
-   cmd->u.pipeline_barrier.image_memory_barrier_count = imageMemoryBarrierCount;
-
-   /* TODO finish off this */
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdDrawIndirectCount(
-    VkCommandBuffer                             commandBuffer,
-    VkBuffer                                    buffer,
-    VkDeviceSize                                offset,
-    VkBuffer                                    countBuffer,
-    VkDeviceSize                                countBufferOffset,
-    uint32_t                                    maxDrawCount,
-    uint32_t                                    stride)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_buffer, buf, buffer);
-   LVP_FROM_HANDLE(lvp_buffer, count_buf, countBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_DRAW_INDIRECT_COUNT);
-   if (!cmd)
-      return;
-
-   cmd->u.draw_indirect_count.offset = offset;
-   cmd->u.draw_indirect_count.buffer = buf;
-   cmd->u.draw_indirect_count.count_buffer_offset = countBufferOffset;
-   cmd->u.draw_indirect_count.count_buffer = count_buf;
-   cmd->u.draw_indirect_count.max_draw_count = maxDrawCount;
-   cmd->u.draw_indirect_count.stride = stride;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdDrawIndexedIndirectCount(
-    VkCommandBuffer                             commandBuffer,
-    VkBuffer                                    buffer,
-    VkDeviceSize                                offset,
-    VkBuffer                                    countBuffer,
-    VkDeviceSize                                countBufferOffset,
-    uint32_t                                    maxDrawCount,
-    uint32_t                                    stride)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_buffer, buf, buffer);
-   LVP_FROM_HANDLE(lvp_buffer, count_buf, countBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_DRAW_INDEXED_INDIRECT_COUNT);
-   if (!cmd)
-      return;
-
-   cmd->u.draw_indirect_count.offset = offset;
-   cmd->u.draw_indirect_count.buffer = buf;
-   cmd->u.draw_indirect_count.count_buffer_offset = countBufferOffset;
-   cmd->u.draw_indirect_count.count_buffer = count_buf;
-   cmd->u.draw_indirect_count.max_draw_count = maxDrawCount;
-   cmd->u.draw_indirect_count.stride = stride;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdPushDescriptorSetKHR(
-   VkCommandBuffer                             commandBuffer,
-   VkPipelineBindPoint                         pipelineBindPoint,
-   VkPipelineLayout                            _layout,
-   uint32_t                                    set,
-   uint32_t                                    descriptorWriteCount,
-   const VkWriteDescriptorSet*                 pDescriptorWrites)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_pipeline_layout, layout, _layout);
-   struct lvp_cmd_buffer_entry *cmd;
-   int cmd_size = 0;
-
-   cmd_size += descriptorWriteCount * sizeof(struct lvp_write_descriptor);
-
-   int count_descriptors = 0;
-
-   for (unsigned i = 0; i < descriptorWriteCount; i++) {
-      count_descriptors += pDescriptorWrites[i].descriptorCount;
-   }
-   cmd_size += count_descriptors * sizeof(union lvp_descriptor_info);
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, cmd_size, LVP_CMD_PUSH_DESCRIPTOR_SET);
-   if (!cmd)
-      return;
-
-   cmd->u.push_descriptor_set.bind_point = pipelineBindPoint;
-   cmd->u.push_descriptor_set.layout = layout;
-   cmd->u.push_descriptor_set.set = set;
-   cmd->u.push_descriptor_set.descriptor_write_count = descriptorWriteCount;
-   cmd->u.push_descriptor_set.descriptors = (struct lvp_write_descriptor *)(cmd + 1);
-   cmd->u.push_descriptor_set.infos = (union lvp_descriptor_info *)(cmd->u.push_descriptor_set.descriptors + descriptorWriteCount);
-
-   unsigned descriptor_index = 0;
-
-   for (unsigned i = 0; i < descriptorWriteCount; i++) {
-      struct lvp_write_descriptor *desc = &cmd->u.push_descriptor_set.descriptors[i];
-
-      /* dstSet is ignored */
-      desc->dst_binding = pDescriptorWrites[i].dstBinding;
-      desc->dst_array_element = pDescriptorWrites[i].dstArrayElement;
-      desc->descriptor_count = pDescriptorWrites[i].descriptorCount;
-      desc->descriptor_type = pDescriptorWrites[i].descriptorType;
-
-      for (unsigned j = 0; j < desc->descriptor_count; j++) {
-         union lvp_descriptor_info *info = &cmd->u.push_descriptor_set.infos[descriptor_index + j];
-         switch (desc->descriptor_type) {
-         case VK_DESCRIPTOR_TYPE_SAMPLER:
-            info->sampler = lvp_sampler_from_handle(pDescriptorWrites[i].pImageInfo[j].sampler);
-            break;
-         case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
-            info->sampler = lvp_sampler_from_handle(pDescriptorWrites[i].pImageInfo[j].sampler);
-            info->iview = lvp_image_view_from_handle(pDescriptorWrites[i].pImageInfo[j].imageView);
-            info->image_layout = pDescriptorWrites[i].pImageInfo[j].imageLayout;
-            break;
-         case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
-         case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
-         case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
-            info->iview = lvp_image_view_from_handle(pDescriptorWrites[i].pImageInfo[j].imageView);
-            info->image_layout = pDescriptorWrites[i].pImageInfo[j].imageLayout;
-            break;
-         case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
-         case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
-            info->buffer_view = lvp_buffer_view_from_handle(pDescriptorWrites[i].pTexelBufferView[j]);
-            break;
-         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
-         case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
-         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
-         case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
-         default:
-            info->buffer = lvp_buffer_from_handle(pDescriptorWrites[i].pBufferInfo[j].buffer);
-            info->offset = pDescriptorWrites[i].pBufferInfo[j].offset;
-            info->range = pDescriptorWrites[i].pBufferInfo[j].range;
-            break;
-         }
-      }
-      descriptor_index += desc->descriptor_count;
-   }
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdPushDescriptorSetWithTemplateKHR(
-   VkCommandBuffer                             commandBuffer,
-   VkDescriptorUpdateTemplate                  descriptorUpdateTemplate,
-   VkPipelineLayout                            _layout,
-   uint32_t                                    set,
-   const void*                                 pData)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   LVP_FROM_HANDLE(lvp_descriptor_update_template, templ, descriptorUpdateTemplate);
-   int cmd_size = 0;
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd_size += templ->entry_count * sizeof(struct lvp_write_descriptor);
-
-   int count_descriptors = 0;
-   for (unsigned i = 0; i < templ->entry_count; i++) {
-      VkDescriptorUpdateTemplateEntry *entry = &templ->entry[i];
-      count_descriptors += entry->descriptorCount;
-   }
-   cmd_size += count_descriptors * sizeof(union lvp_descriptor_info);
-
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, cmd_size, LVP_CMD_PUSH_DESCRIPTOR_SET);
-   if (!cmd)
-      return;
-
-   cmd->u.push_descriptor_set.bind_point = templ->bind_point;
-   cmd->u.push_descriptor_set.layout = templ->pipeline_layout;
-   cmd->u.push_descriptor_set.set = templ->set;
-   cmd->u.push_descriptor_set.descriptor_write_count = templ->entry_count;
-   cmd->u.push_descriptor_set.descriptors = (struct lvp_write_descriptor *)(cmd + 1);
-   cmd->u.push_descriptor_set.infos = (union lvp_descriptor_info *)(cmd->u.push_descriptor_set.descriptors + templ->entry_count);
-
-   unsigned descriptor_index = 0;
-
-   for (unsigned i = 0; i < templ->entry_count; i++) {
-      struct lvp_write_descriptor *desc = &cmd->u.push_descriptor_set.descriptors[i];
-      struct VkDescriptorUpdateTemplateEntry *entry = &templ->entry[i];
-      const uint8_t *pSrc = ((const uint8_t *) pData) + entry->offset;
-
-      /* dstSet is ignored */
-      desc->dst_binding = entry->dstBinding;
-      desc->dst_array_element = entry->dstArrayElement;
-      desc->descriptor_count = entry->descriptorCount;
-      desc->descriptor_type = entry->descriptorType;
-
-      for (unsigned j = 0; j < desc->descriptor_count; j++) {
-         union lvp_descriptor_info *info = &cmd->u.push_descriptor_set.infos[descriptor_index + j];
-         switch (desc->descriptor_type) {
-         case VK_DESCRIPTOR_TYPE_SAMPLER:
-            info->sampler = lvp_sampler_from_handle(*(VkSampler *)pSrc);
-            break;
-         case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: {
-            VkDescriptorImageInfo *image_info = (VkDescriptorImageInfo *)pSrc;
-            info->sampler = lvp_sampler_from_handle(image_info->sampler);
-            info->iview = lvp_image_view_from_handle(image_info->imageView);
-            info->image_layout = image_info->imageLayout;
-            break;
-         }
-         case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
-         case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
-         case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
-            VkDescriptorImageInfo *image_info = (VkDescriptorImageInfo *)pSrc;
-            info->iview = lvp_image_view_from_handle(image_info->imageView);
-            info->image_layout = image_info->imageLayout;
-            break;
-         }
-         case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
-         case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
-            info->buffer_view = lvp_buffer_view_from_handle(*(VkBufferView *)pSrc);
-            break;
-         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
-         case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
-         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
-         case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
-         default: {
-            VkDescriptorBufferInfo *buffer_info = (VkDescriptorBufferInfo *)pSrc;
-            info->buffer = lvp_buffer_from_handle(buffer_info->buffer);
-            info->offset = buffer_info->offset;
-            info->range = buffer_info->range;
-            break;
-         }
-         }
-         pSrc += entry->stride;
-      }
-      descriptor_index += desc->descriptor_count;
-   }
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdBindTransformFeedbackBuffersEXT(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    firstBinding,
-    uint32_t                                    bindingCount,
-    const VkBuffer*                             pBuffers,
-    const VkDeviceSize*                         pOffsets,
-    const VkDeviceSize*                         pSizes)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-   uint32_t cmd_size = 0;
-
-   cmd_size += bindingCount * (sizeof(struct lvp_buffer *) + sizeof(VkDeviceSize) * 2);
-
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, cmd_size, LVP_CMD_BIND_TRANSFORM_FEEDBACK_BUFFERS);
-   if (!cmd)
-      return;
-
-   cmd->u.bind_transform_feedback_buffers.first_binding = firstBinding;
-   cmd->u.bind_transform_feedback_buffers.binding_count = bindingCount;
-   cmd->u.bind_transform_feedback_buffers.buffers = (struct lvp_buffer **)(cmd + 1);
-   cmd->u.bind_transform_feedback_buffers.offsets = (VkDeviceSize *)(cmd->u.bind_transform_feedback_buffers.buffers + bindingCount);
-   cmd->u.bind_transform_feedback_buffers.sizes = (VkDeviceSize *)(cmd->u.bind_transform_feedback_buffers.offsets + bindingCount);
-
-   for (unsigned i = 0; i < bindingCount; i++) {
-      cmd->u.bind_transform_feedback_buffers.buffers[i] = lvp_buffer_from_handle(pBuffers[i]);
-      cmd->u.bind_transform_feedback_buffers.offsets[i] = pOffsets[i];
-      if (pSizes && pSizes[i] != VK_WHOLE_SIZE)
-         cmd->u.bind_transform_feedback_buffers.sizes[i] = pSizes[i];
-      else
-         cmd->u.bind_transform_feedback_buffers.sizes[i] = cmd->u.bind_transform_feedback_buffers.buffers[i]->size - pOffsets[i];
-   }
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdBeginTransformFeedbackEXT(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    firstCounterBuffer,
-    uint32_t                                    counterBufferCount,
-    const VkBuffer*                             pCounterBuffers,
-    const VkDeviceSize*                         pCounterBufferOffsets)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-   uint32_t cmd_size = 0;
-
-   cmd_size += counterBufferCount * (sizeof(struct lvp_buffer *) + sizeof(VkDeviceSize));
-
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, cmd_size, LVP_CMD_BEGIN_TRANSFORM_FEEDBACK);
-   if (!cmd)
-      return;
-
-   cmd->u.begin_transform_feedback.first_counter_buffer = firstCounterBuffer;
-   cmd->u.begin_transform_feedback.counter_buffer_count = counterBufferCount;
-   cmd->u.begin_transform_feedback.counter_buffers = (struct lvp_buffer **)(cmd + 1);
-   cmd->u.begin_transform_feedback.counter_buffer_offsets = (VkDeviceSize *)(cmd->u.begin_transform_feedback.counter_buffers + counterBufferCount);
-
-   for (unsigned i = 0; i < counterBufferCount; i++) {
-      if (pCounterBuffers)
-         cmd->u.begin_transform_feedback.counter_buffers[i] = lvp_buffer_from_handle(pCounterBuffers[i]);
-      else
-         cmd->u.begin_transform_feedback.counter_buffers[i] = NULL;
-      if (pCounterBufferOffsets)
-         cmd->u.begin_transform_feedback.counter_buffer_offsets[i] = pCounterBufferOffsets[i];
-      else
-         cmd->u.begin_transform_feedback.counter_buffer_offsets[i] = 0;
-   }
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdEndTransformFeedbackEXT(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    firstCounterBuffer,
-    uint32_t                                    counterBufferCount,
-    const VkBuffer*                             pCounterBuffers,
-    const VkDeviceSize*                         pCounterBufferOffsets)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-   uint32_t cmd_size = 0;
-
-   cmd_size += counterBufferCount * (sizeof(struct lvp_buffer *) + sizeof(VkDeviceSize));
-
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, cmd_size, LVP_CMD_END_TRANSFORM_FEEDBACK);
-   if (!cmd)
-      return;
-
-   cmd->u.begin_transform_feedback.first_counter_buffer = firstCounterBuffer;
-   cmd->u.begin_transform_feedback.counter_buffer_count = counterBufferCount;
-   cmd->u.begin_transform_feedback.counter_buffers = (struct lvp_buffer **)(cmd + 1);
-   cmd->u.begin_transform_feedback.counter_buffer_offsets = (VkDeviceSize *)(cmd->u.begin_transform_feedback.counter_buffers + counterBufferCount);
-
-   for (unsigned i = 0; i < counterBufferCount; i++) {
-      if (pCounterBuffers)
-         cmd->u.begin_transform_feedback.counter_buffers[i] = lvp_buffer_from_handle(pCounterBuffers[i]);
-      else
-         cmd->u.begin_transform_feedback.counter_buffers[i] = NULL;
-      if (pCounterBufferOffsets)
-         cmd->u.begin_transform_feedback.counter_buffer_offsets[i] = pCounterBufferOffsets[i];
-      else
-         cmd->u.begin_transform_feedback.counter_buffer_offsets[i] = 0;
-   }
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdDrawIndirectByteCountEXT(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    instanceCount,
-    uint32_t                                    firstInstance,
-    VkBuffer                                    counterBuffer,
-    VkDeviceSize                                counterBufferOffset,
-    uint32_t                                    counterOffset,
-    uint32_t                                    vertexStride)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_DRAW_INDIRECT_BYTE_COUNT);
-   if (!cmd)
-      return;
-
-   cmd->u.draw_indirect_byte_count.instance_count = instanceCount;
-   cmd->u.draw_indirect_byte_count.first_instance = firstInstance;
-   cmd->u.draw_indirect_byte_count.counter_buffer = lvp_buffer_from_handle(counterBuffer);
-   cmd->u.draw_indirect_byte_count.counter_buffer_offset = counterBufferOffset;
-   cmd->u.draw_indirect_byte_count.counter_offset = counterOffset;
-   cmd->u.draw_indirect_byte_count.vertex_stride = vertexStride;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetDeviceMask(
-   VkCommandBuffer commandBuffer,
-   uint32_t deviceMask)
-{
-   /* No-op */
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdDispatchBase(
-   VkCommandBuffer                             commandBuffer,
-   uint32_t                                    base_x,
-   uint32_t                                    base_y,
-   uint32_t                                    base_z,
-   uint32_t                                    x,
-   uint32_t                                    y,
-   uint32_t                                    z)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_DISPATCH);
-   if (!cmd)
-      return;
-
-   cmd->u.dispatch.x = x;
-   cmd->u.dispatch.y = y;
-   cmd->u.dispatch.z = z;
-   cmd->u.dispatch.base_x = base_x;
-   cmd->u.dispatch.base_y = base_y;
-   cmd->u.dispatch.base_z = base_z;
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdBeginConditionalRenderingEXT(
-   VkCommandBuffer commandBuffer,
-   const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_BEGIN_CONDITIONAL_RENDERING);
-   if (!cmd)
-      return;
-
-   cmd->u.begin_conditional_rendering.buffer = lvp_buffer_from_handle(pConditionalRenderingBegin->buffer);
-   cmd->u.begin_conditional_rendering.offset = pConditionalRenderingBegin->offset;
-   cmd->u.begin_conditional_rendering.inverted = pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdEndConditionalRenderingEXT(
-   VkCommandBuffer commandBuffer)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_END_CONDITIONAL_RENDERING);
-   if (!cmd)
-      return;
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetCullModeEXT(
-    VkCommandBuffer                             commandBuffer,
-    VkCullModeFlags                             cullMode)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_CULL_MODE);
-   if (!cmd)
-      return;
-
-   cmd->u.set_cull_mode.cull_mode = cullMode;
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetVertexInputEXT(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    vertexBindingDescriptionCount,
-    const VkVertexInputBindingDescription2EXT*  pVertexBindingDescriptions,
-    uint32_t                                    vertexAttributeDescriptionCount,
-    const VkVertexInputAttributeDescription2EXT* pVertexAttributeDescriptions)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   size_t binding_size = vertexBindingDescriptionCount * sizeof(VkVertexInputBindingDescription2EXT);
-   size_t attr_size = vertexAttributeDescriptionCount * sizeof(VkVertexInputAttributeDescription2EXT);
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, binding_size + attr_size, LVP_CMD_SET_VERTEX_INPUT);
-   if (!cmd)
-      return;
-
-   cmd->u.set_vertex_input.binding_count = vertexBindingDescriptionCount;
-   cmd->u.set_vertex_input.attr_count = vertexAttributeDescriptionCount;
-   memcpy(cmd->u.set_vertex_input.data, pVertexBindingDescriptions, binding_size);
-   memcpy(cmd->u.set_vertex_input.data + binding_size, pVertexAttributeDescriptions, attr_size);
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetFrontFaceEXT(
-    VkCommandBuffer                             commandBuffer,
-    VkFrontFace                                 frontFace)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_FRONT_FACE);
-   if (!cmd)
-      return;
-
-   cmd->u.set_front_face.front_face = frontFace;
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetLineStippleEXT(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    lineStippleFactor,
-    uint16_t                                    lineStipplePattern)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_LINE_STIPPLE);
-   if (!cmd)
-      return;
-
-   cmd->u.set_line_stipple.line_stipple_factor = lineStippleFactor;
-   cmd->u.set_line_stipple.line_stipple_pattern = lineStipplePattern;
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetPrimitiveTopologyEXT(
-    VkCommandBuffer                             commandBuffer,
-    VkPrimitiveTopology                         primitiveTopology)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_PRIMITIVE_TOPOLOGY);
-   if (!cmd)
-      return;
-
-   cmd->u.set_primitive_topology.prim = primitiveTopology;
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetViewportWithCountEXT(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    viewportCount,
-    const VkViewport*                           pViewports)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-   int i;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_VIEWPORT);
-   if (!cmd)
-      return;
-
-   cmd->u.set_viewport.first_viewport = UINT32_MAX;
-   cmd->u.set_viewport.viewport_count = viewportCount;
-   for (i = 0; i < viewportCount; i++)
-      cmd->u.set_viewport.viewports[i] = pViewports[i];
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetScissorWithCountEXT(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    scissorCount,
-    const VkRect2D*                             pScissors)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-   int i;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_SCISSOR);
-   if (!cmd)
-      return;
-
-   cmd->u.set_scissor.first_scissor = UINT32_MAX;
-   cmd->u.set_scissor.scissor_count = scissorCount;
-   for (i = 0; i < scissorCount; i++)
-      cmd->u.set_scissor.scissors[i] = pScissors[i];
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdBindVertexBuffers2EXT(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    firstBinding,
-    uint32_t                                    bindingCount,
-    const VkBuffer*                             pBuffers,
-    const VkDeviceSize*                         pOffsets,
-    const VkDeviceSize*                         pSizes,
-    const VkDeviceSize*                         pStrides)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-   struct lvp_buffer **buffers;
-   VkDeviceSize *offsets;
-   VkDeviceSize *sizes;
-   VkDeviceSize *strides;
-   int i;
-   uint32_t array_count = pStrides ? 3 : 2;
-   uint32_t cmd_size = bindingCount * sizeof(struct lvp_buffer *) + bindingCount * array_count * sizeof(VkDeviceSize);
-
-   cmd = cmd_buf_entry_alloc_size(cmd_buffer, cmd_size, LVP_CMD_BIND_VERTEX_BUFFERS);
-   if (!cmd)
-      return;
-
-   cmd->u.vertex_buffers.first = firstBinding;
-   cmd->u.vertex_buffers.binding_count = bindingCount;
-
-   buffers = (struct lvp_buffer **)(cmd + 1);
-   offsets = (VkDeviceSize *)(buffers + bindingCount);
-   sizes = (VkDeviceSize *)(offsets + bindingCount);
-   strides = (VkDeviceSize *)(sizes + bindingCount);
-   for (i = 0; i < bindingCount; i++) {
-      buffers[i] = lvp_buffer_from_handle(pBuffers[i]);
-      offsets[i] = pOffsets[i];
-      if (pSizes)
-         sizes[i] = pSizes[i];
-      else
-         sizes[i] = 0;
-
-      if (pStrides)
-         strides[i] = pStrides[i];
-   }
-   cmd->u.vertex_buffers.buffers = buffers;
-   cmd->u.vertex_buffers.offsets = offsets;
-   cmd->u.vertex_buffers.sizes = sizes;
-   cmd->u.vertex_buffers.strides = pStrides ? strides : NULL;
-
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetDepthTestEnableEXT(
-    VkCommandBuffer                             commandBuffer,
-    VkBool32                                    depthTestEnable)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_DEPTH_TEST_ENABLE);
-   if (!cmd)
-      return;
-
-   cmd->u.set_depth_test_enable.depth_test_enable = depthTestEnable;
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetDepthWriteEnableEXT(
-    VkCommandBuffer                             commandBuffer,
-    VkBool32                                    depthWriteEnable)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_DEPTH_WRITE_ENABLE);
-   if (!cmd)
-      return;
-
-   cmd->u.set_depth_write_enable.depth_write_enable = depthWriteEnable;
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetDepthCompareOpEXT(
-    VkCommandBuffer                             commandBuffer,
-    VkCompareOp                                 depthCompareOp)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_DEPTH_COMPARE_OP);
-   if (!cmd)
-      return;
-
-   cmd->u.set_depth_compare_op.depth_op = depthCompareOp;
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetDepthBoundsTestEnableEXT(
-    VkCommandBuffer                             commandBuffer,
-    VkBool32                                    depthBoundsTestEnable)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_DEPTH_BOUNDS_TEST_ENABLE);
-   if (!cmd)
-      return;
-
-   cmd->u.set_depth_bounds_test_enable.depth_bounds_test_enable = depthBoundsTestEnable;
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetStencilTestEnableEXT(
-    VkCommandBuffer                             commandBuffer,
-    VkBool32                                    stencilTestEnable)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_STENCIL_TEST_ENABLE);
-   if (!cmd)
-      return;
-
-   cmd->u.set_stencil_test_enable.stencil_test_enable = stencilTestEnable;
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetStencilOpEXT(
-    VkCommandBuffer                             commandBuffer,
-    VkStencilFaceFlags                          faceMask,
-    VkStencilOp                                 failOp,
-    VkStencilOp                                 passOp,
-    VkStencilOp                                 depthFailOp,
-    VkCompareOp                                 compareOp)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_STENCIL_OP);
-   if (!cmd)
-      return;
-
-   cmd->u.set_stencil_op.face_mask = faceMask;
-   cmd->u.set_stencil_op.fail_op = failOp;
-   cmd->u.set_stencil_op.pass_op = passOp;
-   cmd->u.set_stencil_op.depth_fail_op = depthFailOp;
-   cmd->u.set_stencil_op.compare_op = compareOp;
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetDepthBiasEnableEXT(
-    VkCommandBuffer                             commandBuffer,
-    VkBool32                                    depthBiasEnable)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_DEPTH_BIAS_ENABLE);
-   if (!cmd)
-      return;
-
-   cmd->u.set_depth_bias_enable.enable = depthBiasEnable == VK_TRUE;
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetLogicOpEXT(
-    VkCommandBuffer                             commandBuffer,
-    VkLogicOp                                   logicOp)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_LOGIC_OP);
-   if (!cmd)
-      return;
-
-   cmd->u.set_logic_op.op = logicOp;
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetPatchControlPointsEXT(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    patchControlPoints)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_PATCH_CONTROL_POINTS);
-   if (!cmd)
-      return;
-
-   cmd->u.set_patch_control_points.vertices_per_patch = patchControlPoints;
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetPrimitiveRestartEnableEXT(
-    VkCommandBuffer                             commandBuffer,
-    VkBool32                                    primitiveRestartEnable)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_PRIMITIVE_RESTART_ENABLE);
-   if (!cmd)
-      return;
-
-   cmd->u.set_primitive_restart_enable.enable = primitiveRestartEnable == VK_TRUE;
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetRasterizerDiscardEnableEXT(
-    VkCommandBuffer                             commandBuffer,
-    VkBool32                                    rasterizerDiscardEnable)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_RASTERIZER_DISCARD_ENABLE);
-   if (!cmd)
-      return;
-
-   cmd->u.set_rasterizer_discard_enable.enable = rasterizerDiscardEnable == VK_TRUE;
-   cmd_buf_queue(cmd_buffer, cmd);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_CmdSetColorWriteEnableEXT(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    attachmentCount,
-    const VkBool32*                             pColorWriteEnables)
-{
-   LVP_FROM_HANDLE(lvp_cmd_buffer, cmd_buffer, commandBuffer);
-   struct lvp_cmd_buffer_entry *cmd;
-
-   cmd = cmd_buf_entry_alloc(cmd_buffer, LVP_CMD_SET_COLOR_WRITE_ENABLE);
-   if (!cmd)
-      return;
-
-   cmd->u.set_color_write_enable.disable_mask = 0;
-   for (unsigned i = 0; i < attachmentCount; i++) {
-      /* this is inverted because cmdbufs are zero-initialized, meaning only 'true'
-       * can be detected with a bool, and the default is to enable color writes
-       */
-      if (pColorWriteEnables[i] != VK_TRUE)
-         cmd->u.set_color_write_enable.disable_mask |= BITFIELD_BIT(i);
-   }
-   cmd_buf_queue(cmd_buffer, cmd);
 }
diff --git a/mesa 3D driver/src/gallium/frontends/lavapipe/lvp_descriptor_set.c b/mesa 3D driver/src/gallium/frontends/lavapipe/lvp_descriptor_set.c
index 4714947a88..9c057f37af 100644
--- a/mesa 3D driver/src/gallium/frontends/lavapipe/lvp_descriptor_set.c	
+++ b/mesa 3D driver/src/gallium/frontends/lavapipe/lvp_descriptor_set.c	
@@ -65,7 +65,7 @@ VKAPI_ATTR VkResult VKAPI_CALL lvp_CreateDescriptorSetLayout(
    set_layout = vk_zalloc2(&device->vk.alloc, pAllocator, size, 8,
                            VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (!set_layout)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    vk_object_base_init(&device->vk, &set_layout->base,
                        VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT);
@@ -86,7 +86,7 @@ VKAPI_ATTR VkResult VKAPI_CALL lvp_CreateDescriptorSetLayout(
    if (result != VK_SUCCESS) {
       vk_object_base_finish(&set_layout->base);
       vk_free2(&device->vk.alloc, pAllocator, set_layout);
-      return vk_error(device->instance, result);
+      return vk_error(device, result);
    }
 
    uint32_t dynamic_offset_count = 0;
@@ -218,7 +218,7 @@ VKAPI_ATTR VkResult VKAPI_CALL lvp_CreatePipelineLayout(
    layout = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*layout), 8,
                        VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (layout == NULL)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    vk_object_base_init(&device->vk, &layout->base,
                        VK_OBJECT_TYPE_PIPELINE_LAYOUT);
@@ -271,7 +271,7 @@ lvp_descriptor_set_create(struct lvp_device *device,
    set = vk_alloc(&device->vk.alloc /* XXX: Use the pool */, size, 8,
                    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (!set)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    /* A descriptor set may not be 100% filled. Clear the set so we can can
     * later detect holes in it.
@@ -489,7 +489,7 @@ VKAPI_ATTR VkResult VKAPI_CALL lvp_CreateDescriptorPool(
    pool = vk_zalloc2(&device->vk.alloc, pAllocator, size, 8,
                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (!pool)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    vk_object_base_init(&device->vk, &pool->base,
                        VK_OBJECT_TYPE_DESCRIPTOR_POOL);
@@ -559,120 +559,9 @@ VKAPI_ATTR VkResult VKAPI_CALL lvp_CreateDescriptorUpdateTemplate(VkDevice _devi
 
    templ = vk_alloc2(&device->vk.alloc, pAllocator, size, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (!templ)
-      return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    vk_object_base_init(&device->vk, &templ->base,
                        VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE);
 
-   templ->type = pCreateInfo->templateType;
-   templ->bind_point = pCreateInfo->pipelineBindPoint;
-   templ->set = pCreateInfo->set;
-   /* This parameter is ignored if templateType is not VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR */
-   if (pCreateInfo->templateType == VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR)
-      templ->pipeline_layout = lvp_pipeline_layout_from_handle(pCreateInfo->pipelineLayout);
-   else
-      templ->pipeline_layout = NULL;
-   templ->entry_count = entry_count;
-
-   VkDescriptorUpdateTemplateEntry *entries = (VkDescriptorUpdateTemplateEntry *)(templ + 1);
-   for (unsigned i = 0; i < entry_count; i++) {
-      entries[i] = pCreateInfo->pDescriptorUpdateEntries[i];
-   }
-
-   *pDescriptorUpdateTemplate = lvp_descriptor_update_template_to_handle(templ);
-   return VK_SUCCESS;
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_DestroyDescriptorUpdateTemplate(VkDevice _device,
-                                         VkDescriptorUpdateTemplate descriptorUpdateTemplate,
-                                         const VkAllocationCallbacks *pAllocator)
-{
-   LVP_FROM_HANDLE(lvp_device, device, _device);
-   LVP_FROM_HANDLE(lvp_descriptor_update_template, templ, descriptorUpdateTemplate);
-
-   if (!templ)
-      return;
-
-   vk_object_base_finish(&templ->base);
-   vk_free2(&device->vk.alloc, pAllocator, templ);
-}
-
-VKAPI_ATTR void VKAPI_CALL lvp_UpdateDescriptorSetWithTemplate(VkDevice _device,
-                                         VkDescriptorSet descriptorSet,
-                                         VkDescriptorUpdateTemplate descriptorUpdateTemplate,
-                                         const void *pData)
-{
-   LVP_FROM_HANDLE(lvp_descriptor_set, set, descriptorSet);
-   LVP_FROM_HANDLE(lvp_descriptor_update_template, templ, descriptorUpdateTemplate);
-   uint32_t i, j;
-
-   for (i = 0; i < templ->entry_count; ++i) {
-      VkDescriptorUpdateTemplateEntry *entry = &templ->entry[i];
-      const uint8_t *pSrc = ((const uint8_t *) pData) + entry->offset;
-      const struct lvp_descriptor_set_binding_layout *bind_layout =
-         &set->layout->binding[entry->dstBinding];
-      struct lvp_descriptor *desc =
-         &set->descriptors[bind_layout->descriptor_index];
-      for (j = 0; j < entry->descriptorCount; ++j) {
-         unsigned idx = j + entry->dstArrayElement;
-         switch (entry->descriptorType) {
-         case VK_DESCRIPTOR_TYPE_SAMPLER: {
-            LVP_FROM_HANDLE(lvp_sampler, sampler,
-                            *(VkSampler *)pSrc);
-            desc[idx] = (struct lvp_descriptor) {
-               .type = VK_DESCRIPTOR_TYPE_SAMPLER,
-               .info.sampler = sampler,
-            };
-            break;
-         }
-         case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: {
-            VkDescriptorImageInfo *info = (VkDescriptorImageInfo *)pSrc;
-            desc[idx] = (struct lvp_descriptor) {
-               .type = entry->descriptorType,
-               .info.iview = lvp_image_view_from_handle(info->imageView),
-               .info.sampler = lvp_sampler_from_handle(info->sampler),
-            };
-            break;
-         }
-         case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
-         case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
-         case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
-            LVP_FROM_HANDLE(lvp_image_view, iview,
-                            ((VkDescriptorImageInfo *)pSrc)->imageView);
-            desc[idx] = (struct lvp_descriptor) {
-               .type = entry->descriptorType,
-               .info.iview = iview,
-            };
-            break;
-         }
-         case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
-         case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: {
-            LVP_FROM_HANDLE(lvp_buffer_view, bview,
-                            *(VkBufferView *)pSrc);
-            desc[idx] = (struct lvp_descriptor) {
-               .type = entry->descriptorType,
-               .info.buffer_view = bview,
-            };
-            break;
-         }
-
-         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
-         case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
-         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
-         case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
-            VkDescriptorBufferInfo *info = (VkDescriptorBufferInfo *)pSrc;
-            desc[idx] = (struct lvp_descriptor) {
-               .type = entry->descriptorType,
-               .info.offset = info->offset,
-               .info.buffer = lvp_buffer_from_handle(info->buffer),
-               .info.range =  info->range,
-            };
-            break;
-         }
-         default:
-            break;
-         }
-         pSrc += entry->stride;
-      }
-   }
-}
+   templ->type = pCreateInfo->
\ No newline at end of file